vsegments 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(npm rebuild:*)",
5
+ "Bash(vsegments:*)",
6
+ "Bash(node -e:*)",
7
+ "Bash(git add:*)",
8
+ "Bash(git commit:*)",
9
+ "Bash(npm publish:*)"
10
+ ]
11
+ }
12
+ }
package/bin/cli.js CHANGED
@@ -11,11 +11,13 @@ const VSegments = require('../src/index');
11
11
 
12
12
  const program = new Command();
13
13
 
14
+ const pkg = require('../package.json');
15
+
14
16
  program
15
17
  .name('vsegments')
16
18
  .description('Visual segmentation and bounding box detection using Google Gemini AI')
17
- .version('0.1.0')
18
- .requiredOption('-f, --file <image>', 'Path to input image file')
19
+ .version(pkg.version)
20
+ .option('-f, --file <image>', 'Path to input image file')
19
21
  .option('--segment', 'Perform segmentation instead of bounding box detection')
20
22
  .option('--api-key <key>', 'Google API key (default: GOOGLE_API_KEY env var)')
21
23
  .option('-m, --model <model>', 'Model name to use', 'gemini-flash-latest')
@@ -38,6 +40,42 @@ program.parse(process.argv);
38
40
 
39
41
  const options = program.opts();
40
42
 
43
+ // Show welcome message if no file provided
44
+ if (!options.file) {
45
+ console.log(`
46
+ vsegments v${pkg.version}
47
+ Visual segmentation and bounding box detection using Google Gemini AI
48
+
49
+ QUICK START
50
+ vsegments -f image.jpg Detect objects with bounding boxes
51
+ vsegments -f image.jpg --segment Perform segmentation with masks
52
+ vsegments -f image.jpg -o output.png Save visualization to file
53
+
54
+ CUSTOM PROMPTS
55
+ vsegments -f photo.jpg -p "find all faces"
56
+ vsegments -f room.jpg -p "furniture items"
57
+
58
+ OUTPUT OPTIONS
59
+ --json results.json Export detection data as JSON
60
+ --compact Print minimal output: "1. label [x y xx yy]"
61
+ --raw Show raw API response
62
+
63
+ CONFIGURATION
64
+ --api-key <key> Google API key (or set GOOGLE_API_KEY env var)
65
+ --model <name> Model to use (default: gemini-flash-latest)
66
+ --temperature <0-1> Sampling temperature (default: 0.5)
67
+ --max-objects <n> Max objects to detect (default: 25)
68
+
69
+ VISUALIZATION
70
+ --line-width <n> Bounding box line width (default: 4)
71
+ --font-size <n> Label font size (default: 14)
72
+ --alpha <0-1> Mask transparency (default: 0.7)
73
+
74
+ Run 'vsegments --help' for full options.
75
+ `);
76
+ process.exit(0);
77
+ }
78
+
41
79
  async function main() {
42
80
  try {
43
81
  // Validate file exists
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vsegments",
3
- "version": "0.1.4",
3
+ "version": "0.1.6",
4
4
  "description": "Visual segmentation and bounding box detection using Google Gemini AI",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
@@ -43,7 +43,8 @@
43
43
  "@google/generative-ai": "^0.21.0",
44
44
  "canvas": "^2.11.2",
45
45
  "commander": "^12.0.0",
46
- "sharp": "^0.33.0"
46
+ "sharp": "^0.33.0",
47
+ "vsegments": "^0.1.4"
47
48
  },
48
49
  "devDependencies": {
49
50
  "@types/jest": "^30.0.0",
package/src/core.js CHANGED
@@ -7,11 +7,11 @@ const { loadImage } = require('canvas');
7
7
  const fs = require('fs').promises;
8
8
  const { SegmentationResult } = require('./models');
9
9
  const { parseBoundingBoxes, parseSegmentationMasks } = require('./utils');
10
- const {
11
- loadImageToCanvas,
12
- plotBoundingBoxes,
10
+ const {
11
+ loadImageToCanvas,
12
+ plotBoundingBoxes,
13
13
  plotSegmentationMasks,
14
- saveCanvas
14
+ saveCanvas
15
15
  } = require('./visualize');
16
16
 
17
17
  class VSegments {
@@ -25,26 +25,38 @@ class VSegments {
25
25
  */
26
26
  constructor(options = {}) {
27
27
  this.apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
28
-
28
+
29
29
  if (!this.apiKey) {
30
30
  throw new Error(
31
31
  'API key must be provided or set in GOOGLE_API_KEY environment variable'
32
32
  );
33
33
  }
34
-
34
+
35
35
  this.model = options.model || 'gemini-3-pro-preview';
36
36
  this.temperature = options.temperature !== undefined ? options.temperature : 0.5;
37
37
  this.maxObjects = options.maxObjects || 25;
38
-
38
+
39
39
  // Initialize Google AI client
40
40
  this.genAI = new GoogleGenerativeAI(this.apiKey);
41
-
41
+
42
42
  // Default system instructions
43
43
  this.defaultSystemInstructions = `
44
- Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to ${this.maxObjects} objects.
45
- If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc..).
44
+ Return bounding boxes as a JSON array with labels.
45
+ Never return masks or code fencing. Limit to ${this.maxObjects} objects.
46
+
47
+ Follow this intuition:
48
+
49
+ If the object is a face number them according to how someone would draw the features of the face. First the left eye, then the right eye
50
+ then the nose, then the left ear, then the right ear, then the mouth, then the chin.
51
+ Same with Animal faces.
52
+
53
+ General guideline:
54
+ Follow a drawing order intuitively. People usually do not draw first an eye, and then the background and then the shirt but they
55
+ follow a more natural order with symmetry in mind.
56
+
57
+ No more than 10 features!
46
58
  `.trim();
47
-
59
+
48
60
  // Safety settings
49
61
  this.safetySettings = [
50
62
  {
@@ -53,7 +65,7 @@ If an object is present multiple times, name them according to their unique char
53
65
  },
54
66
  ];
55
67
  }
56
-
68
+
57
69
  /**
58
70
  * Load image from file and convert to format for API
59
71
  * @param {string} imagePath - Path to image file
@@ -62,18 +74,44 @@ If an object is present multiple times, name them according to their unique char
62
74
  async _loadImage(imagePath) {
63
75
  let imageBuffer = await fs.readFile(imagePath);
64
76
  let mimeType = this._getMimeType(imagePath);
65
-
77
+
66
78
  // Convert SVG to PNG for API compatibility
67
79
  if (mimeType === 'image/svg+xml') {
68
80
  const sharp = require('sharp');
69
- imageBuffer = await sharp(imageBuffer)
81
+
82
+ // Remove common registration/cut line colors from SVG before conversion
83
+ // These colors are often used for print registration marks, cut lines, etc.
84
+ let svgString = imageBuffer.toString('utf-8');
85
+ const registrationColors = [
86
+ '#ec008c', '#ED008C', // Magenta/pink registration
87
+ '#00ff00', '#00FF00', // Green registration
88
+ '#ff0000', '#FF0000', // Red registration (when used for cut lines)
89
+ ];
90
+
91
+ // Remove elements with registration colors
92
+ for (const color of registrationColors) {
93
+ // Remove stroke colors
94
+ const strokeRegex = new RegExp(`stroke="${color}"`, 'gi');
95
+ svgString = svgString.replace(strokeRegex, 'stroke="none"');
96
+ // Remove fill colors
97
+ const fillRegex = new RegExp(`fill="${color}"`, 'gi');
98
+ svgString = svgString.replace(fillRegex, 'fill="none"');
99
+ }
100
+
101
+ // Increase thin stroke widths for better visibility
102
+ svgString = svgString.replace(/stroke-width:\s*0\.5pt/gi, 'stroke-width: 2pt');
103
+ svgString = svgString.replace(/stroke-width="0\.5pt"/gi, 'stroke-width="2pt"');
104
+
105
+ imageBuffer = await sharp(Buffer.from(svgString), { density: 300 })
106
+ .resize(1024, 1024, { fit: 'inside', withoutEnlargement: false })
107
+ .flatten({ background: { r: 245, g: 245, b: 245 } }) // Light gray background for better contrast
70
108
  .png()
71
109
  .toBuffer();
72
110
  mimeType = 'image/png';
73
111
  }
74
-
112
+
75
113
  const base64Data = imageBuffer.toString('base64');
76
-
114
+
77
115
  return {
78
116
  inlineData: {
79
117
  data: base64Data,
@@ -81,7 +119,7 @@ If an object is present multiple times, name them according to their unique char
81
119
  }
82
120
  };
83
121
  }
84
-
122
+
85
123
  /**
86
124
  * Get MIME type from file extension
87
125
  * @param {string} filePath - File path
@@ -99,7 +137,7 @@ If an object is present multiple times, name them according to their unique char
99
137
  };
100
138
  return mimeTypes[ext] || 'image/jpeg';
101
139
  }
102
-
140
+
103
141
  /**
104
142
  * Get system instructions with custom additions
105
143
  * @param {string} customInstructions - Additional instructions
@@ -107,14 +145,14 @@ If an object is present multiple times, name them according to their unique char
107
145
  */
108
146
  _getSystemInstructions(customInstructions) {
109
147
  let instructions = this.defaultSystemInstructions;
110
-
148
+
111
149
  if (customInstructions) {
112
150
  instructions += '\n' + customInstructions;
113
151
  }
114
-
152
+
115
153
  return instructions;
116
154
  }
117
-
155
+
118
156
  /**
119
157
  * Detect bounding boxes in an image
120
158
  * @param {string} imagePath - Path to image file
@@ -130,17 +168,17 @@ If an object is present multiple times, name them according to their unique char
130
168
  customInstructions = null,
131
169
  maxSize = 1024
132
170
  } = options;
133
-
171
+
134
172
  // Load image
135
173
  const image = await this._loadImage(imagePath);
136
-
174
+
137
175
  // Get model
138
176
  const model = this.genAI.getGenerativeModel({
139
177
  model: this.model,
140
178
  safetySettings: this.safetySettings,
141
179
  systemInstruction: this._getSystemInstructions(customInstructions)
142
180
  });
143
-
181
+
144
182
  // Generate content
145
183
  let result, response, text;
146
184
  try {
@@ -150,7 +188,7 @@ If an object is present multiple times, name them according to their unique char
150
188
  temperature: this.temperature,
151
189
  }
152
190
  });
153
-
191
+
154
192
  response = result.response;
155
193
  text = response.text();
156
194
  } catch (error) {
@@ -161,13 +199,13 @@ If an object is present multiple times, name them according to their unique char
161
199
  }
162
200
  throw error;
163
201
  }
164
-
202
+
165
203
  // Parse response
166
204
  const boxes = parseBoundingBoxes(text);
167
-
205
+
168
206
  return new SegmentationResult(boxes, null, text);
169
207
  }
170
-
208
+
171
209
  /**
172
210
  * Perform segmentation on an image
173
211
  * @param {string} imagePath - Path to image file
@@ -181,21 +219,21 @@ If an object is present multiple times, name them according to their unique char
181
219
  prompt = 'Give the segmentation masks for the objects. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels.',
182
220
  maxSize = 1024
183
221
  } = options;
184
-
222
+
185
223
  // Load image
186
224
  const image = await this._loadImage(imagePath);
187
-
225
+
188
226
  // Get image dimensions
189
227
  const img = await loadImage(imagePath);
190
228
  const imgWidth = img.width;
191
229
  const imgHeight = img.height;
192
-
230
+
193
231
  // Get model (no system instructions for segmentation)
194
232
  const model = this.genAI.getGenerativeModel({
195
233
  model: this.model,
196
234
  safetySettings: this.safetySettings
197
235
  });
198
-
236
+
199
237
  // Generate content
200
238
  let result, response, text;
201
239
  try {
@@ -205,7 +243,7 @@ If an object is present multiple times, name them according to their unique char
205
243
  temperature: this.temperature,
206
244
  }
207
245
  });
208
-
246
+
209
247
  response = result.response;
210
248
  text = response.text();
211
249
  } catch (error) {
@@ -216,14 +254,14 @@ If an object is present multiple times, name them according to their unique char
216
254
  }
217
255
  throw error;
218
256
  }
219
-
257
+
220
258
  // Parse response
221
259
  const boxes = parseBoundingBoxes(text);
222
260
  const masks = await parseSegmentationMasks(text, imgHeight, imgWidth);
223
-
261
+
224
262
  return new SegmentationResult(boxes, masks, text);
225
263
  }
226
-
264
+
227
265
  /**
228
266
  * Visualize detection/segmentation results
229
267
  * @param {string} imagePath - Path to original image
@@ -242,10 +280,10 @@ If an object is present multiple times, name them according to their unique char
242
280
  fontSize = 14,
243
281
  alpha = 0.7
244
282
  } = options;
245
-
283
+
246
284
  // Load image to canvas
247
285
  const canvas = await loadImageToCanvas(imagePath, 2048);
248
-
286
+
249
287
  // Draw visualizations
250
288
  if (result.masks) {
251
289
  plotSegmentationMasks(canvas, result.masks, {
@@ -259,12 +297,12 @@ If an object is present multiple times, name them according to their unique char
259
297
  fontSize
260
298
  });
261
299
  }
262
-
300
+
263
301
  // Save if requested
264
302
  if (outputPath) {
265
303
  await saveCanvas(canvas, outputPath);
266
304
  }
267
-
305
+
268
306
  return canvas;
269
307
  }
270
308
  }