vsegments 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +12 -0
- package/bin/cli.js +40 -2
- package/package.json +3 -2
- package/src/core.js +78 -40
package/bin/cli.js
CHANGED
|
@@ -11,11 +11,13 @@ const VSegments = require('../src/index');
|
|
|
11
11
|
|
|
12
12
|
const program = new Command();
|
|
13
13
|
|
|
14
|
+
const pkg = require('../package.json');
|
|
15
|
+
|
|
14
16
|
program
|
|
15
17
|
.name('vsegments')
|
|
16
18
|
.description('Visual segmentation and bounding box detection using Google Gemini AI')
|
|
17
|
-
.version(
|
|
18
|
-
.
|
|
19
|
+
.version(pkg.version)
|
|
20
|
+
.option('-f, --file <image>', 'Path to input image file')
|
|
19
21
|
.option('--segment', 'Perform segmentation instead of bounding box detection')
|
|
20
22
|
.option('--api-key <key>', 'Google API key (default: GOOGLE_API_KEY env var)')
|
|
21
23
|
.option('-m, --model <model>', 'Model name to use', 'gemini-flash-latest')
|
|
@@ -38,6 +40,42 @@ program.parse(process.argv);
|
|
|
38
40
|
|
|
39
41
|
const options = program.opts();
|
|
40
42
|
|
|
43
|
+
// Show welcome message if no file provided
|
|
44
|
+
if (!options.file) {
|
|
45
|
+
console.log(`
|
|
46
|
+
vsegments v${pkg.version}
|
|
47
|
+
Visual segmentation and bounding box detection using Google Gemini AI
|
|
48
|
+
|
|
49
|
+
QUICK START
|
|
50
|
+
vsegments -f image.jpg Detect objects with bounding boxes
|
|
51
|
+
vsegments -f image.jpg --segment Perform segmentation with masks
|
|
52
|
+
vsegments -f image.jpg -o output.png Save visualization to file
|
|
53
|
+
|
|
54
|
+
CUSTOM PROMPTS
|
|
55
|
+
vsegments -f photo.jpg -p "find all faces"
|
|
56
|
+
vsegments -f room.jpg -p "furniture items"
|
|
57
|
+
|
|
58
|
+
OUTPUT OPTIONS
|
|
59
|
+
--json results.json Export detection data as JSON
|
|
60
|
+
--compact Print minimal output: "1. label [x y xx yy]"
|
|
61
|
+
--raw Show raw API response
|
|
62
|
+
|
|
63
|
+
CONFIGURATION
|
|
64
|
+
--api-key <key> Google API key (or set GOOGLE_API_KEY env var)
|
|
65
|
+
--model <name> Model to use (default: gemini-flash-latest)
|
|
66
|
+
--temperature <0-1> Sampling temperature (default: 0.5)
|
|
67
|
+
--max-objects <n> Max objects to detect (default: 25)
|
|
68
|
+
|
|
69
|
+
VISUALIZATION
|
|
70
|
+
--line-width <n> Bounding box line width (default: 4)
|
|
71
|
+
--font-size <n> Label font size (default: 14)
|
|
72
|
+
--alpha <0-1> Mask transparency (default: 0.7)
|
|
73
|
+
|
|
74
|
+
Run 'vsegments --help' for full options.
|
|
75
|
+
`);
|
|
76
|
+
process.exit(0);
|
|
77
|
+
}
|
|
78
|
+
|
|
41
79
|
async function main() {
|
|
42
80
|
try {
|
|
43
81
|
// Validate file exists
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "vsegments",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"description": "Visual segmentation and bounding box detection using Google Gemini AI",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "src/index.d.ts",
|
|
@@ -43,7 +43,8 @@
|
|
|
43
43
|
"@google/generative-ai": "^0.21.0",
|
|
44
44
|
"canvas": "^2.11.2",
|
|
45
45
|
"commander": "^12.0.0",
|
|
46
|
-
"sharp": "^0.33.0"
|
|
46
|
+
"sharp": "^0.33.0",
|
|
47
|
+
"vsegments": "^0.1.4"
|
|
47
48
|
},
|
|
48
49
|
"devDependencies": {
|
|
49
50
|
"@types/jest": "^30.0.0",
|
package/src/core.js
CHANGED
|
@@ -7,11 +7,11 @@ const { loadImage } = require('canvas');
|
|
|
7
7
|
const fs = require('fs').promises;
|
|
8
8
|
const { SegmentationResult } = require('./models');
|
|
9
9
|
const { parseBoundingBoxes, parseSegmentationMasks } = require('./utils');
|
|
10
|
-
const {
|
|
11
|
-
loadImageToCanvas,
|
|
12
|
-
plotBoundingBoxes,
|
|
10
|
+
const {
|
|
11
|
+
loadImageToCanvas,
|
|
12
|
+
plotBoundingBoxes,
|
|
13
13
|
plotSegmentationMasks,
|
|
14
|
-
saveCanvas
|
|
14
|
+
saveCanvas
|
|
15
15
|
} = require('./visualize');
|
|
16
16
|
|
|
17
17
|
class VSegments {
|
|
@@ -25,26 +25,38 @@ class VSegments {
|
|
|
25
25
|
*/
|
|
26
26
|
constructor(options = {}) {
|
|
27
27
|
this.apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
if (!this.apiKey) {
|
|
30
30
|
throw new Error(
|
|
31
31
|
'API key must be provided or set in GOOGLE_API_KEY environment variable'
|
|
32
32
|
);
|
|
33
33
|
}
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
this.model = options.model || 'gemini-3-pro-preview';
|
|
36
36
|
this.temperature = options.temperature !== undefined ? options.temperature : 0.5;
|
|
37
37
|
this.maxObjects = options.maxObjects || 25;
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
// Initialize Google AI client
|
|
40
40
|
this.genAI = new GoogleGenerativeAI(this.apiKey);
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
// Default system instructions
|
|
43
43
|
this.defaultSystemInstructions = `
|
|
44
|
-
Return bounding boxes as a JSON array with labels.
|
|
45
|
-
|
|
44
|
+
Return bounding boxes as a JSON array with labels.
|
|
45
|
+
Never return masks or code fencing. Limit to ${this.maxObjects} objects.
|
|
46
|
+
|
|
47
|
+
Follow this intuition:
|
|
48
|
+
|
|
49
|
+
If the object is a face number them according to how someone would draw the features of the face. First the left eye, then the right eye
|
|
50
|
+
then the nose, then the left ear, then the right ear, then the mouth, then the chin.
|
|
51
|
+
Same with Animal faces.
|
|
52
|
+
|
|
53
|
+
General guideline:
|
|
54
|
+
Follow a drawing order intuitively. People usually do not draw first an eye, and then the background and then the shirt but they
|
|
55
|
+
follow a more natural order with symmetry in mind.
|
|
56
|
+
|
|
57
|
+
No more than 10 features!
|
|
46
58
|
`.trim();
|
|
47
|
-
|
|
59
|
+
|
|
48
60
|
// Safety settings
|
|
49
61
|
this.safetySettings = [
|
|
50
62
|
{
|
|
@@ -53,7 +65,7 @@ If an object is present multiple times, name them according to their unique char
|
|
|
53
65
|
},
|
|
54
66
|
];
|
|
55
67
|
}
|
|
56
|
-
|
|
68
|
+
|
|
57
69
|
/**
|
|
58
70
|
* Load image from file and convert to format for API
|
|
59
71
|
* @param {string} imagePath - Path to image file
|
|
@@ -62,18 +74,44 @@ If an object is present multiple times, name them according to their unique char
|
|
|
62
74
|
async _loadImage(imagePath) {
|
|
63
75
|
let imageBuffer = await fs.readFile(imagePath);
|
|
64
76
|
let mimeType = this._getMimeType(imagePath);
|
|
65
|
-
|
|
77
|
+
|
|
66
78
|
// Convert SVG to PNG for API compatibility
|
|
67
79
|
if (mimeType === 'image/svg+xml') {
|
|
68
80
|
const sharp = require('sharp');
|
|
69
|
-
|
|
81
|
+
|
|
82
|
+
// Remove common registration/cut line colors from SVG before conversion
|
|
83
|
+
// These colors are often used for print registration marks, cut lines, etc.
|
|
84
|
+
let svgString = imageBuffer.toString('utf-8');
|
|
85
|
+
const registrationColors = [
|
|
86
|
+
'#ec008c', '#ED008C', // Magenta/pink registration
|
|
87
|
+
'#00ff00', '#00FF00', // Green registration
|
|
88
|
+
'#ff0000', '#FF0000', // Red registration (when used for cut lines)
|
|
89
|
+
];
|
|
90
|
+
|
|
91
|
+
// Remove elements with registration colors
|
|
92
|
+
for (const color of registrationColors) {
|
|
93
|
+
// Remove stroke colors
|
|
94
|
+
const strokeRegex = new RegExp(`stroke="${color}"`, 'gi');
|
|
95
|
+
svgString = svgString.replace(strokeRegex, 'stroke="none"');
|
|
96
|
+
// Remove fill colors
|
|
97
|
+
const fillRegex = new RegExp(`fill="${color}"`, 'gi');
|
|
98
|
+
svgString = svgString.replace(fillRegex, 'fill="none"');
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Increase thin stroke widths for better visibility
|
|
102
|
+
svgString = svgString.replace(/stroke-width:\s*0\.5pt/gi, 'stroke-width: 2pt');
|
|
103
|
+
svgString = svgString.replace(/stroke-width="0\.5pt"/gi, 'stroke-width="2pt"');
|
|
104
|
+
|
|
105
|
+
imageBuffer = await sharp(Buffer.from(svgString), { density: 300 })
|
|
106
|
+
.resize(1024, 1024, { fit: 'inside', withoutEnlargement: false })
|
|
107
|
+
.flatten({ background: { r: 245, g: 245, b: 245 } }) // Light gray background for better contrast
|
|
70
108
|
.png()
|
|
71
109
|
.toBuffer();
|
|
72
110
|
mimeType = 'image/png';
|
|
73
111
|
}
|
|
74
|
-
|
|
112
|
+
|
|
75
113
|
const base64Data = imageBuffer.toString('base64');
|
|
76
|
-
|
|
114
|
+
|
|
77
115
|
return {
|
|
78
116
|
inlineData: {
|
|
79
117
|
data: base64Data,
|
|
@@ -81,7 +119,7 @@ If an object is present multiple times, name them according to their unique char
|
|
|
81
119
|
}
|
|
82
120
|
};
|
|
83
121
|
}
|
|
84
|
-
|
|
122
|
+
|
|
85
123
|
/**
|
|
86
124
|
* Get MIME type from file extension
|
|
87
125
|
* @param {string} filePath - File path
|
|
@@ -99,7 +137,7 @@ If an object is present multiple times, name them according to their unique char
|
|
|
99
137
|
};
|
|
100
138
|
return mimeTypes[ext] || 'image/jpeg';
|
|
101
139
|
}
|
|
102
|
-
|
|
140
|
+
|
|
103
141
|
/**
|
|
104
142
|
* Get system instructions with custom additions
|
|
105
143
|
* @param {string} customInstructions - Additional instructions
|
|
@@ -107,14 +145,14 @@ If an object is present multiple times, name them according to their unique char
|
|
|
107
145
|
*/
|
|
108
146
|
_getSystemInstructions(customInstructions) {
|
|
109
147
|
let instructions = this.defaultSystemInstructions;
|
|
110
|
-
|
|
148
|
+
|
|
111
149
|
if (customInstructions) {
|
|
112
150
|
instructions += '\n' + customInstructions;
|
|
113
151
|
}
|
|
114
|
-
|
|
152
|
+
|
|
115
153
|
return instructions;
|
|
116
154
|
}
|
|
117
|
-
|
|
155
|
+
|
|
118
156
|
/**
|
|
119
157
|
* Detect bounding boxes in an image
|
|
120
158
|
* @param {string} imagePath - Path to image file
|
|
@@ -130,17 +168,17 @@ If an object is present multiple times, name them according to their unique char
|
|
|
130
168
|
customInstructions = null,
|
|
131
169
|
maxSize = 1024
|
|
132
170
|
} = options;
|
|
133
|
-
|
|
171
|
+
|
|
134
172
|
// Load image
|
|
135
173
|
const image = await this._loadImage(imagePath);
|
|
136
|
-
|
|
174
|
+
|
|
137
175
|
// Get model
|
|
138
176
|
const model = this.genAI.getGenerativeModel({
|
|
139
177
|
model: this.model,
|
|
140
178
|
safetySettings: this.safetySettings,
|
|
141
179
|
systemInstruction: this._getSystemInstructions(customInstructions)
|
|
142
180
|
});
|
|
143
|
-
|
|
181
|
+
|
|
144
182
|
// Generate content
|
|
145
183
|
let result, response, text;
|
|
146
184
|
try {
|
|
@@ -150,7 +188,7 @@ If an object is present multiple times, name them according to their unique char
|
|
|
150
188
|
temperature: this.temperature,
|
|
151
189
|
}
|
|
152
190
|
});
|
|
153
|
-
|
|
191
|
+
|
|
154
192
|
response = result.response;
|
|
155
193
|
text = response.text();
|
|
156
194
|
} catch (error) {
|
|
@@ -161,13 +199,13 @@ If an object is present multiple times, name them according to their unique char
|
|
|
161
199
|
}
|
|
162
200
|
throw error;
|
|
163
201
|
}
|
|
164
|
-
|
|
202
|
+
|
|
165
203
|
// Parse response
|
|
166
204
|
const boxes = parseBoundingBoxes(text);
|
|
167
|
-
|
|
205
|
+
|
|
168
206
|
return new SegmentationResult(boxes, null, text);
|
|
169
207
|
}
|
|
170
|
-
|
|
208
|
+
|
|
171
209
|
/**
|
|
172
210
|
* Perform segmentation on an image
|
|
173
211
|
* @param {string} imagePath - Path to image file
|
|
@@ -181,21 +219,21 @@ If an object is present multiple times, name them according to their unique char
|
|
|
181
219
|
prompt = 'Give the segmentation masks for the objects. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels.',
|
|
182
220
|
maxSize = 1024
|
|
183
221
|
} = options;
|
|
184
|
-
|
|
222
|
+
|
|
185
223
|
// Load image
|
|
186
224
|
const image = await this._loadImage(imagePath);
|
|
187
|
-
|
|
225
|
+
|
|
188
226
|
// Get image dimensions
|
|
189
227
|
const img = await loadImage(imagePath);
|
|
190
228
|
const imgWidth = img.width;
|
|
191
229
|
const imgHeight = img.height;
|
|
192
|
-
|
|
230
|
+
|
|
193
231
|
// Get model (no system instructions for segmentation)
|
|
194
232
|
const model = this.genAI.getGenerativeModel({
|
|
195
233
|
model: this.model,
|
|
196
234
|
safetySettings: this.safetySettings
|
|
197
235
|
});
|
|
198
|
-
|
|
236
|
+
|
|
199
237
|
// Generate content
|
|
200
238
|
let result, response, text;
|
|
201
239
|
try {
|
|
@@ -205,7 +243,7 @@ If an object is present multiple times, name them according to their unique char
|
|
|
205
243
|
temperature: this.temperature,
|
|
206
244
|
}
|
|
207
245
|
});
|
|
208
|
-
|
|
246
|
+
|
|
209
247
|
response = result.response;
|
|
210
248
|
text = response.text();
|
|
211
249
|
} catch (error) {
|
|
@@ -216,14 +254,14 @@ If an object is present multiple times, name them according to their unique char
|
|
|
216
254
|
}
|
|
217
255
|
throw error;
|
|
218
256
|
}
|
|
219
|
-
|
|
257
|
+
|
|
220
258
|
// Parse response
|
|
221
259
|
const boxes = parseBoundingBoxes(text);
|
|
222
260
|
const masks = await parseSegmentationMasks(text, imgHeight, imgWidth);
|
|
223
|
-
|
|
261
|
+
|
|
224
262
|
return new SegmentationResult(boxes, masks, text);
|
|
225
263
|
}
|
|
226
|
-
|
|
264
|
+
|
|
227
265
|
/**
|
|
228
266
|
* Visualize detection/segmentation results
|
|
229
267
|
* @param {string} imagePath - Path to original image
|
|
@@ -242,10 +280,10 @@ If an object is present multiple times, name them according to their unique char
|
|
|
242
280
|
fontSize = 14,
|
|
243
281
|
alpha = 0.7
|
|
244
282
|
} = options;
|
|
245
|
-
|
|
283
|
+
|
|
246
284
|
// Load image to canvas
|
|
247
285
|
const canvas = await loadImageToCanvas(imagePath, 2048);
|
|
248
|
-
|
|
286
|
+
|
|
249
287
|
// Draw visualizations
|
|
250
288
|
if (result.masks) {
|
|
251
289
|
plotSegmentationMasks(canvas, result.masks, {
|
|
@@ -259,12 +297,12 @@ If an object is present multiple times, name them according to their unique char
|
|
|
259
297
|
fontSize
|
|
260
298
|
});
|
|
261
299
|
}
|
|
262
|
-
|
|
300
|
+
|
|
263
301
|
// Save if requested
|
|
264
302
|
if (outputPath) {
|
|
265
303
|
await saveCanvas(canvas, outputPath);
|
|
266
304
|
}
|
|
267
|
-
|
|
305
|
+
|
|
268
306
|
return canvas;
|
|
269
307
|
}
|
|
270
308
|
}
|