vsegments 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +3 -2
  2. package/src/core.js +50 -39
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vsegments",
3
- "version": "0.1.4",
3
+ "version": "0.1.5",
4
4
  "description": "Visual segmentation and bounding box detection using Google Gemini AI",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
@@ -43,7 +43,8 @@
43
43
  "@google/generative-ai": "^0.21.0",
44
44
  "canvas": "^2.11.2",
45
45
  "commander": "^12.0.0",
46
- "sharp": "^0.33.0"
46
+ "sharp": "^0.33.0",
47
+ "vsegments": "^0.1.4"
47
48
  },
48
49
  "devDependencies": {
49
50
  "@types/jest": "^30.0.0",
package/src/core.js CHANGED
@@ -7,11 +7,11 @@ const { loadImage } = require('canvas');
7
7
  const fs = require('fs').promises;
8
8
  const { SegmentationResult } = require('./models');
9
9
  const { parseBoundingBoxes, parseSegmentationMasks } = require('./utils');
10
- const {
11
- loadImageToCanvas,
12
- plotBoundingBoxes,
10
+ const {
11
+ loadImageToCanvas,
12
+ plotBoundingBoxes,
13
13
  plotSegmentationMasks,
14
- saveCanvas
14
+ saveCanvas
15
15
  } = require('./visualize');
16
16
 
17
17
  class VSegments {
@@ -25,26 +25,37 @@ class VSegments {
25
25
  */
26
26
  constructor(options = {}) {
27
27
  this.apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
28
-
28
+
29
29
  if (!this.apiKey) {
30
30
  throw new Error(
31
31
  'API key must be provided or set in GOOGLE_API_KEY environment variable'
32
32
  );
33
33
  }
34
-
34
+
35
35
  this.model = options.model || 'gemini-3-pro-preview';
36
36
  this.temperature = options.temperature !== undefined ? options.temperature : 0.5;
37
37
  this.maxObjects = options.maxObjects || 25;
38
-
38
+
39
39
  // Initialize Google AI client
40
40
  this.genAI = new GoogleGenerativeAI(this.apiKey);
41
-
41
+
42
42
  // Default system instructions
43
43
  this.defaultSystemInstructions = `
44
- Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to ${this.maxObjects} objects.
45
- If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc..).
44
+ Return bounding boxes as a JSON array with labels.
45
+ Never return masks or code fencing. Limit to ${this.maxObjects} objects.
46
+
47
+ Follow this intuition:
48
+ If an object is present multiple times, name them according to their unique characteristic
49
+ (colors, size, position, unique characteristics, etc..).
50
+ If the object is a face number them according to how someone would draw the features of the face. First the left eye, then the right eye
51
+ then the nose, then the left ear, then the right ear, then the mouth, then the chin.
52
+ Same with Animal faces.
53
+
54
+ General guideline:
55
+ Follow a drawing order intuitively. People usually do not draw first an eye, and then the background and then the shirt but they
56
+ follow a more natural order with symmetry in mind.
46
57
  `.trim();
47
-
58
+
48
59
  // Safety settings
49
60
  this.safetySettings = [
50
61
  {
@@ -53,7 +64,7 @@ If an object is present multiple times, name them according to their unique char
53
64
  },
54
65
  ];
55
66
  }
56
-
67
+
57
68
  /**
58
69
  * Load image from file and convert to format for API
59
70
  * @param {string} imagePath - Path to image file
@@ -62,7 +73,7 @@ If an object is present multiple times, name them according to their unique char
62
73
  async _loadImage(imagePath) {
63
74
  let imageBuffer = await fs.readFile(imagePath);
64
75
  let mimeType = this._getMimeType(imagePath);
65
-
76
+
66
77
  // Convert SVG to PNG for API compatibility
67
78
  if (mimeType === 'image/svg+xml') {
68
79
  const sharp = require('sharp');
@@ -71,9 +82,9 @@ If an object is present multiple times, name them according to their unique char
71
82
  .toBuffer();
72
83
  mimeType = 'image/png';
73
84
  }
74
-
85
+
75
86
  const base64Data = imageBuffer.toString('base64');
76
-
87
+
77
88
  return {
78
89
  inlineData: {
79
90
  data: base64Data,
@@ -81,7 +92,7 @@ If an object is present multiple times, name them according to their unique char
81
92
  }
82
93
  };
83
94
  }
84
-
95
+
85
96
  /**
86
97
  * Get MIME type from file extension
87
98
  * @param {string} filePath - File path
@@ -99,7 +110,7 @@ If an object is present multiple times, name them according to their unique char
99
110
  };
100
111
  return mimeTypes[ext] || 'image/jpeg';
101
112
  }
102
-
113
+
103
114
  /**
104
115
  * Get system instructions with custom additions
105
116
  * @param {string} customInstructions - Additional instructions
@@ -107,14 +118,14 @@ If an object is present multiple times, name them according to their unique char
107
118
  */
108
119
  _getSystemInstructions(customInstructions) {
109
120
  let instructions = this.defaultSystemInstructions;
110
-
121
+
111
122
  if (customInstructions) {
112
123
  instructions += '\n' + customInstructions;
113
124
  }
114
-
125
+
115
126
  return instructions;
116
127
  }
117
-
128
+
118
129
  /**
119
130
  * Detect bounding boxes in an image
120
131
  * @param {string} imagePath - Path to image file
@@ -130,17 +141,17 @@ If an object is present multiple times, name them according to their unique char
130
141
  customInstructions = null,
131
142
  maxSize = 1024
132
143
  } = options;
133
-
144
+
134
145
  // Load image
135
146
  const image = await this._loadImage(imagePath);
136
-
147
+
137
148
  // Get model
138
149
  const model = this.genAI.getGenerativeModel({
139
150
  model: this.model,
140
151
  safetySettings: this.safetySettings,
141
152
  systemInstruction: this._getSystemInstructions(customInstructions)
142
153
  });
143
-
154
+
144
155
  // Generate content
145
156
  let result, response, text;
146
157
  try {
@@ -150,7 +161,7 @@ If an object is present multiple times, name them according to their unique char
150
161
  temperature: this.temperature,
151
162
  }
152
163
  });
153
-
164
+
154
165
  response = result.response;
155
166
  text = response.text();
156
167
  } catch (error) {
@@ -161,13 +172,13 @@ If an object is present multiple times, name them according to their unique char
161
172
  }
162
173
  throw error;
163
174
  }
164
-
175
+
165
176
  // Parse response
166
177
  const boxes = parseBoundingBoxes(text);
167
-
178
+
168
179
  return new SegmentationResult(boxes, null, text);
169
180
  }
170
-
181
+
171
182
  /**
172
183
  * Perform segmentation on an image
173
184
  * @param {string} imagePath - Path to image file
@@ -181,21 +192,21 @@ If an object is present multiple times, name them according to their unique char
181
192
  prompt = 'Give the segmentation masks for the objects. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels.',
182
193
  maxSize = 1024
183
194
  } = options;
184
-
195
+
185
196
  // Load image
186
197
  const image = await this._loadImage(imagePath);
187
-
198
+
188
199
  // Get image dimensions
189
200
  const img = await loadImage(imagePath);
190
201
  const imgWidth = img.width;
191
202
  const imgHeight = img.height;
192
-
203
+
193
204
  // Get model (no system instructions for segmentation)
194
205
  const model = this.genAI.getGenerativeModel({
195
206
  model: this.model,
196
207
  safetySettings: this.safetySettings
197
208
  });
198
-
209
+
199
210
  // Generate content
200
211
  let result, response, text;
201
212
  try {
@@ -205,7 +216,7 @@ If an object is present multiple times, name them according to their unique char
205
216
  temperature: this.temperature,
206
217
  }
207
218
  });
208
-
219
+
209
220
  response = result.response;
210
221
  text = response.text();
211
222
  } catch (error) {
@@ -216,14 +227,14 @@ If an object is present multiple times, name them according to their unique char
216
227
  }
217
228
  throw error;
218
229
  }
219
-
230
+
220
231
  // Parse response
221
232
  const boxes = parseBoundingBoxes(text);
222
233
  const masks = await parseSegmentationMasks(text, imgHeight, imgWidth);
223
-
234
+
224
235
  return new SegmentationResult(boxes, masks, text);
225
236
  }
226
-
237
+
227
238
  /**
228
239
  * Visualize detection/segmentation results
229
240
  * @param {string} imagePath - Path to original image
@@ -242,10 +253,10 @@ If an object is present multiple times, name them according to their unique char
242
253
  fontSize = 14,
243
254
  alpha = 0.7
244
255
  } = options;
245
-
256
+
246
257
  // Load image to canvas
247
258
  const canvas = await loadImageToCanvas(imagePath, 2048);
248
-
259
+
249
260
  // Draw visualizations
250
261
  if (result.masks) {
251
262
  plotSegmentationMasks(canvas, result.masks, {
@@ -259,12 +270,12 @@ If an object is present multiple times, name them according to their unique char
259
270
  fontSize
260
271
  });
261
272
  }
262
-
273
+
263
274
  // Save if requested
264
275
  if (outputPath) {
265
276
  await saveCanvas(canvas, outputPath);
266
277
  }
267
-
278
+
268
279
  return canvas;
269
280
  }
270
281
  }