vsegments 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/core.js ADDED
@@ -0,0 +1,241 @@
1
+ /**
2
+ * Core VSegments class for image segmentation and bounding box detection
3
+ */
4
+
5
+ const { GoogleGenerativeAI } = require('@google/generative-ai');
6
+ const { loadImage } = require('canvas');
7
+ const fs = require('fs').promises;
8
+ const { SegmentationResult } = require('./models');
9
+ const { parseBoundingBoxes, parseSegmentationMasks } = require('./utils');
10
+ const {
11
+ loadImageToCanvas,
12
+ plotBoundingBoxes,
13
+ plotSegmentationMasks,
14
+ saveCanvas
15
+ } = require('./visualize');
16
+
17
+ class VSegments {
18
+ /**
19
+ * Main class for visual segmentation using Google Gemini AI
20
+ * @param {Object} options - Configuration options
21
+ * @param {string} options.apiKey - Google API key (defaults to GOOGLE_API_KEY env var)
22
+ * @param {string} options.model - Model name (default: gemini-flash-latest)
23
+ * @param {number} options.temperature - Sampling temperature (default: 0.5)
24
+ * @param {number} options.maxObjects - Maximum objects to detect (default: 25)
25
+ */
26
+ constructor(options = {}) {
27
+ this.apiKey = options.apiKey || process.env.GOOGLE_API_KEY;
28
+
29
+ if (!this.apiKey) {
30
+ throw new Error(
31
+ 'API key must be provided or set in GOOGLE_API_KEY environment variable'
32
+ );
33
+ }
34
+
35
+ this.model = options.model || 'gemini-flash-latest';
36
+ this.temperature = options.temperature !== undefined ? options.temperature : 0.5;
37
+ this.maxObjects = options.maxObjects || 25;
38
+
39
+ // Initialize Google AI client
40
+ this.genAI = new GoogleGenerativeAI(this.apiKey);
41
+
42
+ // Default system instructions
43
+ this.defaultSystemInstructions = `
44
+ Return bounding boxes as a JSON array with labels. Never return masks or code fencing. Limit to ${this.maxObjects} objects.
45
+ If an object is present multiple times, name them according to their unique characteristic (colors, size, position, unique characteristics, etc..).
46
+ `.trim();
47
+
48
+ // Safety settings
49
+ this.safetySettings = [
50
+ {
51
+ category: 'HARM_CATEGORY_DANGEROUS_CONTENT',
52
+ threshold: 'BLOCK_ONLY_HIGH',
53
+ },
54
+ ];
55
+ }
56
+
57
+ /**
58
+ * Load image from file and convert to format for API
59
+ * @param {string} imagePath - Path to image file
60
+ * @returns {Promise<Object>} - Image data for API
61
+ */
62
+ async _loadImage(imagePath) {
63
+ const imageBuffer = await fs.readFile(imagePath);
64
+ const base64Data = imageBuffer.toString('base64');
65
+ const mimeType = this._getMimeType(imagePath);
66
+
67
+ return {
68
+ inlineData: {
69
+ data: base64Data,
70
+ mimeType: mimeType
71
+ }
72
+ };
73
+ }
74
+
75
+ /**
76
+ * Get MIME type from file extension
77
+ * @param {string} filePath - File path
78
+ * @returns {string} - MIME type
79
+ */
80
+ _getMimeType(filePath) {
81
+ const ext = filePath.toLowerCase().split('.').pop();
82
+ const mimeTypes = {
83
+ 'jpg': 'image/jpeg',
84
+ 'jpeg': 'image/jpeg',
85
+ 'png': 'image/png',
86
+ 'gif': 'image/gif',
87
+ 'webp': 'image/webp'
88
+ };
89
+ return mimeTypes[ext] || 'image/jpeg';
90
+ }
91
+
92
+ /**
93
+ * Get system instructions with custom additions
94
+ * @param {string} customInstructions - Additional instructions
95
+ * @returns {string} - Complete system instructions
96
+ */
97
+ _getSystemInstructions(customInstructions) {
98
+ let instructions = this.defaultSystemInstructions;
99
+
100
+ if (customInstructions) {
101
+ instructions += '\n' + customInstructions;
102
+ }
103
+
104
+ return instructions;
105
+ }
106
+
107
+ /**
108
+ * Detect bounding boxes in an image
109
+ * @param {string} imagePath - Path to image file
110
+ * @param {Object} options - Detection options
111
+ * @param {string} options.prompt - Custom prompt
112
+ * @param {string} options.customInstructions - Additional system instructions
113
+ * @param {number} options.maxSize - Maximum image dimension
114
+ * @returns {Promise<SegmentationResult>} - Detection results
115
+ */
116
+ async detectBoxes(imagePath, options = {}) {
117
+ const {
118
+ prompt = 'Detect the 2d bounding boxes',
119
+ customInstructions = null,
120
+ maxSize = 1024
121
+ } = options;
122
+
123
+ // Load image
124
+ const image = await this._loadImage(imagePath);
125
+
126
+ // Get model
127
+ const model = this.genAI.getGenerativeModel({
128
+ model: this.model,
129
+ safetySettings: this.safetySettings,
130
+ systemInstruction: this._getSystemInstructions(customInstructions)
131
+ });
132
+
133
+ // Generate content
134
+ const result = await model.generateContent({
135
+ contents: [{ role: 'user', parts: [{ text: prompt }, image] }],
136
+ generationConfig: {
137
+ temperature: this.temperature,
138
+ }
139
+ });
140
+
141
+ const response = result.response;
142
+ const text = response.text();
143
+
144
+ // Parse response
145
+ const boxes = parseBoundingBoxes(text);
146
+
147
+ return new SegmentationResult(boxes, null, text);
148
+ }
149
+
150
+ /**
151
+ * Perform segmentation on an image
152
+ * @param {string} imagePath - Path to image file
153
+ * @param {Object} options - Segmentation options
154
+ * @param {string} options.prompt - Custom prompt
155
+ * @param {number} options.maxSize - Maximum image dimension
156
+ * @returns {Promise<SegmentationResult>} - Segmentation results
157
+ */
158
+ async segment(imagePath, options = {}) {
159
+ const {
160
+ prompt = 'Give the segmentation masks for the objects. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels.',
161
+ maxSize = 1024
162
+ } = options;
163
+
164
+ // Load image
165
+ const image = await this._loadImage(imagePath);
166
+
167
+ // Get image dimensions
168
+ const img = await loadImage(imagePath);
169
+ const imgWidth = img.width;
170
+ const imgHeight = img.height;
171
+
172
+ // Get model (no system instructions for segmentation)
173
+ const model = this.genAI.getGenerativeModel({
174
+ model: this.model,
175
+ safetySettings: this.safetySettings
176
+ });
177
+
178
+ // Generate content
179
+ const result = await model.generateContent({
180
+ contents: [{ role: 'user', parts: [{ text: prompt }, image] }],
181
+ generationConfig: {
182
+ temperature: this.temperature,
183
+ }
184
+ });
185
+
186
+ const response = result.response;
187
+ const text = response.text();
188
+
189
+ // Parse response
190
+ const boxes = parseBoundingBoxes(text);
191
+ const masks = await parseSegmentationMasks(text, imgHeight, imgWidth);
192
+
193
+ return new SegmentationResult(boxes, masks, text);
194
+ }
195
+
196
+ /**
197
+ * Visualize detection/segmentation results
198
+ * @param {string} imagePath - Path to original image
199
+ * @param {SegmentationResult} result - Detection/segmentation results
200
+ * @param {Object} options - Visualization options
201
+ * @param {string} options.outputPath - Path to save output
202
+ * @param {number} options.lineWidth - Bounding box line width
203
+ * @param {number} options.fontSize - Label font size
204
+ * @param {number} options.alpha - Mask transparency (0-1)
205
+ * @returns {Promise<Canvas>} - Canvas with visualizations
206
+ */
207
+ async visualize(imagePath, result, options = {}) {
208
+ const {
209
+ outputPath = null,
210
+ lineWidth = 4,
211
+ fontSize = 14,
212
+ alpha = 0.7
213
+ } = options;
214
+
215
+ // Load image to canvas
216
+ const canvas = await loadImageToCanvas(imagePath, 2048);
217
+
218
+ // Draw visualizations
219
+ if (result.masks) {
220
+ plotSegmentationMasks(canvas, result.masks, {
221
+ lineWidth,
222
+ fontSize,
223
+ alpha
224
+ });
225
+ } else {
226
+ plotBoundingBoxes(canvas, result.boxes, {
227
+ lineWidth,
228
+ fontSize
229
+ });
230
+ }
231
+
232
+ // Save if requested
233
+ if (outputPath) {
234
+ await saveCanvas(canvas, outputPath);
235
+ }
236
+
237
+ return canvas;
238
+ }
239
+ }
240
+
241
+ module.exports = VSegments;
package/src/index.d.ts ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * TypeScript definitions for vsegments
3
+ */
4
+
5
+ /// <reference types="node" />
6
+
7
+ import { Canvas } from 'canvas';
8
+
9
+ export interface BoundingBoxData {
10
+ label: string;
11
+ y1: number;
12
+ x1: number;
13
+ y2: number;
14
+ x2: number;
15
+ }
16
+
17
+ export class BoundingBox {
18
+ label: string;
19
+ y1: number;
20
+ x1: number;
21
+ y2: number;
22
+ x2: number;
23
+
24
+ constructor(label: string, y1: number, x1: number, y2: number, x2: number);
25
+ toAbsolute(imgWidth: number, imgHeight: number): [number, number, number, number];
26
+ static fromDict(data: any): BoundingBox;
27
+ }
28
+
29
+ export class SegmentationMask {
30
+ y0: number;
31
+ x0: number;
32
+ y1: number;
33
+ x1: number;
34
+ mask: Buffer;
35
+ label: string;
36
+
37
+ constructor(y0: number, x0: number, y1: number, x1: number, mask: Buffer, label: string);
38
+ }
39
+
40
+ export class SegmentationResult {
41
+ boxes: BoundingBox[];
42
+ masks: SegmentationMask[] | null;
43
+ rawResponse: string | null;
44
+
45
+ constructor(boxes: BoundingBox[], masks?: SegmentationMask[] | null, rawResponse?: string | null);
46
+ get length(): number;
47
+ }
48
+
49
+ export interface VSegmentsOptions {
50
+ apiKey?: string;
51
+ model?: string;
52
+ temperature?: number;
53
+ maxObjects?: number;
54
+ }
55
+
56
+ export interface DetectBoxesOptions {
57
+ prompt?: string;
58
+ customInstructions?: string;
59
+ maxSize?: number;
60
+ }
61
+
62
+ export interface SegmentOptions {
63
+ prompt?: string;
64
+ maxSize?: number;
65
+ }
66
+
67
+ export interface VisualizeOptions {
68
+ outputPath?: string;
69
+ lineWidth?: number;
70
+ fontSize?: number;
71
+ alpha?: number;
72
+ }
73
+
74
+ export default class VSegments {
75
+ constructor(options?: VSegmentsOptions);
76
+
77
+ detectBoxes(imagePath: string, options?: DetectBoxesOptions): Promise<SegmentationResult>;
78
+ segment(imagePath: string, options?: SegmentOptions): Promise<SegmentationResult>;
79
+ visualize(imagePath: string, result: SegmentationResult, options?: VisualizeOptions): Promise<Canvas>;
80
+ }
81
+
82
+ export { VSegments };
83
+ export const version: string;
package/src/index.js ADDED
@@ -0,0 +1,20 @@
1
+ /**
2
+ * vsegments - Visual Segmentation Library
3
+ *
4
+ * A Node.js library for image segmentation and bounding box detection
5
+ * using Google Gemini AI.
6
+ *
7
+ * @module vsegments
8
+ * @author Marco Kotrotsos
9
+ * @license MIT
10
+ */
11
+
12
+ const VSegments = require('./core');
13
+ const { BoundingBox, SegmentationMask, SegmentationResult } = require('./models');
14
+
15
+ module.exports = VSegments;
16
+ module.exports.VSegments = VSegments;
17
+ module.exports.BoundingBox = BoundingBox;
18
+ module.exports.SegmentationMask = SegmentationMask;
19
+ module.exports.SegmentationResult = SegmentationResult;
20
+ module.exports.version = '0.1.0';
package/src/models.js ADDED
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Data models for vsegments
3
+ */
4
+
5
+ class BoundingBox {
6
+ /**
7
+ * Represents a 2D bounding box with label
8
+ * @param {string} label - Object label
9
+ * @param {number} y1 - Top coordinate (normalized 0-1000)
10
+ * @param {number} x1 - Left coordinate (normalized 0-1000)
11
+ * @param {number} y2 - Bottom coordinate (normalized 0-1000)
12
+ * @param {number} x2 - Right coordinate (normalized 0-1000)
13
+ */
14
+ constructor(label, y1, x1, y2, x2) {
15
+ this.label = label;
16
+ this.y1 = y1;
17
+ this.x1 = x1;
18
+ this.y2 = y2;
19
+ this.x2 = x2;
20
+ }
21
+
22
+ /**
23
+ * Convert normalized coordinates to absolute pixel coordinates
24
+ * @param {number} imgWidth - Image width in pixels
25
+ * @param {number} imgHeight - Image height in pixels
26
+ * @returns {number[]} - [absX1, absY1, absX2, absY2]
27
+ */
28
+ toAbsolute(imgWidth, imgHeight) {
29
+ const absX1 = Math.round((this.x1 / 1000) * imgWidth);
30
+ const absY1 = Math.round((this.y1 / 1000) * imgHeight);
31
+ const absX2 = Math.round((this.x2 / 1000) * imgWidth);
32
+ const absY2 = Math.round((this.y2 / 1000) * imgHeight);
33
+ return [absX1, absY1, absX2, absY2];
34
+ }
35
+
36
+ /**
37
+ * Create BoundingBox from API response object
38
+ * @param {Object} data - Raw data from API
39
+ * @returns {BoundingBox}
40
+ */
41
+ static fromDict(data) {
42
+ const box = data.box_2d;
43
+ return new BoundingBox(
44
+ data.label,
45
+ box[0],
46
+ box[1],
47
+ box[2],
48
+ box[3]
49
+ );
50
+ }
51
+ }
52
+
53
+ class SegmentationMask {
54
+ /**
55
+ * Represents a segmentation mask for an object
56
+ * @param {number} y0 - Top coordinate (absolute pixels)
57
+ * @param {number} x0 - Left coordinate (absolute pixels)
58
+ * @param {number} y1 - Bottom coordinate (absolute pixels)
59
+ * @param {number} x1 - Right coordinate (absolute pixels)
60
+ * @param {Buffer} mask - Mask data [height, width] with values 0-255
61
+ * @param {string} label - Object label
62
+ */
63
+ constructor(y0, x0, y1, x1, mask, label) {
64
+ this.y0 = y0;
65
+ this.x0 = x0;
66
+ this.y1 = y1;
67
+ this.x1 = x1;
68
+ this.mask = mask;
69
+ this.label = label;
70
+ }
71
+ }
72
+
73
+ class SegmentationResult {
74
+ /**
75
+ * Container for segmentation/detection results
76
+ * @param {BoundingBox[]} boxes - Array of bounding boxes
77
+ * @param {SegmentationMask[]|null} masks - Array of segmentation masks (optional)
78
+ * @param {string|null} rawResponse - Raw API response (optional)
79
+ */
80
+ constructor(boxes, masks = null, rawResponse = null) {
81
+ this.boxes = boxes;
82
+ this.masks = masks;
83
+ this.rawResponse = rawResponse;
84
+ }
85
+
86
+ /**
87
+ * Get number of detected objects
88
+ * @returns {number}
89
+ */
90
+ get length() {
91
+ return this.boxes.length;
92
+ }
93
+ }
94
+
95
+ module.exports = {
96
+ BoundingBox,
97
+ SegmentationMask,
98
+ SegmentationResult
99
+ };
package/src/utils.js ADDED
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Utility functions for parsing and processing API responses
3
+ */
4
+
5
+ const { BoundingBox, SegmentationMask } = require('./models');
6
+ const { createCanvas, loadImage } = require('canvas');
7
+
8
+ /**
9
+ * Parse JSON output, removing markdown fencing if present
10
+ * @param {string} jsonOutput - Raw JSON string
11
+ * @returns {string} - Cleaned JSON string
12
+ */
13
+ function parseJson(jsonOutput) {
14
+ const lines = jsonOutput.split('\n');
15
+ for (let i = 0; i < lines.length; i++) {
16
+ if (lines[i].trim() === '```json') {
17
+ jsonOutput = lines.slice(i + 1).join('\n');
18
+ jsonOutput = jsonOutput.split('```')[0];
19
+ break;
20
+ }
21
+ }
22
+ return jsonOutput.trim();
23
+ }
24
+
25
+ /**
26
+ * Parse bounding boxes from API response
27
+ * @param {string} responseText - Raw response text from API
28
+ * @returns {BoundingBox[]} - Array of BoundingBox objects
29
+ */
30
+ function parseBoundingBoxes(responseText) {
31
+ const cleanedJson = parseJson(responseText);
32
+ const data = JSON.parse(cleanedJson);
33
+
34
+ const boxes = [];
35
+ for (const item of data) {
36
+ if (item.box_2d) {
37
+ boxes.push(BoundingBox.fromDict(item));
38
+ }
39
+ }
40
+
41
+ return boxes;
42
+ }
43
+
44
+ /**
45
+ * Parse segmentation masks from API response
46
+ * @param {string} responseText - Raw response text
47
+ * @param {number} imgHeight - Image height
48
+ * @param {number} imgWidth - Image width
49
+ * @returns {Promise<SegmentationMask[]>} - Array of SegmentationMask objects
50
+ */
51
+ async function parseSegmentationMasks(responseText, imgHeight, imgWidth) {
52
+ const cleanedJson = parseJson(responseText);
53
+ const data = JSON.parse(cleanedJson);
54
+
55
+ const masks = [];
56
+
57
+ for (const item of data) {
58
+ if (!item.box_2d || !item.mask) continue;
59
+
60
+ const box = item.box_2d;
61
+ const label = item.label;
62
+
63
+ // Convert normalized bbox to absolute coordinates
64
+ const absY0 = Math.round((box[0] / 1000) * imgHeight);
65
+ const absX0 = Math.round((box[1] / 1000) * imgWidth);
66
+ const absY1 = Math.round((box[2] / 1000) * imgHeight);
67
+ const absX1 = Math.round((box[3] / 1000) * imgWidth);
68
+
69
+ const bboxWidth = absX1 - absX0;
70
+ const bboxHeight = absY1 - absY0;
71
+
72
+ // Decode base64 mask
73
+ const maskBase64 = item.mask.replace(/^data:image\/png;base64,/, '');
74
+ const maskBuffer = Buffer.from(maskBase64, 'base64');
75
+
76
+ // Load mask image and resize to bbox size
77
+ const maskImg = await loadImage(maskBuffer);
78
+ const canvas = createCanvas(bboxWidth, bboxHeight);
79
+ const ctx = canvas.getContext('2d');
80
+ ctx.drawImage(maskImg, 0, 0, bboxWidth, bboxHeight);
81
+
82
+ // Get image data and extract alpha channel
83
+ const imageData = ctx.getImageData(0, 0, bboxWidth, bboxHeight);
84
+ const maskData = new Uint8Array(bboxWidth * bboxHeight);
85
+
86
+ for (let i = 0; i < imageData.data.length; i += 4) {
87
+ maskData[i / 4] = imageData.data[i + 3]; // Alpha channel
88
+ }
89
+
90
+ // Create full-size mask
91
+ const fullMask = Buffer.alloc(imgHeight * imgWidth);
92
+
93
+ for (let y = 0; y < bboxHeight; y++) {
94
+ for (let x = 0; x < bboxWidth; x++) {
95
+ const srcIdx = y * bboxWidth + x;
96
+ const dstY = absY0 + y;
97
+ const dstX = absX0 + x;
98
+
99
+ if (dstY < imgHeight && dstX < imgWidth) {
100
+ const dstIdx = dstY * imgWidth + dstX;
101
+ fullMask[dstIdx] = maskData[srcIdx];
102
+ }
103
+ }
104
+ }
105
+
106
+ masks.push(new SegmentationMask(
107
+ absY0, absX0, absY1, absX1, fullMask, label
108
+ ));
109
+ }
110
+
111
+ return masks;
112
+ }
113
+
114
+ module.exports = {
115
+ parseJson,
116
+ parseBoundingBoxes,
117
+ parseSegmentationMasks
118
+ };