voyageai-cli 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +193 -0
- package/package.json +37 -0
- package/src/cli.js +24 -0
- package/src/commands/embed.js +68 -0
- package/src/commands/index.js +156 -0
- package/src/commands/models.js +54 -0
- package/src/commands/rerank.js +110 -0
- package/src/commands/search.js +111 -0
- package/src/commands/store.js +186 -0
- package/src/lib/api.js +107 -0
- package/src/lib/catalog.js +26 -0
- package/src/lib/format.js +24 -0
- package/src/lib/input.js +40 -0
- package/src/lib/mongo.js +55 -0
- package/voyageai-cli-1.1.0.tgz +0 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Michael Lynn
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# voyageai-cli
|
|
2
|
+
|
|
3
|
+
CLI for [Voyage AI](https://www.mongodb.com/docs/voyageai/) embeddings, reranking, and [MongoDB Atlas Vector Search](https://www.mongodb.com/docs/atlas/atlas-vector-search/). Pure Node.js — no Python required.
|
|
4
|
+
|
|
5
|
+
Generate embeddings, rerank search results, store vectors in Atlas, and run semantic search — all from the command line.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install -g voyageai-cli
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Set your API key (get one from MongoDB Atlas → AI Models)
|
|
17
|
+
export VOYAGE_API_KEY="your-key"
|
|
18
|
+
|
|
19
|
+
# Generate an embedding
|
|
20
|
+
vai embed "What is MongoDB?"
|
|
21
|
+
|
|
22
|
+
# List available models
|
|
23
|
+
vai models
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Commands
|
|
27
|
+
|
|
28
|
+
### `vai embed` — Generate embeddings
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Single text
|
|
32
|
+
vai embed "Hello, world"
|
|
33
|
+
|
|
34
|
+
# With options
|
|
35
|
+
vai embed "search query" --model voyage-4-large --input-type query --dimensions 512
|
|
36
|
+
|
|
37
|
+
# From a file
|
|
38
|
+
vai embed --file document.txt --input-type document
|
|
39
|
+
|
|
40
|
+
# Bulk from stdin (newline-delimited)
|
|
41
|
+
cat texts.txt | vai embed
|
|
42
|
+
|
|
43
|
+
# Raw array output
|
|
44
|
+
vai embed "hello" --output-format array
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### `vai rerank` — Rerank documents by relevance
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Inline documents
|
|
51
|
+
vai rerank --query "database performance" \
|
|
52
|
+
--documents "MongoDB is fast" "Redis is cached" "SQL is relational"
|
|
53
|
+
|
|
54
|
+
# From a file (JSON array or newline-delimited)
|
|
55
|
+
vai rerank --query "best database" --documents-file candidates.json --top-k 3
|
|
56
|
+
|
|
57
|
+
# Different model
|
|
58
|
+
vai rerank --query "query" --documents "doc1" "doc2" --model rerank-2.5-lite
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### `vai store` — Embed and insert into MongoDB Atlas
|
|
62
|
+
|
|
63
|
+
Requires `MONGODB_URI` environment variable.
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Single document with metadata
|
|
67
|
+
vai store --db myapp --collection docs --field embedding \
|
|
68
|
+
--text "MongoDB Atlas is a cloud database" \
|
|
69
|
+
--metadata '{"source": "docs", "category": "product"}'
|
|
70
|
+
|
|
71
|
+
# From a file
|
|
72
|
+
vai store --db myapp --collection docs --field embedding \
|
|
73
|
+
--file article.txt
|
|
74
|
+
|
|
75
|
+
# Batch from JSONL (one {"text": "...", "metadata": {...}} per line)
|
|
76
|
+
vai store --db myapp --collection docs --field embedding \
|
|
77
|
+
--file documents.jsonl
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### `vai search` — Vector similarity search
|
|
81
|
+
|
|
82
|
+
Requires `MONGODB_URI` environment variable.
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Basic search
|
|
86
|
+
vai search --query "cloud database" \
|
|
87
|
+
--db myapp --collection docs \
|
|
88
|
+
--index vector_index --field embedding
|
|
89
|
+
|
|
90
|
+
# With pre-filter and limit
|
|
91
|
+
vai search --query "performance tuning" \
|
|
92
|
+
--db myapp --collection docs \
|
|
93
|
+
--index vector_index --field embedding \
|
|
94
|
+
--filter '{"category": "guides"}' --limit 5
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### `vai index` — Manage Atlas Vector Search indexes
|
|
98
|
+
|
|
99
|
+
Requires `MONGODB_URI` environment variable.
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Create an index
|
|
103
|
+
vai index create --db myapp --collection docs --field embedding \
|
|
104
|
+
--dimensions 1024 --similarity cosine --index-name my_index
|
|
105
|
+
|
|
106
|
+
# List indexes
|
|
107
|
+
vai index list --db myapp --collection docs
|
|
108
|
+
|
|
109
|
+
# Delete an index
|
|
110
|
+
vai index delete --db myapp --collection docs --index-name my_index
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### `vai models` — List available models
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# All models
|
|
117
|
+
vai models
|
|
118
|
+
|
|
119
|
+
# Filter by type
|
|
120
|
+
vai models --type embedding
|
|
121
|
+
vai models --type reranking
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Full Pipeline Example
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
export VOYAGE_API_KEY="your-key"
|
|
128
|
+
export MONGODB_URI="mongodb+srv://user:pass@cluster.mongodb.net/"
|
|
129
|
+
|
|
130
|
+
# 1. Store documents with embeddings
|
|
131
|
+
vai store --db myapp --collection articles --field embedding \
|
|
132
|
+
--text "MongoDB Atlas provides a fully managed cloud database" \
|
|
133
|
+
--metadata '{"title": "Atlas Overview"}'
|
|
134
|
+
|
|
135
|
+
vai store --db myapp --collection articles --field embedding \
|
|
136
|
+
--text "Vector search enables semantic similarity matching" \
|
|
137
|
+
--metadata '{"title": "Vector Search Guide"}'
|
|
138
|
+
|
|
139
|
+
# 2. Create a vector search index
|
|
140
|
+
vai index create --db myapp --collection articles --field embedding \
|
|
141
|
+
--dimensions 1024 --similarity cosine --index-name article_search
|
|
142
|
+
|
|
143
|
+
# 3. Search (wait ~60s for index to build on small collections)
|
|
144
|
+
vai search --query "how does cloud database work" \
|
|
145
|
+
--db myapp --collection articles --index article_search --field embedding
|
|
146
|
+
|
|
147
|
+
# 4. Rerank for precision
|
|
148
|
+
vai rerank --query "how does cloud database work" \
|
|
149
|
+
--documents "MongoDB Atlas provides a fully managed cloud database" \
|
|
150
|
+
"Vector search enables semantic similarity matching"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Environment Variables
|
|
154
|
+
|
|
155
|
+
| Variable | Required For | Description |
|
|
156
|
+
|----------|-------------|-------------|
|
|
157
|
+
| `VOYAGE_API_KEY` | embed, rerank, store, search | [Model API key](https://www.mongodb.com/docs/voyageai/management/api-keys/) from MongoDB Atlas |
|
|
158
|
+
| `MONGODB_URI` | store, search, index | MongoDB Atlas connection string |
|
|
159
|
+
|
|
160
|
+
## Global Flags
|
|
161
|
+
|
|
162
|
+
All commands support:
|
|
163
|
+
|
|
164
|
+
| Flag | Description |
|
|
165
|
+
|------|-------------|
|
|
166
|
+
| `--json` | Machine-readable JSON output |
|
|
167
|
+
| `--quiet` | Suppress non-essential output |
|
|
168
|
+
|
|
169
|
+
## Models
|
|
170
|
+
|
|
171
|
+
| Model | Type | Dimensions | Price/1M tokens | Best For |
|
|
172
|
+
|-------|------|-----------|----------------|----------|
|
|
173
|
+
| voyage-4-large | embedding | 1024 (default), 256-2048 | $0.12 | Best quality |
|
|
174
|
+
| voyage-4 | embedding | 1024 (default), 256-2048 | $0.06 | Balanced |
|
|
175
|
+
| voyage-4-lite | embedding | 1024 (default), 256-2048 | $0.02 | Lowest cost |
|
|
176
|
+
| voyage-code-3 | embedding | 1024 (default), 256-2048 | $0.18 | Code |
|
|
177
|
+
| voyage-finance-2 | embedding | 1024 | $0.12 | Finance |
|
|
178
|
+
| voyage-law-2 | embedding | 1024 | $0.12 | Legal |
|
|
179
|
+
| voyage-multimodal-3.5 | embedding | 1024 (default), 256-2048 | $0.12 + pixels | Text + images |
|
|
180
|
+
| rerank-2.5 | reranking | — | $0.05 | Best reranking |
|
|
181
|
+
| rerank-2.5-lite | reranking | — | $0.02 | Fast reranking |
|
|
182
|
+
|
|
183
|
+
Free tier: 200M tokens for most models. All Voyage 4 series models share the same embedding space.
|
|
184
|
+
|
|
185
|
+
## Requirements
|
|
186
|
+
|
|
187
|
+
- Node.js 18+
|
|
188
|
+
- A [MongoDB Atlas](https://www.mongodb.com/atlas) account (free tier works)
|
|
189
|
+
- A [Voyage AI model API key](https://www.mongodb.com/docs/voyageai/management/api-keys/) (created in Atlas)
|
|
190
|
+
|
|
191
|
+
## License
|
|
192
|
+
|
|
193
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "voyageai-cli",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "CLI for Voyage AI embeddings, reranking, and MongoDB Atlas Vector Search",
|
|
5
|
+
"bin": {
|
|
6
|
+
"vai": "./src/cli.js"
|
|
7
|
+
},
|
|
8
|
+
"keywords": [
|
|
9
|
+
"voyage-ai",
|
|
10
|
+
"voyageai",
|
|
11
|
+
"embeddings",
|
|
12
|
+
"vector-search",
|
|
13
|
+
"reranking",
|
|
14
|
+
"mongodb",
|
|
15
|
+
"atlas",
|
|
16
|
+
"semantic-search",
|
|
17
|
+
"rag",
|
|
18
|
+
"cli"
|
|
19
|
+
],
|
|
20
|
+
"author": "Michael Lynn",
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"repository": {
|
|
23
|
+
"type": "git",
|
|
24
|
+
"url": "https://github.com/mrlynn/voyageai-cli"
|
|
25
|
+
},
|
|
26
|
+
"homepage": "https://github.com/mrlynn/voyageai-cli#readme",
|
|
27
|
+
"bugs": {
|
|
28
|
+
"url": "https://github.com/mrlynn/voyageai-cli/issues"
|
|
29
|
+
},
|
|
30
|
+
"engines": {
|
|
31
|
+
"node": ">=18.0.0"
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"commander": "^12.0.0",
|
|
35
|
+
"mongodb": "^6.0.0"
|
|
36
|
+
}
|
|
37
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const { program } = require('commander');
|
|
5
|
+
const { registerEmbed } = require('./commands/embed');
|
|
6
|
+
const { registerRerank } = require('./commands/rerank');
|
|
7
|
+
const { registerStore } = require('./commands/store');
|
|
8
|
+
const { registerSearch } = require('./commands/search');
|
|
9
|
+
const { registerIndex } = require('./commands/index');
|
|
10
|
+
const { registerModels } = require('./commands/models');
|
|
11
|
+
|
|
12
|
+
program
|
|
13
|
+
.name('vai')
|
|
14
|
+
.description('Voyage AI embeddings, reranking, and Atlas Vector Search CLI')
|
|
15
|
+
.version('1.0.0');
|
|
16
|
+
|
|
17
|
+
registerEmbed(program);
|
|
18
|
+
registerRerank(program);
|
|
19
|
+
registerStore(program);
|
|
20
|
+
registerSearch(program);
|
|
21
|
+
registerIndex(program);
|
|
22
|
+
registerModels(program);
|
|
23
|
+
|
|
24
|
+
program.parse();
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { DEFAULT_EMBED_MODEL } = require('../lib/catalog');
|
|
4
|
+
const { generateEmbeddings } = require('../lib/api');
|
|
5
|
+
const { resolveTextInput } = require('../lib/input');
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Register the embed command on a Commander program.
|
|
9
|
+
* @param {import('commander').Command} program
|
|
10
|
+
*/
|
|
11
|
+
function registerEmbed(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('embed [text]')
|
|
14
|
+
.description('Generate embeddings for text')
|
|
15
|
+
.option('-m, --model <model>', 'Embedding model', DEFAULT_EMBED_MODEL)
|
|
16
|
+
.option('-t, --input-type <type>', 'Input type: query or document')
|
|
17
|
+
.option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
|
|
18
|
+
.option('-f, --file <path>', 'Read text from file')
|
|
19
|
+
.option('-o, --output-format <format>', 'Output format: json or array', 'json')
|
|
20
|
+
.option('--json', 'Machine-readable JSON output')
|
|
21
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
22
|
+
.action(async (text, opts) => {
|
|
23
|
+
try {
|
|
24
|
+
const texts = await resolveTextInput(text, opts.file);
|
|
25
|
+
|
|
26
|
+
const result = await generateEmbeddings(texts, {
|
|
27
|
+
model: opts.model,
|
|
28
|
+
inputType: opts.inputType,
|
|
29
|
+
dimensions: opts.dimensions,
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
if (opts.outputFormat === 'array') {
|
|
33
|
+
if (result.data.length === 1) {
|
|
34
|
+
console.log(JSON.stringify(result.data[0].embedding));
|
|
35
|
+
} else {
|
|
36
|
+
console.log(JSON.stringify(result.data.map(d => d.embedding)));
|
|
37
|
+
}
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (opts.json) {
|
|
42
|
+
console.log(JSON.stringify(result, null, 2));
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Friendly output
|
|
47
|
+
if (!opts.quiet) {
|
|
48
|
+
console.log(`Model: ${result.model}`);
|
|
49
|
+
console.log(`Texts: ${result.data.length}`);
|
|
50
|
+
if (result.usage) {
|
|
51
|
+
console.log(`Tokens: ${result.usage.total_tokens}`);
|
|
52
|
+
}
|
|
53
|
+
console.log(`Dimensions: ${result.data[0]?.embedding?.length || 'N/A'}`);
|
|
54
|
+
console.log('');
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
for (const item of result.data) {
|
|
58
|
+
const preview = item.embedding.slice(0, 5).map(v => v.toFixed(6)).join(', ');
|
|
59
|
+
console.log(`[${item.index}] [${preview}, ...] (${item.embedding.length} dims)`);
|
|
60
|
+
}
|
|
61
|
+
} catch (err) {
|
|
62
|
+
console.error(`Error: ${err.message}`);
|
|
63
|
+
process.exit(1);
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
module.exports = { registerEmbed };
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { DEFAULT_DIMENSIONS } = require('../lib/catalog');
|
|
4
|
+
const { getMongoCollection } = require('../lib/mongo');
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Register the index command (with create, list, delete subcommands) on a Commander program.
|
|
8
|
+
* @param {import('commander').Command} program
|
|
9
|
+
*/
|
|
10
|
+
function registerIndex(program) {
|
|
11
|
+
const indexCmd = program
|
|
12
|
+
.command('index')
|
|
13
|
+
.description('Manage Atlas Vector Search indexes');
|
|
14
|
+
|
|
15
|
+
// ── index create ──
|
|
16
|
+
indexCmd
|
|
17
|
+
.command('create')
|
|
18
|
+
.description('Create a vector search index')
|
|
19
|
+
.requiredOption('--db <database>', 'Database name')
|
|
20
|
+
.requiredOption('--collection <name>', 'Collection name')
|
|
21
|
+
.requiredOption('--field <name>', 'Embedding field name')
|
|
22
|
+
.option('-d, --dimensions <n>', 'Vector dimensions', (v) => parseInt(v, 10), DEFAULT_DIMENSIONS)
|
|
23
|
+
.option('-s, --similarity <type>', 'Similarity function: cosine, dotProduct, euclidean', 'cosine')
|
|
24
|
+
.option('-n, --index-name <name>', 'Index name', 'default')
|
|
25
|
+
.option('--json', 'Machine-readable JSON output')
|
|
26
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
27
|
+
.action(async (opts) => {
|
|
28
|
+
let client;
|
|
29
|
+
try {
|
|
30
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
31
|
+
client = c;
|
|
32
|
+
|
|
33
|
+
const indexDef = {
|
|
34
|
+
name: opts.indexName,
|
|
35
|
+
type: 'vectorSearch',
|
|
36
|
+
definition: {
|
|
37
|
+
fields: [
|
|
38
|
+
{
|
|
39
|
+
type: 'vector',
|
|
40
|
+
path: opts.field,
|
|
41
|
+
numDimensions: parseInt(opts.dimensions, 10) || DEFAULT_DIMENSIONS,
|
|
42
|
+
similarity: opts.similarity,
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
},
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
const result = await collection.createSearchIndex(indexDef);
|
|
49
|
+
|
|
50
|
+
if (opts.json) {
|
|
51
|
+
console.log(JSON.stringify({ indexName: result, definition: indexDef }, null, 2));
|
|
52
|
+
} else if (!opts.quiet) {
|
|
53
|
+
console.log(`✓ Vector search index created: "${result}"`);
|
|
54
|
+
console.log(` Database: ${opts.db}`);
|
|
55
|
+
console.log(` Collection: ${opts.collection}`);
|
|
56
|
+
console.log(` Field: ${opts.field}`);
|
|
57
|
+
console.log(` Dimensions: ${opts.dimensions}`);
|
|
58
|
+
console.log(` Similarity: ${opts.similarity}`);
|
|
59
|
+
console.log('');
|
|
60
|
+
console.log('Note: Index may take a few minutes to become ready.');
|
|
61
|
+
}
|
|
62
|
+
} catch (err) {
|
|
63
|
+
if (err.message && err.message.includes('already exists')) {
|
|
64
|
+
console.error(`Error: Index "${opts.indexName}" already exists on ${opts.db}.${opts.collection}`);
|
|
65
|
+
console.error('Use a different --index-name or delete the existing index first.');
|
|
66
|
+
} else {
|
|
67
|
+
console.error(`Error: ${err.message}`);
|
|
68
|
+
}
|
|
69
|
+
process.exit(1);
|
|
70
|
+
} finally {
|
|
71
|
+
if (client) await client.close();
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// ── index list ──
|
|
76
|
+
indexCmd
|
|
77
|
+
.command('list')
|
|
78
|
+
.description('List all search indexes on a collection')
|
|
79
|
+
.requiredOption('--db <database>', 'Database name')
|
|
80
|
+
.requiredOption('--collection <name>', 'Collection name')
|
|
81
|
+
.option('--json', 'Machine-readable JSON output')
|
|
82
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
83
|
+
.action(async (opts) => {
|
|
84
|
+
let client;
|
|
85
|
+
try {
|
|
86
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
87
|
+
client = c;
|
|
88
|
+
|
|
89
|
+
const indexes = await collection.listSearchIndexes().toArray();
|
|
90
|
+
|
|
91
|
+
if (opts.json) {
|
|
92
|
+
console.log(JSON.stringify(indexes, null, 2));
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (indexes.length === 0) {
|
|
97
|
+
console.log(`No search indexes found on ${opts.db}.${opts.collection}`);
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (!opts.quiet) {
|
|
102
|
+
console.log(`Search indexes on ${opts.db}.${opts.collection}:`);
|
|
103
|
+
console.log('');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
for (const idx of indexes) {
|
|
107
|
+
console.log(` Name: ${idx.name}`);
|
|
108
|
+
console.log(` Type: ${idx.type || 'N/A'}`);
|
|
109
|
+
console.log(` Status: ${idx.status || 'N/A'}`);
|
|
110
|
+
if (idx.latestDefinition) {
|
|
111
|
+
console.log(` Fields: ${JSON.stringify(idx.latestDefinition.fields || [])}`);
|
|
112
|
+
}
|
|
113
|
+
console.log('');
|
|
114
|
+
}
|
|
115
|
+
} catch (err) {
|
|
116
|
+
console.error(`Error: ${err.message}`);
|
|
117
|
+
process.exit(1);
|
|
118
|
+
} finally {
|
|
119
|
+
if (client) await client.close();
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
// ── index delete ──
|
|
124
|
+
indexCmd
|
|
125
|
+
.command('delete')
|
|
126
|
+
.description('Drop a search index')
|
|
127
|
+
.requiredOption('--db <database>', 'Database name')
|
|
128
|
+
.requiredOption('--collection <name>', 'Collection name')
|
|
129
|
+
.requiredOption('-n, --index-name <name>', 'Index name to delete')
|
|
130
|
+
.option('--json', 'Machine-readable JSON output')
|
|
131
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
132
|
+
.action(async (opts) => {
|
|
133
|
+
let client;
|
|
134
|
+
try {
|
|
135
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
136
|
+
client = c;
|
|
137
|
+
|
|
138
|
+
await collection.dropSearchIndex(opts.indexName);
|
|
139
|
+
|
|
140
|
+
if (opts.json) {
|
|
141
|
+
console.log(JSON.stringify({ dropped: opts.indexName }, null, 2));
|
|
142
|
+
} else if (!opts.quiet) {
|
|
143
|
+
console.log(`✓ Dropped search index: "${opts.indexName}"`);
|
|
144
|
+
console.log(` Database: ${opts.db}`);
|
|
145
|
+
console.log(` Collection: ${opts.collection}`);
|
|
146
|
+
}
|
|
147
|
+
} catch (err) {
|
|
148
|
+
console.error(`Error: ${err.message}`);
|
|
149
|
+
process.exit(1);
|
|
150
|
+
} finally {
|
|
151
|
+
if (client) await client.close();
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
module.exports = { registerIndex };
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { MODEL_CATALOG } = require('../lib/catalog');
|
|
4
|
+
const { API_BASE } = require('../lib/api');
|
|
5
|
+
const { formatTable } = require('../lib/format');
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Register the models command on a Commander program.
|
|
9
|
+
* @param {import('commander').Command} program
|
|
10
|
+
*/
|
|
11
|
+
function registerModels(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('models')
|
|
14
|
+
.description('List available Voyage AI models')
|
|
15
|
+
.option('-t, --type <type>', 'Filter by type: embedding, reranking, or all', 'all')
|
|
16
|
+
.option('--json', 'Machine-readable JSON output')
|
|
17
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
18
|
+
.action((opts) => {
|
|
19
|
+
let models = MODEL_CATALOG;
|
|
20
|
+
|
|
21
|
+
if (opts.type !== 'all') {
|
|
22
|
+
models = models.filter(m => m.type === opts.type);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (opts.json) {
|
|
26
|
+
console.log(JSON.stringify(models, null, 2));
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (models.length === 0) {
|
|
31
|
+
console.log(`No models found for type: ${opts.type}`);
|
|
32
|
+
return;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (!opts.quiet) {
|
|
36
|
+
console.log('Voyage AI Models');
|
|
37
|
+
console.log(`(via MongoDB AI API — ${API_BASE})`);
|
|
38
|
+
console.log('');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const headers = ['Model', 'Type', 'Context', 'Dimensions', 'Price', 'Best For'];
|
|
42
|
+
const rows = models.map(m => [m.name, m.type, m.context, m.dimensions, m.price, m.bestFor]);
|
|
43
|
+
|
|
44
|
+
console.log(formatTable(headers, rows));
|
|
45
|
+
|
|
46
|
+
if (!opts.quiet) {
|
|
47
|
+
console.log('');
|
|
48
|
+
console.log('Free tier: 200M tokens (most models), 50M (domain-specific)');
|
|
49
|
+
console.log('All 4-series models share the same embedding space.');
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
module.exports = { registerModels };
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const { DEFAULT_RERANK_MODEL } = require('../lib/catalog');
|
|
5
|
+
const { apiRequest } = require('../lib/api');
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Register the rerank command on a Commander program.
|
|
9
|
+
* @param {import('commander').Command} program
|
|
10
|
+
*/
|
|
11
|
+
function registerRerank(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('rerank')
|
|
14
|
+
.description('Rerank documents against a query')
|
|
15
|
+
.requiredOption('--query <text>', 'Search query')
|
|
16
|
+
.option('--documents <docs...>', 'Documents to rerank')
|
|
17
|
+
.option('--documents-file <path>', 'File with documents (JSON array or newline-delimited)')
|
|
18
|
+
.option('-m, --model <model>', 'Reranking model', DEFAULT_RERANK_MODEL)
|
|
19
|
+
.option('-k, --top-k <n>', 'Return top K results', (v) => parseInt(v, 10))
|
|
20
|
+
.option('--json', 'Machine-readable JSON output')
|
|
21
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
22
|
+
.action(async (opts) => {
|
|
23
|
+
try {
|
|
24
|
+
let documents = opts.documents;
|
|
25
|
+
|
|
26
|
+
if (opts.documentsFile) {
|
|
27
|
+
const content = fs.readFileSync(opts.documentsFile, 'utf-8').trim();
|
|
28
|
+
try {
|
|
29
|
+
const parsed = JSON.parse(content);
|
|
30
|
+
if (Array.isArray(parsed)) {
|
|
31
|
+
documents = parsed.map(item => {
|
|
32
|
+
if (typeof item === 'string') return item;
|
|
33
|
+
if (item.text) return item.text;
|
|
34
|
+
return JSON.stringify(item);
|
|
35
|
+
});
|
|
36
|
+
} else {
|
|
37
|
+
documents = [typeof parsed === 'string' ? parsed : JSON.stringify(parsed)];
|
|
38
|
+
}
|
|
39
|
+
} catch {
|
|
40
|
+
documents = content.split('\n').filter(line => line.trim());
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Also support stdin for documents
|
|
45
|
+
if (!documents && !process.stdin.isTTY) {
|
|
46
|
+
const chunks = [];
|
|
47
|
+
for await (const chunk of process.stdin) {
|
|
48
|
+
chunks.push(chunk);
|
|
49
|
+
}
|
|
50
|
+
const input = Buffer.concat(chunks).toString('utf-8').trim();
|
|
51
|
+
try {
|
|
52
|
+
const parsed = JSON.parse(input);
|
|
53
|
+
if (Array.isArray(parsed)) {
|
|
54
|
+
documents = parsed.map(item => {
|
|
55
|
+
if (typeof item === 'string') return item;
|
|
56
|
+
if (item.text) return item.text;
|
|
57
|
+
return JSON.stringify(item);
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
} catch {
|
|
61
|
+
documents = input.split('\n').filter(line => line.trim());
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (!documents || documents.length === 0) {
|
|
66
|
+
console.error('Error: No documents provided. Use --documents, --documents-file, or pipe via stdin.');
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const body = {
|
|
71
|
+
query: opts.query,
|
|
72
|
+
documents,
|
|
73
|
+
model: opts.model,
|
|
74
|
+
};
|
|
75
|
+
if (opts.topK) {
|
|
76
|
+
body.top_k = opts.topK;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const result = await apiRequest('/rerank', body);
|
|
80
|
+
|
|
81
|
+
if (opts.json) {
|
|
82
|
+
console.log(JSON.stringify(result, null, 2));
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (!opts.quiet) {
|
|
87
|
+
console.log(`Model: ${result.model}`);
|
|
88
|
+
console.log(`Query: "${opts.query}"`);
|
|
89
|
+
console.log(`Results: ${result.data?.length || 0}`);
|
|
90
|
+
if (result.usage) {
|
|
91
|
+
console.log(`Tokens: ${result.usage.total_tokens}`);
|
|
92
|
+
}
|
|
93
|
+
console.log('');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (result.data) {
|
|
97
|
+
for (const item of result.data) {
|
|
98
|
+
const docPreview = documents[item.index].substring(0, 80);
|
|
99
|
+
const ellipsis = documents[item.index].length > 80 ? '...' : '';
|
|
100
|
+
console.log(`[${item.index}] Score: ${item.relevance_score.toFixed(6)} "${docPreview}${ellipsis}"`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
} catch (err) {
|
|
104
|
+
console.error(`Error: ${err.message}`);
|
|
105
|
+
process.exit(1);
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
module.exports = { registerRerank };
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { DEFAULT_EMBED_MODEL } = require('../lib/catalog');
|
|
4
|
+
const { generateEmbeddings } = require('../lib/api');
|
|
5
|
+
const { getMongoCollection } = require('../lib/mongo');
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Register the search command on a Commander program.
|
|
9
|
+
* @param {import('commander').Command} program
|
|
10
|
+
*/
|
|
11
|
+
function registerSearch(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('search')
|
|
14
|
+
.description('Vector search against Atlas collection')
|
|
15
|
+
.requiredOption('--query <text>', 'Search query text')
|
|
16
|
+
.requiredOption('--db <database>', 'Database name')
|
|
17
|
+
.requiredOption('--collection <name>', 'Collection name')
|
|
18
|
+
.requiredOption('--index <name>', 'Vector search index name')
|
|
19
|
+
.requiredOption('--field <name>', 'Embedding field name')
|
|
20
|
+
.option('-m, --model <model>', 'Embedding model', DEFAULT_EMBED_MODEL)
|
|
21
|
+
.option('--input-type <type>', 'Input type for query embedding', 'query')
|
|
22
|
+
.option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
|
|
23
|
+
.option('-l, --limit <n>', 'Maximum results', (v) => parseInt(v, 10), 10)
|
|
24
|
+
.option('--min-score <n>', 'Minimum similarity score', parseFloat)
|
|
25
|
+
.option('--num-candidates <n>', 'Number of candidates for ANN search', (v) => parseInt(v, 10))
|
|
26
|
+
.option('--filter <json>', 'Pre-filter JSON for $vectorSearch (e.g. \'{"category": "docs"}\')')
|
|
27
|
+
.option('--json', 'Machine-readable JSON output')
|
|
28
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
29
|
+
.action(async (opts) => {
|
|
30
|
+
let client;
|
|
31
|
+
try {
|
|
32
|
+
const embedResult = await generateEmbeddings([opts.query], {
|
|
33
|
+
model: opts.model,
|
|
34
|
+
inputType: opts.inputType,
|
|
35
|
+
dimensions: opts.dimensions,
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
const queryVector = embedResult.data[0].embedding;
|
|
39
|
+
const numCandidates = opts.numCandidates || Math.min(opts.limit * 15, 10000);
|
|
40
|
+
|
|
41
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
42
|
+
client = c;
|
|
43
|
+
|
|
44
|
+
const vectorSearchStage = {
|
|
45
|
+
index: opts.index,
|
|
46
|
+
path: opts.field,
|
|
47
|
+
queryVector,
|
|
48
|
+
numCandidates,
|
|
49
|
+
limit: opts.limit,
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// Add pre-filter if provided
|
|
53
|
+
if (opts.filter) {
|
|
54
|
+
try {
|
|
55
|
+
vectorSearchStage.filter = JSON.parse(opts.filter);
|
|
56
|
+
} catch (e) {
|
|
57
|
+
console.error('Error: Invalid filter JSON. Ensure it is valid JSON.');
|
|
58
|
+
process.exit(1);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const pipeline = [
|
|
63
|
+
{ $vectorSearch: vectorSearchStage },
|
|
64
|
+
{ $addFields: { score: { $meta: 'vectorSearchScore' } } },
|
|
65
|
+
...(opts.minScore ? [{ $match: { score: { $gte: opts.minScore } } }] : []),
|
|
66
|
+
];
|
|
67
|
+
|
|
68
|
+
const results = await collection.aggregate(pipeline).toArray();
|
|
69
|
+
|
|
70
|
+
const cleanResults = results.map(doc => {
|
|
71
|
+
const clean = { ...doc };
|
|
72
|
+
delete clean[opts.field];
|
|
73
|
+
return clean;
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
if (opts.json) {
|
|
77
|
+
console.log(JSON.stringify(cleanResults, null, 2));
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (!opts.quiet) {
|
|
82
|
+
console.log(`Query: "${opts.query}"`);
|
|
83
|
+
console.log(`Results: ${cleanResults.length}`);
|
|
84
|
+
console.log('');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (cleanResults.length === 0) {
|
|
88
|
+
console.log('No results found.');
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (let i = 0; i < cleanResults.length; i++) {
|
|
93
|
+
const doc = cleanResults[i];
|
|
94
|
+
const score = doc.score?.toFixed(6) || 'N/A';
|
|
95
|
+
console.log(`── Result ${i + 1} (score: ${score}) ──`);
|
|
96
|
+
const textPreview = doc.text ? doc.text.substring(0, 200) : 'No text field';
|
|
97
|
+
const ellipsis = doc.text && doc.text.length > 200 ? '...' : '';
|
|
98
|
+
console.log(` ${textPreview}${ellipsis}`);
|
|
99
|
+
console.log(` _id: ${doc._id}`);
|
|
100
|
+
console.log('');
|
|
101
|
+
}
|
|
102
|
+
} catch (err) {
|
|
103
|
+
console.error(`Error: ${err.message}`);
|
|
104
|
+
process.exit(1);
|
|
105
|
+
} finally {
|
|
106
|
+
if (client) await client.close();
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
module.exports = { registerSearch };
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const { DEFAULT_EMBED_MODEL } = require('../lib/catalog');
|
|
5
|
+
const { generateEmbeddings } = require('../lib/api');
|
|
6
|
+
const { resolveTextInput } = require('../lib/input');
|
|
7
|
+
const { getMongoCollection } = require('../lib/mongo');
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Register the store command on a Commander program.
|
|
11
|
+
* @param {import('commander').Command} program
|
|
12
|
+
*/
|
|
13
|
+
function registerStore(program) {
|
|
14
|
+
program
|
|
15
|
+
.command('store')
|
|
16
|
+
.description('Embed text and store in MongoDB Atlas')
|
|
17
|
+
.requiredOption('--db <database>', 'Database name')
|
|
18
|
+
.requiredOption('--collection <name>', 'Collection name')
|
|
19
|
+
.requiredOption('--field <name>', 'Embedding field name')
|
|
20
|
+
.option('--text <text>', 'Text to embed and store')
|
|
21
|
+
.option('-f, --file <path>', 'File to embed and store (text file or .jsonl for batch mode)')
|
|
22
|
+
.option('-m, --model <model>', 'Embedding model', DEFAULT_EMBED_MODEL)
|
|
23
|
+
.option('--input-type <type>', 'Input type: query or document', 'document')
|
|
24
|
+
.option('-d, --dimensions <n>', 'Output dimensions', (v) => parseInt(v, 10))
|
|
25
|
+
.option('--metadata <json>', 'Additional metadata as JSON')
|
|
26
|
+
.option('--json', 'Machine-readable JSON output')
|
|
27
|
+
.option('-q, --quiet', 'Suppress non-essential output')
|
|
28
|
+
.action(async (opts) => {
|
|
29
|
+
let client;
|
|
30
|
+
try {
|
|
31
|
+
// Batch mode: .jsonl file
|
|
32
|
+
if (opts.file && opts.file.endsWith('.jsonl')) {
|
|
33
|
+
await handleBatchStore(opts);
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const texts = await resolveTextInput(opts.text, opts.file);
|
|
38
|
+
const textContent = texts[0];
|
|
39
|
+
|
|
40
|
+
const embedResult = await generateEmbeddings([textContent], {
|
|
41
|
+
model: opts.model,
|
|
42
|
+
inputType: opts.inputType,
|
|
43
|
+
dimensions: opts.dimensions,
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const embedding = embedResult.data[0].embedding;
|
|
47
|
+
|
|
48
|
+
const doc = {
|
|
49
|
+
text: textContent,
|
|
50
|
+
[opts.field]: embedding,
|
|
51
|
+
model: opts.model || DEFAULT_EMBED_MODEL,
|
|
52
|
+
dimensions: embedding.length,
|
|
53
|
+
createdAt: new Date(),
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
if (opts.metadata) {
|
|
57
|
+
try {
|
|
58
|
+
const meta = JSON.parse(opts.metadata);
|
|
59
|
+
Object.assign(doc, meta);
|
|
60
|
+
} catch (e) {
|
|
61
|
+
console.error('Error: Invalid metadata JSON. Ensure it is valid JSON.');
|
|
62
|
+
process.exit(1);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
67
|
+
client = c;
|
|
68
|
+
const result = await collection.insertOne(doc);
|
|
69
|
+
|
|
70
|
+
if (opts.json) {
|
|
71
|
+
console.log(JSON.stringify({
|
|
72
|
+
insertedId: result.insertedId,
|
|
73
|
+
dimensions: embedding.length,
|
|
74
|
+
model: doc.model,
|
|
75
|
+
tokens: embedResult.usage?.total_tokens,
|
|
76
|
+
}, null, 2));
|
|
77
|
+
} else if (!opts.quiet) {
|
|
78
|
+
console.log(`✓ Stored document: ${result.insertedId}`);
|
|
79
|
+
console.log(` Database: ${opts.db}`);
|
|
80
|
+
console.log(` Collection: ${opts.collection}`);
|
|
81
|
+
console.log(` Field: ${opts.field}`);
|
|
82
|
+
console.log(` Dimensions: ${embedding.length}`);
|
|
83
|
+
console.log(` Model: ${doc.model}`);
|
|
84
|
+
if (embedResult.usage) {
|
|
85
|
+
console.log(` Tokens: ${embedResult.usage.total_tokens}`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
} catch (err) {
|
|
89
|
+
console.error(`Error: ${err.message}`);
|
|
90
|
+
process.exit(1);
|
|
91
|
+
} finally {
|
|
92
|
+
if (client) await client.close();
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Handle batch store from a .jsonl file.
|
|
99
|
+
* Each line: {"text": "...", "metadata": {...}}
|
|
100
|
+
* @param {object} opts - Command options
|
|
101
|
+
*/
|
|
102
|
+
async function handleBatchStore(opts) {
|
|
103
|
+
let client;
|
|
104
|
+
try {
|
|
105
|
+
const content = fs.readFileSync(opts.file, 'utf-8').trim();
|
|
106
|
+
const lines = content.split('\n').filter(line => line.trim());
|
|
107
|
+
|
|
108
|
+
if (lines.length === 0) {
|
|
109
|
+
console.error('Error: JSONL file is empty.');
|
|
110
|
+
process.exit(1);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const records = lines.map((line, i) => {
|
|
114
|
+
try {
|
|
115
|
+
return JSON.parse(line);
|
|
116
|
+
} catch (e) {
|
|
117
|
+
console.error(`Error: Invalid JSON on line ${i + 1}: ${e.message}`);
|
|
118
|
+
process.exit(1);
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
const texts = records.map(r => {
|
|
123
|
+
if (!r.text) {
|
|
124
|
+
console.error('Error: Each JSONL line must have a "text" field.');
|
|
125
|
+
process.exit(1);
|
|
126
|
+
}
|
|
127
|
+
return r.text;
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
if (!opts.quiet) {
|
|
131
|
+
console.log(`Embedding ${texts.length} documents...`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const embedResult = await generateEmbeddings(texts, {
|
|
135
|
+
model: opts.model,
|
|
136
|
+
inputType: opts.inputType,
|
|
137
|
+
dimensions: opts.dimensions,
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
const docs = records.map((record, i) => {
|
|
141
|
+
const embedding = embedResult.data[i].embedding;
|
|
142
|
+
const doc = {
|
|
143
|
+
text: record.text,
|
|
144
|
+
[opts.field]: embedding,
|
|
145
|
+
model: opts.model || DEFAULT_EMBED_MODEL,
|
|
146
|
+
dimensions: embedding.length,
|
|
147
|
+
createdAt: new Date(),
|
|
148
|
+
};
|
|
149
|
+
if (record.metadata) {
|
|
150
|
+
Object.assign(doc, record.metadata);
|
|
151
|
+
}
|
|
152
|
+
return doc;
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
const { client: c, collection } = await getMongoCollection(opts.db, opts.collection);
|
|
156
|
+
client = c;
|
|
157
|
+
const result = await collection.insertMany(docs);
|
|
158
|
+
|
|
159
|
+
if (opts.json) {
|
|
160
|
+
console.log(JSON.stringify({
|
|
161
|
+
insertedCount: result.insertedCount,
|
|
162
|
+
insertedIds: result.insertedIds,
|
|
163
|
+
dimensions: docs[0]?.dimensions,
|
|
164
|
+
model: opts.model || DEFAULT_EMBED_MODEL,
|
|
165
|
+
tokens: embedResult.usage?.total_tokens,
|
|
166
|
+
}, null, 2));
|
|
167
|
+
} else if (!opts.quiet) {
|
|
168
|
+
console.log(`✓ Stored ${result.insertedCount} documents`);
|
|
169
|
+
console.log(` Database: ${opts.db}`);
|
|
170
|
+
console.log(` Collection: ${opts.collection}`);
|
|
171
|
+
console.log(` Field: ${opts.field}`);
|
|
172
|
+
console.log(` Dimensions: ${docs[0]?.dimensions}`);
|
|
173
|
+
console.log(` Model: ${opts.model || DEFAULT_EMBED_MODEL}`);
|
|
174
|
+
if (embedResult.usage) {
|
|
175
|
+
console.log(` Tokens: ${embedResult.usage.total_tokens}`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} catch (err) {
|
|
179
|
+
console.error(`Error: ${err.message}`);
|
|
180
|
+
process.exit(1);
|
|
181
|
+
} finally {
|
|
182
|
+
if (client) await client.close();
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
module.exports = { registerStore };
|
package/src/lib/api.js
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const API_BASE = 'https://ai.mongodb.com/v1';
|
|
4
|
+
const MAX_RETRIES = 3;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Get the Voyage API key or exit with a helpful error.
|
|
8
|
+
* @returns {string}
|
|
9
|
+
*/
|
|
10
|
+
function requireApiKey() {
|
|
11
|
+
const key = process.env.VOYAGE_API_KEY;
|
|
12
|
+
if (!key) {
|
|
13
|
+
console.error('Error: VOYAGE_API_KEY environment variable is not set.');
|
|
14
|
+
console.error('');
|
|
15
|
+
console.error('Get one from MongoDB Atlas → AI Models → Create model API key');
|
|
16
|
+
console.error('Then: export VOYAGE_API_KEY="your-key-here"');
|
|
17
|
+
process.exit(1);
|
|
18
|
+
}
|
|
19
|
+
return key;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Sleep for the given number of milliseconds.
|
|
24
|
+
* @param {number} ms
|
|
25
|
+
* @returns {Promise<void>}
|
|
26
|
+
*/
|
|
27
|
+
function sleep(ms) {
|
|
28
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Make an authenticated request to the Voyage AI API with retry on 429.
|
|
33
|
+
* @param {string} endpoint - API endpoint path (e.g., '/embeddings')
|
|
34
|
+
* @param {object} body - Request body
|
|
35
|
+
* @returns {Promise<object>}
|
|
36
|
+
*/
|
|
37
|
+
async function apiRequest(endpoint, body) {
|
|
38
|
+
const apiKey = requireApiKey();
|
|
39
|
+
const url = `${API_BASE}${endpoint}`;
|
|
40
|
+
|
|
41
|
+
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
42
|
+
const response = await fetch(url, {
|
|
43
|
+
method: 'POST',
|
|
44
|
+
headers: {
|
|
45
|
+
'Content-Type': 'application/json',
|
|
46
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
47
|
+
},
|
|
48
|
+
body: JSON.stringify(body),
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
if (response.status === 429 && attempt < MAX_RETRIES) {
|
|
52
|
+
const retryAfter = response.headers.get('Retry-After');
|
|
53
|
+
const waitMs = retryAfter ? parseInt(retryAfter, 10) * 1000 : Math.pow(2, attempt) * 1000;
|
|
54
|
+
console.error(`Rate limited (429). Retrying in ${waitMs / 1000}s... (attempt ${attempt + 1}/${MAX_RETRIES})`);
|
|
55
|
+
await sleep(waitMs);
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (!response.ok) {
|
|
60
|
+
let errorDetail = '';
|
|
61
|
+
try {
|
|
62
|
+
const errBody = await response.json();
|
|
63
|
+
errorDetail = errBody.detail || errBody.message || errBody.error?.message || JSON.stringify(errBody);
|
|
64
|
+
} catch {
|
|
65
|
+
errorDetail = await response.text();
|
|
66
|
+
}
|
|
67
|
+
console.error(`API Error (${response.status}): ${errorDetail}`);
|
|
68
|
+
process.exit(1);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return response.json();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Generate embeddings for an array of texts.
|
|
77
|
+
* @param {string[]} texts - Array of texts to embed
|
|
78
|
+
* @param {object} options - Embedding options
|
|
79
|
+
* @param {string} [options.model] - Model name
|
|
80
|
+
* @param {string} [options.inputType] - Input type (query|document)
|
|
81
|
+
* @param {number} [options.dimensions] - Output dimensions
|
|
82
|
+
* @returns {Promise<object>} API response with embeddings
|
|
83
|
+
*/
|
|
84
|
+
async function generateEmbeddings(texts, options = {}) {
|
|
85
|
+
const { DEFAULT_EMBED_MODEL } = require('./catalog');
|
|
86
|
+
|
|
87
|
+
const body = {
|
|
88
|
+
input: texts,
|
|
89
|
+
model: options.model || DEFAULT_EMBED_MODEL,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
if (options.inputType) {
|
|
93
|
+
body.input_type = options.inputType;
|
|
94
|
+
}
|
|
95
|
+
if (options.dimensions) {
|
|
96
|
+
body.output_dimension = options.dimensions;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return apiRequest('/embeddings', body);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
module.exports = {
|
|
103
|
+
API_BASE,
|
|
104
|
+
requireApiKey,
|
|
105
|
+
apiRequest,
|
|
106
|
+
generateEmbeddings,
|
|
107
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const DEFAULT_EMBED_MODEL = 'voyage-4-large';
|
|
4
|
+
const DEFAULT_RERANK_MODEL = 'rerank-2.5';
|
|
5
|
+
const DEFAULT_DIMENSIONS = 1024;
|
|
6
|
+
|
|
7
|
+
/** @type {Array<{name: string, type: string, context: string, dimensions: string, price: string, bestFor: string}>} */
|
|
8
|
+
const MODEL_CATALOG = [
|
|
9
|
+
{ name: 'voyage-4-large', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.12/1M tokens', bestFor: 'Best quality, multilingual' },
|
|
10
|
+
{ name: 'voyage-4', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.06/1M tokens', bestFor: 'Balanced quality/perf' },
|
|
11
|
+
{ name: 'voyage-4-lite', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.02/1M tokens', bestFor: 'Lowest cost' },
|
|
12
|
+
{ name: 'voyage-code-3', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.18/1M tokens', bestFor: 'Code retrieval' },
|
|
13
|
+
{ name: 'voyage-finance-2', type: 'embedding', context: '32K', dimensions: '1024', price: '$0.12/1M tokens', bestFor: 'Finance' },
|
|
14
|
+
{ name: 'voyage-law-2', type: 'embedding', context: '16K', dimensions: '1024', price: '$0.12/1M tokens', bestFor: 'Legal' },
|
|
15
|
+
{ name: 'voyage-context-3', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.18/1M tokens', bestFor: 'Contextualized chunks' },
|
|
16
|
+
{ name: 'voyage-multimodal-3.5', type: 'embedding', context: '32K', dimensions: '1024 (default), 256, 512, 2048', price: '$0.12/M + $0.60/B px', bestFor: 'Text + images + video' },
|
|
17
|
+
{ name: 'rerank-2.5', type: 'reranking', context: '32K', dimensions: '—', price: '$0.05/1M tokens', bestFor: 'Best quality reranking' },
|
|
18
|
+
{ name: 'rerank-2.5-lite', type: 'reranking', context: '32K', dimensions: '—', price: '$0.02/1M tokens', bestFor: 'Fast reranking' },
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
module.exports = {
|
|
22
|
+
DEFAULT_EMBED_MODEL,
|
|
23
|
+
DEFAULT_RERANK_MODEL,
|
|
24
|
+
DEFAULT_DIMENSIONS,
|
|
25
|
+
MODEL_CATALOG,
|
|
26
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Format a simple table for terminal output.
|
|
5
|
+
* @param {string[]} headers - Column headers
|
|
6
|
+
* @param {string[][]} rows - Table rows
|
|
7
|
+
* @returns {string}
|
|
8
|
+
*/
|
|
9
|
+
function formatTable(headers, rows) {
|
|
10
|
+
const colWidths = headers.map((h, i) => {
|
|
11
|
+
const maxRow = rows.reduce((max, row) => Math.max(max, (row[i] || '').length), 0);
|
|
12
|
+
return Math.max(h.length, maxRow);
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
const sep = colWidths.map(w => '─'.repeat(w + 2)).join('┼');
|
|
16
|
+
const headerLine = headers.map((h, i) => ` ${h.padEnd(colWidths[i])} `).join('│');
|
|
17
|
+
const dataLines = rows.map(row =>
|
|
18
|
+
row.map((cell, i) => ` ${(cell || '').padEnd(colWidths[i])} `).join('│')
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
return [headerLine, sep, ...dataLines].join('\n');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
module.exports = { formatTable };
|
package/src/lib/input.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Read text input from argument, --file flag, or stdin.
|
|
7
|
+
* @param {string|undefined} textArg - Text argument from CLI
|
|
8
|
+
* @param {string|undefined} filePath - File path from --file flag
|
|
9
|
+
* @returns {Promise<string[]>} Array of text strings
|
|
10
|
+
*/
|
|
11
|
+
async function resolveTextInput(textArg, filePath) {
|
|
12
|
+
if (filePath) {
|
|
13
|
+
const content = fs.readFileSync(filePath, 'utf-8').trim();
|
|
14
|
+
return [content];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
if (textArg) {
|
|
18
|
+
return [textArg];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Try reading from stdin (piped input)
|
|
22
|
+
if (!process.stdin.isTTY) {
|
|
23
|
+
const chunks = [];
|
|
24
|
+
for await (const chunk of process.stdin) {
|
|
25
|
+
chunks.push(chunk);
|
|
26
|
+
}
|
|
27
|
+
const input = Buffer.concat(chunks).toString('utf-8').trim();
|
|
28
|
+
if (!input) {
|
|
29
|
+
console.error('Error: No input provided. Pass text as an argument, use --file, or pipe via stdin.');
|
|
30
|
+
process.exit(1);
|
|
31
|
+
}
|
|
32
|
+
// Split by newlines for bulk embedding
|
|
33
|
+
return input.split('\n').filter(line => line.trim());
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
console.error('Error: No input provided. Pass text as an argument, use --file, or pipe via stdin.');
|
|
37
|
+
process.exit(1);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
module.exports = { resolveTextInput };
|
package/src/lib/mongo.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Get MongoDB URI or exit with a helpful error.
|
|
5
|
+
* @returns {string}
|
|
6
|
+
*/
|
|
7
|
+
function requireMongoUri() {
|
|
8
|
+
const uri = process.env.MONGODB_URI;
|
|
9
|
+
if (!uri) {
|
|
10
|
+
console.error('Error: MONGODB_URI environment variable is not set.');
|
|
11
|
+
console.error('');
|
|
12
|
+
console.error('Set your Atlas connection string:');
|
|
13
|
+
console.error(' export MONGODB_URI="mongodb+srv://user:pass@cluster.mongodb.net/"');
|
|
14
|
+
process.exit(1);
|
|
15
|
+
}
|
|
16
|
+
return uri;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Get a connected MongoDB client and target collection.
|
|
21
|
+
* Lazy-requires the mongodb driver.
|
|
22
|
+
* @param {string} db - Database name
|
|
23
|
+
* @param {string} collectionName - Collection name
|
|
24
|
+
* @returns {Promise<{client: import('mongodb').MongoClient, collection: import('mongodb').Collection}>}
|
|
25
|
+
*/
|
|
26
|
+
async function getMongoCollection(db, collectionName) {
|
|
27
|
+
const { MongoClient } = require('mongodb');
|
|
28
|
+
const uri = requireMongoUri();
|
|
29
|
+
const client = new MongoClient(uri);
|
|
30
|
+
await client.connect();
|
|
31
|
+
const collection = client.db(db).collection(collectionName);
|
|
32
|
+
return { client, collection };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Connect to MongoDB, run a function with the collection, then close.
|
|
37
|
+
* @param {string} db - Database name
|
|
38
|
+
* @param {string} collectionName - Collection name
|
|
39
|
+
* @param {(collection: import('mongodb').Collection) => Promise<*>} fn - Function to run
|
|
40
|
+
* @returns {Promise<*>}
|
|
41
|
+
*/
|
|
42
|
+
async function connectAndClose(db, collectionName, fn) {
|
|
43
|
+
const { client, collection } = await getMongoCollection(db, collectionName);
|
|
44
|
+
try {
|
|
45
|
+
return await fn(collection);
|
|
46
|
+
} finally {
|
|
47
|
+
await client.close();
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
module.exports = {
|
|
52
|
+
requireMongoUri,
|
|
53
|
+
getMongoCollection,
|
|
54
|
+
connectAndClose,
|
|
55
|
+
};
|
|
Binary file
|