voyageai-cli 1.20.6 → 1.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +142 -26
- package/README.md +130 -2
- package/package.json +3 -2
- package/src/cli.js +10 -0
- package/src/commands/bug.js +249 -0
- package/src/commands/eval.js +420 -10
- package/src/commands/generate.js +220 -0
- package/src/commands/playground.js +93 -0
- package/src/commands/purge.js +271 -0
- package/src/commands/refresh.js +322 -0
- package/src/commands/scaffold.js +217 -0
- package/src/lib/codegen.js +339 -0
- package/src/lib/explanations.js +155 -0
- package/src/lib/scaffold-structure.js +114 -0
- package/src/lib/templates/nextjs/README.md.tpl +106 -0
- package/src/lib/templates/nextjs/env.example.tpl +8 -0
- package/src/lib/templates/nextjs/layout.jsx.tpl +29 -0
- package/src/lib/templates/nextjs/lib-mongo.js.tpl +111 -0
- package/src/lib/templates/nextjs/lib-voyage.js.tpl +103 -0
- package/src/lib/templates/nextjs/package.json.tpl +33 -0
- package/src/lib/templates/nextjs/page-search.jsx.tpl +147 -0
- package/src/lib/templates/nextjs/route-ingest.js.tpl +114 -0
- package/src/lib/templates/nextjs/route-search.js.tpl +97 -0
- package/src/lib/templates/nextjs/theme.js.tpl +84 -0
- package/src/lib/templates/python/README.md.tpl +145 -0
- package/src/lib/templates/python/app.py.tpl +221 -0
- package/src/lib/templates/python/chunker.py.tpl +127 -0
- package/src/lib/templates/python/env.example.tpl +12 -0
- package/src/lib/templates/python/mongo_client.py.tpl +125 -0
- package/src/lib/templates/python/requirements.txt.tpl +10 -0
- package/src/lib/templates/python/voyage_client.py.tpl +124 -0
- package/src/lib/templates/vanilla/README.md.tpl +156 -0
- package/src/lib/templates/vanilla/client.js.tpl +103 -0
- package/src/lib/templates/vanilla/connection.js.tpl +126 -0
- package/src/lib/templates/vanilla/env.example.tpl +11 -0
- package/src/lib/templates/vanilla/ingest.js.tpl +231 -0
- package/src/lib/templates/vanilla/package.json.tpl +31 -0
- package/src/lib/templates/vanilla/retrieval.js.tpl +100 -0
- package/src/lib/templates/vanilla/search-api.js.tpl +175 -0
- package/src/lib/templates/vanilla/server.js.tpl +81 -0
- package/src/lib/zip.js +130 -0
- package/src/playground/index.html +708 -3
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# {{projectName}}
|
|
2
|
+
|
|
3
|
+
A semantic search API powered by Voyage AI embeddings and MongoDB Atlas Vector Search.
|
|
4
|
+
|
|
5
|
+
## Configuration
|
|
6
|
+
|
|
7
|
+
| Setting | Value |
|
|
8
|
+
|---------|-------|
|
|
9
|
+
| Embedding Model | `{{model}}` |
|
|
10
|
+
| Dimensions | {{dimensions}} |
|
|
11
|
+
| Database | `{{db}}` |
|
|
12
|
+
| Collection | `{{collection}}` |
|
|
13
|
+
| Vector Index | `{{index}}` |
|
|
14
|
+
{{#if rerank}}
|
|
15
|
+
| Rerank Model | `{{rerankModel}}` |
|
|
16
|
+
{{/if}}
|
|
17
|
+
|
|
18
|
+
## Setup
|
|
19
|
+
|
|
20
|
+
### 1. Install dependencies
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### 2. Configure environment
|
|
27
|
+
|
|
28
|
+
Copy `.env.example` to `.env` and fill in your credentials:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
cp .env.example .env
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Required variables:
|
|
35
|
+
- `VOYAGE_API_KEY` - Your Voyage AI API key from [dash.voyageai.com](https://dash.voyageai.com)
|
|
36
|
+
- `MONGODB_URI` - Your MongoDB Atlas connection string
|
|
37
|
+
|
|
38
|
+
### 3. Create vector index
|
|
39
|
+
|
|
40
|
+
In MongoDB Atlas, create a vector search index on your collection:
|
|
41
|
+
|
|
42
|
+
```json
|
|
43
|
+
{
|
|
44
|
+
"fields": [
|
|
45
|
+
{
|
|
46
|
+
"type": "vector",
|
|
47
|
+
"path": "{{field}}",
|
|
48
|
+
"numDimensions": {{dimensions}},
|
|
49
|
+
"similarity": "cosine"
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Name the index `{{index}}`.
|
|
56
|
+
|
|
57
|
+
### 4. Ingest documents
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Ingest a directory of markdown/text files
|
|
61
|
+
npm run ingest -- ./docs
|
|
62
|
+
|
|
63
|
+
# Or ingest a single file
|
|
64
|
+
npm run ingest -- ./docs/guide.md
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 5. Start the server
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npm start
|
|
71
|
+
# or for development with auto-reload
|
|
72
|
+
npm run dev
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## API Endpoints
|
|
76
|
+
|
|
77
|
+
### POST /api/search
|
|
78
|
+
|
|
79
|
+
Search for relevant documents.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
curl -X POST http://localhost:3000/api/search \
|
|
83
|
+
-H "Content-Type: application/json" \
|
|
84
|
+
-d '{"query": "How does vector search work?", "limit": 5}'
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Response:
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"results": [
|
|
91
|
+
{
|
|
92
|
+
"text": "Vector search uses...",
|
|
93
|
+
"score": 0.95,
|
|
94
|
+
"metadata": { "source": "docs/guide.md" }
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
"meta": {
|
|
98
|
+
"model": "{{model}}",
|
|
99
|
+
"took": 123
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### POST /api/ingest
|
|
105
|
+
|
|
106
|
+
Ingest text or a file.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Ingest text
|
|
110
|
+
curl -X POST http://localhost:3000/api/ingest \
|
|
111
|
+
-H "Content-Type: application/json" \
|
|
112
|
+
-d '{"text": "Your document content...", "metadata": {"source": "api"}}'
|
|
113
|
+
|
|
114
|
+
# Ingest a file
|
|
115
|
+
curl -X POST http://localhost:3000/api/ingest \
|
|
116
|
+
-H "Content-Type: application/json" \
|
|
117
|
+
-d '{"path": "./docs/new-doc.md"}'
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### GET /api/health
|
|
121
|
+
|
|
122
|
+
Check API health and database connection.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
curl http://localhost:3000/api/health
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Project Structure
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
{{projectName}}/
|
|
132
|
+
├── server.js # Express server entry point
|
|
133
|
+
├── lib/
|
|
134
|
+
│ ├── client.js # Voyage AI API client
|
|
135
|
+
│ ├── connection.js # MongoDB connection helper
|
|
136
|
+
│ ├── retrieval.js # RAG retrieval module
|
|
137
|
+
│ ├── ingest.js # Document ingestion pipeline
|
|
138
|
+
│ └── search-api.js # Express API routes
|
|
139
|
+
├── .env.example
|
|
140
|
+
├── package.json
|
|
141
|
+
└── README.md
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Chunking Configuration
|
|
145
|
+
|
|
146
|
+
Documents are chunked with the following settings:
|
|
147
|
+
|
|
148
|
+
| Setting | Value |
|
|
149
|
+
|---------|-------|
|
|
150
|
+
| Strategy | `{{chunkStrategy}}` |
|
|
151
|
+
| Chunk Size | {{chunkSize}} characters |
|
|
152
|
+
| Overlap | {{chunkOverlap}} characters |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
Generated by [vai](https://github.com/mrlynn/voyageai-cli) v{{vaiVersion}}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Voyage AI API Client
|
|
3
|
+
* Generated by vai v{{vaiVersion}} on {{generatedAt}}
|
|
4
|
+
*
|
|
5
|
+
* Model: {{model}}
|
|
6
|
+
* Dimensions: {{dimensions}}
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const VOYAGE_API_URL = process.env.VOYAGE_API_URL || 'https://api.voyageai.com/v1';
|
|
10
|
+
const VOYAGE_API_KEY = process.env.VOYAGE_API_KEY;
|
|
11
|
+
|
|
12
|
+
if (!VOYAGE_API_KEY) {
|
|
13
|
+
throw new Error('VOYAGE_API_KEY environment variable is required');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Generate embeddings for text(s) using Voyage AI.
|
|
18
|
+
* @param {string|string[]} input - Text or array of texts to embed
|
|
19
|
+
* @param {object} options - Optional parameters
|
|
20
|
+
* @param {string} options.model - Embedding model (default: {{model}})
|
|
21
|
+
* @param {string} options.inputType - 'document' or 'query' (default: {{inputType}})
|
|
22
|
+
* @param {number} options.outputDimension - Output dimensions (default: {{dimensions}})
|
|
23
|
+
* @returns {Promise<{embeddings: number[][], usage: {total_tokens: number}}>}
|
|
24
|
+
*/
|
|
25
|
+
async function embed(input, options = {}) {
|
|
26
|
+
const texts = Array.isArray(input) ? input : [input];
|
|
27
|
+
|
|
28
|
+
const response = await fetch(`${VOYAGE_API_URL}/embeddings`, {
|
|
29
|
+
method: 'POST',
|
|
30
|
+
headers: {
|
|
31
|
+
'Content-Type': 'application/json',
|
|
32
|
+
'Authorization': `Bearer ${VOYAGE_API_KEY}`,
|
|
33
|
+
},
|
|
34
|
+
body: JSON.stringify({
|
|
35
|
+
model: options.model || '{{model}}',
|
|
36
|
+
input: texts,
|
|
37
|
+
input_type: options.inputType || '{{inputType}}',
|
|
38
|
+
output_dimension: options.outputDimension || {{dimensions}},
|
|
39
|
+
}),
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
if (!response.ok) {
|
|
43
|
+
const error = await response.text();
|
|
44
|
+
throw new Error(`Voyage AI API error: ${response.status} ${error}`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const data = await response.json();
|
|
48
|
+
return {
|
|
49
|
+
embeddings: data.data.map(d => d.embedding),
|
|
50
|
+
usage: data.usage,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
{{#if rerank}}
|
|
55
|
+
/**
|
|
56
|
+
* Rerank documents by relevance to a query.
|
|
57
|
+
* @param {string} query - The query to rank against
|
|
58
|
+
* @param {string[]} documents - Documents to rerank
|
|
59
|
+
* @param {object} options - Optional parameters
|
|
60
|
+
* @param {string} options.model - Rerank model (default: {{rerankModel}})
|
|
61
|
+
* @param {number} options.topK - Number of results to return
|
|
62
|
+
* @returns {Promise<{results: Array<{index: number, relevanceScore: number, document: string}>}>}
|
|
63
|
+
*/
|
|
64
|
+
async function rerank(query, documents, options = {}) {
|
|
65
|
+
const response = await fetch(`${VOYAGE_API_URL}/rerank`, {
|
|
66
|
+
method: 'POST',
|
|
67
|
+
headers: {
|
|
68
|
+
'Content-Type': 'application/json',
|
|
69
|
+
'Authorization': `Bearer ${VOYAGE_API_KEY}`,
|
|
70
|
+
},
|
|
71
|
+
body: JSON.stringify({
|
|
72
|
+
model: options.model || '{{rerankModel}}',
|
|
73
|
+
query,
|
|
74
|
+
documents,
|
|
75
|
+
top_k: options.topK,
|
|
76
|
+
return_documents: true,
|
|
77
|
+
}),
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
if (!response.ok) {
|
|
81
|
+
const error = await response.text();
|
|
82
|
+
throw new Error(`Voyage AI rerank error: ${response.status} ${error}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const data = await response.json();
|
|
86
|
+
return {
|
|
87
|
+
results: data.data.map(d => ({
|
|
88
|
+
index: d.index,
|
|
89
|
+
relevanceScore: d.relevance_score,
|
|
90
|
+
document: d.document,
|
|
91
|
+
})),
|
|
92
|
+
usage: data.usage,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
{{/if}}
|
|
96
|
+
|
|
97
|
+
module.exports = {
|
|
98
|
+
embed,
|
|
99
|
+
{{#if rerank}}
|
|
100
|
+
rerank,
|
|
101
|
+
{{/if}}
|
|
102
|
+
VOYAGE_API_URL,
|
|
103
|
+
};
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MongoDB Connection Helper
|
|
3
|
+
* Generated by vai v{{vaiVersion}} on {{generatedAt}}
|
|
4
|
+
*
|
|
5
|
+
* Database: {{db}}
|
|
6
|
+
* Collection: {{collection}}
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const { MongoClient } = require('mongodb');
|
|
10
|
+
|
|
11
|
+
const MONGODB_URI = process.env.MONGODB_URI;
|
|
12
|
+
|
|
13
|
+
if (!MONGODB_URI) {
|
|
14
|
+
throw new Error('MONGODB_URI environment variable is required');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
let client = null;
|
|
18
|
+
let db = null;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Connect to MongoDB and return the database instance.
|
|
22
|
+
* Reuses existing connection if available.
|
|
23
|
+
* @returns {Promise<import('mongodb').Db>}
|
|
24
|
+
*/
|
|
25
|
+
async function connect() {
|
|
26
|
+
if (db) return db;
|
|
27
|
+
|
|
28
|
+
client = new MongoClient(MONGODB_URI);
|
|
29
|
+
await client.connect();
|
|
30
|
+
db = client.db('{{db}}');
|
|
31
|
+
|
|
32
|
+
console.log('Connected to MongoDB: {{db}}');
|
|
33
|
+
return db;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Get the documents collection.
|
|
38
|
+
* @returns {Promise<import('mongodb').Collection>}
|
|
39
|
+
*/
|
|
40
|
+
async function getCollection() {
|
|
41
|
+
const database = await connect();
|
|
42
|
+
return database.collection('{{collection}}');
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Close the MongoDB connection.
|
|
47
|
+
*/
|
|
48
|
+
async function close() {
|
|
49
|
+
if (client) {
|
|
50
|
+
await client.close();
|
|
51
|
+
client = null;
|
|
52
|
+
db = null;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Perform a vector search on the collection.
|
|
58
|
+
* @param {number[]} embedding - Query embedding vector
|
|
59
|
+
* @param {object} options - Search options
|
|
60
|
+
* @param {number} options.limit - Number of results (default: 10)
|
|
61
|
+
* @param {number} options.numCandidates - Candidates to consider (default: limit * 10)
|
|
62
|
+
* @param {object} options.filter - Optional pre-filter
|
|
63
|
+
* @returns {Promise<Array<{document: object, score: number}>>}
|
|
64
|
+
*/
|
|
65
|
+
async function vectorSearch(embedding, options = {}) {
|
|
66
|
+
const collection = await getCollection();
|
|
67
|
+
const limit = options.limit || 10;
|
|
68
|
+
const numCandidates = options.numCandidates || limit * 10;
|
|
69
|
+
|
|
70
|
+
const pipeline = [
|
|
71
|
+
{
|
|
72
|
+
$vectorSearch: {
|
|
73
|
+
index: '{{index}}',
|
|
74
|
+
path: '{{field}}',
|
|
75
|
+
queryVector: embedding,
|
|
76
|
+
numCandidates,
|
|
77
|
+
limit,
|
|
78
|
+
{{#if filter}}
|
|
79
|
+
filter: options.filter,
|
|
80
|
+
{{/if}}
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
$project: {
|
|
85
|
+
_id: 1,
|
|
86
|
+
text: 1,
|
|
87
|
+
metadata: 1,
|
|
88
|
+
score: { $meta: 'vectorSearchScore' },
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
];
|
|
92
|
+
|
|
93
|
+
const results = await collection.aggregate(pipeline).toArray();
|
|
94
|
+
return results.map(doc => ({
|
|
95
|
+
document: doc,
|
|
96
|
+
score: doc.score,
|
|
97
|
+
}));
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Insert documents with embeddings.
|
|
102
|
+
* @param {Array<{text: string, embedding: number[], metadata?: object}>} docs
|
|
103
|
+
* @returns {Promise<{insertedCount: number}>}
|
|
104
|
+
*/
|
|
105
|
+
async function insertDocuments(docs) {
|
|
106
|
+
const collection = await getCollection();
|
|
107
|
+
|
|
108
|
+
const documents = docs.map(doc => ({
|
|
109
|
+
text: doc.text,
|
|
110
|
+
{{field}}: doc.embedding,
|
|
111
|
+
metadata: doc.metadata || {},
|
|
112
|
+
_embeddedAt: new Date(),
|
|
113
|
+
_model: '{{model}}',
|
|
114
|
+
}));
|
|
115
|
+
|
|
116
|
+
const result = await collection.insertMany(documents);
|
|
117
|
+
return { insertedCount: result.insertedCount };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
module.exports = {
|
|
121
|
+
connect,
|
|
122
|
+
getCollection,
|
|
123
|
+
close,
|
|
124
|
+
vectorSearch,
|
|
125
|
+
insertDocuments,
|
|
126
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Voyage AI API
|
|
2
|
+
VOYAGE_API_KEY=your_voyage_api_key_here
|
|
3
|
+
|
|
4
|
+
# MongoDB Atlas
|
|
5
|
+
MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/{{db}}?retryWrites=true&w=majority
|
|
6
|
+
|
|
7
|
+
# Server
|
|
8
|
+
PORT=3000
|
|
9
|
+
|
|
10
|
+
# Optional: Override defaults
|
|
11
|
+
# VOYAGE_API_URL=https://api.voyageai.com/v1
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Ingestion Pipeline
|
|
3
|
+
* Generated by vai v{{vaiVersion}} on {{generatedAt}}
|
|
4
|
+
*
|
|
5
|
+
* Model: {{model}}
|
|
6
|
+
* Chunk Strategy: {{chunkStrategy}}
|
|
7
|
+
* Chunk Size: {{chunkSize}} characters
|
|
8
|
+
* Overlap: {{chunkOverlap}} characters
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const fs = require('fs');
|
|
12
|
+
const path = require('path');
|
|
13
|
+
const { embed } = require('./client');
|
|
14
|
+
const { insertDocuments, close } = require('./connection');
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Chunk text using {{chunkStrategy}} strategy.
|
|
18
|
+
* @param {string} text - Text to chunk
|
|
19
|
+
* @param {object} options - Chunking options
|
|
20
|
+
* @returns {string[]} Array of chunks
|
|
21
|
+
*/
|
|
22
|
+
function chunkText(text, options = {}) {
|
|
23
|
+
const size = options.size || {{chunkSize}};
|
|
24
|
+
const overlap = options.overlap || {{chunkOverlap}};
|
|
25
|
+
|
|
26
|
+
{{#if chunkStrategy}}
|
|
27
|
+
{{#unless chunkStrategy}}
|
|
28
|
+
// Fixed-size chunking
|
|
29
|
+
const chunks = [];
|
|
30
|
+
let start = 0;
|
|
31
|
+
|
|
32
|
+
while (start < text.length) {
|
|
33
|
+
const end = Math.min(start + size, text.length);
|
|
34
|
+
chunks.push(text.slice(start, end).trim());
|
|
35
|
+
start = end - overlap;
|
|
36
|
+
if (start >= text.length) break;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return chunks.filter(c => c.length > 0);
|
|
40
|
+
{{/unless}}
|
|
41
|
+
{{/if}}
|
|
42
|
+
// Recursive chunking with smart boundaries
|
|
43
|
+
const separators = ['\n\n', '\n', '. ', ' '];
|
|
44
|
+
|
|
45
|
+
function splitRecursive(text, separatorIndex = 0) {
|
|
46
|
+
if (text.length <= size) {
|
|
47
|
+
return [text.trim()].filter(c => c.length > 0);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (separatorIndex >= separators.length) {
|
|
51
|
+
// Fall back to fixed-size split
|
|
52
|
+
const chunks = [];
|
|
53
|
+
let start = 0;
|
|
54
|
+
while (start < text.length) {
|
|
55
|
+
chunks.push(text.slice(start, start + size).trim());
|
|
56
|
+
start += size - overlap;
|
|
57
|
+
}
|
|
58
|
+
return chunks.filter(c => c.length > 0);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const separator = separators[separatorIndex];
|
|
62
|
+
const parts = text.split(separator);
|
|
63
|
+
const chunks = [];
|
|
64
|
+
let current = '';
|
|
65
|
+
|
|
66
|
+
for (const part of parts) {
|
|
67
|
+
const potential = current ? current + separator + part : part;
|
|
68
|
+
|
|
69
|
+
if (potential.length <= size) {
|
|
70
|
+
current = potential;
|
|
71
|
+
} else {
|
|
72
|
+
if (current) chunks.push(...splitRecursive(current, separatorIndex + 1));
|
|
73
|
+
current = part;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (current) {
|
|
78
|
+
chunks.push(...splitRecursive(current, separatorIndex + 1));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return chunks;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return splitRecursive(text);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Read and parse a file based on extension.
|
|
89
|
+
* @param {string} filePath - Path to the file
|
|
90
|
+
* @returns {{text: string, metadata: object}}
|
|
91
|
+
*/
|
|
92
|
+
function readFile(filePath) {
|
|
93
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
94
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
95
|
+
|
|
96
|
+
const metadata = {
|
|
97
|
+
source: filePath,
|
|
98
|
+
filename: path.basename(filePath),
|
|
99
|
+
extension: ext,
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
// For markdown, optionally extract frontmatter
|
|
103
|
+
if (ext === '.md' || ext === '.mdx') {
|
|
104
|
+
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
105
|
+
if (frontmatterMatch) {
|
|
106
|
+
// Simple YAML-like parsing for common fields
|
|
107
|
+
const yaml = frontmatterMatch[1];
|
|
108
|
+
const body = frontmatterMatch[2];
|
|
109
|
+
|
|
110
|
+
const titleMatch = yaml.match(/title:\s*["']?(.+?)["']?\s*$/m);
|
|
111
|
+
if (titleMatch) metadata.title = titleMatch[1];
|
|
112
|
+
|
|
113
|
+
return { text: body, metadata };
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return { text: content, metadata };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Ingest a single file: read, chunk, embed, and store.
|
|
122
|
+
* @param {string} filePath - Path to the file
|
|
123
|
+
* @param {object} options - Ingestion options
|
|
124
|
+
* @returns {Promise<{chunks: number, tokens: number}>}
|
|
125
|
+
*/
|
|
126
|
+
async function ingestFile(filePath, options = {}) {
|
|
127
|
+
const { text, metadata } = readFile(filePath);
|
|
128
|
+
const chunks = chunkText(text, options);
|
|
129
|
+
|
|
130
|
+
if (chunks.length === 0) {
|
|
131
|
+
return { chunks: 0, tokens: 0 };
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Embed all chunks
|
|
135
|
+
const { embeddings, usage } = await embed(chunks, { inputType: 'document' });
|
|
136
|
+
|
|
137
|
+
// Prepare documents for insertion
|
|
138
|
+
const documents = chunks.map((chunk, i) => ({
|
|
139
|
+
text: chunk,
|
|
140
|
+
embedding: embeddings[i],
|
|
141
|
+
metadata: {
|
|
142
|
+
...metadata,
|
|
143
|
+
chunkIndex: i,
|
|
144
|
+
totalChunks: chunks.length,
|
|
145
|
+
},
|
|
146
|
+
}));
|
|
147
|
+
|
|
148
|
+
// Insert into MongoDB
|
|
149
|
+
await insertDocuments(documents);
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
chunks: chunks.length,
|
|
153
|
+
tokens: usage.total_tokens,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Ingest multiple files from a directory.
|
|
159
|
+
* @param {string} dirPath - Directory path
|
|
160
|
+
* @param {object} options - Ingestion options
|
|
161
|
+
* @param {string[]} options.extensions - File extensions to include (default: ['.txt', '.md'])
|
|
162
|
+
* @returns {Promise<{files: number, chunks: number, tokens: number}>}
|
|
163
|
+
*/
|
|
164
|
+
async function ingestDirectory(dirPath, options = {}) {
|
|
165
|
+
const extensions = options.extensions || ['.txt', '.md', '.mdx'];
|
|
166
|
+
|
|
167
|
+
function findFiles(dir) {
|
|
168
|
+
const files = [];
|
|
169
|
+
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
170
|
+
|
|
171
|
+
for (const entry of entries) {
|
|
172
|
+
const fullPath = path.join(dir, entry.name);
|
|
173
|
+
if (entry.isDirectory() && !entry.name.startsWith('.')) {
|
|
174
|
+
files.push(...findFiles(fullPath));
|
|
175
|
+
} else if (entry.isFile() && extensions.includes(path.extname(entry.name).toLowerCase())) {
|
|
176
|
+
files.push(fullPath);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return files;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const files = findFiles(dirPath);
|
|
184
|
+
let totalChunks = 0;
|
|
185
|
+
let totalTokens = 0;
|
|
186
|
+
|
|
187
|
+
for (const file of files) {
|
|
188
|
+
console.log(`Ingesting: ${file}`);
|
|
189
|
+
const { chunks, tokens } = await ingestFile(file, options);
|
|
190
|
+
totalChunks += chunks;
|
|
191
|
+
totalTokens += tokens;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
files: files.length,
|
|
196
|
+
chunks: totalChunks,
|
|
197
|
+
tokens: totalTokens,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// CLI entry point
|
|
202
|
+
if (require.main === module) {
|
|
203
|
+
const args = process.argv.slice(2);
|
|
204
|
+
const target = args[0] || '.';
|
|
205
|
+
|
|
206
|
+
(async () => {
|
|
207
|
+
try {
|
|
208
|
+
const stats = fs.statSync(target);
|
|
209
|
+
|
|
210
|
+
if (stats.isDirectory()) {
|
|
211
|
+
const result = await ingestDirectory(target);
|
|
212
|
+
console.log(`\n✓ Ingested ${result.files} files (${result.chunks} chunks, ${result.tokens} tokens)`);
|
|
213
|
+
} else {
|
|
214
|
+
const result = await ingestFile(target);
|
|
215
|
+
console.log(`\n✓ Ingested ${result.chunks} chunks (${result.tokens} tokens)`);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
await close();
|
|
219
|
+
} catch (err) {
|
|
220
|
+
console.error('Error:', err.message);
|
|
221
|
+
process.exit(1);
|
|
222
|
+
}
|
|
223
|
+
})();
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
module.exports = {
|
|
227
|
+
chunkText,
|
|
228
|
+
readFile,
|
|
229
|
+
ingestFile,
|
|
230
|
+
ingestDirectory,
|
|
231
|
+
};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "{{projectName}}",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Voyage AI RAG application",
|
|
5
|
+
"type": "commonjs",
|
|
6
|
+
"main": "server.js",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"start": "node server.js",
|
|
9
|
+
"ingest": "node lib/ingest.js",
|
|
10
|
+
"dev": "node --watch server.js"
|
|
11
|
+
},
|
|
12
|
+
"dependencies": {
|
|
13
|
+
"express": "^4.18.2",
|
|
14
|
+
"mongodb": "^6.3.0"
|
|
15
|
+
},
|
|
16
|
+
"engines": {
|
|
17
|
+
"node": ">=18.0.0"
|
|
18
|
+
},
|
|
19
|
+
"keywords": [
|
|
20
|
+
"voyage-ai",
|
|
21
|
+
"rag",
|
|
22
|
+
"vector-search",
|
|
23
|
+
"mongodb-atlas"
|
|
24
|
+
],
|
|
25
|
+
"generated": {
|
|
26
|
+
"by": "vai",
|
|
27
|
+
"version": "{{vaiVersion}}",
|
|
28
|
+
"model": "{{model}}",
|
|
29
|
+
"at": "{{generatedAt}}"
|
|
30
|
+
}
|
|
31
|
+
}
|