simple-dynamsoft-mcp 7.2.0 → 7.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +5 -3
- package/README.md +1 -2
- package/package.json +3 -2
- package/src/data/download-utils.js +11 -3
- package/src/index.js +1 -2
- package/src/rag/config.js +16 -3
- package/src/rag/gemini-retry.js +1 -1
- package/src/rag/index.js +6 -2
- package/src/rag/logger.js +4 -19
- package/src/rag/providers.js +15 -46
- package/src/rag/search-utils.js +8 -1
- package/src/rag/vector-cache.js +173 -171
- package/scripts/compute-repo-signatures.mjs +0 -210
- package/scripts/data-sync-azure.mjs +0 -364
- package/src/data/shared-state.js +0 -214
package/src/rag/vector-cache.js
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
import { existsSync, mkdirSync,
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
normalizeRepoKey,
|
|
7
|
-
normalizeRepoPath
|
|
8
|
-
} from "../data/shared-state.js";
|
|
1
|
+
import { readFileSync, existsSync, writeFileSync, mkdirSync, readdirSync, copyFileSync, rmSync, statSync } from "node:fs";
|
|
2
|
+
import { join, basename, resolve } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import * as tar from "tar";
|
|
9
6
|
|
|
10
7
|
function ensureDirectory(path) {
|
|
11
8
|
if (!existsSync(path)) {
|
|
@@ -105,197 +102,206 @@ function clearVectorIndexCheckpoint(checkpointFile) {
|
|
|
105
102
|
}
|
|
106
103
|
}
|
|
107
104
|
|
|
108
|
-
function
|
|
109
|
-
|
|
110
|
-
const
|
|
111
|
-
|
|
105
|
+
function listFilesRecursive(rootDir) {
|
|
106
|
+
const files = [];
|
|
107
|
+
const stack = [rootDir];
|
|
108
|
+
while (stack.length > 0) {
|
|
109
|
+
const current = stack.pop();
|
|
110
|
+
const entries = readdirSync(current, { withFileTypes: true });
|
|
111
|
+
for (const entry of entries) {
|
|
112
|
+
const fullPath = join(current, entry.name);
|
|
113
|
+
if (entry.isDirectory()) {
|
|
114
|
+
stack.push(fullPath);
|
|
115
|
+
} else if (entry.isFile()) {
|
|
116
|
+
files.push(fullPath);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return files;
|
|
121
|
+
}
|
|
112
122
|
|
|
123
|
+
function readSignaturePackageVersion(signatureRaw) {
|
|
124
|
+
if (!signatureRaw) return "";
|
|
113
125
|
try {
|
|
114
|
-
const parsed = JSON.parse(
|
|
115
|
-
|
|
116
|
-
return parsed.repos
|
|
117
|
-
.map((repo) => ({
|
|
118
|
-
path: normalizeRepoPath(repo?.path),
|
|
119
|
-
commit: String(repo?.commit || "").trim()
|
|
120
|
-
}))
|
|
121
|
-
.filter((repo) => repo.path && repo.commit);
|
|
126
|
+
const parsed = JSON.parse(signatureRaw);
|
|
127
|
+
return String(parsed?.packageVersion || "");
|
|
122
128
|
} catch {
|
|
123
|
-
return
|
|
129
|
+
return "";
|
|
124
130
|
}
|
|
125
131
|
}
|
|
126
132
|
|
|
127
|
-
function
|
|
128
|
-
const
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
133
|
+
function listDownloadedCacheCandidatesByProvider(extractRoot, expectedCacheFileName, cacheKey, provider) {
|
|
134
|
+
const allFiles = listFilesRecursive(extractRoot).filter((path) => path.toLowerCase().endsWith(".json")).sort();
|
|
135
|
+
const expectedPath = allFiles.find((path) => basename(path) === expectedCacheFileName);
|
|
136
|
+
|
|
137
|
+
const cachePrefix = cacheKey.slice(0, 12);
|
|
138
|
+
const prefixPath = allFiles.find((path) => {
|
|
139
|
+
const name = basename(path);
|
|
140
|
+
return name.startsWith(`rag-${provider}-`) && name.endsWith(`-${cachePrefix}.json`);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
const providerFiles = allFiles.filter((path) => basename(path).startsWith(`rag-${provider}-`));
|
|
144
|
+
const unique = [];
|
|
145
|
+
for (const path of [expectedPath, prefixPath, ...providerFiles]) {
|
|
146
|
+
if (!path) continue;
|
|
147
|
+
if (!unique.includes(path)) unique.push(path);
|
|
134
148
|
}
|
|
149
|
+
return unique;
|
|
150
|
+
}
|
|
135
151
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
}
|
|
152
|
+
function resolvePrebuiltIndexUrlCandidates(provider, ragConfig, legacyPrebuiltIndexUrl) {
|
|
153
|
+
const override = String(ragConfig.prebuiltIndexUrl || "").trim();
|
|
154
|
+
if (override) return [override];
|
|
140
155
|
|
|
141
|
-
const
|
|
142
|
-
if (
|
|
143
|
-
|
|
156
|
+
const candidates = [];
|
|
157
|
+
if (provider === "gemini") {
|
|
158
|
+
candidates.push(String(ragConfig.prebuiltIndexUrlGemini || "").trim());
|
|
144
159
|
}
|
|
160
|
+
candidates.push(legacyPrebuiltIndexUrl);
|
|
145
161
|
|
|
146
|
-
const
|
|
147
|
-
const stateRoot = resolve(stateDir, "..");
|
|
148
|
-
const workspaceRoot = dirname(stateRoot);
|
|
149
|
-
const candidates = [
|
|
150
|
-
resolve(stateRoot, normalizedShardPath),
|
|
151
|
-
resolve(stateDir, normalizedShardPath),
|
|
152
|
-
resolve(workspaceRoot, normalizedShardPath),
|
|
153
|
-
resolve(process.cwd(), normalizedShardPath)
|
|
154
|
-
];
|
|
155
|
-
|
|
162
|
+
const deduped = [];
|
|
156
163
|
for (const candidate of candidates) {
|
|
157
|
-
if (
|
|
158
|
-
|
|
159
|
-
}
|
|
164
|
+
if (!candidate) continue;
|
|
165
|
+
if (!deduped.includes(candidate)) deduped.push(candidate);
|
|
160
166
|
}
|
|
161
|
-
|
|
162
|
-
return candidates[0];
|
|
167
|
+
return deduped;
|
|
163
168
|
}
|
|
164
169
|
|
|
165
|
-
function
|
|
166
|
-
const
|
|
167
|
-
|
|
170
|
+
async function downloadPrebuiltArchive(url, outputPath, timeoutMs) {
|
|
171
|
+
const source = String(url || "").trim();
|
|
172
|
+
if (!source) {
|
|
173
|
+
throw new Error("prebuilt URL is empty");
|
|
174
|
+
}
|
|
168
175
|
|
|
169
|
-
if (
|
|
170
|
-
|
|
176
|
+
if (source.startsWith("file://")) {
|
|
177
|
+
copyFileSync(fileURLToPath(source), outputPath);
|
|
178
|
+
return { sourceType: "file", size: statSync(outputPath).size };
|
|
171
179
|
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
);
|
|
180
|
+
|
|
181
|
+
if (!/^https?:\/\//i.test(source)) {
|
|
182
|
+
copyFileSync(resolve(source), outputPath);
|
|
183
|
+
return { sourceType: "file", size: statSync(outputPath).size };
|
|
176
184
|
}
|
|
177
185
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
186
|
+
const controller = new AbortController();
|
|
187
|
+
const timer = setTimeout(() => controller.abort(), Math.max(1000, timeoutMs));
|
|
188
|
+
try {
|
|
189
|
+
const response = await fetch(source, { signal: controller.signal });
|
|
190
|
+
if (!response.ok) {
|
|
191
|
+
throw new Error(`HTTP ${response.status}`);
|
|
192
|
+
}
|
|
193
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
194
|
+
writeFileSync(outputPath, Buffer.from(arrayBuffer));
|
|
195
|
+
return { sourceType: "http", size: arrayBuffer.byteLength };
|
|
196
|
+
} finally {
|
|
197
|
+
clearTimeout(timer);
|
|
198
|
+
}
|
|
182
199
|
}
|
|
183
200
|
|
|
184
|
-
function
|
|
185
|
-
|
|
186
|
-
chunkSize: ragConfig.chunkSize,
|
|
187
|
-
chunkOverlap: ragConfig.chunkOverlap,
|
|
188
|
-
maxChunksPerDoc: ragConfig.maxChunksPerDoc,
|
|
189
|
-
maxTextChars: ragConfig.maxTextChars
|
|
190
|
-
};
|
|
191
|
-
}
|
|
201
|
+
function createVectorCacheHelpers({ ragConfig, pkgVersion, legacyPrebuiltIndexUrl, logRag }) {
|
|
202
|
+
const prebuiltDownloadAttempts = new Map();
|
|
192
203
|
|
|
193
|
-
function
|
|
194
|
-
async function maybeLoadSharedVectorIndex({ provider, model, cacheKey, signature, cacheFile }) {
|
|
204
|
+
async function maybeDownloadPrebuiltVectorIndex({ provider, model, cacheKey, signature, cacheFile }) {
|
|
195
205
|
if (provider !== "gemini") {
|
|
196
|
-
return {
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
const sharedStatePath = String(ragConfig.sharedStatePath || "").trim();
|
|
200
|
-
if (!sharedStatePath) {
|
|
201
|
-
return { loaded: false, reason: "shared_state_not_configured" };
|
|
206
|
+
return { downloaded: false, reason: "provider_not_supported" };
|
|
202
207
|
}
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
return { loaded: false, reason: "shared_state_unreadable" };
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
let sharedState;
|
|
209
|
-
try {
|
|
210
|
-
sharedState = loadSharedState(readFileSync(sharedStatePath, "utf8"));
|
|
211
|
-
} catch (error) {
|
|
212
|
-
const normalizedError = error instanceof Error ? error : new Error(String(error));
|
|
213
|
-
return {
|
|
214
|
-
loaded: false,
|
|
215
|
-
fatal: true,
|
|
216
|
-
reason: "shared_state_invalid",
|
|
217
|
-
error: normalizedError
|
|
218
|
-
};
|
|
208
|
+
if (!ragConfig.prebuiltIndexAutoDownload) {
|
|
209
|
+
return { downloaded: false, reason: "auto_download_disabled" };
|
|
219
210
|
}
|
|
220
211
|
|
|
221
|
-
const
|
|
222
|
-
if (
|
|
223
|
-
return {
|
|
212
|
+
const sourceUrls = resolvePrebuiltIndexUrlCandidates(provider, ragConfig, legacyPrebuiltIndexUrl);
|
|
213
|
+
if (sourceUrls.length === 0) {
|
|
214
|
+
return { downloaded: false, reason: "url_not_set" };
|
|
224
215
|
}
|
|
225
216
|
|
|
226
|
-
const
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
const repoKey = normalizeRepoKey(repo.path);
|
|
230
|
-
const sharedRepo = sharedState.repos[repoKey];
|
|
231
|
-
if (!sharedRepo) continue;
|
|
232
|
-
|
|
233
|
-
const expectedSignature = computeRepoSignature({
|
|
234
|
-
repo,
|
|
235
|
-
embeddingModel: model,
|
|
236
|
-
indexConfig: repoIndexConfig,
|
|
237
|
-
indexVersion: sharedState.indexVersion
|
|
238
|
-
});
|
|
239
|
-
|
|
240
|
-
if (sharedRepo.signature !== expectedSignature) continue;
|
|
241
|
-
matchedShards.push({
|
|
242
|
-
repoPath: repo.path,
|
|
243
|
-
shardPath: sharedRepo.shardPath,
|
|
244
|
-
signature: expectedSignature
|
|
245
|
-
});
|
|
217
|
+
const attemptKey = `${provider}:${cacheKey}:${sourceUrls.join("|")}`;
|
|
218
|
+
if (prebuiltDownloadAttempts.has(attemptKey)) {
|
|
219
|
+
return prebuiltDownloadAttempts.get(attemptKey);
|
|
246
220
|
}
|
|
247
221
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
const
|
|
258
|
-
|
|
259
|
-
|
|
222
|
+
const expectedCacheFileName = makeCacheFileName(provider, model, cacheKey);
|
|
223
|
+
const attempt = (async () => {
|
|
224
|
+
let lastReason = "not_attempted";
|
|
225
|
+
for (const sourceUrl of sourceUrls) {
|
|
226
|
+
const tempRoot = join(
|
|
227
|
+
tmpdir(),
|
|
228
|
+
`simple-dynamsoft-mcp-rag-prebuilt-${Date.now()}-${Math.random().toString(16).slice(2)}`
|
|
229
|
+
);
|
|
230
|
+
const archivePath = join(tempRoot, "prebuilt-rag-index.tar.gz");
|
|
231
|
+
const extractRoot = join(tempRoot, "extract");
|
|
232
|
+
|
|
233
|
+
ensureDirectory(extractRoot);
|
|
234
|
+
try {
|
|
235
|
+
logRag(
|
|
236
|
+
`prebuilt index download start provider=${provider} url=${sourceUrl} timeout_ms=${ragConfig.prebuiltIndexTimeoutMs}`
|
|
237
|
+
);
|
|
238
|
+
const downloaded = await downloadPrebuiltArchive(sourceUrl, archivePath, ragConfig.prebuiltIndexTimeoutMs);
|
|
239
|
+
logRag(
|
|
240
|
+
`prebuilt index downloaded provider=${provider} source=${downloaded.sourceType} size=${downloaded.size}B url=${sourceUrl}`
|
|
241
|
+
);
|
|
242
|
+
|
|
243
|
+
await tar.x({
|
|
244
|
+
file: archivePath,
|
|
245
|
+
cwd: extractRoot,
|
|
246
|
+
strict: true
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
const candidateFiles = listDownloadedCacheCandidatesByProvider(
|
|
250
|
+
extractRoot,
|
|
251
|
+
expectedCacheFileName,
|
|
252
|
+
cacheKey,
|
|
253
|
+
provider
|
|
254
|
+
);
|
|
255
|
+
if (candidateFiles.length === 0) {
|
|
256
|
+
throw new Error(`cache_file_not_found expected=${expectedCacheFileName}`);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
for (const sourceCacheFile of candidateFiles) {
|
|
260
|
+
const candidateCache = loadVectorIndexCache(sourceCacheFile, {
|
|
261
|
+
provider,
|
|
262
|
+
model
|
|
263
|
+
});
|
|
264
|
+
if (!candidateCache.hit) {
|
|
265
|
+
continue;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const cachePackageVersion = readSignaturePackageVersion(candidateCache.payload?.meta?.signature);
|
|
269
|
+
if (!cachePackageVersion || cachePackageVersion !== pkgVersion) {
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const migratedPayload = {
|
|
274
|
+
...candidateCache.payload,
|
|
275
|
+
cacheKey,
|
|
276
|
+
meta: {
|
|
277
|
+
...(candidateCache.payload.meta || {}),
|
|
278
|
+
provider,
|
|
279
|
+
model,
|
|
280
|
+
signature
|
|
281
|
+
}
|
|
282
|
+
};
|
|
283
|
+
saveVectorIndexCache(ragConfig.cacheDir, cacheFile, migratedPayload);
|
|
284
|
+
logRag(
|
|
285
|
+
`prebuilt index installed provider=${provider} cache_file=${cacheFile} source=${basename(sourceCacheFile)} mode=version_only_compat version=${cachePackageVersion}`
|
|
286
|
+
);
|
|
287
|
+
return { downloaded: true, reason: "installed_version_only_compat" };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
throw new Error(
|
|
291
|
+
`no_compatible_cache expected=${expectedCacheFileName} found=${candidateFiles.map((path) => basename(path)).join(",")}`
|
|
292
|
+
);
|
|
293
|
+
} catch (error) {
|
|
294
|
+
lastReason = `${sourceUrl} => ${error.message}`;
|
|
295
|
+
logRag(`prebuilt index unavailable provider=${provider} url=${sourceUrl} reason=${error.message}`);
|
|
296
|
+
} finally {
|
|
297
|
+
rmSync(tempRoot, { recursive: true, force: true });
|
|
260
298
|
}
|
|
261
|
-
const parsed = parseSharedShardPayload(shard.shardPath, readFileSync(shardFile, "utf8"));
|
|
262
|
-
combinedItems.push(...parsed.items);
|
|
263
|
-
combinedVectors.push(...parsed.vectors);
|
|
264
299
|
}
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
return {
|
|
268
|
-
loaded: false,
|
|
269
|
-
fatal: true,
|
|
270
|
-
reason: "shared_shard_error",
|
|
271
|
-
error: normalizedError
|
|
272
|
-
};
|
|
273
|
-
}
|
|
300
|
+
return { downloaded: false, reason: lastReason };
|
|
301
|
+
})();
|
|
274
302
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
meta: {
|
|
278
|
-
provider,
|
|
279
|
-
model,
|
|
280
|
-
signature,
|
|
281
|
-
source: "shared_state",
|
|
282
|
-
sharedStatePath,
|
|
283
|
-
sharedStateIndexVersion: sharedState.indexVersion,
|
|
284
|
-
sharedShardCount: matchedShards.length
|
|
285
|
-
},
|
|
286
|
-
items: combinedItems,
|
|
287
|
-
vectors: combinedVectors
|
|
288
|
-
};
|
|
289
|
-
saveVectorIndexCache(ragConfig.cacheDir, cacheFile, payload);
|
|
290
|
-
logRag(
|
|
291
|
-
`shared shard index loaded provider=${provider} shards=${matchedShards.length} items=${combinedItems.length} vectors=${combinedVectors.length}`
|
|
292
|
-
);
|
|
293
|
-
return {
|
|
294
|
-
loaded: true,
|
|
295
|
-
reason: "loaded_shared_shards",
|
|
296
|
-
shardCount: matchedShards.length,
|
|
297
|
-
itemCount: combinedItems.length
|
|
298
|
-
};
|
|
303
|
+
prebuiltDownloadAttempts.set(attemptKey, attempt);
|
|
304
|
+
return attempt;
|
|
299
305
|
}
|
|
300
306
|
|
|
301
307
|
return {
|
|
@@ -306,11 +312,7 @@ function createVectorCacheHelpers({ ragConfig, logRag }) {
|
|
|
306
312
|
loadVectorIndexCheckpoint,
|
|
307
313
|
saveVectorIndexCheckpoint: (checkpointFile, payload) => saveVectorIndexCheckpoint(ragConfig.cacheDir, checkpointFile, payload),
|
|
308
314
|
clearVectorIndexCheckpoint,
|
|
309
|
-
|
|
310
|
-
maybeDownloadPrebuiltVectorIndex: async () => ({
|
|
311
|
-
downloaded: false,
|
|
312
|
-
reason: "runtime_prebuilt_download_disabled"
|
|
313
|
-
})
|
|
315
|
+
maybeDownloadPrebuiltVectorIndex
|
|
314
316
|
};
|
|
315
317
|
}
|
|
316
318
|
|
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { dirname, join } from "node:path";
|
|
4
|
-
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
5
|
-
|
|
6
|
-
import {
|
|
7
|
-
SHARED_STATE_SCHEMA_VERSION,
|
|
8
|
-
computeRepoSignature,
|
|
9
|
-
createSharedState,
|
|
10
|
-
normalizeRepoKey,
|
|
11
|
-
normalizeRepoPath
|
|
12
|
-
} from "../src/data/shared-state.js";
|
|
13
|
-
|
|
14
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
15
|
-
const projectRoot = join(__dirname, "..");
|
|
16
|
-
const DEFAULT_EMBEDDING_MODEL = "models/gemini-embedding-001";
|
|
17
|
-
const DEFAULT_INDEX_VERSION = "azure-shared-v1";
|
|
18
|
-
const DEFAULT_INDEX_CONFIG = {
|
|
19
|
-
chunkSize: 1200,
|
|
20
|
-
chunkOverlap: 200,
|
|
21
|
-
maxChunksPerDoc: 6,
|
|
22
|
-
maxTextChars: 4000
|
|
23
|
-
};
|
|
24
|
-
|
|
25
|
-
function parseIntegerOption(flag, rawValue, { min = 0 } = {}) {
|
|
26
|
-
const valueText = String(rawValue ?? "").trim();
|
|
27
|
-
if (!/^-?\d+$/.test(valueText)) {
|
|
28
|
-
throw new Error(`Invalid ${flag}: expected an integer, received '${rawValue}'`);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
const value = Number.parseInt(valueText, 10);
|
|
32
|
-
if (!Number.isSafeInteger(value)) {
|
|
33
|
-
throw new Error(`Invalid ${flag}: expected a safe integer, received '${rawValue}'`);
|
|
34
|
-
}
|
|
35
|
-
if (value < min) {
|
|
36
|
-
throw new Error(`Invalid ${flag}: expected integer >= ${min}, received '${rawValue}'`);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
return value;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
function parseArgs(argv, env = process.env) {
|
|
43
|
-
const defaultIndexConfig = {
|
|
44
|
-
chunkSize: env.DATA_SYNC_AZURE_CHUNK_SIZE
|
|
45
|
-
? parseIntegerOption("--chunk-size", env.DATA_SYNC_AZURE_CHUNK_SIZE, { min: 0 })
|
|
46
|
-
: DEFAULT_INDEX_CONFIG.chunkSize,
|
|
47
|
-
chunkOverlap: env.DATA_SYNC_AZURE_CHUNK_OVERLAP
|
|
48
|
-
? parseIntegerOption("--chunk-overlap", env.DATA_SYNC_AZURE_CHUNK_OVERLAP, { min: 0 })
|
|
49
|
-
: DEFAULT_INDEX_CONFIG.chunkOverlap,
|
|
50
|
-
maxChunksPerDoc: env.DATA_SYNC_AZURE_MAX_CHUNKS_PER_DOC
|
|
51
|
-
? parseIntegerOption("--max-chunks-per-doc", env.DATA_SYNC_AZURE_MAX_CHUNKS_PER_DOC, { min: 1 })
|
|
52
|
-
: DEFAULT_INDEX_CONFIG.maxChunksPerDoc,
|
|
53
|
-
maxTextChars: env.DATA_SYNC_AZURE_MAX_TEXT_CHARS
|
|
54
|
-
? parseIntegerOption("--max-text-chars", env.DATA_SYNC_AZURE_MAX_TEXT_CHARS, { min: 0 })
|
|
55
|
-
: DEFAULT_INDEX_CONFIG.maxTextChars
|
|
56
|
-
};
|
|
57
|
-
|
|
58
|
-
const args = {
|
|
59
|
-
manifest: "data/metadata/data-manifest.json",
|
|
60
|
-
output: "",
|
|
61
|
-
embeddingModel: env.DATA_SYNC_AZURE_EMBEDDING_MODEL || DEFAULT_EMBEDDING_MODEL,
|
|
62
|
-
indexVersion: env.DATA_SYNC_AZURE_INDEX_VERSION || DEFAULT_INDEX_VERSION,
|
|
63
|
-
generatedAt: new Date().toISOString(),
|
|
64
|
-
schemaVersion: SHARED_STATE_SCHEMA_VERSION,
|
|
65
|
-
indexConfig: defaultIndexConfig
|
|
66
|
-
};
|
|
67
|
-
|
|
68
|
-
for (let i = 0; i < argv.length; i++) {
|
|
69
|
-
const arg = argv[i];
|
|
70
|
-
const value = argv[i + 1];
|
|
71
|
-
|
|
72
|
-
if (arg === "--manifest" && value) {
|
|
73
|
-
args.manifest = value;
|
|
74
|
-
i++;
|
|
75
|
-
continue;
|
|
76
|
-
}
|
|
77
|
-
if (arg === "--output" && value) {
|
|
78
|
-
args.output = value;
|
|
79
|
-
i++;
|
|
80
|
-
continue;
|
|
81
|
-
}
|
|
82
|
-
if (arg === "--embedding-model" && value) {
|
|
83
|
-
args.embeddingModel = value;
|
|
84
|
-
i++;
|
|
85
|
-
continue;
|
|
86
|
-
}
|
|
87
|
-
if (arg === "--index-version" && value) {
|
|
88
|
-
args.indexVersion = String(value).trim();
|
|
89
|
-
i++;
|
|
90
|
-
continue;
|
|
91
|
-
}
|
|
92
|
-
if (arg === "--generated-at" && value) {
|
|
93
|
-
args.generatedAt = value;
|
|
94
|
-
i++;
|
|
95
|
-
continue;
|
|
96
|
-
}
|
|
97
|
-
if (arg === "--schema-version" && value) {
|
|
98
|
-
args.schemaVersion = parseIntegerOption("--schema-version", value, { min: 1 });
|
|
99
|
-
i++;
|
|
100
|
-
continue;
|
|
101
|
-
}
|
|
102
|
-
if (arg === "--chunk-size" && value) {
|
|
103
|
-
args.indexConfig.chunkSize = parseIntegerOption("--chunk-size", value, { min: 0 });
|
|
104
|
-
i++;
|
|
105
|
-
continue;
|
|
106
|
-
}
|
|
107
|
-
if (arg === "--chunk-overlap" && value) {
|
|
108
|
-
args.indexConfig.chunkOverlap = parseIntegerOption("--chunk-overlap", value, { min: 0 });
|
|
109
|
-
i++;
|
|
110
|
-
continue;
|
|
111
|
-
}
|
|
112
|
-
if (arg === "--max-chunks-per-doc" && value) {
|
|
113
|
-
args.indexConfig.maxChunksPerDoc = parseIntegerOption("--max-chunks-per-doc", value, { min: 1 });
|
|
114
|
-
i++;
|
|
115
|
-
continue;
|
|
116
|
-
}
|
|
117
|
-
if (arg === "--max-text-chars" && value) {
|
|
118
|
-
args.indexConfig.maxTextChars = parseIntegerOption("--max-text-chars", value, { min: 0 });
|
|
119
|
-
i++;
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
return args;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
function toAbsolutePath(relativeOrAbsolutePath) {
|
|
127
|
-
if (!relativeOrAbsolutePath) return "";
|
|
128
|
-
if (relativeOrAbsolutePath.startsWith("/")) return relativeOrAbsolutePath;
|
|
129
|
-
return join(projectRoot, relativeOrAbsolutePath);
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
function loadManifest(pathToManifest) {
|
|
133
|
-
if (!existsSync(pathToManifest)) {
|
|
134
|
-
throw new Error(`Manifest not found: ${pathToManifest}`);
|
|
135
|
-
}
|
|
136
|
-
const manifest = JSON.parse(readFileSync(pathToManifest, "utf8"));
|
|
137
|
-
if (!Array.isArray(manifest?.repos)) {
|
|
138
|
-
throw new Error("Manifest must include a repos array");
|
|
139
|
-
}
|
|
140
|
-
return manifest;
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
function buildShardPath(repoPath, signature) {
|
|
144
|
-
return `rag/cache/gemini-${signature}.json`;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
function buildReposState(manifestRepos, options) {
|
|
148
|
-
const repos = {};
|
|
149
|
-
const sourceByKey = new Map();
|
|
150
|
-
|
|
151
|
-
for (const repo of manifestRepos) {
|
|
152
|
-
const path = normalizeRepoPath(repo.path);
|
|
153
|
-
const key = normalizeRepoKey(path);
|
|
154
|
-
if (!key) continue;
|
|
155
|
-
|
|
156
|
-
const existingPath = sourceByKey.get(key);
|
|
157
|
-
if (existingPath && existingPath !== path) {
|
|
158
|
-
throw new Error(
|
|
159
|
-
`Repo key collision for '${key}': '${existingPath}' and '${path}' normalize to the same key`
|
|
160
|
-
);
|
|
161
|
-
}
|
|
162
|
-
sourceByKey.set(key, path);
|
|
163
|
-
|
|
164
|
-
const signature = computeRepoSignature({
|
|
165
|
-
repo,
|
|
166
|
-
embeddingModel: options.embeddingModel,
|
|
167
|
-
indexConfig: options.indexConfig,
|
|
168
|
-
indexVersion: options.indexVersion,
|
|
169
|
-
schemaVersion: options.schemaVersion
|
|
170
|
-
});
|
|
171
|
-
|
|
172
|
-
repos[key] = {
|
|
173
|
-
path,
|
|
174
|
-
commit: String(repo.commit || "").trim(),
|
|
175
|
-
signature,
|
|
176
|
-
shardPath: buildShardPath(path, signature)
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
return repos;
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
function main() {
|
|
184
|
-
const options = parseArgs(process.argv.slice(2));
|
|
185
|
-
const manifestPath = toAbsolutePath(options.manifest);
|
|
186
|
-
const outputPath = toAbsolutePath(options.output);
|
|
187
|
-
const manifest = loadManifest(manifestPath);
|
|
188
|
-
|
|
189
|
-
const state = createSharedState({
|
|
190
|
-
schemaVersion: options.schemaVersion,
|
|
191
|
-
generatedAt: options.generatedAt,
|
|
192
|
-
indexVersion: options.indexVersion,
|
|
193
|
-
repos: buildReposState(manifest.repos, options)
|
|
194
|
-
});
|
|
195
|
-
|
|
196
|
-
const output = `${JSON.stringify(state, null, 2)}\n`;
|
|
197
|
-
if (outputPath) {
|
|
198
|
-
writeFileSync(outputPath, output);
|
|
199
|
-
console.log(`Wrote shared state to ${outputPath}`);
|
|
200
|
-
return;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
process.stdout.write(output);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
if (process.argv[1] && pathToFileURL(process.argv[1]).href === import.meta.url) {
|
|
207
|
-
main();
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
export { buildReposState, parseArgs, parseIntegerOption };
|