@memvid/maw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -0
- package/dist/bin/maw.d.ts +6 -0
- package/dist/bin/maw.d.ts.map +1 -0
- package/dist/bin/maw.js +275 -0
- package/dist/bin/maw.js.map +1 -0
- package/dist/src/crawler/index.d.ts +71 -0
- package/dist/src/crawler/index.d.ts.map +1 -0
- package/dist/src/crawler/index.js +249 -0
- package/dist/src/crawler/index.js.map +1 -0
- package/dist/src/crawler/robots.d.ts +26 -0
- package/dist/src/crawler/robots.d.ts.map +1 -0
- package/dist/src/crawler/robots.js +179 -0
- package/dist/src/crawler/robots.js.map +1 -0
- package/dist/src/crawler/sitemap.d.ts +36 -0
- package/dist/src/crawler/sitemap.d.ts.map +1 -0
- package/dist/src/crawler/sitemap.js +209 -0
- package/dist/src/crawler/sitemap.js.map +1 -0
- package/dist/src/engine/detector.d.ts +18 -0
- package/dist/src/engine/detector.d.ts.map +1 -0
- package/dist/src/engine/detector.js +155 -0
- package/dist/src/engine/detector.js.map +1 -0
- package/dist/src/engine/fetch.d.ts +18 -0
- package/dist/src/engine/fetch.d.ts.map +1 -0
- package/dist/src/engine/fetch.js +53 -0
- package/dist/src/engine/fetch.js.map +1 -0
- package/dist/src/engine/index.d.ts +39 -0
- package/dist/src/engine/index.d.ts.map +1 -0
- package/dist/src/engine/index.js +116 -0
- package/dist/src/engine/index.js.map +1 -0
- package/dist/src/engine/playwright.d.ts +23 -0
- package/dist/src/engine/playwright.d.ts.map +1 -0
- package/dist/src/engine/playwright.js +88 -0
- package/dist/src/engine/playwright.js.map +1 -0
- package/dist/src/engine/rebrowser.d.ts +22 -0
- package/dist/src/engine/rebrowser.d.ts.map +1 -0
- package/dist/src/engine/rebrowser.js +142 -0
- package/dist/src/engine/rebrowser.js.map +1 -0
- package/dist/src/extractor/cleaner.d.ts +13 -0
- package/dist/src/extractor/cleaner.d.ts.map +1 -0
- package/dist/src/extractor/cleaner.js +122 -0
- package/dist/src/extractor/cleaner.js.map +1 -0
- package/dist/src/extractor/index.d.ts +29 -0
- package/dist/src/extractor/index.d.ts.map +1 -0
- package/dist/src/extractor/index.js +162 -0
- package/dist/src/extractor/index.js.map +1 -0
- package/dist/src/extractor/links.d.ts +22 -0
- package/dist/src/extractor/links.d.ts.map +1 -0
- package/dist/src/extractor/links.js +92 -0
- package/dist/src/extractor/links.js.map +1 -0
- package/dist/src/extractor/markdown.d.ts +13 -0
- package/dist/src/extractor/markdown.d.ts.map +1 -0
- package/dist/src/extractor/markdown.js +94 -0
- package/dist/src/extractor/markdown.js.map +1 -0
- package/dist/src/git/index.d.ts +40 -0
- package/dist/src/git/index.d.ts.map +1 -0
- package/dist/src/git/index.js +303 -0
- package/dist/src/git/index.js.map +1 -0
- package/dist/src/index.d.ts +103 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +229 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/ingestor/index.d.ts +95 -0
- package/dist/src/ingestor/index.d.ts.map +1 -0
- package/dist/src/ingestor/index.js +471 -0
- package/dist/src/ingestor/index.js.map +1 -0
- package/dist/src/utils/dedup.d.ts +66 -0
- package/dist/src/utils/dedup.d.ts.map +1 -0
- package/dist/src/utils/dedup.js +296 -0
- package/dist/src/utils/dedup.js.map +1 -0
- package/dist/src/utils/index.d.ts +3 -0
- package/dist/src/utils/index.d.ts.map +1 -0
- package/dist/src/utils/index.js +3 -0
- package/dist/src/utils/index.js.map +1 -0
- package/dist/src/utils/logger.d.ts +12 -0
- package/dist/src/utils/logger.d.ts.map +1 -0
- package/dist/src/utils/logger.js +49 -0
- package/dist/src/utils/logger.js.map +1 -0
- package/dist/src/utils/ui.d.ts +126 -0
- package/dist/src/utils/ui.d.ts.map +1 -0
- package/dist/src/utils/ui.js +357 -0
- package/dist/src/utils/ui.js.map +1 -0
- package/dist/src/utils/url.d.ts +21 -0
- package/dist/src/utils/url.d.ts.map +1 -0
- package/dist/src/utils/url.js +107 -0
- package/dist/src/utils/url.js.map +1 -0
- package/package.json +71 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MV2 ingestor - saves crawled content to .mv2 files
|
|
3
|
+
*/
|
|
4
|
+
import { existsSync } from 'fs';
|
|
5
|
+
import { stat } from 'fs/promises';
|
|
6
|
+
import { createLogger } from '../utils/logger.js';
|
|
7
|
+
const log = createLogger();
|
|
8
|
+
// Dynamic import for @memvid/sdk
|
|
9
|
+
let sdkModule = null;
|
|
10
|
+
async function getSDK() {
|
|
11
|
+
if (!sdkModule) {
|
|
12
|
+
sdkModule = await import('@memvid/sdk');
|
|
13
|
+
}
|
|
14
|
+
return sdkModule;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Ingest crawled results into an MV2 file
|
|
18
|
+
*/
|
|
19
|
+
export async function ingestToMv2(results, options) {
|
|
20
|
+
const sdk = await getSDK();
|
|
21
|
+
const { create, use, configure, createMemory } = sdk;
|
|
22
|
+
const startTime = Date.now();
|
|
23
|
+
const apiKey = process.env.MEMVID_API_KEY;
|
|
24
|
+
const dashboardUrl = process.env.MEMVID_DASHBOARD_URL || 'https://memvid.com';
|
|
25
|
+
let memoryId = options.memoryId;
|
|
26
|
+
// Configure SDK if API key is set
|
|
27
|
+
if (apiKey) {
|
|
28
|
+
configure({
|
|
29
|
+
apiKey,
|
|
30
|
+
dashboardUrl,
|
|
31
|
+
});
|
|
32
|
+
// Auto-create memory if API key is set but no memoryId provided
|
|
33
|
+
if (!memoryId && !existsSync(options.output)) {
|
|
34
|
+
try {
|
|
35
|
+
const memoryName = options.memoryName || `maw-${Date.now()}`;
|
|
36
|
+
log.dim(` Creating cloud memory: ${memoryName}`);
|
|
37
|
+
const memory = await createMemory({
|
|
38
|
+
name: memoryName,
|
|
39
|
+
description: `Created by maw CLI`,
|
|
40
|
+
});
|
|
41
|
+
memoryId = memory.id;
|
|
42
|
+
log.dim(` Memory ID: ${memoryId}`);
|
|
43
|
+
}
|
|
44
|
+
catch (err) {
|
|
45
|
+
// Failed to create memory, continue without cloud binding
|
|
46
|
+
log.warn(` Could not create cloud memory: ${err.message}`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
// Create or open the MV2 file with optional cloud binding
|
|
51
|
+
let mem;
|
|
52
|
+
if (existsSync(options.output)) {
|
|
53
|
+
mem = await use('basic', options.output);
|
|
54
|
+
// Sync tickets if we have API key and memory ID
|
|
55
|
+
if (apiKey && memoryId) {
|
|
56
|
+
try {
|
|
57
|
+
await mem.syncTickets(memoryId, apiKey, dashboardUrl);
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
// Sync failed, continue with local
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
// Create new file, optionally bound to cloud memory
|
|
66
|
+
const createOpts = {};
|
|
67
|
+
if (apiKey && memoryId) {
|
|
68
|
+
createOpts.memoryId = memoryId;
|
|
69
|
+
createOpts.memvidApiKey = apiKey;
|
|
70
|
+
}
|
|
71
|
+
mem = await create(options.output, 'basic', createOpts);
|
|
72
|
+
}
|
|
73
|
+
let pages = 0;
|
|
74
|
+
let totalBytes = 0;
|
|
75
|
+
let skippedDupes = 0;
|
|
76
|
+
let stoppedAtLimit = false;
|
|
77
|
+
let estimatedFileSize = 0;
|
|
78
|
+
// Smaller batch size to work around SDK putMany hang at ~30 docs
|
|
79
|
+
const batchSize = options.batchSize || 3;
|
|
80
|
+
const batch = [];
|
|
81
|
+
// Size limit (default 40MB to stay safely under 50MB free tier with buffer)
|
|
82
|
+
const maxSizeBytes = (options.maxSizeMB || 40) * 1024 * 1024;
|
|
83
|
+
const hasApiKey = !!process.env.MEMVID_API_KEY;
|
|
84
|
+
// Content fingerprints to skip near-duplicates
|
|
85
|
+
const contentHashes = new Set();
|
|
86
|
+
for await (const result of results) {
|
|
87
|
+
const markdown = result.extracted.markdown;
|
|
88
|
+
// Simple content fingerprint to skip near-duplicates (first 2000 chars)
|
|
89
|
+
const fingerprint = markdown.slice(0, 2000).replace(/\s+/g, ' ').trim();
|
|
90
|
+
if (contentHashes.has(fingerprint)) {
|
|
91
|
+
skippedDupes++;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
contentHashes.add(fingerprint);
|
|
95
|
+
const doc = {
|
|
96
|
+
title: result.extracted.title,
|
|
97
|
+
label: options.label || 'web',
|
|
98
|
+
text: markdown,
|
|
99
|
+
metadata: {
|
|
100
|
+
url: result.url,
|
|
101
|
+
finalUrl: result.finalUrl,
|
|
102
|
+
description: result.extracted.description,
|
|
103
|
+
author: result.extracted.author,
|
|
104
|
+
publishedDate: result.extracted.publishedDate,
|
|
105
|
+
wordCount: result.extracted.wordCount,
|
|
106
|
+
crawlDepth: result.depth,
|
|
107
|
+
engine: result.engine,
|
|
108
|
+
crawledAt: new Date().toISOString(),
|
|
109
|
+
},
|
|
110
|
+
};
|
|
111
|
+
// Estimate document size (text + metadata + embeddings overhead ~3x)
|
|
112
|
+
const docSize = markdown.length * 3;
|
|
113
|
+
// Check if we'd exceed the limit (only if no API key)
|
|
114
|
+
if (!hasApiKey && estimatedFileSize + docSize > maxSizeBytes) {
|
|
115
|
+
stoppedAtLimit = true;
|
|
116
|
+
log.warn(` Reached ~${Math.round(estimatedFileSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
batch.push(doc);
|
|
120
|
+
pages++;
|
|
121
|
+
totalBytes += result.extracted.byteSize;
|
|
122
|
+
estimatedFileSize += docSize;
|
|
123
|
+
// Show progress
|
|
124
|
+
log.progress(pages, pages, result.extracted.title.slice(0, 40));
|
|
125
|
+
// Flush batch
|
|
126
|
+
if (batch.length >= batchSize) {
|
|
127
|
+
try {
|
|
128
|
+
if (options.enableEmbedding) {
|
|
129
|
+
log.status(`Embedding batch ${Math.ceil(pages / batchSize)} (${options.embeddingModel || 'bge-small'})...`);
|
|
130
|
+
}
|
|
131
|
+
await mem.putMany(batch, options.enableEmbedding ? {
|
|
132
|
+
enableEmbedding: true,
|
|
133
|
+
embeddingModel: options.embeddingModel || 'openai',
|
|
134
|
+
} : undefined);
|
|
135
|
+
batch.length = 0;
|
|
136
|
+
// Check actual file size after flush (only if no API key)
|
|
137
|
+
if (!hasApiKey) {
|
|
138
|
+
const currentSize = await getFileSize(options.output);
|
|
139
|
+
if (currentSize > maxSizeBytes) {
|
|
140
|
+
stoppedAtLimit = true;
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
catch (err) {
|
|
146
|
+
// Handle SDK size limit error gracefully
|
|
147
|
+
if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
|
|
148
|
+
stoppedAtLimit = true;
|
|
149
|
+
break;
|
|
150
|
+
}
|
|
151
|
+
throw err;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Flush remaining (with error handling)
|
|
156
|
+
if (batch.length > 0 && !stoppedAtLimit) {
|
|
157
|
+
try {
|
|
158
|
+
if (options.enableEmbedding) {
|
|
159
|
+
log.status(`Embedding final batch (${options.embeddingModel || 'bge-small'})...`);
|
|
160
|
+
}
|
|
161
|
+
await mem.putMany(batch, options.enableEmbedding ? {
|
|
162
|
+
enableEmbedding: true,
|
|
163
|
+
embeddingModel: options.embeddingModel || 'openai',
|
|
164
|
+
} : undefined);
|
|
165
|
+
}
|
|
166
|
+
catch (err) {
|
|
167
|
+
if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
|
|
168
|
+
stoppedAtLimit = true;
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
throw err;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
const duration = Date.now() - startTime;
|
|
176
|
+
return {
|
|
177
|
+
pages,
|
|
178
|
+
bytes: totalBytes,
|
|
179
|
+
duration,
|
|
180
|
+
stoppedAtLimit,
|
|
181
|
+
skippedDupes,
|
|
182
|
+
memoryId,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Ingest git repo files into an MV2 file
|
|
187
|
+
*/
|
|
188
|
+
export async function ingestGitToMv2(files, options) {
|
|
189
|
+
const sdk = await getSDK();
|
|
190
|
+
const { create, use, configure, createMemory } = sdk;
|
|
191
|
+
const startTime = Date.now();
|
|
192
|
+
const apiKey = process.env.MEMVID_API_KEY;
|
|
193
|
+
const dashboardUrl = process.env.MEMVID_DASHBOARD_URL || 'https://memvid.com';
|
|
194
|
+
let memoryId = options.memoryId;
|
|
195
|
+
// Configure SDK if API key is set
|
|
196
|
+
if (apiKey) {
|
|
197
|
+
configure({ apiKey, dashboardUrl });
|
|
198
|
+
// Auto-create memory if API key is set but no memoryId provided
|
|
199
|
+
if (!memoryId && !existsSync(options.output)) {
|
|
200
|
+
try {
|
|
201
|
+
const memoryName = options.memoryName || `maw-repo-${Date.now()}`;
|
|
202
|
+
log.dim(` Creating cloud memory: ${memoryName}`);
|
|
203
|
+
const memory = await createMemory({
|
|
204
|
+
name: memoryName,
|
|
205
|
+
description: `Git repo ingested by maw CLI`,
|
|
206
|
+
});
|
|
207
|
+
memoryId = memory.id;
|
|
208
|
+
log.dim(` Memory ID: ${memoryId}`);
|
|
209
|
+
}
|
|
210
|
+
catch (err) {
|
|
211
|
+
log.warn(` Could not create cloud memory: ${err.message}`);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
// Create or open the MV2 file
|
|
216
|
+
let mem;
|
|
217
|
+
if (existsSync(options.output)) {
|
|
218
|
+
mem = await use('basic', options.output);
|
|
219
|
+
if (apiKey && memoryId) {
|
|
220
|
+
try {
|
|
221
|
+
await mem.syncTickets(memoryId, apiKey, dashboardUrl);
|
|
222
|
+
}
|
|
223
|
+
catch { }
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
227
|
+
const createOpts = {};
|
|
228
|
+
if (apiKey && memoryId) {
|
|
229
|
+
createOpts.memoryId = memoryId;
|
|
230
|
+
createOpts.memvidApiKey = apiKey;
|
|
231
|
+
}
|
|
232
|
+
mem = await create(options.output, 'basic', createOpts);
|
|
233
|
+
}
|
|
234
|
+
let fileCount = 0;
|
|
235
|
+
let totalBytes = 0;
|
|
236
|
+
let stoppedAtLimit = false;
|
|
237
|
+
let estimatedFileSize = 0;
|
|
238
|
+
// Smaller batch size to work around SDK putMany hang
|
|
239
|
+
const batchSize = 3;
|
|
240
|
+
const batch = [];
|
|
241
|
+
// Size limit (default 40MB to stay safely under 50MB free tier)
|
|
242
|
+
const maxSizeBytes = (options.maxSizeMB || 40) * 1024 * 1024;
|
|
243
|
+
const hasApiKey = !!process.env.MEMVID_API_KEY;
|
|
244
|
+
// Embedding overhead: ~6KB per chunk for 1536-dim vectors (OpenAI), less for smaller models
|
|
245
|
+
const embeddingOverhead = options.enableEmbedding ? 6000 : 0;
|
|
246
|
+
for await (const file of files) {
|
|
247
|
+
// Check if this is a README or documentation file
|
|
248
|
+
const isReadme = /readme\.md$/i.test(file.path);
|
|
249
|
+
const isDocs = /^(docs|documentation)\//i.test(file.path) || /\.(md|mdx|rst)$/i.test(file.path);
|
|
250
|
+
// Build enhanced text for README files to improve retrieval
|
|
251
|
+
let text = `File: ${file.path}\nLanguage: ${file.language}\n\n${file.content}`;
|
|
252
|
+
// For README files, prepend searchable context
|
|
253
|
+
if (isReadme) {
|
|
254
|
+
const projectName = file.path.includes('/') ? '' : file.content.match(/^#\s+(.+)/m)?.[1] || '';
|
|
255
|
+
text = `Project Overview: ${projectName}\nThis is the main README documentation.\nIntroduction and description of the project.\n\n${text}`;
|
|
256
|
+
}
|
|
257
|
+
// Create document with code content
|
|
258
|
+
// Format: include file path as context, then the actual code
|
|
259
|
+
// This helps the LLM understand what file it's looking at
|
|
260
|
+
const doc = {
|
|
261
|
+
title: `${file.path} (${file.language})`,
|
|
262
|
+
label: options.label || 'code',
|
|
263
|
+
text,
|
|
264
|
+
uri: `file://${file.path}`, // Use file path as URI for scope filtering
|
|
265
|
+
metadata: {
|
|
266
|
+
path: file.path,
|
|
267
|
+
language: file.language,
|
|
268
|
+
size: file.size,
|
|
269
|
+
type: isReadme ? 'readme' : (isDocs ? 'docs' : 'code'),
|
|
270
|
+
isReadme,
|
|
271
|
+
ingestedAt: new Date().toISOString(),
|
|
272
|
+
},
|
|
273
|
+
};
|
|
274
|
+
// Add labels for better categorization
|
|
275
|
+
if (isReadme) {
|
|
276
|
+
doc.labels = ['README', 'Documentation', 'Overview', 'Introduction'];
|
|
277
|
+
}
|
|
278
|
+
else if (isDocs) {
|
|
279
|
+
doc.labels = ['Documentation'];
|
|
280
|
+
}
|
|
281
|
+
// Estimate document size (text + metadata + embedding vectors)
|
|
282
|
+
const docSize = text.length * 2 + embeddingOverhead;
|
|
283
|
+
// Check if we'd exceed the limit (only if no API key)
|
|
284
|
+
if (!hasApiKey && estimatedFileSize + docSize > maxSizeBytes) {
|
|
285
|
+
stoppedAtLimit = true;
|
|
286
|
+
log.warn(` Reached ~${Math.round(estimatedFileSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
|
|
287
|
+
break;
|
|
288
|
+
}
|
|
289
|
+
batch.push(doc);
|
|
290
|
+
fileCount++;
|
|
291
|
+
totalBytes += file.size;
|
|
292
|
+
estimatedFileSize += docSize;
|
|
293
|
+
// Show progress
|
|
294
|
+
log.progress(fileCount, fileCount, file.path.slice(-40));
|
|
295
|
+
// Flush batch
|
|
296
|
+
if (batch.length >= batchSize) {
|
|
297
|
+
try {
|
|
298
|
+
if (options.enableEmbedding) {
|
|
299
|
+
log.status(`Embedding ${fileCount} files (${options.embeddingModel || 'bge-small'})...`);
|
|
300
|
+
}
|
|
301
|
+
await mem.putMany(batch, options.enableEmbedding ? {
|
|
302
|
+
enableEmbedding: true,
|
|
303
|
+
embeddingModel: options.embeddingModel || 'openai',
|
|
304
|
+
} : undefined);
|
|
305
|
+
batch.length = 0;
|
|
306
|
+
// Check actual file size after flush (only if no API key)
|
|
307
|
+
if (!hasApiKey) {
|
|
308
|
+
const currentSize = await getFileSize(options.output);
|
|
309
|
+
if (currentSize > maxSizeBytes) {
|
|
310
|
+
stoppedAtLimit = true;
|
|
311
|
+
log.warn(` Reached ${Math.round(currentSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
|
|
312
|
+
break;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
catch (err) {
|
|
317
|
+
// Handle SDK size limit error gracefully
|
|
318
|
+
if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
|
|
319
|
+
stoppedAtLimit = true;
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
throw err;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
// Flush remaining (with error handling)
|
|
327
|
+
if (batch.length > 0 && !stoppedAtLimit) {
|
|
328
|
+
try {
|
|
329
|
+
if (options.enableEmbedding) {
|
|
330
|
+
log.status(`Embedding final ${batch.length} files (${options.embeddingModel || 'bge-small'})...`);
|
|
331
|
+
}
|
|
332
|
+
await mem.putMany(batch, options.enableEmbedding ? {
|
|
333
|
+
enableEmbedding: true,
|
|
334
|
+
embeddingModel: options.embeddingModel || 'openai',
|
|
335
|
+
} : undefined);
|
|
336
|
+
}
|
|
337
|
+
catch (err) {
|
|
338
|
+
if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
|
|
339
|
+
stoppedAtLimit = true;
|
|
340
|
+
}
|
|
341
|
+
else {
|
|
342
|
+
throw err;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
const duration = Date.now() - startTime;
|
|
347
|
+
return {
|
|
348
|
+
files: fileCount,
|
|
349
|
+
bytes: totalBytes,
|
|
350
|
+
duration,
|
|
351
|
+
memoryId,
|
|
352
|
+
stoppedAtLimit,
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Get file size
|
|
357
|
+
*/
|
|
358
|
+
export async function getFileSize(path) {
|
|
359
|
+
try {
|
|
360
|
+
const stats = await stat(path);
|
|
361
|
+
return stats.size;
|
|
362
|
+
}
|
|
363
|
+
catch {
|
|
364
|
+
return 0;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Open an existing MV2 file for querying
|
|
369
|
+
*/
|
|
370
|
+
export async function openMv2(path) {
|
|
371
|
+
const { use } = await getSDK();
|
|
372
|
+
return use('basic', path);
|
|
373
|
+
}
|
|
374
|
+
/**
|
|
375
|
+
* Search in an MV2 file
|
|
376
|
+
* Uses semantic search when OPENAI_API_KEY is set
|
|
377
|
+
*/
|
|
378
|
+
export async function searchMv2(path, query, options = {}) {
|
|
379
|
+
const mem = await openMv2(path);
|
|
380
|
+
// Determine search mode: use semantic if we have an API key for query embeddings
|
|
381
|
+
const queryEmbeddingModel = options.embeddingModel || (process.env.OPENAI_API_KEY ? 'openai' : undefined);
|
|
382
|
+
const mode = queryEmbeddingModel ? 'auto' : 'lex';
|
|
383
|
+
return mem.find(query, {
|
|
384
|
+
k: options.k || 10,
|
|
385
|
+
mode,
|
|
386
|
+
queryEmbeddingModel,
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
/**
|
|
390
|
+
* Detect if question is asking about what something is/does
|
|
391
|
+
*/
|
|
392
|
+
function isOverviewQuestion(question) {
|
|
393
|
+
const lowerQ = question.toLowerCase();
|
|
394
|
+
return (/^what (is|does|are)\b/.test(lowerQ) ||
|
|
395
|
+
/^(explain|describe|tell me about)\b/.test(lowerQ) ||
|
|
396
|
+
/^how does .+ work/.test(lowerQ) ||
|
|
397
|
+
/overview|introduction|getting started/i.test(lowerQ));
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Ask a question using an MV2 file
|
|
401
|
+
* Uses semantic search when embeddings are available and OPENAI_API_KEY is set
|
|
402
|
+
*/
|
|
403
|
+
export async function askMv2(path, question, options = {}) {
|
|
404
|
+
const mem = await openMv2(path);
|
|
405
|
+
// For overview questions, use higher k to get more diverse context including README chunks
|
|
406
|
+
const isOverview = isOverviewQuestion(question);
|
|
407
|
+
const effectiveK = options.k || (isOverview ? 15 : 8);
|
|
408
|
+
// Determine search mode: use semantic/auto if we have an API key for query embeddings
|
|
409
|
+
const queryEmbeddingModel = options.embeddingModel || (process.env.OPENAI_API_KEY ? 'openai' : undefined);
|
|
410
|
+
const mode = queryEmbeddingModel ? 'auto' : 'lex'; // auto = hybrid (semantic + lexical), lex = BM25 only
|
|
411
|
+
return mem.ask(question, {
|
|
412
|
+
model: options.model || 'gpt-4o-mini',
|
|
413
|
+
modelApiKey: options.apiKey || process.env.OPENAI_API_KEY,
|
|
414
|
+
k: effectiveK,
|
|
415
|
+
llmContextChars: isOverview ? 15000 : 8000, // More context for overview questions
|
|
416
|
+
mode,
|
|
417
|
+
queryEmbeddingModel,
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* List documents in an MV2 file
|
|
422
|
+
*/
|
|
423
|
+
export async function listMv2(path, options = {}) {
|
|
424
|
+
const mem = await openMv2(path);
|
|
425
|
+
// Use timeline or list method if available
|
|
426
|
+
if (typeof mem.timeline === 'function') {
|
|
427
|
+
return mem.timeline({ limit: options.limit || 100 });
|
|
428
|
+
}
|
|
429
|
+
// Fallback to find with empty query
|
|
430
|
+
return mem.find('', { k: options.limit || 100 });
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Export documents from an MV2 file with full content
|
|
434
|
+
*/
|
|
435
|
+
export async function exportMv2(path, options = {}) {
|
|
436
|
+
const mem = await openMv2(path);
|
|
437
|
+
// Get frame list
|
|
438
|
+
const timeline = await mem.timeline({ limit: options.limit || 10000 });
|
|
439
|
+
const frames = timeline.frames || timeline;
|
|
440
|
+
// Get full content for each frame
|
|
441
|
+
const results = [];
|
|
442
|
+
for (const frame of frames) {
|
|
443
|
+
// Skip child frames (they're included in parent)
|
|
444
|
+
if (frame.child_frames && frame.child_frames.length > 0) {
|
|
445
|
+
// This is a parent frame - get its full content
|
|
446
|
+
try {
|
|
447
|
+
const content = await mem.view(frame.frame_id);
|
|
448
|
+
const uri = frame.uri || '';
|
|
449
|
+
const title = uri.replace('file://', '').replace(/^https?:\/\//, '') || `Frame ${frame.frame_id}`;
|
|
450
|
+
results.push({ title, uri, content });
|
|
451
|
+
}
|
|
452
|
+
catch {
|
|
453
|
+
// Frame might not exist, skip
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
else if (!frames.some((f) => f.child_frames?.includes(frame.frame_id))) {
|
|
457
|
+
// This is a standalone frame (not a child of another)
|
|
458
|
+
try {
|
|
459
|
+
const content = await mem.view(frame.frame_id);
|
|
460
|
+
const uri = frame.uri || '';
|
|
461
|
+
const title = uri.replace('file://', '').replace(/^https?:\/\//, '') || `Frame ${frame.frame_id}`;
|
|
462
|
+
results.push({ title, uri, content });
|
|
463
|
+
}
|
|
464
|
+
catch {
|
|
465
|
+
// Frame might not exist, skip
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
return results;
|
|
470
|
+
}
|
|
471
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/ingestor/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AAEnC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,MAAM,GAAG,GAAG,YAAY,EAAE,CAAC;AAE3B,iCAAiC;AACjC,IAAI,SAAS,GAAQ,IAAI,CAAC;AAE1B,KAAK,UAAU,MAAM;IACnB,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,SAAS,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAsBD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,OAAmC,EACnC,OAAsB;IAEtB,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;IAC3B,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,GAAG,CAAC;IACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAC1C,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,oBAAoB,CAAC;IAE9E,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAEhC,kCAAkC;IAClC,IAAI,MAAM,EAAE,CAAC;QACX,SAAS,CAAC;YACR,MAAM;YACN,YAAY;SACb,CAAC,CAAC;QAEH,gEAAgE;QAChE,IAAI,CAAC,QAAQ,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC7C,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,OAAO,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;gBAC7D,GAAG,CAAC,GAAG,CAAC,4BAA4B,UAAU,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;oBAChC,IAAI,EAAE,UAAU;oBAChB,WAAW,EAAE,oBAAoB;iBAClC,CAAC,CAAC;gBACH,QAAQ,GAAG,MAAM,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,gBAAgB,QAAQ,EAAE,CAAC,CAAC;YACtC,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,0DAA0D;gBAC1D,GAAG,CAAC,IAAI,CAAC,oCAAoC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;IACH,CAAC;IAED,0DAA0D;IAC1D,IAAI,GAAG,CAAC;IACR,IAAI,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,GAAG,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzC,gDAAgD;QAChD,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,GAAG,CAAC,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC;gBACP,mCAAmC;YACrC,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,oDAAoD;QACpD,MAAM,UAAU,GAAQ,EAAE,CAAC;QAC3B,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAC;YAC/B,UAAU,CAAC,YAAY,GAAG,MAAM,CAAC;QACnC,CAAC;QACD,GAAG,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAC1B,iEAAiE;IACjE,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,CAAC,CAAC;IACzC,MAAM,KAAK,GAAyF,EAAE,CAAC;IAEvG,4EAA4E;IAC5E,MAAM,YAAY,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;IAC7D,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE/C,+CAA+C;IAC/C,MAAM,aAAa,GAAG,IAAI,GAAG,EAAU,CAAC;IAExC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC;QAE3C,wEAAwE;QACxE,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACxE,IAAI,aAAa,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC;YACnC,YAAY,EAAE,CAAC;YACf,SAAS;QACX,CAAC;QACD,aAAa,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QAE/B,MAAM,GAAG,GAAG;YACV,KAAK,EAAE,MAAM,CAAC,SAAS,CAAC,KAAK;YAC7B,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,KAAK;YAC7B,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE;gBACR,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,WAAW,EAAE,MAAM,CAAC,SAAS,CAAC,WAAW;gBACzC,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,MAAM;gBAC/B,aAAa,EAAE,MAAM,CAAC,SAAS,CAAC,aAAa;gBAC7C,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,SAAS;gBACrC,UAAU,EAAE,MAAM,CAAC,KAAK;gBACxB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC;SACF,CAAC;QAEF,qEAAqE;QACrE,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;QAEpC,sDAAsD;QACtD,IAAI,CAAC,SAAS,IAAI,iBAAiB,GAAG,OAAO,GAAG,YAAY,EAAE,CAAC;YAC7D,cAAc,GAAG,IAAI,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;YACjH,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,KAAK,EAAE,CAAC;QACR,UAAU,IAAI,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC;QACxC,iBAAiB,IAAI,OAAO,CAAC;QAE7B,gBAAgB;QAChB,GAAG,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QAEhE,cAAc;QACd,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;oBAC5B,GAAG,CAAC,MAAM,CAAC,mBAAmB,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,SAAS,CAAC,KAAK,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;gBAC9G,CAAC;gBACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;oBACjD,eAAe,EAAE,IAAI;oBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;iBACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;gBACf,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;gBAEjB,0DAA0D;gBAC1D,IAAI,CAAC,SAAS,EAAE,CAAC;oBACf,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;oBACtD,IAAI,WAAW,GAAG,YAAY,EAAE,CAAC;wBAC/B,cAAc,GAAG,IAAI,CAAC;wBACtB,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,yCAAyC;gBACzC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;oBACvE,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM;gBACR,CAAC;gBACD,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,cAAc,EAAE,CAAC;QACxC,IAAI,CAAC;YACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,GAAG,CAAC,MAAM,CAAC,0BAA0B,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;YACpF,CAAC;YACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;gBACjD,eAAe,EAAE,IAAI;gBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;aACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACjB,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvE,cAAc,GAAG,IAAI,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAExC,OAAO;QACL,KAAK;QACL,KAAK,EAAE,UAAU;QACjB,QAAQ;QACR,cAAc;QACd,YAAY;QACZ,QAAQ;KACT,CAAC;AACJ,CAAC;AAoBD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAuF,EACvF,OAAyB;IAEzB,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;IAC3B,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,GAAG,CAAC;IACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAC1C,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,oBAAoB,CAAC;IAE9E,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAEhC,kCAAkC;IAClC,IAAI,MAAM,EAAE,CAAC;QACX,SAAS,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC;QAEpC,gEAAgE;QAChE,IAAI,CAAC,QAAQ,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC7C,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,YAAY,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;gBAClE,GAAG,CAAC,GAAG,CAAC,4BAA4B,UAAU,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;oBAChC,IAAI,EAAE,UAAU;oBAChB,WAAW,EAAE,8BAA8B;iBAC5C,CAAC,CAAC;gBACH,QAAQ,GAAG,MAAM,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,gBAAgB,QAAQ,EAAE,CAAC,CAAC;YACtC,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,GAAG,CAAC,IAAI,CAAC,oCAAoC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,IAAI,GAAG,CAAC;IACR,IAAI,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,GAAG,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzC,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,GAAG,CAAC,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,UAAU,GAAQ,EAAE,CAAC;QAC3B,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAC;YAC/B,UAAU,CAAC,YAAY,GAAG,MAAM,CAAC;QACnC,CAAC;QACD,GAAG,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAC1B,qDAAqD;IACrD,MAAM,SAAS,GAAG,CAAC,CAAC;IACpB,MAAM,KAAK,GAAyF,EAAE,CAAC;IAEvG,gEAAgE;IAChE,MAAM,YAAY,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;IAC7D,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE/C,4FAA4F;IAC5F,MAAM,iBAAiB,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC/B,kDAAkD;QAClD,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,0BAA0B,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEhG,4DAA4D;QAC5D,IAAI,IAAI,GAAG,SAAS,IAAI,CAAC,IAAI,eAAe,IAAI,CAAC,QAAQ,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;QAE/E,+CAA+C;QAC/C,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/F,IAAI,GAAG,qBAAqB,WAAW,6FAA6F,IAAI,EAAE,CAAC;QAC7I,CAAC;QAED,oCAAoC;QACpC,6DAA6D;QAC7D,0DAA0D;QAC1D,MAAM,GAAG,GAAQ;YACf,KAAK,EAAE,GAAG,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,QAAQ,GAAG;YACxC,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,MAAM;YAC9B,IAAI;YACJ,GAAG,EAAE,UAAU,IAAI,CAAC,IAAI,EAAE,EAAE,2CAA2C;YACvE,QAAQ,EAAE;gBACR,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;gBACtD,QAAQ;gBACR,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACrC;SACF,CAAC;QAEF,uCAAuC;QACvC,IAAI,QAAQ,EAAE,CAAC;YACb,GAAG,CAAC,MAAM,GAAG,CAAC,QAAQ,EAAE,eAAe,EAAE,UAAU,EAAE,cAAc,CAAC,CAAC;QACvE,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,GAAG,CAAC,MAAM,GAAG,CAAC,eAAe,CAAC,CAAC;QACjC,CAAC;QAED,+DAA+D;QAC/D,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,iBAAiB,CAAC;QAEpD,sDAAsD;QACtD,IAAI,CAAC,SAAS,IAAI,iBAAiB,GAAG,OAAO,GAAG,YAAY,EAAE,CAAC;YAC7D,cAAc,GAAG,IAAI,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;YACjH,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,SAAS,EAAE,CAAC;QACZ,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC;QACxB,iBAAiB,IAAI,OAAO,CAAC;QAE7B,gBAAgB;QAChB,GAAG,CAAC,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAEzD,cAAc;QACd,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;oBAC5B,GAAG,CAAC,MAAM,CAAC,aAAa,SAAS,WAAW,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;gBAC3F,CAAC;gBACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;oBACjD,eAAe,EAAE,IAAI;oBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;iBACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;gBACf,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;gBAEjB,0DAA0D;gBAC1D,IAAI,CAAC,SAAS,EAAE,CAAC;oBACf,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;oBACtD,IAAI,WAAW,GAAG,YAAY,EAAE,CAAC;wBAC/B,cAAc,GAAG,IAAI,CAAC;wBACtB,GAAG,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;wBAC1G,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,yCAAyC;gBACzC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;oBACvE,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM;gBACR,CAAC;gBACD,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,cAAc,EAAE,CAAC;QACxC,IAAI,CAAC;YACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,GAAG,CAAC,MAAM,CAAC,mBAAmB,KAAK,CAAC,MAAM,WAAW,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;YACpG,CAAC;YACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;gBACjD,eAAe,EAAE,IAAI;gBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;aACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACjB,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvE,cAAc,GAAG,IAAI,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAExC,OAAO;QACL,KAAK,EAAE,SAAS;QAChB,KAAK,EAAE,UAAU;QACjB,QAAQ;QACR,QAAQ;QACR,cAAc;KACf,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,IAAY;IAC5C,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,OAAO,KAAK,CAAC,IAAI,CAAC;IACpB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,CAAC;IACX,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAY;IACxC,MAAM,EAAE,GAAG,EAAE,GAAG,MAAM,MAAM,EAAE,CAAC;IAC/B,OAAO,GAAG,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;AAC5B,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,KAAa,EACb,UAAmD,EAAE;IAErD,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,iFAAiF;IACjF,MAAM,mBAAmB,GAAG,OAAO,CAAC,cAAc,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1G,MAAM,IAAI,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC;IAElD,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE;QACrB,CAAC,EAAE,OAAO,CAAC,CAAC,IAAI,EAAE;QAClB,IAAI;QACJ,mBAAmB;KACpB,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,QAAgB;IAC1C,MAAM,MAAM,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;IACtC,OAAO,CACL,uBAAuB,CAAC,IAAI,CAAC,MAAM,CAAC;QACpC,qCAAqC,CAAC,IAAI,CAAC,MAAM,CAAC;QAClD,mBAAmB,CAAC,IAAI,CAAC,MAAM,CAAC;QAChC,wCAAwC,CAAC,IAAI,CAAC,MAAM,CAAC,CACtD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,MAAM,CAC1B,IAAY,EACZ,QAAgB,EAChB,UAAoF,EAAE;IAEtF,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,2FAA2F;IAC3F,MAAM,UAAU,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEtD,sFAAsF;IACtF,MAAM,mBAAmB,GAAG,OAAO,CAAC,cAAc,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1G,MAAM,IAAI,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,sDAAsD;IAEzG,OAAO,GAAG,CAAC,GAAG,CAAC,QAAQ,EAAE;QACvB,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,aAAa;QACrC,WAAW,EAAE,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc;QACzD,CAAC,EAAE,UAAU;QACb,eAAe,EAAE,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,sCAAsC;QAClF,IAAI;QACJ,mBAAmB;KACpB,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,IAAY,EACZ,UAA+C,EAAE;IAEjD,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAChC,2CAA2C;IAC3C,IAAI,OAAO,GAAG,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;QACvC,OAAO,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG,EAAE,CAAC,CAAC;IACvD,CAAC;IACD,oCAAoC;IACpC,OAAO,GAAG,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG,EAAE,CAAC,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,UAA8B,EAAE;IAEhC,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,iBAAiB;IACjB,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,KAAK,EAAE,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC;IAE3C,kCAAkC;IAClC,MAAM,OAAO,GAA2D,EAAE,CAAC;IAE3E,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,iDAAiD;QACjD,IAAI,KAAK,CAAC,YAAY,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxD,gDAAgD;YAChD,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;gBAC/C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC;gBAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,IAAI,SAAS,KAAK,CAAC,QAAQ,EAAE,CAAC;gBAClG,OAAO,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;YACxC,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;aAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;YAC9E,sDAAsD;YACtD,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;gBAC/C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC;gBAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,IAAI,SAAS,KAAK,CAAC,QAAQ,EAAE,CAAC;gBAClG,OAAO,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;YACxC,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart duplicate detection for web crawling
|
|
3
|
+
* - Detects localized URLs (e.g., /en-us/, /de-de/, /fr-fr/)
|
|
4
|
+
* - Content similarity fingerprinting
|
|
5
|
+
* - URL path normalization
|
|
6
|
+
*/
|
|
7
|
+
export interface LocaleInfo {
|
|
8
|
+
hasLocale: boolean;
|
|
9
|
+
locale?: string;
|
|
10
|
+
language?: string;
|
|
11
|
+
country?: string;
|
|
12
|
+
canonicalPath: string;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Extract locale information from a URL
|
|
16
|
+
*/
|
|
17
|
+
export declare function extractLocale(url: string): LocaleInfo;
|
|
18
|
+
/**
|
|
19
|
+
* Generate a content fingerprint for similarity detection
|
|
20
|
+
* Uses multiple techniques for robust matching
|
|
21
|
+
*/
|
|
22
|
+
export declare function generateFingerprint(text: string): string;
|
|
23
|
+
/**
|
|
24
|
+
* Calculate similarity between two fingerprints (0-1)
|
|
25
|
+
*/
|
|
26
|
+
export declare function calculateSimilarity(fp1: string, fp2: string): number;
|
|
27
|
+
/**
|
|
28
|
+
* Smart deduplication tracker
|
|
29
|
+
*/
|
|
30
|
+
export declare class DedupTracker {
|
|
31
|
+
private canonicalPaths;
|
|
32
|
+
private fingerprints;
|
|
33
|
+
stats: {
|
|
34
|
+
localeSkipped: number;
|
|
35
|
+
similarSkipped: number;
|
|
36
|
+
total: number;
|
|
37
|
+
};
|
|
38
|
+
private preferredLanguage;
|
|
39
|
+
constructor(preferredLanguage?: string);
|
|
40
|
+
/**
|
|
41
|
+
* Check if URL should be skipped (is duplicate)
|
|
42
|
+
* Returns reason if should skip, undefined if should crawl
|
|
43
|
+
*/
|
|
44
|
+
shouldSkip(url: string): {
|
|
45
|
+
skip: boolean;
|
|
46
|
+
reason?: string;
|
|
47
|
+
};
|
|
48
|
+
/**
|
|
49
|
+
* Check content similarity against previously seen content
|
|
50
|
+
*/
|
|
51
|
+
checkContentSimilarity(url: string, content: string, threshold?: number): {
|
|
52
|
+
skip: boolean;
|
|
53
|
+
reason?: string;
|
|
54
|
+
};
|
|
55
|
+
/**
|
|
56
|
+
* Get dedup statistics
|
|
57
|
+
*/
|
|
58
|
+
getStats(): {
|
|
59
|
+
uniquePaths: number;
|
|
60
|
+
uniqueContent: number;
|
|
61
|
+
localeSkipped: number;
|
|
62
|
+
similarSkipped: number;
|
|
63
|
+
total: number;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=dedup.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../../src/utils/dedup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmDH,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,OAAO,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,CAsHrD;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAoBxD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAiBpE;AAED;;GAEG;AACH,qBAAa,YAAY;IAEvB,OAAO,CAAC,cAAc,CAAuD;IAG7E,OAAO,CAAC,YAAY,CAA6B;IAG1C,KAAK;;;;MAIV;IAGF,OAAO,CAAC,iBAAiB,CAAS;gBAEtB,iBAAiB,SAAO;IAIpC;;;OAGG;IACH,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE;IAqC3D;;OAEG;IACH,sBAAsB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,SAAO,GAAG;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE;IAiB1G;;OAEG;IACH,QAAQ;;;;;;;CAOT"}
|