@memvid/maw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +188 -0
  2. package/dist/bin/maw.d.ts +6 -0
  3. package/dist/bin/maw.d.ts.map +1 -0
  4. package/dist/bin/maw.js +275 -0
  5. package/dist/bin/maw.js.map +1 -0
  6. package/dist/src/crawler/index.d.ts +71 -0
  7. package/dist/src/crawler/index.d.ts.map +1 -0
  8. package/dist/src/crawler/index.js +249 -0
  9. package/dist/src/crawler/index.js.map +1 -0
  10. package/dist/src/crawler/robots.d.ts +26 -0
  11. package/dist/src/crawler/robots.d.ts.map +1 -0
  12. package/dist/src/crawler/robots.js +179 -0
  13. package/dist/src/crawler/robots.js.map +1 -0
  14. package/dist/src/crawler/sitemap.d.ts +36 -0
  15. package/dist/src/crawler/sitemap.d.ts.map +1 -0
  16. package/dist/src/crawler/sitemap.js +209 -0
  17. package/dist/src/crawler/sitemap.js.map +1 -0
  18. package/dist/src/engine/detector.d.ts +18 -0
  19. package/dist/src/engine/detector.d.ts.map +1 -0
  20. package/dist/src/engine/detector.js +155 -0
  21. package/dist/src/engine/detector.js.map +1 -0
  22. package/dist/src/engine/fetch.d.ts +18 -0
  23. package/dist/src/engine/fetch.d.ts.map +1 -0
  24. package/dist/src/engine/fetch.js +53 -0
  25. package/dist/src/engine/fetch.js.map +1 -0
  26. package/dist/src/engine/index.d.ts +39 -0
  27. package/dist/src/engine/index.d.ts.map +1 -0
  28. package/dist/src/engine/index.js +116 -0
  29. package/dist/src/engine/index.js.map +1 -0
  30. package/dist/src/engine/playwright.d.ts +23 -0
  31. package/dist/src/engine/playwright.d.ts.map +1 -0
  32. package/dist/src/engine/playwright.js +88 -0
  33. package/dist/src/engine/playwright.js.map +1 -0
  34. package/dist/src/engine/rebrowser.d.ts +22 -0
  35. package/dist/src/engine/rebrowser.d.ts.map +1 -0
  36. package/dist/src/engine/rebrowser.js +142 -0
  37. package/dist/src/engine/rebrowser.js.map +1 -0
  38. package/dist/src/extractor/cleaner.d.ts +13 -0
  39. package/dist/src/extractor/cleaner.d.ts.map +1 -0
  40. package/dist/src/extractor/cleaner.js +122 -0
  41. package/dist/src/extractor/cleaner.js.map +1 -0
  42. package/dist/src/extractor/index.d.ts +29 -0
  43. package/dist/src/extractor/index.d.ts.map +1 -0
  44. package/dist/src/extractor/index.js +162 -0
  45. package/dist/src/extractor/index.js.map +1 -0
  46. package/dist/src/extractor/links.d.ts +22 -0
  47. package/dist/src/extractor/links.d.ts.map +1 -0
  48. package/dist/src/extractor/links.js +92 -0
  49. package/dist/src/extractor/links.js.map +1 -0
  50. package/dist/src/extractor/markdown.d.ts +13 -0
  51. package/dist/src/extractor/markdown.d.ts.map +1 -0
  52. package/dist/src/extractor/markdown.js +94 -0
  53. package/dist/src/extractor/markdown.js.map +1 -0
  54. package/dist/src/git/index.d.ts +40 -0
  55. package/dist/src/git/index.d.ts.map +1 -0
  56. package/dist/src/git/index.js +303 -0
  57. package/dist/src/git/index.js.map +1 -0
  58. package/dist/src/index.d.ts +103 -0
  59. package/dist/src/index.d.ts.map +1 -0
  60. package/dist/src/index.js +229 -0
  61. package/dist/src/index.js.map +1 -0
  62. package/dist/src/ingestor/index.d.ts +95 -0
  63. package/dist/src/ingestor/index.d.ts.map +1 -0
  64. package/dist/src/ingestor/index.js +471 -0
  65. package/dist/src/ingestor/index.js.map +1 -0
  66. package/dist/src/utils/dedup.d.ts +66 -0
  67. package/dist/src/utils/dedup.d.ts.map +1 -0
  68. package/dist/src/utils/dedup.js +296 -0
  69. package/dist/src/utils/dedup.js.map +1 -0
  70. package/dist/src/utils/index.d.ts +3 -0
  71. package/dist/src/utils/index.d.ts.map +1 -0
  72. package/dist/src/utils/index.js +3 -0
  73. package/dist/src/utils/index.js.map +1 -0
  74. package/dist/src/utils/logger.d.ts +12 -0
  75. package/dist/src/utils/logger.d.ts.map +1 -0
  76. package/dist/src/utils/logger.js +49 -0
  77. package/dist/src/utils/logger.js.map +1 -0
  78. package/dist/src/utils/ui.d.ts +126 -0
  79. package/dist/src/utils/ui.d.ts.map +1 -0
  80. package/dist/src/utils/ui.js +357 -0
  81. package/dist/src/utils/ui.js.map +1 -0
  82. package/dist/src/utils/url.d.ts +21 -0
  83. package/dist/src/utils/url.d.ts.map +1 -0
  84. package/dist/src/utils/url.js +107 -0
  85. package/dist/src/utils/url.js.map +1 -0
  86. package/package.json +71 -0
@@ -0,0 +1,471 @@
1
+ /**
2
+ * MV2 ingestor - saves crawled content to .mv2 files
3
+ */
4
+ import { existsSync } from 'fs';
5
+ import { stat } from 'fs/promises';
6
+ import { createLogger } from '../utils/logger.js';
7
+ const log = createLogger();
8
+ // Dynamic import for @memvid/sdk
9
+ let sdkModule = null;
10
+ async function getSDK() {
11
+ if (!sdkModule) {
12
+ sdkModule = await import('@memvid/sdk');
13
+ }
14
+ return sdkModule;
15
+ }
16
+ /**
17
+ * Ingest crawled results into an MV2 file
18
+ */
19
+ export async function ingestToMv2(results, options) {
20
+ const sdk = await getSDK();
21
+ const { create, use, configure, createMemory } = sdk;
22
+ const startTime = Date.now();
23
+ const apiKey = process.env.MEMVID_API_KEY;
24
+ const dashboardUrl = process.env.MEMVID_DASHBOARD_URL || 'https://memvid.com';
25
+ let memoryId = options.memoryId;
26
+ // Configure SDK if API key is set
27
+ if (apiKey) {
28
+ configure({
29
+ apiKey,
30
+ dashboardUrl,
31
+ });
32
+ // Auto-create memory if API key is set but no memoryId provided
33
+ if (!memoryId && !existsSync(options.output)) {
34
+ try {
35
+ const memoryName = options.memoryName || `maw-${Date.now()}`;
36
+ log.dim(` Creating cloud memory: ${memoryName}`);
37
+ const memory = await createMemory({
38
+ name: memoryName,
39
+ description: `Created by maw CLI`,
40
+ });
41
+ memoryId = memory.id;
42
+ log.dim(` Memory ID: ${memoryId}`);
43
+ }
44
+ catch (err) {
45
+ // Failed to create memory, continue without cloud binding
46
+ log.warn(` Could not create cloud memory: ${err.message}`);
47
+ }
48
+ }
49
+ }
50
+ // Create or open the MV2 file with optional cloud binding
51
+ let mem;
52
+ if (existsSync(options.output)) {
53
+ mem = await use('basic', options.output);
54
+ // Sync tickets if we have API key and memory ID
55
+ if (apiKey && memoryId) {
56
+ try {
57
+ await mem.syncTickets(memoryId, apiKey, dashboardUrl);
58
+ }
59
+ catch {
60
+ // Sync failed, continue with local
61
+ }
62
+ }
63
+ }
64
+ else {
65
+ // Create new file, optionally bound to cloud memory
66
+ const createOpts = {};
67
+ if (apiKey && memoryId) {
68
+ createOpts.memoryId = memoryId;
69
+ createOpts.memvidApiKey = apiKey;
70
+ }
71
+ mem = await create(options.output, 'basic', createOpts);
72
+ }
73
+ let pages = 0;
74
+ let totalBytes = 0;
75
+ let skippedDupes = 0;
76
+ let stoppedAtLimit = false;
77
+ let estimatedFileSize = 0;
78
+ // Smaller batch size to work around SDK putMany hang at ~30 docs
79
+ const batchSize = options.batchSize || 3;
80
+ const batch = [];
81
+ // Size limit (default 40MB to stay safely under 50MB free tier with buffer)
82
+ const maxSizeBytes = (options.maxSizeMB || 40) * 1024 * 1024;
83
+ const hasApiKey = !!process.env.MEMVID_API_KEY;
84
+ // Content fingerprints to skip near-duplicates
85
+ const contentHashes = new Set();
86
+ for await (const result of results) {
87
+ const markdown = result.extracted.markdown;
88
+ // Simple content fingerprint to skip near-duplicates (first 2000 chars)
89
+ const fingerprint = markdown.slice(0, 2000).replace(/\s+/g, ' ').trim();
90
+ if (contentHashes.has(fingerprint)) {
91
+ skippedDupes++;
92
+ continue;
93
+ }
94
+ contentHashes.add(fingerprint);
95
+ const doc = {
96
+ title: result.extracted.title,
97
+ label: options.label || 'web',
98
+ text: markdown,
99
+ metadata: {
100
+ url: result.url,
101
+ finalUrl: result.finalUrl,
102
+ description: result.extracted.description,
103
+ author: result.extracted.author,
104
+ publishedDate: result.extracted.publishedDate,
105
+ wordCount: result.extracted.wordCount,
106
+ crawlDepth: result.depth,
107
+ engine: result.engine,
108
+ crawledAt: new Date().toISOString(),
109
+ },
110
+ };
111
+ // Estimate document size (text + metadata + embeddings overhead ~3x)
112
+ const docSize = markdown.length * 3;
113
+ // Check if we'd exceed the limit (only if no API key)
114
+ if (!hasApiKey && estimatedFileSize + docSize > maxSizeBytes) {
115
+ stoppedAtLimit = true;
116
+ log.warn(` Reached ~${Math.round(estimatedFileSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
117
+ break;
118
+ }
119
+ batch.push(doc);
120
+ pages++;
121
+ totalBytes += result.extracted.byteSize;
122
+ estimatedFileSize += docSize;
123
+ // Show progress
124
+ log.progress(pages, pages, result.extracted.title.slice(0, 40));
125
+ // Flush batch
126
+ if (batch.length >= batchSize) {
127
+ try {
128
+ if (options.enableEmbedding) {
129
+ log.status(`Embedding batch ${Math.ceil(pages / batchSize)} (${options.embeddingModel || 'bge-small'})...`);
130
+ }
131
+ await mem.putMany(batch, options.enableEmbedding ? {
132
+ enableEmbedding: true,
133
+ embeddingModel: options.embeddingModel || 'openai',
134
+ } : undefined);
135
+ batch.length = 0;
136
+ // Check actual file size after flush (only if no API key)
137
+ if (!hasApiKey) {
138
+ const currentSize = await getFileSize(options.output);
139
+ if (currentSize > maxSizeBytes) {
140
+ stoppedAtLimit = true;
141
+ break;
142
+ }
143
+ }
144
+ }
145
+ catch (err) {
146
+ // Handle SDK size limit error gracefully
147
+ if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
148
+ stoppedAtLimit = true;
149
+ break;
150
+ }
151
+ throw err;
152
+ }
153
+ }
154
+ }
155
+ // Flush remaining (with error handling)
156
+ if (batch.length > 0 && !stoppedAtLimit) {
157
+ try {
158
+ if (options.enableEmbedding) {
159
+ log.status(`Embedding final batch (${options.embeddingModel || 'bge-small'})...`);
160
+ }
161
+ await mem.putMany(batch, options.enableEmbedding ? {
162
+ enableEmbedding: true,
163
+ embeddingModel: options.embeddingModel || 'openai',
164
+ } : undefined);
165
+ }
166
+ catch (err) {
167
+ if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
168
+ stoppedAtLimit = true;
169
+ }
170
+ else {
171
+ throw err;
172
+ }
173
+ }
174
+ }
175
+ const duration = Date.now() - startTime;
176
+ return {
177
+ pages,
178
+ bytes: totalBytes,
179
+ duration,
180
+ stoppedAtLimit,
181
+ skippedDupes,
182
+ memoryId,
183
+ };
184
+ }
185
+ /**
186
+ * Ingest git repo files into an MV2 file
187
+ */
188
+ export async function ingestGitToMv2(files, options) {
189
+ const sdk = await getSDK();
190
+ const { create, use, configure, createMemory } = sdk;
191
+ const startTime = Date.now();
192
+ const apiKey = process.env.MEMVID_API_KEY;
193
+ const dashboardUrl = process.env.MEMVID_DASHBOARD_URL || 'https://memvid.com';
194
+ let memoryId = options.memoryId;
195
+ // Configure SDK if API key is set
196
+ if (apiKey) {
197
+ configure({ apiKey, dashboardUrl });
198
+ // Auto-create memory if API key is set but no memoryId provided
199
+ if (!memoryId && !existsSync(options.output)) {
200
+ try {
201
+ const memoryName = options.memoryName || `maw-repo-${Date.now()}`;
202
+ log.dim(` Creating cloud memory: ${memoryName}`);
203
+ const memory = await createMemory({
204
+ name: memoryName,
205
+ description: `Git repo ingested by maw CLI`,
206
+ });
207
+ memoryId = memory.id;
208
+ log.dim(` Memory ID: ${memoryId}`);
209
+ }
210
+ catch (err) {
211
+ log.warn(` Could not create cloud memory: ${err.message}`);
212
+ }
213
+ }
214
+ }
215
+ // Create or open the MV2 file
216
+ let mem;
217
+ if (existsSync(options.output)) {
218
+ mem = await use('basic', options.output);
219
+ if (apiKey && memoryId) {
220
+ try {
221
+ await mem.syncTickets(memoryId, apiKey, dashboardUrl);
222
+ }
223
+ catch { }
224
+ }
225
+ }
226
+ else {
227
+ const createOpts = {};
228
+ if (apiKey && memoryId) {
229
+ createOpts.memoryId = memoryId;
230
+ createOpts.memvidApiKey = apiKey;
231
+ }
232
+ mem = await create(options.output, 'basic', createOpts);
233
+ }
234
+ let fileCount = 0;
235
+ let totalBytes = 0;
236
+ let stoppedAtLimit = false;
237
+ let estimatedFileSize = 0;
238
+ // Smaller batch size to work around SDK putMany hang
239
+ const batchSize = 3;
240
+ const batch = [];
241
+ // Size limit (default 40MB to stay safely under 50MB free tier)
242
+ const maxSizeBytes = (options.maxSizeMB || 40) * 1024 * 1024;
243
+ const hasApiKey = !!process.env.MEMVID_API_KEY;
244
+ // Embedding overhead: ~6KB per chunk for 1536-dim vectors (OpenAI), less for smaller models
245
+ const embeddingOverhead = options.enableEmbedding ? 6000 : 0;
246
+ for await (const file of files) {
247
+ // Check if this is a README or documentation file
248
+ const isReadme = /readme\.md$/i.test(file.path);
249
+ const isDocs = /^(docs|documentation)\//i.test(file.path) || /\.(md|mdx|rst)$/i.test(file.path);
250
+ // Build enhanced text for README files to improve retrieval
251
+ let text = `File: ${file.path}\nLanguage: ${file.language}\n\n${file.content}`;
252
+ // For README files, prepend searchable context
253
+ if (isReadme) {
254
+ const projectName = file.path.includes('/') ? '' : file.content.match(/^#\s+(.+)/m)?.[1] || '';
255
+ text = `Project Overview: ${projectName}\nThis is the main README documentation.\nIntroduction and description of the project.\n\n${text}`;
256
+ }
257
+ // Create document with code content
258
+ // Format: include file path as context, then the actual code
259
+ // This helps the LLM understand what file it's looking at
260
+ const doc = {
261
+ title: `${file.path} (${file.language})`,
262
+ label: options.label || 'code',
263
+ text,
264
+ uri: `file://${file.path}`, // Use file path as URI for scope filtering
265
+ metadata: {
266
+ path: file.path,
267
+ language: file.language,
268
+ size: file.size,
269
+ type: isReadme ? 'readme' : (isDocs ? 'docs' : 'code'),
270
+ isReadme,
271
+ ingestedAt: new Date().toISOString(),
272
+ },
273
+ };
274
+ // Add labels for better categorization
275
+ if (isReadme) {
276
+ doc.labels = ['README', 'Documentation', 'Overview', 'Introduction'];
277
+ }
278
+ else if (isDocs) {
279
+ doc.labels = ['Documentation'];
280
+ }
281
+ // Estimate document size (text + metadata + embedding vectors)
282
+ const docSize = text.length * 2 + embeddingOverhead;
283
+ // Check if we'd exceed the limit (only if no API key)
284
+ if (!hasApiKey && estimatedFileSize + docSize > maxSizeBytes) {
285
+ stoppedAtLimit = true;
286
+ log.warn(` Reached ~${Math.round(estimatedFileSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
287
+ break;
288
+ }
289
+ batch.push(doc);
290
+ fileCount++;
291
+ totalBytes += file.size;
292
+ estimatedFileSize += docSize;
293
+ // Show progress
294
+ log.progress(fileCount, fileCount, file.path.slice(-40));
295
+ // Flush batch
296
+ if (batch.length >= batchSize) {
297
+ try {
298
+ if (options.enableEmbedding) {
299
+ log.status(`Embedding ${fileCount} files (${options.embeddingModel || 'bge-small'})...`);
300
+ }
301
+ await mem.putMany(batch, options.enableEmbedding ? {
302
+ enableEmbedding: true,
303
+ embeddingModel: options.embeddingModel || 'openai',
304
+ } : undefined);
305
+ batch.length = 0;
306
+ // Check actual file size after flush (only if no API key)
307
+ if (!hasApiKey) {
308
+ const currentSize = await getFileSize(options.output);
309
+ if (currentSize > maxSizeBytes) {
310
+ stoppedAtLimit = true;
311
+ log.warn(` Reached ${Math.round(currentSize / 1024 / 1024)}MB limit. Set MEMVID_API_KEY for unlimited.`);
312
+ break;
313
+ }
314
+ }
315
+ }
316
+ catch (err) {
317
+ // Handle SDK size limit error gracefully
318
+ if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
319
+ stoppedAtLimit = true;
320
+ break;
321
+ }
322
+ throw err;
323
+ }
324
+ }
325
+ }
326
+ // Flush remaining (with error handling)
327
+ if (batch.length > 0 && !stoppedAtLimit) {
328
+ try {
329
+ if (options.enableEmbedding) {
330
+ log.status(`Embedding final ${batch.length} files (${options.embeddingModel || 'bge-small'})...`);
331
+ }
332
+ await mem.putMany(batch, options.enableEmbedding ? {
333
+ enableEmbedding: true,
334
+ embeddingModel: options.embeddingModel || 'openai',
335
+ } : undefined);
336
+ }
337
+ catch (err) {
338
+ if (err.message?.includes('exceeds') && err.message?.includes('limit')) {
339
+ stoppedAtLimit = true;
340
+ }
341
+ else {
342
+ throw err;
343
+ }
344
+ }
345
+ }
346
+ const duration = Date.now() - startTime;
347
+ return {
348
+ files: fileCount,
349
+ bytes: totalBytes,
350
+ duration,
351
+ memoryId,
352
+ stoppedAtLimit,
353
+ };
354
+ }
355
+ /**
356
+ * Get file size
357
+ */
358
+ export async function getFileSize(path) {
359
+ try {
360
+ const stats = await stat(path);
361
+ return stats.size;
362
+ }
363
+ catch {
364
+ return 0;
365
+ }
366
+ }
367
+ /**
368
+ * Open an existing MV2 file for querying
369
+ */
370
+ export async function openMv2(path) {
371
+ const { use } = await getSDK();
372
+ return use('basic', path);
373
+ }
374
+ /**
375
+ * Search in an MV2 file
376
+ * Uses semantic search when OPENAI_API_KEY is set
377
+ */
378
+ export async function searchMv2(path, query, options = {}) {
379
+ const mem = await openMv2(path);
380
+ // Determine search mode: use semantic if we have an API key for query embeddings
381
+ const queryEmbeddingModel = options.embeddingModel || (process.env.OPENAI_API_KEY ? 'openai' : undefined);
382
+ const mode = queryEmbeddingModel ? 'auto' : 'lex';
383
+ return mem.find(query, {
384
+ k: options.k || 10,
385
+ mode,
386
+ queryEmbeddingModel,
387
+ });
388
+ }
389
+ /**
390
+ * Detect if question is asking about what something is/does
391
+ */
392
+ function isOverviewQuestion(question) {
393
+ const lowerQ = question.toLowerCase();
394
+ return (/^what (is|does|are)\b/.test(lowerQ) ||
395
+ /^(explain|describe|tell me about)\b/.test(lowerQ) ||
396
+ /^how does .+ work/.test(lowerQ) ||
397
+ /overview|introduction|getting started/i.test(lowerQ));
398
+ }
399
+ /**
400
+ * Ask a question using an MV2 file
401
+ * Uses semantic search when embeddings are available and OPENAI_API_KEY is set
402
+ */
403
+ export async function askMv2(path, question, options = {}) {
404
+ const mem = await openMv2(path);
405
+ // For overview questions, use higher k to get more diverse context including README chunks
406
+ const isOverview = isOverviewQuestion(question);
407
+ const effectiveK = options.k || (isOverview ? 15 : 8);
408
+ // Determine search mode: use semantic/auto if we have an API key for query embeddings
409
+ const queryEmbeddingModel = options.embeddingModel || (process.env.OPENAI_API_KEY ? 'openai' : undefined);
410
+ const mode = queryEmbeddingModel ? 'auto' : 'lex'; // auto = hybrid (semantic + lexical), lex = BM25 only
411
+ return mem.ask(question, {
412
+ model: options.model || 'gpt-4o-mini',
413
+ modelApiKey: options.apiKey || process.env.OPENAI_API_KEY,
414
+ k: effectiveK,
415
+ llmContextChars: isOverview ? 15000 : 8000, // More context for overview questions
416
+ mode,
417
+ queryEmbeddingModel,
418
+ });
419
+ }
420
+ /**
421
+ * List documents in an MV2 file
422
+ */
423
+ export async function listMv2(path, options = {}) {
424
+ const mem = await openMv2(path);
425
+ // Use timeline or list method if available
426
+ if (typeof mem.timeline === 'function') {
427
+ return mem.timeline({ limit: options.limit || 100 });
428
+ }
429
+ // Fallback to find with empty query
430
+ return mem.find('', { k: options.limit || 100 });
431
+ }
432
+ /**
433
+ * Export documents from an MV2 file with full content
434
+ */
435
+ export async function exportMv2(path, options = {}) {
436
+ const mem = await openMv2(path);
437
+ // Get frame list
438
+ const timeline = await mem.timeline({ limit: options.limit || 10000 });
439
+ const frames = timeline.frames || timeline;
440
+ // Get full content for each frame
441
+ const results = [];
442
+ for (const frame of frames) {
443
+ // Skip child frames (they're included in parent)
444
+ if (frame.child_frames && frame.child_frames.length > 0) {
445
+ // This is a parent frame - get its full content
446
+ try {
447
+ const content = await mem.view(frame.frame_id);
448
+ const uri = frame.uri || '';
449
+ const title = uri.replace('file://', '').replace(/^https?:\/\//, '') || `Frame ${frame.frame_id}`;
450
+ results.push({ title, uri, content });
451
+ }
452
+ catch {
453
+ // Frame might not exist, skip
454
+ }
455
+ }
456
+ else if (!frames.some((f) => f.child_frames?.includes(frame.frame_id))) {
457
+ // This is a standalone frame (not a child of another)
458
+ try {
459
+ const content = await mem.view(frame.frame_id);
460
+ const uri = frame.uri || '';
461
+ const title = uri.replace('file://', '').replace(/^https?:\/\//, '') || `Frame ${frame.frame_id}`;
462
+ results.push({ title, uri, content });
463
+ }
464
+ catch {
465
+ // Frame might not exist, skip
466
+ }
467
+ }
468
+ }
469
+ return results;
470
+ }
471
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/ingestor/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AAEnC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,MAAM,GAAG,GAAG,YAAY,EAAE,CAAC;AAE3B,iCAAiC;AACjC,IAAI,SAAS,GAAQ,IAAI,CAAC;AAE1B,KAAK,UAAU,MAAM;IACnB,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,SAAS,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAsBD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,OAAmC,EACnC,OAAsB;IAEtB,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;IAC3B,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,GAAG,CAAC;IACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAC1C,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,oBAAoB,CAAC;IAE9E,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAEhC,kCAAkC;IAClC,IAAI,MAAM,EAAE,CAAC;QACX,SAAS,CAAC;YACR,MAAM;YACN,YAAY;SACb,CAAC,CAAC;QAEH,gEAAgE;QAChE,IAAI,CAAC,QAAQ,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC7C,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,OAAO,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;gBAC7D,GAAG,CAAC,GAAG,CAAC,4BAA4B,UAAU,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;oBAChC,IAAI,EAAE,UAAU;oBAChB,WAAW,EAAE,oBAAoB;iBAClC,CAAC,CAAC;gBACH,QAAQ,GAAG,MAAM,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,gBAAgB,QAAQ,EAAE,CAAC,CAAC;YACtC,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,0DAA0D;gBAC1D,GAAG,CAAC,IAAI,CAAC,oCAAoC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;IACH,CAAC;IAED,0DAA0D;IAC1D,IAAI,GAAG,CAAC;IACR,IAAI,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,GAAG,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzC,gDAAgD;QAChD,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,GAAG,CAAC,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC;gBACP,mCAAmC;YACrC,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,oDAAoD;QACpD,MAAM,UAAU,GAAQ,EAAE,CAAC;QAC3B,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAC;YAC/B,UAAU,CAAC,YAAY,GAAG,MAAM,CAAC;QACnC,CAAC;QACD,GAAG,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAC1B,iEAAiE;IACjE,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,CAAC,CAAC;IACzC,MAAM,KAAK,GAAyF,EAAE,CAAC;IAEvG,4EAA4E;IAC5E,MAAM,YAAY,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;IAC7D,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE/C,+CAA+C;IAC/C,MAAM,aAAa,GAAG,IAAI,GAAG,EAAU,CAAC;IAExC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC;QAE3C,wEAAwE;QACxE,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACxE,IAAI,aAAa,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC;YACnC,YAAY,EAAE,CAAC;YACf,SAAS;QACX,CAAC;QACD,aAAa,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QAE/B,MAAM,GAAG,GAAG;YACV,KAAK,EAAE,MAAM,CAAC,SAAS,CAAC,KAAK;YAC7B,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,KAAK;YAC7B,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE;gBACR,GAAG,EAAE,MAAM,CAAC,GAAG;gBACf,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,WAAW,EAAE,MAAM,CAAC,SAAS,CAAC,WAAW;gBACzC,MAAM,EAAE,MAAM,CAAC,SAAS,CAAC,MAAM;gBAC/B,aAAa,EAAE,MAAM,CAAC,SAAS,CAAC,aAAa;gBAC7C,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,SAAS;gBACrC,UAAU,EAAE,MAAM,CAAC,KAAK;gBACxB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC;SACF,CAAC;QAEF,qEAAqE;QACrE,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;QAEpC,sDAAsD;QACtD,IAAI,CAAC,SAAS,IAAI,iBAAiB,GAAG,OAAO,GAAG,YAAY,EAAE,CAAC;YAC7D,cAAc,GAAG,IAAI,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;YACjH,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,KAAK,EAAE,CAAC;QACR,UAAU,IAAI,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC;QACxC,iBAAiB,IAAI,OAAO,CAAC;QAE7B,gBAAgB;QAChB,GAAG,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QAEhE,cAAc;QACd,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;oBAC5B,GAAG,CAAC,MAAM,CAAC,mBAAmB,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,SAAS,CAAC,KAAK,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;gBAC9G,CAAC;gBACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;oBACjD,eAAe,EAAE,IAAI;oBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;iBACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;gBACf,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;gBAEjB,0DAA0D;gBAC1D,IAAI,CAAC,SAAS,EAAE,CAAC;oBACf,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;oBACtD,IAAI,WAAW,GAAG,YAAY,EAAE,CAAC;wBAC/B,cAAc,GAAG,IAAI,CAAC;wBACtB,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,yCAAyC;gBACzC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;oBACvE,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM;gBACR,CAAC;gBACD,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,cAAc,EAAE,CAAC;QACxC,IAAI,CAAC;YACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,GAAG,CAAC,MAAM,CAAC,0BAA0B,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;YACpF,CAAC;YACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;gBACjD,eAAe,EAAE,IAAI;gBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;aACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACjB,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvE,cAAc,GAAG,IAAI,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAExC,OAAO;QACL,KAAK;QACL,KAAK,EAAE,UAAU;QACjB,QAAQ;QACR,cAAc;QACd,YAAY;QACZ,QAAQ;KACT,CAAC;AACJ,CAAC;AAoBD;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,KAAuF,EACvF,OAAyB;IAEzB,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;IAC3B,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,GAAG,CAAC;IACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAC1C,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,oBAAoB,CAAC;IAE9E,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;IAEhC,kCAAkC;IAClC,IAAI,MAAM,EAAE,CAAC;QACX,SAAS,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC;QAEpC,gEAAgE;QAChE,IAAI,CAAC,QAAQ,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC7C,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,YAAY,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;gBAClE,GAAG,CAAC,GAAG,CAAC,4BAA4B,UAAU,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC;oBAChC,IAAI,EAAE,UAAU;oBAChB,WAAW,EAAE,8BAA8B;iBAC5C,CAAC,CAAC;gBACH,QAAQ,GAAG,MAAM,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,gBAAgB,QAAQ,EAAE,CAAC,CAAC;YACtC,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,GAAG,CAAC,IAAI,CAAC,oCAAoC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8BAA8B;IAC9B,IAAI,GAAG,CAAC;IACR,IAAI,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,GAAG,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzC,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,GAAG,CAAC,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC;YACxD,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,UAAU,GAAQ,EAAE,CAAC;QAC3B,IAAI,MAAM,IAAI,QAAQ,EAAE,CAAC;YACvB,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAC;YAC/B,UAAU,CAAC,YAAY,GAAG,MAAM,CAAC;QACnC,CAAC;QACD,GAAG,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,iBAAiB,GAAG,CAAC,CAAC;IAC1B,qDAAqD;IACrD,MAAM,SAAS,GAAG,CAAC,CAAC;IACpB,MAAM,KAAK,GAAyF,EAAE,CAAC;IAEvG,gEAAgE;IAChE,MAAM,YAAY,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;IAC7D,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE/C,4FAA4F;IAC5F,MAAM,iBAAiB,GAAG,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC/B,kDAAkD;QAClD,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,0BAA0B,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEhG,4DAA4D;QAC5D,IAAI,IAAI,GAAG,SAAS,IAAI,CAAC,IAAI,eAAe,IAAI,CAAC,QAAQ,OAAO,IAAI,CAAC,OAAO,EAAE,CAAC;QAE/E,+CAA+C;QAC/C,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/F,IAAI,GAAG,qBAAqB,WAAW,6FAA6F,IAAI,EAAE,CAAC;QAC7I,CAAC;QAED,oCAAoC;QACpC,6DAA6D;QAC7D,0DAA0D;QAC1D,MAAM,GAAG,GAAQ;YACf,KAAK,EAAE,GAAG,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,QAAQ,GAAG;YACxC,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,MAAM;YAC9B,IAAI;YACJ,GAAG,EAAE,UAAU,IAAI,CAAC,IAAI,EAAE,EAAE,2CAA2C;YACvE,QAAQ,EAAE;gBACR,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;gBACtD,QAAQ;gBACR,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACrC;SACF,CAAC;QAEF,uCAAuC;QACvC,IAAI,QAAQ,EAAE,CAAC;YACb,GAAG,CAAC,MAAM,GAAG,CAAC,QAAQ,EAAE,eAAe,EAAE,UAAU,EAAE,cAAc,CAAC,CAAC;QACvE,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,GAAG,CAAC,MAAM,GAAG,CAAC,eAAe,CAAC,CAAC;QACjC,CAAC;QAED,+DAA+D;QAC/D,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,iBAAiB,CAAC;QAEpD,sDAAsD;QACtD,IAAI,CAAC,SAAS,IAAI,iBAAiB,GAAG,OAAO,GAAG,YAAY,EAAE,CAAC;YAC7D,cAAc,GAAG,IAAI,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;YACjH,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChB,SAAS,EAAE,CAAC;QACZ,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC;QACxB,iBAAiB,IAAI,OAAO,CAAC;QAE7B,gBAAgB;QAChB,GAAG,CAAC,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAEzD,cAAc;QACd,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;oBAC5B,GAAG,CAAC,MAAM,CAAC,aAAa,SAAS,WAAW,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;gBAC3F,CAAC;gBACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;oBACjD,eAAe,EAAE,IAAI;oBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;iBACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;gBACf,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;gBAEjB,0DAA0D;gBAC1D,IAAI,CAAC,SAAS,EAAE,CAAC;oBACf,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;oBACtD,IAAI,WAAW,GAAG,YAAY,EAAE,CAAC;wBAC/B,cAAc,GAAG,IAAI,CAAC;wBACtB,GAAG,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,IAAI,GAAG,IAAI,CAAC,6CAA6C,CAAC,CAAC;wBAC1G,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,yCAAyC;gBACzC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;oBACvE,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM;gBACR,CAAC;gBACD,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,cAAc,EAAE,CAAC;QACxC,IAAI,CAAC;YACH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;gBAC5B,GAAG,CAAC,MAAM,CAAC,mBAAmB,KAAK,CAAC,MAAM,WAAW,OAAO,CAAC,cAAc,IAAI,WAAW,MAAM,CAAC,CAAC;YACpG,CAAC;YACD,MAAM,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC;gBACjD,eAAe,EAAE,IAAI;gBACrB,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;aACnD,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACjB,CAAC;QAAC,OAAO,GAAQ,EAAE,CAAC;YAClB,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvE,cAAc,GAAG,IAAI,CAAC;YACxB,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAExC,OAAO;QACL,KAAK,EAAE,SAAS;QAChB,KAAK,EAAE,UAAU;QACjB,QAAQ;QACR,QAAQ;QACR,cAAc;KACf,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,IAAY;IAC5C,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,OAAO,KAAK,CAAC,IAAI,CAAC;IACpB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,CAAC;IACX,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAY;IACxC,MAAM,EAAE,GAAG,EAAE,GAAG,MAAM,MAAM,EAAE,CAAC;IAC/B,OAAO,GAAG,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;AAC5B,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,KAAa,EACb,UAAmD,EAAE;IAErD,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,iFAAiF;IACjF,MAAM,mBAAmB,GAAG,OAAO,CAAC,cAAc,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1G,MAAM,IAAI,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC;IAElD,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE;QACrB,CAAC,EAAE,OAAO,CAAC,CAAC,IAAI,EAAE;QAClB,IAAI;QACJ,mBAAmB;KACpB,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,QAAgB;IAC1C,MAAM,MAAM,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;IACtC,OAAO,CACL,uBAAuB,CAAC,IAAI,CAAC,MAAM,CAAC;QACpC,qCAAqC,CAAC,IAAI,CAAC,MAAM,CAAC;QAClD,mBAAmB,CAAC,IAAI,CAAC,MAAM,CAAC;QAChC,wCAAwC,CAAC,IAAI,CAAC,MAAM,CAAC,CACtD,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,MAAM,CAC1B,IAAY,EACZ,QAAgB,EAChB,UAAoF,EAAE;IAEtF,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,2FAA2F;IAC3F,MAAM,UAAU,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEtD,sFAAsF;IACtF,MAAM,mBAAmB,GAAG,OAAO,CAAC,cAAc,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IAC1G,MAAM,IAAI,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,sDAAsD;IAEzG,OAAO,GAAG,CAAC,GAAG,CAAC,QAAQ,EAAE;QACvB,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,aAAa;QACrC,WAAW,EAAE,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc;QACzD,CAAC,EAAE,UAAU;QACb,eAAe,EAAE,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,sCAAsC;QAClF,IAAI;QACJ,mBAAmB;KACpB,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,IAAY,EACZ,UAA+C,EAAE;IAEjD,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAChC,2CAA2C;IAC3C,IAAI,OAAO,GAAG,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;QACvC,OAAO,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG,EAAE,CAAC,CAAC;IACvD,CAAC;IACD,oCAAoC;IACpC,OAAO,GAAG,CAAC,IAAI,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,OAAO,CAAC,KAAK,IAAI,GAAG,EAAE,CAAC,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,UAA8B,EAAE;IAEhC,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAEhC,iBAAiB;IACjB,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,KAAK,EAAE,CAAC,CAAC;IACvE,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC;IAE3C,kCAAkC;IAClC,MAAM,OAAO,GAA2D,EAAE,CAAC;IAE3E,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,iDAAiD;QACjD,IAAI,KAAK,CAAC,YAAY,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxD,gDAAgD;YAChD,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;gBAC/C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC;gBAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,IAAI,SAAS,KAAK,CAAC,QAAQ,EAAE,CAAC;gBAClG,OAAO,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;YACxC,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;aAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;YAC9E,sDAAsD;YACtD,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;gBAC/C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,IAAI,EAAE,CAAC;gBAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,IAAI,SAAS,KAAK,CAAC,QAAQ,EAAE,CAAC;gBAClG,OAAO,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;YACxC,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Smart duplicate detection for web crawling
3
+ * - Detects localized URLs (e.g., /en-us/, /de-de/, /fr-fr/)
4
+ * - Content similarity fingerprinting
5
+ * - URL path normalization
6
+ */
7
+ export interface LocaleInfo {
8
+ hasLocale: boolean;
9
+ locale?: string;
10
+ language?: string;
11
+ country?: string;
12
+ canonicalPath: string;
13
+ }
14
+ /**
15
+ * Extract locale information from a URL
16
+ */
17
+ export declare function extractLocale(url: string): LocaleInfo;
18
+ /**
19
+ * Generate a content fingerprint for similarity detection
20
+ * Uses multiple techniques for robust matching
21
+ */
22
+ export declare function generateFingerprint(text: string): string;
23
+ /**
24
+ * Calculate similarity between two fingerprints (0-1)
25
+ */
26
+ export declare function calculateSimilarity(fp1: string, fp2: string): number;
27
+ /**
28
+ * Smart deduplication tracker
29
+ */
30
+ export declare class DedupTracker {
31
+ private canonicalPaths;
32
+ private fingerprints;
33
+ stats: {
34
+ localeSkipped: number;
35
+ similarSkipped: number;
36
+ total: number;
37
+ };
38
+ private preferredLanguage;
39
+ constructor(preferredLanguage?: string);
40
+ /**
41
+ * Check if URL should be skipped (is duplicate)
42
+ * Returns reason if should skip, undefined if should crawl
43
+ */
44
+ shouldSkip(url: string): {
45
+ skip: boolean;
46
+ reason?: string;
47
+ };
48
+ /**
49
+ * Check content similarity against previously seen content
50
+ */
51
+ checkContentSimilarity(url: string, content: string, threshold?: number): {
52
+ skip: boolean;
53
+ reason?: string;
54
+ };
55
+ /**
56
+ * Get dedup statistics
57
+ */
58
+ getStats(): {
59
+ uniquePaths: number;
60
+ uniqueContent: number;
61
+ localeSkipped: number;
62
+ similarSkipped: number;
63
+ total: number;
64
+ };
65
+ }
66
+ //# sourceMappingURL=dedup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../../src/utils/dedup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmDH,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,OAAO,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,CAsHrD;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAoBxD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAiBpE;AAED;;GAEG;AACH,qBAAa,YAAY;IAEvB,OAAO,CAAC,cAAc,CAAuD;IAG7E,OAAO,CAAC,YAAY,CAA6B;IAG1C,KAAK;;;;MAIV;IAGF,OAAO,CAAC,iBAAiB,CAAS;gBAEtB,iBAAiB,SAAO;IAIpC;;;OAGG;IACH,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE;IAqC3D;;OAEG;IACH,sBAAsB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,SAAO,GAAG;QAAE,IAAI,EAAE,OAAO,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE;IAiB1G;;OAEG;IACH,QAAQ;;;;;;;CAOT"}