@snap-agent/rag-web 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +44 -1
- package/dist/index.d.ts +44 -1
- package/dist/index.js +194 -11
- package/dist/index.mjs +194 -11
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -670,6 +710,9 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
670
710
|
private crawlPageSmart;
|
|
671
711
|
private crawlPageRendered;
|
|
672
712
|
private discoverSitemaps;
|
|
713
|
+
private emitBulkProgress;
|
|
714
|
+
private emitCrawlProgress;
|
|
715
|
+
private emitCrawlPage;
|
|
673
716
|
private createDebugCollector;
|
|
674
717
|
/**
|
|
675
718
|
* Clean extracted text content
|
|
@@ -740,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
740
783
|
getConfig(): Record<string, any>;
|
|
741
784
|
}
|
|
742
785
|
|
|
743
|
-
export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
|
786
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
package/dist/index.d.ts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -670,6 +710,9 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
670
710
|
private crawlPageSmart;
|
|
671
711
|
private crawlPageRendered;
|
|
672
712
|
private discoverSitemaps;
|
|
713
|
+
private emitBulkProgress;
|
|
714
|
+
private emitCrawlProgress;
|
|
715
|
+
private emitCrawlPage;
|
|
673
716
|
private createDebugCollector;
|
|
674
717
|
/**
|
|
675
718
|
* Clean extracted text content
|
|
@@ -740,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
740
783
|
getConfig(): Record<string, any>;
|
|
741
784
|
}
|
|
742
785
|
|
|
743
|
-
export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
|
786
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
package/dist/index.js
CHANGED
|
@@ -40,6 +40,24 @@ var import_openai = __toESM(require("openai"));
|
|
|
40
40
|
var cheerio = __toESM(require("cheerio"));
|
|
41
41
|
var fs = __toESM(require("fs"));
|
|
42
42
|
var path = __toESM(require("path"));
|
|
43
|
+
function bulkOpCurrentUrl(op) {
|
|
44
|
+
const meta = op.document?.metadata;
|
|
45
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
46
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
47
|
+
return void 0;
|
|
48
|
+
}
|
|
49
|
+
function isUrlListingInsert(document) {
|
|
50
|
+
const meta = document.metadata;
|
|
51
|
+
if (meta?.type !== "url") return false;
|
|
52
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
53
|
+
if (!url) return false;
|
|
54
|
+
try {
|
|
55
|
+
const parsed = new URL(url);
|
|
56
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
57
|
+
} catch {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
43
61
|
var WebRAGPlugin = class _WebRAGPlugin {
|
|
44
62
|
name = "web-rag";
|
|
45
63
|
type = "rag";
|
|
@@ -78,6 +96,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
78
96
|
}
|
|
79
97
|
return this.db.collection(this.config.collection);
|
|
80
98
|
}
|
|
99
|
+
ledgerIndexesEnsured = false;
|
|
81
100
|
async getLedgerCollection() {
|
|
82
101
|
if (!this.client) {
|
|
83
102
|
this.client = new import_mongodb.MongoClient(this.config.mongoUri);
|
|
@@ -85,7 +104,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
85
104
|
this.db = this.client.db(this.config.dbName);
|
|
86
105
|
}
|
|
87
106
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
88
|
-
|
|
107
|
+
const col = this.db.collection(name);
|
|
108
|
+
if (!this.ledgerIndexesEnsured) {
|
|
109
|
+
this.ledgerIndexesEnsured = true;
|
|
110
|
+
await col.createIndex(
|
|
111
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
112
|
+
{ unique: true }
|
|
113
|
+
);
|
|
114
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
115
|
+
}
|
|
116
|
+
return col;
|
|
89
117
|
}
|
|
90
118
|
/**
|
|
91
119
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -95,6 +123,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
95
123
|
const filter = { tenantId: this.config.tenantId };
|
|
96
124
|
filter.agentId = options.agentId ?? "shared";
|
|
97
125
|
if (options.domain) filter.domain = options.domain;
|
|
126
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
98
127
|
if (options.status) filter.lastStatus = options.status;
|
|
99
128
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
100
129
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -163,6 +192,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
163
192
|
lastCrawledAt: now,
|
|
164
193
|
updatedAt: now
|
|
165
194
|
};
|
|
195
|
+
if (params.ingestionId) {
|
|
196
|
+
$set.ingestionId = params.ingestionId;
|
|
197
|
+
}
|
|
166
198
|
if (errMsg !== void 0) {
|
|
167
199
|
$set.errorMessage = errMsg;
|
|
168
200
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -175,9 +207,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
175
207
|
$set.docId = params.doc.id;
|
|
176
208
|
} else {
|
|
177
209
|
$set.modeUsed = params.diag?.modeUsed;
|
|
178
|
-
$set.contentLength = null;
|
|
179
|
-
$set.title = null;
|
|
180
|
-
$set.docId = null;
|
|
210
|
+
$set.contentLength = params.contentLength ?? null;
|
|
211
|
+
$set.title = params.title ?? null;
|
|
212
|
+
$set.docId = params.docId ?? null;
|
|
181
213
|
}
|
|
182
214
|
await col.updateOne(
|
|
183
215
|
{
|
|
@@ -421,9 +453,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
421
453
|
let indexed = 0;
|
|
422
454
|
const errors = [];
|
|
423
455
|
const agentId = options?.agentId || "shared";
|
|
424
|
-
|
|
456
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
457
|
+
const indexingTotal = documents.length;
|
|
458
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
459
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
460
|
+
let chunksProcessed = 0;
|
|
461
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
462
|
+
this.emitCrawlProgress(
|
|
463
|
+
{ metadata: options?.metadata },
|
|
464
|
+
{
|
|
465
|
+
phase: "indexing",
|
|
466
|
+
urlsScheduled: indexingTotal,
|
|
467
|
+
pagesProcessed: 0,
|
|
468
|
+
chunksTotal,
|
|
469
|
+
chunksProcessed: 0
|
|
470
|
+
}
|
|
471
|
+
);
|
|
472
|
+
}
|
|
473
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
474
|
+
const doc = documents[docIndex];
|
|
475
|
+
const chunks = chunkPlan[docIndex];
|
|
425
476
|
try {
|
|
426
|
-
const chunks = this.chunkContent(doc.content);
|
|
427
477
|
const isChunked = chunks.length > 1;
|
|
428
478
|
if (isChunked) {
|
|
429
479
|
await collection.deleteMany({
|
|
@@ -458,6 +508,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
458
508
|
},
|
|
459
509
|
{ upsert: true }
|
|
460
510
|
);
|
|
511
|
+
chunksProcessed++;
|
|
512
|
+
if (onCrawlProgress) {
|
|
513
|
+
this.emitCrawlProgress(
|
|
514
|
+
{ metadata: options?.metadata },
|
|
515
|
+
{
|
|
516
|
+
phase: "indexing",
|
|
517
|
+
urlsScheduled: indexingTotal,
|
|
518
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
519
|
+
chunksTotal,
|
|
520
|
+
chunksProcessed
|
|
521
|
+
}
|
|
522
|
+
);
|
|
523
|
+
}
|
|
461
524
|
}
|
|
462
525
|
indexed++;
|
|
463
526
|
} catch (error) {
|
|
@@ -537,23 +600,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
537
600
|
let deleted = 0;
|
|
538
601
|
let failed = 0;
|
|
539
602
|
const errors = [];
|
|
603
|
+
const opsTotal = operations.length;
|
|
604
|
+
let opsDone = 0;
|
|
605
|
+
const ingestOptions = options ?? {};
|
|
606
|
+
this.emitBulkProgress(ingestOptions, {
|
|
607
|
+
phase: "processing",
|
|
608
|
+
opsTotal,
|
|
609
|
+
opsDone: 0
|
|
610
|
+
});
|
|
540
611
|
for (const op of operations) {
|
|
612
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
541
613
|
try {
|
|
542
614
|
switch (op.type) {
|
|
543
615
|
case "insert":
|
|
544
616
|
if (op.document) {
|
|
545
|
-
|
|
546
|
-
|
|
617
|
+
if (isUrlListingInsert(op.document)) {
|
|
618
|
+
const url = bulkOpCurrentUrl(op);
|
|
619
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
620
|
+
{
|
|
621
|
+
url,
|
|
622
|
+
metadata: {
|
|
623
|
+
...op.document.metadata ?? {},
|
|
624
|
+
url
|
|
625
|
+
}
|
|
626
|
+
},
|
|
627
|
+
ingestOptions
|
|
628
|
+
);
|
|
629
|
+
if (crawlResult.indexed > 0) {
|
|
630
|
+
inserted++;
|
|
631
|
+
} else {
|
|
632
|
+
failed++;
|
|
633
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
634
|
+
errors.push({
|
|
635
|
+
id: op.id,
|
|
636
|
+
operation: op.type,
|
|
637
|
+
error: err
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
} else {
|
|
641
|
+
await this.ingest([op.document], ingestOptions);
|
|
642
|
+
inserted++;
|
|
643
|
+
}
|
|
547
644
|
}
|
|
548
645
|
break;
|
|
549
646
|
case "update":
|
|
550
647
|
if (op.document) {
|
|
551
|
-
await this.update(op.id, op.document,
|
|
648
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
552
649
|
updated++;
|
|
553
650
|
}
|
|
554
651
|
break;
|
|
555
652
|
case "delete":
|
|
556
|
-
const count = await this.delete(op.id,
|
|
653
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
557
654
|
deleted += count;
|
|
558
655
|
break;
|
|
559
656
|
}
|
|
@@ -564,6 +661,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
564
661
|
operation: op.type,
|
|
565
662
|
error: error.message || "Unknown error"
|
|
566
663
|
});
|
|
664
|
+
} finally {
|
|
665
|
+
opsDone++;
|
|
666
|
+
this.emitBulkProgress(ingestOptions, {
|
|
667
|
+
phase: "processing",
|
|
668
|
+
opsTotal,
|
|
669
|
+
opsDone,
|
|
670
|
+
currentOpType: op.type,
|
|
671
|
+
...currentUrl ? { currentUrl } : {}
|
|
672
|
+
});
|
|
567
673
|
}
|
|
568
674
|
}
|
|
569
675
|
return {
|
|
@@ -1130,6 +1236,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1130
1236
|
};
|
|
1131
1237
|
}
|
|
1132
1238
|
const dbg = this.createDebugCollector(config.debug);
|
|
1239
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1133
1240
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1134
1241
|
if (!base) {
|
|
1135
1242
|
return {
|
|
@@ -1161,6 +1268,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1161
1268
|
if (config.excludePatterns?.length) {
|
|
1162
1269
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1163
1270
|
}
|
|
1271
|
+
this.emitCrawlProgress(config, {
|
|
1272
|
+
phase: "discovering",
|
|
1273
|
+
urlsDiscovered: filteredUrls.length
|
|
1274
|
+
});
|
|
1164
1275
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1165
1276
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1166
1277
|
break;
|
|
@@ -1182,7 +1293,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1182
1293
|
urlsToCrawl = discovery.urls;
|
|
1183
1294
|
urlsSkipped = discovery.skipped;
|
|
1184
1295
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1296
|
+
this.emitCrawlProgress(config, {
|
|
1297
|
+
phase: "discovering",
|
|
1298
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1299
|
+
});
|
|
1185
1300
|
}
|
|
1301
|
+
this.emitCrawlProgress(config, {
|
|
1302
|
+
phase: "crawling",
|
|
1303
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1304
|
+
urlsScheduled: urlsToCrawl.length
|
|
1305
|
+
});
|
|
1186
1306
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1187
1307
|
contentSelector: config.contentSelector,
|
|
1188
1308
|
titleSelector: config.titleSelector,
|
|
@@ -1204,9 +1324,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1204
1324
|
return {
|
|
1205
1325
|
...result,
|
|
1206
1326
|
urlsSkipped,
|
|
1327
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1328
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1207
1329
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1208
1330
|
metadata: {
|
|
1209
1331
|
...result.metadata || {},
|
|
1332
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1210
1333
|
discoveryDebug: dbg.summary()
|
|
1211
1334
|
}
|
|
1212
1335
|
};
|
|
@@ -1434,6 +1557,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1434
1557
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1435
1558
|
const agentId = options?.agentId ?? "shared";
|
|
1436
1559
|
const stripQ = config.stripQueryParams ?? false;
|
|
1560
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1437
1561
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1438
1562
|
for (const u of urls) {
|
|
1439
1563
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1462,6 +1586,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1462
1586
|
const results = await Promise.allSettled(
|
|
1463
1587
|
batch.map(async (url) => {
|
|
1464
1588
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1589
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1465
1590
|
if (ledgerOpts && !forceRecrawl) {
|
|
1466
1591
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1467
1592
|
if (this.shouldSkipLedger(
|
|
@@ -1482,6 +1607,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1482
1607
|
docId: entry?.docId
|
|
1483
1608
|
});
|
|
1484
1609
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1610
|
+
if (ledgerOpts) {
|
|
1611
|
+
await this.upsertLedgerRecord({
|
|
1612
|
+
url,
|
|
1613
|
+
urlNormalized,
|
|
1614
|
+
agentId,
|
|
1615
|
+
ingestionId,
|
|
1616
|
+
status: "skipped_ledger",
|
|
1617
|
+
title: entry?.title,
|
|
1618
|
+
docId: entry?.docId,
|
|
1619
|
+
contentLength: entry?.contentLength
|
|
1620
|
+
});
|
|
1621
|
+
}
|
|
1622
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1485
1623
|
return { kind: "ledger_skip", url };
|
|
1486
1624
|
}
|
|
1487
1625
|
}
|
|
@@ -1505,6 +1643,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1505
1643
|
url,
|
|
1506
1644
|
urlNormalized,
|
|
1507
1645
|
agentId,
|
|
1646
|
+
ingestionId,
|
|
1508
1647
|
status: crawlSt,
|
|
1509
1648
|
doc,
|
|
1510
1649
|
diag
|
|
@@ -1521,6 +1660,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1521
1660
|
docId: doc?.id,
|
|
1522
1661
|
error: diag?.errorMessage
|
|
1523
1662
|
});
|
|
1663
|
+
this.emitCrawlPage(config, {
|
|
1664
|
+
url,
|
|
1665
|
+
event: "done",
|
|
1666
|
+
status: crawlSt,
|
|
1667
|
+
error: diag?.errorMessage
|
|
1668
|
+
});
|
|
1524
1669
|
return { kind: "doc", doc, url };
|
|
1525
1670
|
} catch (error) {
|
|
1526
1671
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1529,6 +1674,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1529
1674
|
url,
|
|
1530
1675
|
urlNormalized,
|
|
1531
1676
|
agentId,
|
|
1677
|
+
ingestionId,
|
|
1532
1678
|
status: "error",
|
|
1533
1679
|
errorMessage: msg
|
|
1534
1680
|
});
|
|
@@ -1539,6 +1685,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1539
1685
|
status: "error",
|
|
1540
1686
|
error: msg
|
|
1541
1687
|
});
|
|
1688
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1542
1689
|
throw { url, error };
|
|
1543
1690
|
}
|
|
1544
1691
|
})
|
|
@@ -1561,12 +1708,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1561
1708
|
});
|
|
1562
1709
|
}
|
|
1563
1710
|
}
|
|
1711
|
+
this.emitCrawlProgress(config, {
|
|
1712
|
+
phase: "crawling",
|
|
1713
|
+
urlsScheduled: uniqueUrls.length,
|
|
1714
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
1715
|
+
});
|
|
1564
1716
|
if (i + concurrency < uniqueUrls.length) {
|
|
1565
1717
|
await this.delay(delayMs);
|
|
1566
1718
|
}
|
|
1567
1719
|
}
|
|
1568
1720
|
if (documents.length > 0) {
|
|
1569
|
-
const ingestResult = await this.ingest(documents,
|
|
1721
|
+
const ingestResult = await this.ingest(documents, {
|
|
1722
|
+
...options,
|
|
1723
|
+
metadata: {
|
|
1724
|
+
...options?.metadata ?? {},
|
|
1725
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
1726
|
+
}
|
|
1727
|
+
});
|
|
1570
1728
|
indexed = ingestResult.indexed;
|
|
1571
1729
|
if (ingestResult.errors) {
|
|
1572
1730
|
errors.push(...ingestResult.errors);
|
|
@@ -1957,6 +2115,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1957
2115
|
}
|
|
1958
2116
|
return Array.from(found);
|
|
1959
2117
|
}
|
|
2118
|
+
emitBulkProgress(options, update) {
|
|
2119
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2120
|
+
if (!fn) return;
|
|
2121
|
+
try {
|
|
2122
|
+
fn(update);
|
|
2123
|
+
} catch {
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
emitCrawlProgress(config, update) {
|
|
2127
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2128
|
+
if (!fn) return;
|
|
2129
|
+
try {
|
|
2130
|
+
fn(update);
|
|
2131
|
+
} catch {
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
emitCrawlPage(config, event) {
|
|
2135
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2136
|
+
if (!fn) return;
|
|
2137
|
+
try {
|
|
2138
|
+
fn(event);
|
|
2139
|
+
} catch {
|
|
2140
|
+
}
|
|
2141
|
+
}
|
|
1960
2142
|
createDebugCollector(debug) {
|
|
1961
2143
|
const enabled = !!debug?.enabled;
|
|
1962
2144
|
const level = debug?.level || "summary";
|
|
@@ -2245,6 +2427,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2245
2427
|
filterableFields: this.config.filterableFields,
|
|
2246
2428
|
typeBoosts: this.config.typeBoosts,
|
|
2247
2429
|
recencyBoost: this.config.recencyBoost,
|
|
2430
|
+
crawlLedger: this.config.crawlLedger,
|
|
2248
2431
|
priority: this.priority
|
|
2249
2432
|
};
|
|
2250
2433
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -4,6 +4,24 @@ import OpenAI from "openai";
|
|
|
4
4
|
import * as cheerio from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
|
+
function bulkOpCurrentUrl(op) {
|
|
8
|
+
const meta = op.document?.metadata;
|
|
9
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
10
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
11
|
+
return void 0;
|
|
12
|
+
}
|
|
13
|
+
function isUrlListingInsert(document) {
|
|
14
|
+
const meta = document.metadata;
|
|
15
|
+
if (meta?.type !== "url") return false;
|
|
16
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
17
|
+
if (!url) return false;
|
|
18
|
+
try {
|
|
19
|
+
const parsed = new URL(url);
|
|
20
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
21
|
+
} catch {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
7
25
|
var WebRAGPlugin = class _WebRAGPlugin {
|
|
8
26
|
name = "web-rag";
|
|
9
27
|
type = "rag";
|
|
@@ -42,6 +60,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
42
60
|
}
|
|
43
61
|
return this.db.collection(this.config.collection);
|
|
44
62
|
}
|
|
63
|
+
ledgerIndexesEnsured = false;
|
|
45
64
|
async getLedgerCollection() {
|
|
46
65
|
if (!this.client) {
|
|
47
66
|
this.client = new MongoClient(this.config.mongoUri);
|
|
@@ -49,7 +68,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
49
68
|
this.db = this.client.db(this.config.dbName);
|
|
50
69
|
}
|
|
51
70
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
52
|
-
|
|
71
|
+
const col = this.db.collection(name);
|
|
72
|
+
if (!this.ledgerIndexesEnsured) {
|
|
73
|
+
this.ledgerIndexesEnsured = true;
|
|
74
|
+
await col.createIndex(
|
|
75
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
76
|
+
{ unique: true }
|
|
77
|
+
);
|
|
78
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
79
|
+
}
|
|
80
|
+
return col;
|
|
53
81
|
}
|
|
54
82
|
/**
|
|
55
83
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -59,6 +87,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
59
87
|
const filter = { tenantId: this.config.tenantId };
|
|
60
88
|
filter.agentId = options.agentId ?? "shared";
|
|
61
89
|
if (options.domain) filter.domain = options.domain;
|
|
90
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
62
91
|
if (options.status) filter.lastStatus = options.status;
|
|
63
92
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
64
93
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -127,6 +156,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
127
156
|
lastCrawledAt: now,
|
|
128
157
|
updatedAt: now
|
|
129
158
|
};
|
|
159
|
+
if (params.ingestionId) {
|
|
160
|
+
$set.ingestionId = params.ingestionId;
|
|
161
|
+
}
|
|
130
162
|
if (errMsg !== void 0) {
|
|
131
163
|
$set.errorMessage = errMsg;
|
|
132
164
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -139,9 +171,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
139
171
|
$set.docId = params.doc.id;
|
|
140
172
|
} else {
|
|
141
173
|
$set.modeUsed = params.diag?.modeUsed;
|
|
142
|
-
$set.contentLength = null;
|
|
143
|
-
$set.title = null;
|
|
144
|
-
$set.docId = null;
|
|
174
|
+
$set.contentLength = params.contentLength ?? null;
|
|
175
|
+
$set.title = params.title ?? null;
|
|
176
|
+
$set.docId = params.docId ?? null;
|
|
145
177
|
}
|
|
146
178
|
await col.updateOne(
|
|
147
179
|
{
|
|
@@ -385,9 +417,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
385
417
|
let indexed = 0;
|
|
386
418
|
const errors = [];
|
|
387
419
|
const agentId = options?.agentId || "shared";
|
|
388
|
-
|
|
420
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
421
|
+
const indexingTotal = documents.length;
|
|
422
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
423
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
424
|
+
let chunksProcessed = 0;
|
|
425
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
426
|
+
this.emitCrawlProgress(
|
|
427
|
+
{ metadata: options?.metadata },
|
|
428
|
+
{
|
|
429
|
+
phase: "indexing",
|
|
430
|
+
urlsScheduled: indexingTotal,
|
|
431
|
+
pagesProcessed: 0,
|
|
432
|
+
chunksTotal,
|
|
433
|
+
chunksProcessed: 0
|
|
434
|
+
}
|
|
435
|
+
);
|
|
436
|
+
}
|
|
437
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
438
|
+
const doc = documents[docIndex];
|
|
439
|
+
const chunks = chunkPlan[docIndex];
|
|
389
440
|
try {
|
|
390
|
-
const chunks = this.chunkContent(doc.content);
|
|
391
441
|
const isChunked = chunks.length > 1;
|
|
392
442
|
if (isChunked) {
|
|
393
443
|
await collection.deleteMany({
|
|
@@ -422,6 +472,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
422
472
|
},
|
|
423
473
|
{ upsert: true }
|
|
424
474
|
);
|
|
475
|
+
chunksProcessed++;
|
|
476
|
+
if (onCrawlProgress) {
|
|
477
|
+
this.emitCrawlProgress(
|
|
478
|
+
{ metadata: options?.metadata },
|
|
479
|
+
{
|
|
480
|
+
phase: "indexing",
|
|
481
|
+
urlsScheduled: indexingTotal,
|
|
482
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
483
|
+
chunksTotal,
|
|
484
|
+
chunksProcessed
|
|
485
|
+
}
|
|
486
|
+
);
|
|
487
|
+
}
|
|
425
488
|
}
|
|
426
489
|
indexed++;
|
|
427
490
|
} catch (error) {
|
|
@@ -501,23 +564,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
501
564
|
let deleted = 0;
|
|
502
565
|
let failed = 0;
|
|
503
566
|
const errors = [];
|
|
567
|
+
const opsTotal = operations.length;
|
|
568
|
+
let opsDone = 0;
|
|
569
|
+
const ingestOptions = options ?? {};
|
|
570
|
+
this.emitBulkProgress(ingestOptions, {
|
|
571
|
+
phase: "processing",
|
|
572
|
+
opsTotal,
|
|
573
|
+
opsDone: 0
|
|
574
|
+
});
|
|
504
575
|
for (const op of operations) {
|
|
576
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
505
577
|
try {
|
|
506
578
|
switch (op.type) {
|
|
507
579
|
case "insert":
|
|
508
580
|
if (op.document) {
|
|
509
|
-
|
|
510
|
-
|
|
581
|
+
if (isUrlListingInsert(op.document)) {
|
|
582
|
+
const url = bulkOpCurrentUrl(op);
|
|
583
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
584
|
+
{
|
|
585
|
+
url,
|
|
586
|
+
metadata: {
|
|
587
|
+
...op.document.metadata ?? {},
|
|
588
|
+
url
|
|
589
|
+
}
|
|
590
|
+
},
|
|
591
|
+
ingestOptions
|
|
592
|
+
);
|
|
593
|
+
if (crawlResult.indexed > 0) {
|
|
594
|
+
inserted++;
|
|
595
|
+
} else {
|
|
596
|
+
failed++;
|
|
597
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
598
|
+
errors.push({
|
|
599
|
+
id: op.id,
|
|
600
|
+
operation: op.type,
|
|
601
|
+
error: err
|
|
602
|
+
});
|
|
603
|
+
}
|
|
604
|
+
} else {
|
|
605
|
+
await this.ingest([op.document], ingestOptions);
|
|
606
|
+
inserted++;
|
|
607
|
+
}
|
|
511
608
|
}
|
|
512
609
|
break;
|
|
513
610
|
case "update":
|
|
514
611
|
if (op.document) {
|
|
515
|
-
await this.update(op.id, op.document,
|
|
612
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
516
613
|
updated++;
|
|
517
614
|
}
|
|
518
615
|
break;
|
|
519
616
|
case "delete":
|
|
520
|
-
const count = await this.delete(op.id,
|
|
617
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
521
618
|
deleted += count;
|
|
522
619
|
break;
|
|
523
620
|
}
|
|
@@ -528,6 +625,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
528
625
|
operation: op.type,
|
|
529
626
|
error: error.message || "Unknown error"
|
|
530
627
|
});
|
|
628
|
+
} finally {
|
|
629
|
+
opsDone++;
|
|
630
|
+
this.emitBulkProgress(ingestOptions, {
|
|
631
|
+
phase: "processing",
|
|
632
|
+
opsTotal,
|
|
633
|
+
opsDone,
|
|
634
|
+
currentOpType: op.type,
|
|
635
|
+
...currentUrl ? { currentUrl } : {}
|
|
636
|
+
});
|
|
531
637
|
}
|
|
532
638
|
}
|
|
533
639
|
return {
|
|
@@ -1094,6 +1200,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1094
1200
|
};
|
|
1095
1201
|
}
|
|
1096
1202
|
const dbg = this.createDebugCollector(config.debug);
|
|
1203
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1097
1204
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1098
1205
|
if (!base) {
|
|
1099
1206
|
return {
|
|
@@ -1125,6 +1232,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1125
1232
|
if (config.excludePatterns?.length) {
|
|
1126
1233
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1127
1234
|
}
|
|
1235
|
+
this.emitCrawlProgress(config, {
|
|
1236
|
+
phase: "discovering",
|
|
1237
|
+
urlsDiscovered: filteredUrls.length
|
|
1238
|
+
});
|
|
1128
1239
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1129
1240
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1130
1241
|
break;
|
|
@@ -1146,7 +1257,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1146
1257
|
urlsToCrawl = discovery.urls;
|
|
1147
1258
|
urlsSkipped = discovery.skipped;
|
|
1148
1259
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1260
|
+
this.emitCrawlProgress(config, {
|
|
1261
|
+
phase: "discovering",
|
|
1262
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1263
|
+
});
|
|
1149
1264
|
}
|
|
1265
|
+
this.emitCrawlProgress(config, {
|
|
1266
|
+
phase: "crawling",
|
|
1267
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1268
|
+
urlsScheduled: urlsToCrawl.length
|
|
1269
|
+
});
|
|
1150
1270
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1151
1271
|
contentSelector: config.contentSelector,
|
|
1152
1272
|
titleSelector: config.titleSelector,
|
|
@@ -1168,9 +1288,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1168
1288
|
return {
|
|
1169
1289
|
...result,
|
|
1170
1290
|
urlsSkipped,
|
|
1291
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1292
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1171
1293
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1172
1294
|
metadata: {
|
|
1173
1295
|
...result.metadata || {},
|
|
1296
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1174
1297
|
discoveryDebug: dbg.summary()
|
|
1175
1298
|
}
|
|
1176
1299
|
};
|
|
@@ -1398,6 +1521,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1398
1521
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1399
1522
|
const agentId = options?.agentId ?? "shared";
|
|
1400
1523
|
const stripQ = config.stripQueryParams ?? false;
|
|
1524
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1401
1525
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1402
1526
|
for (const u of urls) {
|
|
1403
1527
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1426,6 +1550,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1426
1550
|
const results = await Promise.allSettled(
|
|
1427
1551
|
batch.map(async (url) => {
|
|
1428
1552
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1553
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1429
1554
|
if (ledgerOpts && !forceRecrawl) {
|
|
1430
1555
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1431
1556
|
if (this.shouldSkipLedger(
|
|
@@ -1446,6 +1571,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1446
1571
|
docId: entry?.docId
|
|
1447
1572
|
});
|
|
1448
1573
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1574
|
+
if (ledgerOpts) {
|
|
1575
|
+
await this.upsertLedgerRecord({
|
|
1576
|
+
url,
|
|
1577
|
+
urlNormalized,
|
|
1578
|
+
agentId,
|
|
1579
|
+
ingestionId,
|
|
1580
|
+
status: "skipped_ledger",
|
|
1581
|
+
title: entry?.title,
|
|
1582
|
+
docId: entry?.docId,
|
|
1583
|
+
contentLength: entry?.contentLength
|
|
1584
|
+
});
|
|
1585
|
+
}
|
|
1586
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1449
1587
|
return { kind: "ledger_skip", url };
|
|
1450
1588
|
}
|
|
1451
1589
|
}
|
|
@@ -1469,6 +1607,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1469
1607
|
url,
|
|
1470
1608
|
urlNormalized,
|
|
1471
1609
|
agentId,
|
|
1610
|
+
ingestionId,
|
|
1472
1611
|
status: crawlSt,
|
|
1473
1612
|
doc,
|
|
1474
1613
|
diag
|
|
@@ -1485,6 +1624,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1485
1624
|
docId: doc?.id,
|
|
1486
1625
|
error: diag?.errorMessage
|
|
1487
1626
|
});
|
|
1627
|
+
this.emitCrawlPage(config, {
|
|
1628
|
+
url,
|
|
1629
|
+
event: "done",
|
|
1630
|
+
status: crawlSt,
|
|
1631
|
+
error: diag?.errorMessage
|
|
1632
|
+
});
|
|
1488
1633
|
return { kind: "doc", doc, url };
|
|
1489
1634
|
} catch (error) {
|
|
1490
1635
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1493,6 +1638,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1493
1638
|
url,
|
|
1494
1639
|
urlNormalized,
|
|
1495
1640
|
agentId,
|
|
1641
|
+
ingestionId,
|
|
1496
1642
|
status: "error",
|
|
1497
1643
|
errorMessage: msg
|
|
1498
1644
|
});
|
|
@@ -1503,6 +1649,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1503
1649
|
status: "error",
|
|
1504
1650
|
error: msg
|
|
1505
1651
|
});
|
|
1652
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1506
1653
|
throw { url, error };
|
|
1507
1654
|
}
|
|
1508
1655
|
})
|
|
@@ -1525,12 +1672,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1525
1672
|
});
|
|
1526
1673
|
}
|
|
1527
1674
|
}
|
|
1675
|
+
this.emitCrawlProgress(config, {
|
|
1676
|
+
phase: "crawling",
|
|
1677
|
+
urlsScheduled: uniqueUrls.length,
|
|
1678
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
1679
|
+
});
|
|
1528
1680
|
if (i + concurrency < uniqueUrls.length) {
|
|
1529
1681
|
await this.delay(delayMs);
|
|
1530
1682
|
}
|
|
1531
1683
|
}
|
|
1532
1684
|
if (documents.length > 0) {
|
|
1533
|
-
const ingestResult = await this.ingest(documents,
|
|
1685
|
+
const ingestResult = await this.ingest(documents, {
|
|
1686
|
+
...options,
|
|
1687
|
+
metadata: {
|
|
1688
|
+
...options?.metadata ?? {},
|
|
1689
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
1690
|
+
}
|
|
1691
|
+
});
|
|
1534
1692
|
indexed = ingestResult.indexed;
|
|
1535
1693
|
if (ingestResult.errors) {
|
|
1536
1694
|
errors.push(...ingestResult.errors);
|
|
@@ -1921,6 +2079,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1921
2079
|
}
|
|
1922
2080
|
return Array.from(found);
|
|
1923
2081
|
}
|
|
2082
|
+
emitBulkProgress(options, update) {
|
|
2083
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2084
|
+
if (!fn) return;
|
|
2085
|
+
try {
|
|
2086
|
+
fn(update);
|
|
2087
|
+
} catch {
|
|
2088
|
+
}
|
|
2089
|
+
}
|
|
2090
|
+
emitCrawlProgress(config, update) {
|
|
2091
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2092
|
+
if (!fn) return;
|
|
2093
|
+
try {
|
|
2094
|
+
fn(update);
|
|
2095
|
+
} catch {
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
emitCrawlPage(config, event) {
|
|
2099
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2100
|
+
if (!fn) return;
|
|
2101
|
+
try {
|
|
2102
|
+
fn(event);
|
|
2103
|
+
} catch {
|
|
2104
|
+
}
|
|
2105
|
+
}
|
|
1924
2106
|
createDebugCollector(debug) {
|
|
1925
2107
|
const enabled = !!debug?.enabled;
|
|
1926
2108
|
const level = debug?.level || "summary";
|
|
@@ -2209,6 +2391,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2209
2391
|
filterableFields: this.config.filterableFields,
|
|
2210
2392
|
typeBoosts: this.config.typeBoosts,
|
|
2211
2393
|
recencyBoost: this.config.recencyBoost,
|
|
2394
|
+
crawlLedger: this.config.crawlLedger,
|
|
2212
2395
|
priority: this.priority
|
|
2213
2396
|
};
|
|
2214
2397
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.5",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|