@snap-agent/rag-web 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +49 -1
- package/dist/index.d.ts +49 -1
- package/dist/index.js +231 -12
- package/dist/index.mjs +231 -12
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -660,11 +700,19 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
700
|
private extractBestContentText;
|
|
661
701
|
private bodyTextLengthHint;
|
|
662
702
|
private extractDocumentFromHtml;
|
|
703
|
+
/**
|
|
704
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
+
*/
|
|
707
|
+
private extractHeroImage;
|
|
663
708
|
private looksLikeDynamicShell;
|
|
664
709
|
private diagFromRenderedAttempt;
|
|
665
710
|
private crawlPageSmart;
|
|
666
711
|
private crawlPageRendered;
|
|
667
712
|
private discoverSitemaps;
|
|
713
|
+
private emitBulkProgress;
|
|
714
|
+
private emitCrawlProgress;
|
|
715
|
+
private emitCrawlPage;
|
|
668
716
|
private createDebugCollector;
|
|
669
717
|
/**
|
|
670
718
|
* Clean extracted text content
|
|
@@ -735,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
735
783
|
getConfig(): Record<string, any>;
|
|
736
784
|
}
|
|
737
785
|
|
|
738
|
-
export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
|
786
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
package/dist/index.d.ts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -660,11 +700,19 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
660
700
|
private extractBestContentText;
|
|
661
701
|
private bodyTextLengthHint;
|
|
662
702
|
private extractDocumentFromHtml;
|
|
703
|
+
/**
|
|
704
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
+
*/
|
|
707
|
+
private extractHeroImage;
|
|
663
708
|
private looksLikeDynamicShell;
|
|
664
709
|
private diagFromRenderedAttempt;
|
|
665
710
|
private crawlPageSmart;
|
|
666
711
|
private crawlPageRendered;
|
|
667
712
|
private discoverSitemaps;
|
|
713
|
+
private emitBulkProgress;
|
|
714
|
+
private emitCrawlProgress;
|
|
715
|
+
private emitCrawlPage;
|
|
668
716
|
private createDebugCollector;
|
|
669
717
|
/**
|
|
670
718
|
* Clean extracted text content
|
|
@@ -735,4 +783,4 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
735
783
|
getConfig(): Record<string, any>;
|
|
736
784
|
}
|
|
737
785
|
|
|
738
|
-
export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
|
786
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
|
package/dist/index.js
CHANGED
|
@@ -40,6 +40,24 @@ var import_openai = __toESM(require("openai"));
|
|
|
40
40
|
var cheerio = __toESM(require("cheerio"));
|
|
41
41
|
var fs = __toESM(require("fs"));
|
|
42
42
|
var path = __toESM(require("path"));
|
|
43
|
+
function bulkOpCurrentUrl(op) {
|
|
44
|
+
const meta = op.document?.metadata;
|
|
45
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
46
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
47
|
+
return void 0;
|
|
48
|
+
}
|
|
49
|
+
function isUrlListingInsert(document) {
|
|
50
|
+
const meta = document.metadata;
|
|
51
|
+
if (meta?.type !== "url") return false;
|
|
52
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
53
|
+
if (!url) return false;
|
|
54
|
+
try {
|
|
55
|
+
const parsed = new URL(url);
|
|
56
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
57
|
+
} catch {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
43
61
|
var WebRAGPlugin = class _WebRAGPlugin {
|
|
44
62
|
name = "web-rag";
|
|
45
63
|
type = "rag";
|
|
@@ -78,6 +96,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
78
96
|
}
|
|
79
97
|
return this.db.collection(this.config.collection);
|
|
80
98
|
}
|
|
99
|
+
ledgerIndexesEnsured = false;
|
|
81
100
|
async getLedgerCollection() {
|
|
82
101
|
if (!this.client) {
|
|
83
102
|
this.client = new import_mongodb.MongoClient(this.config.mongoUri);
|
|
@@ -85,7 +104,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
85
104
|
this.db = this.client.db(this.config.dbName);
|
|
86
105
|
}
|
|
87
106
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
88
|
-
|
|
107
|
+
const col = this.db.collection(name);
|
|
108
|
+
if (!this.ledgerIndexesEnsured) {
|
|
109
|
+
this.ledgerIndexesEnsured = true;
|
|
110
|
+
await col.createIndex(
|
|
111
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
112
|
+
{ unique: true }
|
|
113
|
+
);
|
|
114
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
115
|
+
}
|
|
116
|
+
return col;
|
|
89
117
|
}
|
|
90
118
|
/**
|
|
91
119
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -95,6 +123,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
95
123
|
const filter = { tenantId: this.config.tenantId };
|
|
96
124
|
filter.agentId = options.agentId ?? "shared";
|
|
97
125
|
if (options.domain) filter.domain = options.domain;
|
|
126
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
98
127
|
if (options.status) filter.lastStatus = options.status;
|
|
99
128
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
100
129
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -163,6 +192,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
163
192
|
lastCrawledAt: now,
|
|
164
193
|
updatedAt: now
|
|
165
194
|
};
|
|
195
|
+
if (params.ingestionId) {
|
|
196
|
+
$set.ingestionId = params.ingestionId;
|
|
197
|
+
}
|
|
166
198
|
if (errMsg !== void 0) {
|
|
167
199
|
$set.errorMessage = errMsg;
|
|
168
200
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -175,9 +207,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
175
207
|
$set.docId = params.doc.id;
|
|
176
208
|
} else {
|
|
177
209
|
$set.modeUsed = params.diag?.modeUsed;
|
|
178
|
-
$set.contentLength = null;
|
|
179
|
-
$set.title = null;
|
|
180
|
-
$set.docId = null;
|
|
210
|
+
$set.contentLength = params.contentLength ?? null;
|
|
211
|
+
$set.title = params.title ?? null;
|
|
212
|
+
$set.docId = params.docId ?? null;
|
|
181
213
|
}
|
|
182
214
|
await col.updateOne(
|
|
183
215
|
{
|
|
@@ -254,6 +286,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
254
286
|
type: doc.metadata.type,
|
|
255
287
|
title: doc.metadata.title,
|
|
256
288
|
url: doc.metadata.url,
|
|
289
|
+
imageUrl: doc.metadata.imageUrl,
|
|
290
|
+
description: doc.metadata.description,
|
|
257
291
|
score: doc.score
|
|
258
292
|
}))
|
|
259
293
|
}
|
|
@@ -419,9 +453,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
419
453
|
let indexed = 0;
|
|
420
454
|
const errors = [];
|
|
421
455
|
const agentId = options?.agentId || "shared";
|
|
422
|
-
|
|
456
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
457
|
+
const indexingTotal = documents.length;
|
|
458
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
459
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
460
|
+
let chunksProcessed = 0;
|
|
461
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
462
|
+
this.emitCrawlProgress(
|
|
463
|
+
{ metadata: options?.metadata },
|
|
464
|
+
{
|
|
465
|
+
phase: "indexing",
|
|
466
|
+
urlsScheduled: indexingTotal,
|
|
467
|
+
pagesProcessed: 0,
|
|
468
|
+
chunksTotal,
|
|
469
|
+
chunksProcessed: 0
|
|
470
|
+
}
|
|
471
|
+
);
|
|
472
|
+
}
|
|
473
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
474
|
+
const doc = documents[docIndex];
|
|
475
|
+
const chunks = chunkPlan[docIndex];
|
|
423
476
|
try {
|
|
424
|
-
const chunks = this.chunkContent(doc.content);
|
|
425
477
|
const isChunked = chunks.length > 1;
|
|
426
478
|
if (isChunked) {
|
|
427
479
|
await collection.deleteMany({
|
|
@@ -456,6 +508,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
456
508
|
},
|
|
457
509
|
{ upsert: true }
|
|
458
510
|
);
|
|
511
|
+
chunksProcessed++;
|
|
512
|
+
if (onCrawlProgress) {
|
|
513
|
+
this.emitCrawlProgress(
|
|
514
|
+
{ metadata: options?.metadata },
|
|
515
|
+
{
|
|
516
|
+
phase: "indexing",
|
|
517
|
+
urlsScheduled: indexingTotal,
|
|
518
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
519
|
+
chunksTotal,
|
|
520
|
+
chunksProcessed
|
|
521
|
+
}
|
|
522
|
+
);
|
|
523
|
+
}
|
|
459
524
|
}
|
|
460
525
|
indexed++;
|
|
461
526
|
} catch (error) {
|
|
@@ -535,23 +600,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
535
600
|
let deleted = 0;
|
|
536
601
|
let failed = 0;
|
|
537
602
|
const errors = [];
|
|
603
|
+
const opsTotal = operations.length;
|
|
604
|
+
let opsDone = 0;
|
|
605
|
+
const ingestOptions = options ?? {};
|
|
606
|
+
this.emitBulkProgress(ingestOptions, {
|
|
607
|
+
phase: "processing",
|
|
608
|
+
opsTotal,
|
|
609
|
+
opsDone: 0
|
|
610
|
+
});
|
|
538
611
|
for (const op of operations) {
|
|
612
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
539
613
|
try {
|
|
540
614
|
switch (op.type) {
|
|
541
615
|
case "insert":
|
|
542
616
|
if (op.document) {
|
|
543
|
-
|
|
544
|
-
|
|
617
|
+
if (isUrlListingInsert(op.document)) {
|
|
618
|
+
const url = bulkOpCurrentUrl(op);
|
|
619
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
620
|
+
{
|
|
621
|
+
url,
|
|
622
|
+
metadata: {
|
|
623
|
+
...op.document.metadata ?? {},
|
|
624
|
+
url
|
|
625
|
+
}
|
|
626
|
+
},
|
|
627
|
+
ingestOptions
|
|
628
|
+
);
|
|
629
|
+
if (crawlResult.indexed > 0) {
|
|
630
|
+
inserted++;
|
|
631
|
+
} else {
|
|
632
|
+
failed++;
|
|
633
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
634
|
+
errors.push({
|
|
635
|
+
id: op.id,
|
|
636
|
+
operation: op.type,
|
|
637
|
+
error: err
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
} else {
|
|
641
|
+
await this.ingest([op.document], ingestOptions);
|
|
642
|
+
inserted++;
|
|
643
|
+
}
|
|
545
644
|
}
|
|
546
645
|
break;
|
|
547
646
|
case "update":
|
|
548
647
|
if (op.document) {
|
|
549
|
-
await this.update(op.id, op.document,
|
|
648
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
550
649
|
updated++;
|
|
551
650
|
}
|
|
552
651
|
break;
|
|
553
652
|
case "delete":
|
|
554
|
-
const count = await this.delete(op.id,
|
|
653
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
555
654
|
deleted += count;
|
|
556
655
|
break;
|
|
557
656
|
}
|
|
@@ -562,6 +661,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
562
661
|
operation: op.type,
|
|
563
662
|
error: error.message || "Unknown error"
|
|
564
663
|
});
|
|
664
|
+
} finally {
|
|
665
|
+
opsDone++;
|
|
666
|
+
this.emitBulkProgress(ingestOptions, {
|
|
667
|
+
phase: "processing",
|
|
668
|
+
opsTotal,
|
|
669
|
+
opsDone,
|
|
670
|
+
currentOpType: op.type,
|
|
671
|
+
...currentUrl ? { currentUrl } : {}
|
|
672
|
+
});
|
|
565
673
|
}
|
|
566
674
|
}
|
|
567
675
|
return {
|
|
@@ -1128,6 +1236,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1128
1236
|
};
|
|
1129
1237
|
}
|
|
1130
1238
|
const dbg = this.createDebugCollector(config.debug);
|
|
1239
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1131
1240
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1132
1241
|
if (!base) {
|
|
1133
1242
|
return {
|
|
@@ -1159,6 +1268,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1159
1268
|
if (config.excludePatterns?.length) {
|
|
1160
1269
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1161
1270
|
}
|
|
1271
|
+
this.emitCrawlProgress(config, {
|
|
1272
|
+
phase: "discovering",
|
|
1273
|
+
urlsDiscovered: filteredUrls.length
|
|
1274
|
+
});
|
|
1162
1275
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1163
1276
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1164
1277
|
break;
|
|
@@ -1180,7 +1293,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1180
1293
|
urlsToCrawl = discovery.urls;
|
|
1181
1294
|
urlsSkipped = discovery.skipped;
|
|
1182
1295
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1296
|
+
this.emitCrawlProgress(config, {
|
|
1297
|
+
phase: "discovering",
|
|
1298
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1299
|
+
});
|
|
1183
1300
|
}
|
|
1301
|
+
this.emitCrawlProgress(config, {
|
|
1302
|
+
phase: "crawling",
|
|
1303
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1304
|
+
urlsScheduled: urlsToCrawl.length
|
|
1305
|
+
});
|
|
1184
1306
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1185
1307
|
contentSelector: config.contentSelector,
|
|
1186
1308
|
titleSelector: config.titleSelector,
|
|
@@ -1202,9 +1324,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1202
1324
|
return {
|
|
1203
1325
|
...result,
|
|
1204
1326
|
urlsSkipped,
|
|
1327
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1328
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1205
1329
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1206
1330
|
metadata: {
|
|
1207
1331
|
...result.metadata || {},
|
|
1332
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1208
1333
|
discoveryDebug: dbg.summary()
|
|
1209
1334
|
}
|
|
1210
1335
|
};
|
|
@@ -1432,6 +1557,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1432
1557
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1433
1558
|
const agentId = options?.agentId ?? "shared";
|
|
1434
1559
|
const stripQ = config.stripQueryParams ?? false;
|
|
1560
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1435
1561
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1436
1562
|
for (const u of urls) {
|
|
1437
1563
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1460,6 +1586,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1460
1586
|
const results = await Promise.allSettled(
|
|
1461
1587
|
batch.map(async (url) => {
|
|
1462
1588
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1589
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1463
1590
|
if (ledgerOpts && !forceRecrawl) {
|
|
1464
1591
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1465
1592
|
if (this.shouldSkipLedger(
|
|
@@ -1480,6 +1607,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1480
1607
|
docId: entry?.docId
|
|
1481
1608
|
});
|
|
1482
1609
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1610
|
+
if (ledgerOpts) {
|
|
1611
|
+
await this.upsertLedgerRecord({
|
|
1612
|
+
url,
|
|
1613
|
+
urlNormalized,
|
|
1614
|
+
agentId,
|
|
1615
|
+
ingestionId,
|
|
1616
|
+
status: "skipped_ledger",
|
|
1617
|
+
title: entry?.title,
|
|
1618
|
+
docId: entry?.docId,
|
|
1619
|
+
contentLength: entry?.contentLength
|
|
1620
|
+
});
|
|
1621
|
+
}
|
|
1622
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1483
1623
|
return { kind: "ledger_skip", url };
|
|
1484
1624
|
}
|
|
1485
1625
|
}
|
|
@@ -1503,6 +1643,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1503
1643
|
url,
|
|
1504
1644
|
urlNormalized,
|
|
1505
1645
|
agentId,
|
|
1646
|
+
ingestionId,
|
|
1506
1647
|
status: crawlSt,
|
|
1507
1648
|
doc,
|
|
1508
1649
|
diag
|
|
@@ -1519,6 +1660,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1519
1660
|
docId: doc?.id,
|
|
1520
1661
|
error: diag?.errorMessage
|
|
1521
1662
|
});
|
|
1663
|
+
this.emitCrawlPage(config, {
|
|
1664
|
+
url,
|
|
1665
|
+
event: "done",
|
|
1666
|
+
status: crawlSt,
|
|
1667
|
+
error: diag?.errorMessage
|
|
1668
|
+
});
|
|
1522
1669
|
return { kind: "doc", doc, url };
|
|
1523
1670
|
} catch (error) {
|
|
1524
1671
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1527,6 +1674,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1527
1674
|
url,
|
|
1528
1675
|
urlNormalized,
|
|
1529
1676
|
agentId,
|
|
1677
|
+
ingestionId,
|
|
1530
1678
|
status: "error",
|
|
1531
1679
|
errorMessage: msg
|
|
1532
1680
|
});
|
|
@@ -1537,6 +1685,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1537
1685
|
status: "error",
|
|
1538
1686
|
error: msg
|
|
1539
1687
|
});
|
|
1688
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1540
1689
|
throw { url, error };
|
|
1541
1690
|
}
|
|
1542
1691
|
})
|
|
@@ -1559,12 +1708,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1559
1708
|
});
|
|
1560
1709
|
}
|
|
1561
1710
|
}
|
|
1711
|
+
this.emitCrawlProgress(config, {
|
|
1712
|
+
phase: "crawling",
|
|
1713
|
+
urlsScheduled: uniqueUrls.length,
|
|
1714
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
1715
|
+
});
|
|
1562
1716
|
if (i + concurrency < uniqueUrls.length) {
|
|
1563
1717
|
await this.delay(delayMs);
|
|
1564
1718
|
}
|
|
1565
1719
|
}
|
|
1566
1720
|
if (documents.length > 0) {
|
|
1567
|
-
const ingestResult = await this.ingest(documents,
|
|
1721
|
+
const ingestResult = await this.ingest(documents, {
|
|
1722
|
+
...options,
|
|
1723
|
+
metadata: {
|
|
1724
|
+
...options?.metadata ?? {},
|
|
1725
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
1726
|
+
}
|
|
1727
|
+
});
|
|
1568
1728
|
indexed = ingestResult.indexed;
|
|
1569
1729
|
if (ingestResult.errors) {
|
|
1570
1730
|
errors.push(...ingestResult.errors);
|
|
@@ -1659,7 +1819,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1659
1819
|
const content = this.extractBestContentText($, config);
|
|
1660
1820
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1661
1821
|
if (!content || content.length < minChars) return null;
|
|
1662
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1822
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1823
|
+
this.extractHeroImage($, url) || void 0;
|
|
1663
1824
|
let imageUrl;
|
|
1664
1825
|
if (image) {
|
|
1665
1826
|
try {
|
|
@@ -1692,6 +1853,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1692
1853
|
}
|
|
1693
1854
|
};
|
|
1694
1855
|
}
|
|
1856
|
+
/**
|
|
1857
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1858
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1859
|
+
*/
|
|
1860
|
+
extractHeroImage($, pageUrl) {
|
|
1861
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1862
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1863
|
+
let best;
|
|
1864
|
+
scope.find("img[src]").each((_, el) => {
|
|
1865
|
+
if (best) return false;
|
|
1866
|
+
const src = $(el).attr("src") || "";
|
|
1867
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1868
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1869
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1870
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1871
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1872
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1873
|
+
if (src.includes("/_next/image")) {
|
|
1874
|
+
try {
|
|
1875
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1876
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1877
|
+
if (realUrl) {
|
|
1878
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1879
|
+
return false;
|
|
1880
|
+
}
|
|
1881
|
+
} catch {
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
best = src;
|
|
1885
|
+
return false;
|
|
1886
|
+
});
|
|
1887
|
+
return best;
|
|
1888
|
+
}
|
|
1695
1889
|
looksLikeDynamicShell(html) {
|
|
1696
1890
|
const lower = html.toLowerCase();
|
|
1697
1891
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1921,6 +2115,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1921
2115
|
}
|
|
1922
2116
|
return Array.from(found);
|
|
1923
2117
|
}
|
|
2118
|
+
emitBulkProgress(options, update) {
|
|
2119
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2120
|
+
if (!fn) return;
|
|
2121
|
+
try {
|
|
2122
|
+
fn(update);
|
|
2123
|
+
} catch {
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
emitCrawlProgress(config, update) {
|
|
2127
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2128
|
+
if (!fn) return;
|
|
2129
|
+
try {
|
|
2130
|
+
fn(update);
|
|
2131
|
+
} catch {
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
emitCrawlPage(config, event) {
|
|
2135
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2136
|
+
if (!fn) return;
|
|
2137
|
+
try {
|
|
2138
|
+
fn(event);
|
|
2139
|
+
} catch {
|
|
2140
|
+
}
|
|
2141
|
+
}
|
|
1924
2142
|
createDebugCollector(debug) {
|
|
1925
2143
|
const enabled = !!debug?.enabled;
|
|
1926
2144
|
const level = debug?.level || "summary";
|
|
@@ -2209,6 +2427,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2209
2427
|
filterableFields: this.config.filterableFields,
|
|
2210
2428
|
typeBoosts: this.config.typeBoosts,
|
|
2211
2429
|
recencyBoost: this.config.recencyBoost,
|
|
2430
|
+
crawlLedger: this.config.crawlLedger,
|
|
2212
2431
|
priority: this.priority
|
|
2213
2432
|
};
|
|
2214
2433
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -4,6 +4,24 @@ import OpenAI from "openai";
|
|
|
4
4
|
import * as cheerio from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
|
+
function bulkOpCurrentUrl(op) {
|
|
8
|
+
const meta = op.document?.metadata;
|
|
9
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
10
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
11
|
+
return void 0;
|
|
12
|
+
}
|
|
13
|
+
function isUrlListingInsert(document) {
|
|
14
|
+
const meta = document.metadata;
|
|
15
|
+
if (meta?.type !== "url") return false;
|
|
16
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
17
|
+
if (!url) return false;
|
|
18
|
+
try {
|
|
19
|
+
const parsed = new URL(url);
|
|
20
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
21
|
+
} catch {
|
|
22
|
+
return false;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
7
25
|
var WebRAGPlugin = class _WebRAGPlugin {
|
|
8
26
|
name = "web-rag";
|
|
9
27
|
type = "rag";
|
|
@@ -42,6 +60,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
42
60
|
}
|
|
43
61
|
return this.db.collection(this.config.collection);
|
|
44
62
|
}
|
|
63
|
+
ledgerIndexesEnsured = false;
|
|
45
64
|
async getLedgerCollection() {
|
|
46
65
|
if (!this.client) {
|
|
47
66
|
this.client = new MongoClient(this.config.mongoUri);
|
|
@@ -49,7 +68,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
49
68
|
this.db = this.client.db(this.config.dbName);
|
|
50
69
|
}
|
|
51
70
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
52
|
-
|
|
71
|
+
const col = this.db.collection(name);
|
|
72
|
+
if (!this.ledgerIndexesEnsured) {
|
|
73
|
+
this.ledgerIndexesEnsured = true;
|
|
74
|
+
await col.createIndex(
|
|
75
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
76
|
+
{ unique: true }
|
|
77
|
+
);
|
|
78
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
79
|
+
}
|
|
80
|
+
return col;
|
|
53
81
|
}
|
|
54
82
|
/**
|
|
55
83
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -59,6 +87,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
59
87
|
const filter = { tenantId: this.config.tenantId };
|
|
60
88
|
filter.agentId = options.agentId ?? "shared";
|
|
61
89
|
if (options.domain) filter.domain = options.domain;
|
|
90
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
62
91
|
if (options.status) filter.lastStatus = options.status;
|
|
63
92
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
64
93
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -127,6 +156,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
127
156
|
lastCrawledAt: now,
|
|
128
157
|
updatedAt: now
|
|
129
158
|
};
|
|
159
|
+
if (params.ingestionId) {
|
|
160
|
+
$set.ingestionId = params.ingestionId;
|
|
161
|
+
}
|
|
130
162
|
if (errMsg !== void 0) {
|
|
131
163
|
$set.errorMessage = errMsg;
|
|
132
164
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -139,9 +171,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
139
171
|
$set.docId = params.doc.id;
|
|
140
172
|
} else {
|
|
141
173
|
$set.modeUsed = params.diag?.modeUsed;
|
|
142
|
-
$set.contentLength = null;
|
|
143
|
-
$set.title = null;
|
|
144
|
-
$set.docId = null;
|
|
174
|
+
$set.contentLength = params.contentLength ?? null;
|
|
175
|
+
$set.title = params.title ?? null;
|
|
176
|
+
$set.docId = params.docId ?? null;
|
|
145
177
|
}
|
|
146
178
|
await col.updateOne(
|
|
147
179
|
{
|
|
@@ -218,6 +250,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
218
250
|
type: doc.metadata.type,
|
|
219
251
|
title: doc.metadata.title,
|
|
220
252
|
url: doc.metadata.url,
|
|
253
|
+
imageUrl: doc.metadata.imageUrl,
|
|
254
|
+
description: doc.metadata.description,
|
|
221
255
|
score: doc.score
|
|
222
256
|
}))
|
|
223
257
|
}
|
|
@@ -383,9 +417,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
383
417
|
let indexed = 0;
|
|
384
418
|
const errors = [];
|
|
385
419
|
const agentId = options?.agentId || "shared";
|
|
386
|
-
|
|
420
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
421
|
+
const indexingTotal = documents.length;
|
|
422
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
423
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
424
|
+
let chunksProcessed = 0;
|
|
425
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
426
|
+
this.emitCrawlProgress(
|
|
427
|
+
{ metadata: options?.metadata },
|
|
428
|
+
{
|
|
429
|
+
phase: "indexing",
|
|
430
|
+
urlsScheduled: indexingTotal,
|
|
431
|
+
pagesProcessed: 0,
|
|
432
|
+
chunksTotal,
|
|
433
|
+
chunksProcessed: 0
|
|
434
|
+
}
|
|
435
|
+
);
|
|
436
|
+
}
|
|
437
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
438
|
+
const doc = documents[docIndex];
|
|
439
|
+
const chunks = chunkPlan[docIndex];
|
|
387
440
|
try {
|
|
388
|
-
const chunks = this.chunkContent(doc.content);
|
|
389
441
|
const isChunked = chunks.length > 1;
|
|
390
442
|
if (isChunked) {
|
|
391
443
|
await collection.deleteMany({
|
|
@@ -420,6 +472,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
420
472
|
},
|
|
421
473
|
{ upsert: true }
|
|
422
474
|
);
|
|
475
|
+
chunksProcessed++;
|
|
476
|
+
if (onCrawlProgress) {
|
|
477
|
+
this.emitCrawlProgress(
|
|
478
|
+
{ metadata: options?.metadata },
|
|
479
|
+
{
|
|
480
|
+
phase: "indexing",
|
|
481
|
+
urlsScheduled: indexingTotal,
|
|
482
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
483
|
+
chunksTotal,
|
|
484
|
+
chunksProcessed
|
|
485
|
+
}
|
|
486
|
+
);
|
|
487
|
+
}
|
|
423
488
|
}
|
|
424
489
|
indexed++;
|
|
425
490
|
} catch (error) {
|
|
@@ -499,23 +564,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
499
564
|
let deleted = 0;
|
|
500
565
|
let failed = 0;
|
|
501
566
|
const errors = [];
|
|
567
|
+
const opsTotal = operations.length;
|
|
568
|
+
let opsDone = 0;
|
|
569
|
+
const ingestOptions = options ?? {};
|
|
570
|
+
this.emitBulkProgress(ingestOptions, {
|
|
571
|
+
phase: "processing",
|
|
572
|
+
opsTotal,
|
|
573
|
+
opsDone: 0
|
|
574
|
+
});
|
|
502
575
|
for (const op of operations) {
|
|
576
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
503
577
|
try {
|
|
504
578
|
switch (op.type) {
|
|
505
579
|
case "insert":
|
|
506
580
|
if (op.document) {
|
|
507
|
-
|
|
508
|
-
|
|
581
|
+
if (isUrlListingInsert(op.document)) {
|
|
582
|
+
const url = bulkOpCurrentUrl(op);
|
|
583
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
584
|
+
{
|
|
585
|
+
url,
|
|
586
|
+
metadata: {
|
|
587
|
+
...op.document.metadata ?? {},
|
|
588
|
+
url
|
|
589
|
+
}
|
|
590
|
+
},
|
|
591
|
+
ingestOptions
|
|
592
|
+
);
|
|
593
|
+
if (crawlResult.indexed > 0) {
|
|
594
|
+
inserted++;
|
|
595
|
+
} else {
|
|
596
|
+
failed++;
|
|
597
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
598
|
+
errors.push({
|
|
599
|
+
id: op.id,
|
|
600
|
+
operation: op.type,
|
|
601
|
+
error: err
|
|
602
|
+
});
|
|
603
|
+
}
|
|
604
|
+
} else {
|
|
605
|
+
await this.ingest([op.document], ingestOptions);
|
|
606
|
+
inserted++;
|
|
607
|
+
}
|
|
509
608
|
}
|
|
510
609
|
break;
|
|
511
610
|
case "update":
|
|
512
611
|
if (op.document) {
|
|
513
|
-
await this.update(op.id, op.document,
|
|
612
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
514
613
|
updated++;
|
|
515
614
|
}
|
|
516
615
|
break;
|
|
517
616
|
case "delete":
|
|
518
|
-
const count = await this.delete(op.id,
|
|
617
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
519
618
|
deleted += count;
|
|
520
619
|
break;
|
|
521
620
|
}
|
|
@@ -526,6 +625,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
526
625
|
operation: op.type,
|
|
527
626
|
error: error.message || "Unknown error"
|
|
528
627
|
});
|
|
628
|
+
} finally {
|
|
629
|
+
opsDone++;
|
|
630
|
+
this.emitBulkProgress(ingestOptions, {
|
|
631
|
+
phase: "processing",
|
|
632
|
+
opsTotal,
|
|
633
|
+
opsDone,
|
|
634
|
+
currentOpType: op.type,
|
|
635
|
+
...currentUrl ? { currentUrl } : {}
|
|
636
|
+
});
|
|
529
637
|
}
|
|
530
638
|
}
|
|
531
639
|
return {
|
|
@@ -1092,6 +1200,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1092
1200
|
};
|
|
1093
1201
|
}
|
|
1094
1202
|
const dbg = this.createDebugCollector(config.debug);
|
|
1203
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1095
1204
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1096
1205
|
if (!base) {
|
|
1097
1206
|
return {
|
|
@@ -1123,6 +1232,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1123
1232
|
if (config.excludePatterns?.length) {
|
|
1124
1233
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1125
1234
|
}
|
|
1235
|
+
this.emitCrawlProgress(config, {
|
|
1236
|
+
phase: "discovering",
|
|
1237
|
+
urlsDiscovered: filteredUrls.length
|
|
1238
|
+
});
|
|
1126
1239
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1127
1240
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1128
1241
|
break;
|
|
@@ -1144,7 +1257,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1144
1257
|
urlsToCrawl = discovery.urls;
|
|
1145
1258
|
urlsSkipped = discovery.skipped;
|
|
1146
1259
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1260
|
+
this.emitCrawlProgress(config, {
|
|
1261
|
+
phase: "discovering",
|
|
1262
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1263
|
+
});
|
|
1147
1264
|
}
|
|
1265
|
+
this.emitCrawlProgress(config, {
|
|
1266
|
+
phase: "crawling",
|
|
1267
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1268
|
+
urlsScheduled: urlsToCrawl.length
|
|
1269
|
+
});
|
|
1148
1270
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1149
1271
|
contentSelector: config.contentSelector,
|
|
1150
1272
|
titleSelector: config.titleSelector,
|
|
@@ -1166,9 +1288,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1166
1288
|
return {
|
|
1167
1289
|
...result,
|
|
1168
1290
|
urlsSkipped,
|
|
1291
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1292
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1169
1293
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1170
1294
|
metadata: {
|
|
1171
1295
|
...result.metadata || {},
|
|
1296
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1172
1297
|
discoveryDebug: dbg.summary()
|
|
1173
1298
|
}
|
|
1174
1299
|
};
|
|
@@ -1396,6 +1521,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1396
1521
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1397
1522
|
const agentId = options?.agentId ?? "shared";
|
|
1398
1523
|
const stripQ = config.stripQueryParams ?? false;
|
|
1524
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1399
1525
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1400
1526
|
for (const u of urls) {
|
|
1401
1527
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1424,6 +1550,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1424
1550
|
const results = await Promise.allSettled(
|
|
1425
1551
|
batch.map(async (url) => {
|
|
1426
1552
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1553
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1427
1554
|
if (ledgerOpts && !forceRecrawl) {
|
|
1428
1555
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1429
1556
|
if (this.shouldSkipLedger(
|
|
@@ -1444,6 +1571,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1444
1571
|
docId: entry?.docId
|
|
1445
1572
|
});
|
|
1446
1573
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1574
|
+
if (ledgerOpts) {
|
|
1575
|
+
await this.upsertLedgerRecord({
|
|
1576
|
+
url,
|
|
1577
|
+
urlNormalized,
|
|
1578
|
+
agentId,
|
|
1579
|
+
ingestionId,
|
|
1580
|
+
status: "skipped_ledger",
|
|
1581
|
+
title: entry?.title,
|
|
1582
|
+
docId: entry?.docId,
|
|
1583
|
+
contentLength: entry?.contentLength
|
|
1584
|
+
});
|
|
1585
|
+
}
|
|
1586
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1447
1587
|
return { kind: "ledger_skip", url };
|
|
1448
1588
|
}
|
|
1449
1589
|
}
|
|
@@ -1467,6 +1607,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1467
1607
|
url,
|
|
1468
1608
|
urlNormalized,
|
|
1469
1609
|
agentId,
|
|
1610
|
+
ingestionId,
|
|
1470
1611
|
status: crawlSt,
|
|
1471
1612
|
doc,
|
|
1472
1613
|
diag
|
|
@@ -1483,6 +1624,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1483
1624
|
docId: doc?.id,
|
|
1484
1625
|
error: diag?.errorMessage
|
|
1485
1626
|
});
|
|
1627
|
+
this.emitCrawlPage(config, {
|
|
1628
|
+
url,
|
|
1629
|
+
event: "done",
|
|
1630
|
+
status: crawlSt,
|
|
1631
|
+
error: diag?.errorMessage
|
|
1632
|
+
});
|
|
1486
1633
|
return { kind: "doc", doc, url };
|
|
1487
1634
|
} catch (error) {
|
|
1488
1635
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1491,6 +1638,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1491
1638
|
url,
|
|
1492
1639
|
urlNormalized,
|
|
1493
1640
|
agentId,
|
|
1641
|
+
ingestionId,
|
|
1494
1642
|
status: "error",
|
|
1495
1643
|
errorMessage: msg
|
|
1496
1644
|
});
|
|
@@ -1501,6 +1649,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1501
1649
|
status: "error",
|
|
1502
1650
|
error: msg
|
|
1503
1651
|
});
|
|
1652
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1504
1653
|
throw { url, error };
|
|
1505
1654
|
}
|
|
1506
1655
|
})
|
|
@@ -1523,12 +1672,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1523
1672
|
});
|
|
1524
1673
|
}
|
|
1525
1674
|
}
|
|
1675
|
+
this.emitCrawlProgress(config, {
|
|
1676
|
+
phase: "crawling",
|
|
1677
|
+
urlsScheduled: uniqueUrls.length,
|
|
1678
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
1679
|
+
});
|
|
1526
1680
|
if (i + concurrency < uniqueUrls.length) {
|
|
1527
1681
|
await this.delay(delayMs);
|
|
1528
1682
|
}
|
|
1529
1683
|
}
|
|
1530
1684
|
if (documents.length > 0) {
|
|
1531
|
-
const ingestResult = await this.ingest(documents,
|
|
1685
|
+
const ingestResult = await this.ingest(documents, {
|
|
1686
|
+
...options,
|
|
1687
|
+
metadata: {
|
|
1688
|
+
...options?.metadata ?? {},
|
|
1689
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
1690
|
+
}
|
|
1691
|
+
});
|
|
1532
1692
|
indexed = ingestResult.indexed;
|
|
1533
1693
|
if (ingestResult.errors) {
|
|
1534
1694
|
errors.push(...ingestResult.errors);
|
|
@@ -1623,7 +1783,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1623
1783
|
const content = this.extractBestContentText($, config);
|
|
1624
1784
|
const minChars = config.minExtractedContentLength ?? 50;
|
|
1625
1785
|
if (!content || content.length < minChars) return null;
|
|
1626
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") ||
|
|
1786
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1787
|
+
this.extractHeroImage($, url) || void 0;
|
|
1627
1788
|
let imageUrl;
|
|
1628
1789
|
if (image) {
|
|
1629
1790
|
try {
|
|
@@ -1656,6 +1817,39 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1656
1817
|
}
|
|
1657
1818
|
};
|
|
1658
1819
|
}
|
|
1820
|
+
/**
|
|
1821
|
+
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1822
|
+
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1823
|
+
*/
|
|
1824
|
+
extractHeroImage($, pageUrl) {
|
|
1825
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
1826
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
1827
|
+
let best;
|
|
1828
|
+
scope.find("img[src]").each((_, el) => {
|
|
1829
|
+
if (best) return false;
|
|
1830
|
+
const src = $(el).attr("src") || "";
|
|
1831
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1832
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1833
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1834
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1835
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1836
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1837
|
+
if (src.includes("/_next/image")) {
|
|
1838
|
+
try {
|
|
1839
|
+
const nextUrl = new URL(src, pageUrl);
|
|
1840
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
1841
|
+
if (realUrl) {
|
|
1842
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1843
|
+
return false;
|
|
1844
|
+
}
|
|
1845
|
+
} catch {
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
best = src;
|
|
1849
|
+
return false;
|
|
1850
|
+
});
|
|
1851
|
+
return best;
|
|
1852
|
+
}
|
|
1659
1853
|
looksLikeDynamicShell(html) {
|
|
1660
1854
|
const lower = html.toLowerCase();
|
|
1661
1855
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1885,6 +2079,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1885
2079
|
}
|
|
1886
2080
|
return Array.from(found);
|
|
1887
2081
|
}
|
|
2082
|
+
emitBulkProgress(options, update) {
|
|
2083
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2084
|
+
if (!fn) return;
|
|
2085
|
+
try {
|
|
2086
|
+
fn(update);
|
|
2087
|
+
} catch {
|
|
2088
|
+
}
|
|
2089
|
+
}
|
|
2090
|
+
emitCrawlProgress(config, update) {
|
|
2091
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2092
|
+
if (!fn) return;
|
|
2093
|
+
try {
|
|
2094
|
+
fn(update);
|
|
2095
|
+
} catch {
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
emitCrawlPage(config, event) {
|
|
2099
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2100
|
+
if (!fn) return;
|
|
2101
|
+
try {
|
|
2102
|
+
fn(event);
|
|
2103
|
+
} catch {
|
|
2104
|
+
}
|
|
2105
|
+
}
|
|
1888
2106
|
createDebugCollector(debug) {
|
|
1889
2107
|
const enabled = !!debug?.enabled;
|
|
1890
2108
|
const level = debug?.level || "summary";
|
|
@@ -2173,6 +2391,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2173
2391
|
filterableFields: this.config.filterableFields,
|
|
2174
2392
|
typeBoosts: this.config.typeBoosts,
|
|
2175
2393
|
recencyBoost: this.config.recencyBoost,
|
|
2394
|
+
crawlLedger: this.config.crawlLedger,
|
|
2176
2395
|
priority: this.priority
|
|
2177
2396
|
};
|
|
2178
2397
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.5",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|