@snap-agent/rag-web 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +40 -18
- package/dist/index.d.ts +40 -18
- package/dist/index.js +333 -133
- package/dist/index.mjs +325 -132
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
690
690
|
* Crawl a single page and extract content
|
|
691
691
|
*/
|
|
692
692
|
private crawlPage;
|
|
693
|
-
/**
|
|
694
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
695
|
-
* would otherwise hit an empty wrapper.
|
|
696
|
-
*/
|
|
697
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
698
|
-
private stripNoiseFromDom;
|
|
699
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
700
|
-
private extractBestContentText;
|
|
701
693
|
private bodyTextLengthHint;
|
|
702
694
|
private extractDocumentFromHtml;
|
|
703
|
-
/**
|
|
704
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
-
*/
|
|
707
|
-
private extractHeroImage;
|
|
708
695
|
private looksLikeDynamicShell;
|
|
709
696
|
private diagFromRenderedAttempt;
|
|
710
697
|
private crawlPageSmart;
|
|
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
717
704
|
/**
|
|
718
705
|
* Clean extracted text content
|
|
719
706
|
*/
|
|
720
|
-
private cleanContent;
|
|
721
|
-
/**
|
|
722
|
-
* Convert URL to a stable document ID
|
|
723
|
-
*/
|
|
724
707
|
private urlToId;
|
|
725
708
|
/**
|
|
726
709
|
* Delay helper
|
|
@@ -783,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
783
766
|
getConfig(): Record<string, any>;
|
|
784
767
|
}
|
|
785
768
|
|
|
786
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
|
package/dist/index.d.ts
CHANGED
|
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
690
690
|
* Crawl a single page and extract content
|
|
691
691
|
*/
|
|
692
692
|
private crawlPage;
|
|
693
|
-
/**
|
|
694
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
695
|
-
* would otherwise hit an empty wrapper.
|
|
696
|
-
*/
|
|
697
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
698
|
-
private stripNoiseFromDom;
|
|
699
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
700
|
-
private extractBestContentText;
|
|
701
693
|
private bodyTextLengthHint;
|
|
702
694
|
private extractDocumentFromHtml;
|
|
703
|
-
/**
|
|
704
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
-
*/
|
|
707
|
-
private extractHeroImage;
|
|
708
695
|
private looksLikeDynamicShell;
|
|
709
696
|
private diagFromRenderedAttempt;
|
|
710
697
|
private crawlPageSmart;
|
|
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
717
704
|
/**
|
|
718
705
|
* Clean extracted text content
|
|
719
706
|
*/
|
|
720
|
-
private cleanContent;
|
|
721
|
-
/**
|
|
722
|
-
* Convert URL to a stable document ID
|
|
723
|
-
*/
|
|
724
707
|
private urlToId;
|
|
725
708
|
/**
|
|
726
709
|
* Delay helper
|
|
@@ -783,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
783
766
|
getConfig(): Record<string, any>;
|
|
784
767
|
}
|
|
785
768
|
|
|
786
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
|
package/dist/index.js
CHANGED
|
@@ -30,16 +30,319 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
-
WebRAGPlugin: () => WebRAGPlugin
|
|
33
|
+
WebRAGPlugin: () => WebRAGPlugin,
|
|
34
|
+
bodyTextLengthHint: () => bodyTextLengthHint,
|
|
35
|
+
extractPageFromHtml: () => extractPageFromHtml,
|
|
36
|
+
extractProductMetadata: () => extractProductMetadata,
|
|
37
|
+
normalizeAvailability: () => normalizeAvailability,
|
|
38
|
+
normalizeCurrency: () => normalizeCurrency,
|
|
39
|
+
parsePrice: () => parsePrice,
|
|
40
|
+
urlToDocumentId: () => urlToDocumentId
|
|
34
41
|
});
|
|
35
42
|
module.exports = __toCommonJS(index_exports);
|
|
36
43
|
|
|
37
44
|
// src/WebRAGPlugin.ts
|
|
38
45
|
var import_mongodb = require("mongodb");
|
|
39
46
|
var import_openai = __toESM(require("openai"));
|
|
40
|
-
var
|
|
47
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
41
48
|
var fs = __toESM(require("fs"));
|
|
42
49
|
var path = __toESM(require("path"));
|
|
50
|
+
|
|
51
|
+
// src/htmlPageExtract.ts
|
|
52
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
53
|
+
|
|
54
|
+
// src/productMetadata.ts
|
|
55
|
+
var cheerio = __toESM(require("cheerio"));
|
|
56
|
+
function extractProductMetadata(html) {
|
|
57
|
+
const $ = cheerio.load(html);
|
|
58
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
59
|
+
const fromOg = extractFromOpenGraph($);
|
|
60
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
61
|
+
const result = {};
|
|
62
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
63
|
+
if (price != null) result.price = price;
|
|
64
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
65
|
+
if (currency) result.currency = currency;
|
|
66
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
67
|
+
if (availability) result.availability = availability;
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
70
|
+
function extractFromJsonLd($) {
|
|
71
|
+
const result = {};
|
|
72
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
73
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
74
|
+
const raw = $(el).html()?.trim();
|
|
75
|
+
if (!raw) return;
|
|
76
|
+
let parsed;
|
|
77
|
+
try {
|
|
78
|
+
parsed = JSON.parse(raw);
|
|
79
|
+
} catch {
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
83
|
+
if (!isProductType(node)) continue;
|
|
84
|
+
const offer = pickOffer(node);
|
|
85
|
+
if (!offer) continue;
|
|
86
|
+
if (result.price == null) {
|
|
87
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
88
|
+
if (price != null) result.price = price;
|
|
89
|
+
}
|
|
90
|
+
if (!result.currency) {
|
|
91
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
92
|
+
if (currency) result.currency = currency;
|
|
93
|
+
}
|
|
94
|
+
if (!result.availability) {
|
|
95
|
+
const availability = normalizeAvailability(offer.availability);
|
|
96
|
+
if (availability) result.availability = availability;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
return result;
|
|
101
|
+
}
|
|
102
|
+
function extractFromOpenGraph($) {
|
|
103
|
+
const result = {};
|
|
104
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
105
|
+
const price = parsePrice(priceRaw);
|
|
106
|
+
if (price != null) result.price = price;
|
|
107
|
+
const currency = normalizeCurrency(
|
|
108
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
109
|
+
);
|
|
110
|
+
if (currency) result.currency = currency;
|
|
111
|
+
const availability = normalizeAvailability(
|
|
112
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
113
|
+
);
|
|
114
|
+
if (availability) result.availability = availability;
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
function microdataField($, itemprop) {
|
|
118
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
119
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
120
|
+
}
|
|
121
|
+
function extractFromMicrodata($) {
|
|
122
|
+
const result = {};
|
|
123
|
+
const priceEl = microdataField($, "price");
|
|
124
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
125
|
+
if (price != null) result.price = price;
|
|
126
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
127
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
128
|
+
if (currency) result.currency = currency;
|
|
129
|
+
const availabilityEl = microdataField($, "availability");
|
|
130
|
+
const availability = normalizeAvailability(
|
|
131
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
132
|
+
);
|
|
133
|
+
if (availability) result.availability = availability;
|
|
134
|
+
return result;
|
|
135
|
+
}
|
|
136
|
+
function collectJsonLdNodes(data) {
|
|
137
|
+
const nodes = [];
|
|
138
|
+
const visit = (value) => {
|
|
139
|
+
if (value == null) return;
|
|
140
|
+
if (Array.isArray(value)) {
|
|
141
|
+
value.forEach(visit);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
if (typeof value !== "object") return;
|
|
145
|
+
const obj = value;
|
|
146
|
+
nodes.push(obj);
|
|
147
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
148
|
+
};
|
|
149
|
+
visit(data);
|
|
150
|
+
return nodes;
|
|
151
|
+
}
|
|
152
|
+
function isProductType(node) {
|
|
153
|
+
const type = node["@type"];
|
|
154
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
155
|
+
return types.some((t) => {
|
|
156
|
+
const s = String(t).toLowerCase();
|
|
157
|
+
return s === "product" || s.endsWith("/product");
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
function pickOffer(product) {
|
|
161
|
+
const offers = product.offers;
|
|
162
|
+
if (offers == null) return null;
|
|
163
|
+
if (Array.isArray(offers)) {
|
|
164
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
165
|
+
return first ?? null;
|
|
166
|
+
}
|
|
167
|
+
if (typeof offers === "object") return offers;
|
|
168
|
+
return null;
|
|
169
|
+
}
|
|
170
|
+
function parsePrice(value) {
|
|
171
|
+
if (value == null || value === "") return void 0;
|
|
172
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
173
|
+
let s = String(value).trim();
|
|
174
|
+
if (!s) return void 0;
|
|
175
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
176
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
177
|
+
const lastComma = s.lastIndexOf(",");
|
|
178
|
+
const lastDot = s.lastIndexOf(".");
|
|
179
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
180
|
+
if (lastComma > lastDot) {
|
|
181
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
182
|
+
} else {
|
|
183
|
+
s = s.replace(/,/g, "");
|
|
184
|
+
}
|
|
185
|
+
} else if (lastComma > -1) {
|
|
186
|
+
const parts = s.split(",");
|
|
187
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
188
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
189
|
+
} else {
|
|
190
|
+
s = s.replace(/,/g, "");
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
const num = parseFloat(s);
|
|
194
|
+
return Number.isFinite(num) ? num : void 0;
|
|
195
|
+
}
|
|
196
|
+
function normalizeCurrency(value) {
|
|
197
|
+
if (value == null) return void 0;
|
|
198
|
+
const s = String(value).trim().toUpperCase();
|
|
199
|
+
if (!s) return void 0;
|
|
200
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
201
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
202
|
+
}
|
|
203
|
+
function normalizeAvailability(value) {
|
|
204
|
+
if (value == null) return void 0;
|
|
205
|
+
let s = String(value).trim();
|
|
206
|
+
if (!s) return void 0;
|
|
207
|
+
if (s.includes("schema.org/")) {
|
|
208
|
+
const parts = s.split("/");
|
|
209
|
+
s = parts[parts.length - 1] || s;
|
|
210
|
+
}
|
|
211
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
212
|
+
if (s.includes("/")) {
|
|
213
|
+
const parts = s.split("/");
|
|
214
|
+
s = parts[parts.length - 1] || s;
|
|
215
|
+
}
|
|
216
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// src/htmlPageExtract.ts
|
|
220
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
221
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
222
|
+
"script",
|
|
223
|
+
"style",
|
|
224
|
+
"nav",
|
|
225
|
+
"header",
|
|
226
|
+
"footer",
|
|
227
|
+
".sidebar",
|
|
228
|
+
".navigation",
|
|
229
|
+
".menu",
|
|
230
|
+
".comments",
|
|
231
|
+
'[role="navigation"]',
|
|
232
|
+
'[role="banner"]'
|
|
233
|
+
];
|
|
234
|
+
function urlToDocumentId(url) {
|
|
235
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
236
|
+
}
|
|
237
|
+
function cleanContent(text) {
|
|
238
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
239
|
+
}
|
|
240
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
241
|
+
const $ = cheerio2.load(html);
|
|
242
|
+
stripNoiseFromDom($, options);
|
|
243
|
+
return cleanContent($("body").text().trim()).length;
|
|
244
|
+
}
|
|
245
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
246
|
+
const $ = cheerio2.load(html);
|
|
247
|
+
stripNoiseFromDom($, options);
|
|
248
|
+
const titleSelector = options.titleSelector || "h1, title";
|
|
249
|
+
let title = $(titleSelector).first().text().trim();
|
|
250
|
+
if (!title) {
|
|
251
|
+
title = $("title").text().trim();
|
|
252
|
+
}
|
|
253
|
+
const content = extractBestContentText($, options);
|
|
254
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
255
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
256
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
257
|
+
let imageUrl;
|
|
258
|
+
if (image) {
|
|
259
|
+
try {
|
|
260
|
+
imageUrl = new URL(image, url).href;
|
|
261
|
+
} catch {
|
|
262
|
+
imageUrl = image;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
266
|
+
let type = options.defaultType || "page";
|
|
267
|
+
if (options.typeFromUrl) {
|
|
268
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
269
|
+
if (url.includes(pattern)) {
|
|
270
|
+
type = typeName;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
const productMeta = extractProductMetadata(html);
|
|
276
|
+
const metadata = {
|
|
277
|
+
type,
|
|
278
|
+
...title ? { title } : {},
|
|
279
|
+
url,
|
|
280
|
+
...imageUrl ? { imageUrl } : {},
|
|
281
|
+
...description ? { description } : {},
|
|
282
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
283
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
284
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
285
|
+
...options.metadata
|
|
286
|
+
};
|
|
287
|
+
const previewLen = 400;
|
|
288
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
289
|
+
return {
|
|
290
|
+
id: urlToDocumentId(url),
|
|
291
|
+
metadata,
|
|
292
|
+
content,
|
|
293
|
+
indexable,
|
|
294
|
+
contentPreview
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
function stripNoiseFromDom($, options) {
|
|
298
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
299
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
300
|
+
}
|
|
301
|
+
function extractBestContentText($, options) {
|
|
302
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
303
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
304
|
+
let best = "";
|
|
305
|
+
for (const sel of selectors) {
|
|
306
|
+
$(sel).each((_, el) => {
|
|
307
|
+
const t = cleanContent($(el).text().trim());
|
|
308
|
+
if (t.length > best.length) best = t;
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
312
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
313
|
+
return best;
|
|
314
|
+
}
|
|
315
|
+
function extractHeroImage($, pageUrl) {
|
|
316
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
317
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
318
|
+
let best;
|
|
319
|
+
scope.find("img[src]").each((_, el) => {
|
|
320
|
+
if (best) return false;
|
|
321
|
+
const src = $(el).attr("src") || "";
|
|
322
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
323
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
324
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
325
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
326
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
327
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
328
|
+
if (src.includes("/_next/image")) {
|
|
329
|
+
try {
|
|
330
|
+
const nextUrl = new URL(src, pageUrl);
|
|
331
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
332
|
+
if (realUrl) {
|
|
333
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
334
|
+
return false;
|
|
335
|
+
}
|
|
336
|
+
} catch {
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
best = src;
|
|
340
|
+
return false;
|
|
341
|
+
});
|
|
342
|
+
return best;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// src/WebRAGPlugin.ts
|
|
43
346
|
function bulkOpCurrentUrl(op) {
|
|
44
347
|
const meta = op.document?.metadata;
|
|
45
348
|
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
@@ -58,7 +361,7 @@ function isUrlListingInsert(document) {
|
|
|
58
361
|
return false;
|
|
59
362
|
}
|
|
60
363
|
}
|
|
61
|
-
var WebRAGPlugin = class
|
|
364
|
+
var WebRAGPlugin = class {
|
|
62
365
|
name = "web-rag";
|
|
63
366
|
type = "rag";
|
|
64
367
|
priority;
|
|
@@ -288,6 +591,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
288
591
|
url: doc.metadata.url,
|
|
289
592
|
imageUrl: doc.metadata.imageUrl,
|
|
290
593
|
description: doc.metadata.description,
|
|
594
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
595
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
596
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
291
597
|
score: doc.score
|
|
292
598
|
}))
|
|
293
599
|
}
|
|
@@ -1458,7 +1764,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1458
1764
|
return await response.text();
|
|
1459
1765
|
}
|
|
1460
1766
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1461
|
-
const $ =
|
|
1767
|
+
const $ = cheerio3.load(html);
|
|
1462
1768
|
const links = /* @__PURE__ */ new Set();
|
|
1463
1769
|
$("a[href]").each((_, el) => {
|
|
1464
1770
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1624,7 +1930,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1624
1930
|
}
|
|
1625
1931
|
}
|
|
1626
1932
|
try {
|
|
1627
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
1933
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1628
1934
|
renderMode,
|
|
1629
1935
|
renderOptions,
|
|
1630
1936
|
minContentLength,
|
|
@@ -1655,7 +1961,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1655
1961
|
status: crawlSt,
|
|
1656
1962
|
modeUsed: diag?.modeUsed,
|
|
1657
1963
|
contentLength: doc?.content?.length,
|
|
1658
|
-
bodyTextLengthHint,
|
|
1964
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1659
1965
|
title: doc?.metadata?.title,
|
|
1660
1966
|
docId: doc?.id,
|
|
1661
1967
|
error: diag?.errorMessage
|
|
@@ -1767,125 +2073,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1767
2073
|
const html = await response.text();
|
|
1768
2074
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1769
2075
|
}
|
|
1770
|
-
/**
|
|
1771
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1772
|
-
* would otherwise hit an empty wrapper.
|
|
1773
|
-
*/
|
|
1774
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1775
|
-
stripNoiseFromDom($, config) {
|
|
1776
|
-
const removeSelectors = config.removeSelectors || [
|
|
1777
|
-
"script",
|
|
1778
|
-
"style",
|
|
1779
|
-
"nav",
|
|
1780
|
-
"header",
|
|
1781
|
-
"footer",
|
|
1782
|
-
".sidebar",
|
|
1783
|
-
".navigation",
|
|
1784
|
-
".menu",
|
|
1785
|
-
".comments",
|
|
1786
|
-
'[role="navigation"]',
|
|
1787
|
-
'[role="banner"]'
|
|
1788
|
-
];
|
|
1789
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1790
|
-
}
|
|
1791
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1792
|
-
extractBestContentText($, config) {
|
|
1793
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1794
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1795
|
-
let best = "";
|
|
1796
|
-
for (const sel of selectors) {
|
|
1797
|
-
$(sel).each((_, el) => {
|
|
1798
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1799
|
-
if (t.length > best.length) best = t;
|
|
1800
|
-
});
|
|
1801
|
-
}
|
|
1802
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1803
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1804
|
-
return best;
|
|
1805
|
-
}
|
|
1806
2076
|
bodyTextLengthHint(html, config) {
|
|
1807
|
-
|
|
1808
|
-
this.stripNoiseFromDom($, config);
|
|
1809
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2077
|
+
return bodyTextLengthHint(html, config);
|
|
1810
2078
|
}
|
|
1811
2079
|
extractDocumentFromHtml(url, html, config) {
|
|
1812
|
-
const
|
|
1813
|
-
|
|
1814
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1815
|
-
let title = $(titleSelector).first().text().trim();
|
|
1816
|
-
if (!title) {
|
|
1817
|
-
title = $("title").text().trim();
|
|
1818
|
-
}
|
|
1819
|
-
const content = this.extractBestContentText($, config);
|
|
1820
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1821
|
-
if (!content || content.length < minChars) return null;
|
|
1822
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1823
|
-
this.extractHeroImage($, url) || void 0;
|
|
1824
|
-
let imageUrl;
|
|
1825
|
-
if (image) {
|
|
1826
|
-
try {
|
|
1827
|
-
imageUrl = new URL(image, url).href;
|
|
1828
|
-
} catch {
|
|
1829
|
-
imageUrl = image;
|
|
1830
|
-
}
|
|
1831
|
-
}
|
|
1832
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1833
|
-
let type = config.defaultType || "page";
|
|
1834
|
-
if (config.typeFromUrl) {
|
|
1835
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1836
|
-
if (url.includes(pattern)) {
|
|
1837
|
-
type = typeName;
|
|
1838
|
-
break;
|
|
1839
|
-
}
|
|
1840
|
-
}
|
|
1841
|
-
}
|
|
1842
|
-
const id = this.urlToId(url);
|
|
2080
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2081
|
+
if (!extracted.indexable) return null;
|
|
1843
2082
|
return {
|
|
1844
|
-
id,
|
|
1845
|
-
content,
|
|
1846
|
-
metadata:
|
|
1847
|
-
type,
|
|
1848
|
-
title,
|
|
1849
|
-
url,
|
|
1850
|
-
...imageUrl ? { imageUrl } : {},
|
|
1851
|
-
...description ? { description } : {},
|
|
1852
|
-
...config.metadata
|
|
1853
|
-
}
|
|
2083
|
+
id: extracted.id,
|
|
2084
|
+
content: extracted.content,
|
|
2085
|
+
metadata: extracted.metadata
|
|
1854
2086
|
};
|
|
1855
2087
|
}
|
|
1856
|
-
/**
|
|
1857
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1858
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1859
|
-
*/
|
|
1860
|
-
extractHeroImage($, pageUrl) {
|
|
1861
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1862
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1863
|
-
let best;
|
|
1864
|
-
scope.find("img[src]").each((_, el) => {
|
|
1865
|
-
if (best) return false;
|
|
1866
|
-
const src = $(el).attr("src") || "";
|
|
1867
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1868
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1869
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1870
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1871
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1872
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1873
|
-
if (src.includes("/_next/image")) {
|
|
1874
|
-
try {
|
|
1875
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1876
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1877
|
-
if (realUrl) {
|
|
1878
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1879
|
-
return false;
|
|
1880
|
-
}
|
|
1881
|
-
} catch {
|
|
1882
|
-
}
|
|
1883
|
-
}
|
|
1884
|
-
best = src;
|
|
1885
|
-
return false;
|
|
1886
|
-
});
|
|
1887
|
-
return best;
|
|
1888
|
-
}
|
|
1889
2088
|
looksLikeDynamicShell(html) {
|
|
1890
2089
|
const lower = html.toLowerCase();
|
|
1891
2090
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1903,7 +2102,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1903
2102
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1904
2103
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1905
2104
|
}
|
|
1906
|
-
diagFromRenderedAttempt(doc,
|
|
2105
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1907
2106
|
if (blockedSuspected) {
|
|
1908
2107
|
return {
|
|
1909
2108
|
doc: null,
|
|
@@ -1919,12 +2118,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1919
2118
|
return {
|
|
1920
2119
|
doc,
|
|
1921
2120
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1922
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2121
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1923
2122
|
};
|
|
1924
2123
|
}
|
|
1925
2124
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1926
2125
|
if (ctx.renderMode === true) {
|
|
1927
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2126
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1928
2127
|
url,
|
|
1929
2128
|
config,
|
|
1930
2129
|
timeout,
|
|
@@ -1933,7 +2132,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1933
2132
|
);
|
|
1934
2133
|
return this.diagFromRenderedAttempt(
|
|
1935
2134
|
doc,
|
|
1936
|
-
|
|
2135
|
+
bodyTextLengthHint2,
|
|
1937
2136
|
renderFailure,
|
|
1938
2137
|
blockedSuspected,
|
|
1939
2138
|
"render_ok",
|
|
@@ -2050,7 +2249,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2050
2249
|
}
|
|
2051
2250
|
}
|
|
2052
2251
|
const html = await page.content();
|
|
2053
|
-
const
|
|
2252
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2054
2253
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2055
2254
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2056
2255
|
try {
|
|
@@ -2065,7 +2264,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2065
2264
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2066
2265
|
}
|
|
2067
2266
|
}
|
|
2068
|
-
return { doc, bodyTextLengthHint };
|
|
2267
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2069
2268
|
} catch (e) {
|
|
2070
2269
|
const msg = String(e?.message || e || "render_failed");
|
|
2071
2270
|
const lower = msg.toLowerCase();
|
|
@@ -2157,14 +2356,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2157
2356
|
/**
|
|
2158
2357
|
* Clean extracted text content
|
|
2159
2358
|
*/
|
|
2160
|
-
cleanContent(text) {
|
|
2161
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
2162
|
-
}
|
|
2163
|
-
/**
|
|
2164
|
-
* Convert URL to a stable document ID
|
|
2165
|
-
*/
|
|
2166
2359
|
urlToId(url) {
|
|
2167
|
-
return url
|
|
2360
|
+
return urlToDocumentId(url);
|
|
2168
2361
|
}
|
|
2169
2362
|
/**
|
|
2170
2363
|
* Delay helper
|
|
@@ -2434,5 +2627,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2434
2627
|
};
|
|
2435
2628
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2436
2629
|
0 && (module.exports = {
|
|
2437
|
-
WebRAGPlugin
|
|
2630
|
+
WebRAGPlugin,
|
|
2631
|
+
bodyTextLengthHint,
|
|
2632
|
+
extractPageFromHtml,
|
|
2633
|
+
extractProductMetadata,
|
|
2634
|
+
normalizeAvailability,
|
|
2635
|
+
normalizeCurrency,
|
|
2636
|
+
parsePrice,
|
|
2637
|
+
urlToDocumentId
|
|
2438
2638
|
});
|
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,305 @@
|
|
|
1
1
|
// src/WebRAGPlugin.ts
|
|
2
2
|
import { MongoClient } from "mongodb";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
|
-
import * as
|
|
4
|
+
import * as cheerio3 from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
|
+
|
|
8
|
+
// src/htmlPageExtract.ts
|
|
9
|
+
import * as cheerio2 from "cheerio";
|
|
10
|
+
|
|
11
|
+
// src/productMetadata.ts
|
|
12
|
+
import * as cheerio from "cheerio";
|
|
13
|
+
function extractProductMetadata(html) {
|
|
14
|
+
const $ = cheerio.load(html);
|
|
15
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
16
|
+
const fromOg = extractFromOpenGraph($);
|
|
17
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
18
|
+
const result = {};
|
|
19
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
20
|
+
if (price != null) result.price = price;
|
|
21
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
22
|
+
if (currency) result.currency = currency;
|
|
23
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
24
|
+
if (availability) result.availability = availability;
|
|
25
|
+
return result;
|
|
26
|
+
}
|
|
27
|
+
function extractFromJsonLd($) {
|
|
28
|
+
const result = {};
|
|
29
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
30
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
31
|
+
const raw = $(el).html()?.trim();
|
|
32
|
+
if (!raw) return;
|
|
33
|
+
let parsed;
|
|
34
|
+
try {
|
|
35
|
+
parsed = JSON.parse(raw);
|
|
36
|
+
} catch {
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
40
|
+
if (!isProductType(node)) continue;
|
|
41
|
+
const offer = pickOffer(node);
|
|
42
|
+
if (!offer) continue;
|
|
43
|
+
if (result.price == null) {
|
|
44
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
45
|
+
if (price != null) result.price = price;
|
|
46
|
+
}
|
|
47
|
+
if (!result.currency) {
|
|
48
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
49
|
+
if (currency) result.currency = currency;
|
|
50
|
+
}
|
|
51
|
+
if (!result.availability) {
|
|
52
|
+
const availability = normalizeAvailability(offer.availability);
|
|
53
|
+
if (availability) result.availability = availability;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
function extractFromOpenGraph($) {
|
|
60
|
+
const result = {};
|
|
61
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
62
|
+
const price = parsePrice(priceRaw);
|
|
63
|
+
if (price != null) result.price = price;
|
|
64
|
+
const currency = normalizeCurrency(
|
|
65
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
66
|
+
);
|
|
67
|
+
if (currency) result.currency = currency;
|
|
68
|
+
const availability = normalizeAvailability(
|
|
69
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
70
|
+
);
|
|
71
|
+
if (availability) result.availability = availability;
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
function microdataField($, itemprop) {
|
|
75
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
76
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
77
|
+
}
|
|
78
|
+
function extractFromMicrodata($) {
|
|
79
|
+
const result = {};
|
|
80
|
+
const priceEl = microdataField($, "price");
|
|
81
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
82
|
+
if (price != null) result.price = price;
|
|
83
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
84
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
85
|
+
if (currency) result.currency = currency;
|
|
86
|
+
const availabilityEl = microdataField($, "availability");
|
|
87
|
+
const availability = normalizeAvailability(
|
|
88
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
89
|
+
);
|
|
90
|
+
if (availability) result.availability = availability;
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
function collectJsonLdNodes(data) {
|
|
94
|
+
const nodes = [];
|
|
95
|
+
const visit = (value) => {
|
|
96
|
+
if (value == null) return;
|
|
97
|
+
if (Array.isArray(value)) {
|
|
98
|
+
value.forEach(visit);
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
if (typeof value !== "object") return;
|
|
102
|
+
const obj = value;
|
|
103
|
+
nodes.push(obj);
|
|
104
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
105
|
+
};
|
|
106
|
+
visit(data);
|
|
107
|
+
return nodes;
|
|
108
|
+
}
|
|
109
|
+
function isProductType(node) {
|
|
110
|
+
const type = node["@type"];
|
|
111
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
112
|
+
return types.some((t) => {
|
|
113
|
+
const s = String(t).toLowerCase();
|
|
114
|
+
return s === "product" || s.endsWith("/product");
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
function pickOffer(product) {
|
|
118
|
+
const offers = product.offers;
|
|
119
|
+
if (offers == null) return null;
|
|
120
|
+
if (Array.isArray(offers)) {
|
|
121
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
122
|
+
return first ?? null;
|
|
123
|
+
}
|
|
124
|
+
if (typeof offers === "object") return offers;
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
function parsePrice(value) {
|
|
128
|
+
if (value == null || value === "") return void 0;
|
|
129
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
130
|
+
let s = String(value).trim();
|
|
131
|
+
if (!s) return void 0;
|
|
132
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
133
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
134
|
+
const lastComma = s.lastIndexOf(",");
|
|
135
|
+
const lastDot = s.lastIndexOf(".");
|
|
136
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
137
|
+
if (lastComma > lastDot) {
|
|
138
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
139
|
+
} else {
|
|
140
|
+
s = s.replace(/,/g, "");
|
|
141
|
+
}
|
|
142
|
+
} else if (lastComma > -1) {
|
|
143
|
+
const parts = s.split(",");
|
|
144
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
145
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
146
|
+
} else {
|
|
147
|
+
s = s.replace(/,/g, "");
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const num = parseFloat(s);
|
|
151
|
+
return Number.isFinite(num) ? num : void 0;
|
|
152
|
+
}
|
|
153
|
+
function normalizeCurrency(value) {
|
|
154
|
+
if (value == null) return void 0;
|
|
155
|
+
const s = String(value).trim().toUpperCase();
|
|
156
|
+
if (!s) return void 0;
|
|
157
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
158
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
159
|
+
}
|
|
160
|
+
function normalizeAvailability(value) {
|
|
161
|
+
if (value == null) return void 0;
|
|
162
|
+
let s = String(value).trim();
|
|
163
|
+
if (!s) return void 0;
|
|
164
|
+
if (s.includes("schema.org/")) {
|
|
165
|
+
const parts = s.split("/");
|
|
166
|
+
s = parts[parts.length - 1] || s;
|
|
167
|
+
}
|
|
168
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
169
|
+
if (s.includes("/")) {
|
|
170
|
+
const parts = s.split("/");
|
|
171
|
+
s = parts[parts.length - 1] || s;
|
|
172
|
+
}
|
|
173
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// src/htmlPageExtract.ts
|
|
177
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
178
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
179
|
+
"script",
|
|
180
|
+
"style",
|
|
181
|
+
"nav",
|
|
182
|
+
"header",
|
|
183
|
+
"footer",
|
|
184
|
+
".sidebar",
|
|
185
|
+
".navigation",
|
|
186
|
+
".menu",
|
|
187
|
+
".comments",
|
|
188
|
+
'[role="navigation"]',
|
|
189
|
+
'[role="banner"]'
|
|
190
|
+
];
|
|
191
|
+
function urlToDocumentId(url) {
|
|
192
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
193
|
+
}
|
|
194
|
+
function cleanContent(text) {
|
|
195
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
196
|
+
}
|
|
197
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
198
|
+
const $ = cheerio2.load(html);
|
|
199
|
+
stripNoiseFromDom($, options);
|
|
200
|
+
return cleanContent($("body").text().trim()).length;
|
|
201
|
+
}
|
|
202
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
203
|
+
const $ = cheerio2.load(html);
|
|
204
|
+
stripNoiseFromDom($, options);
|
|
205
|
+
const titleSelector = options.titleSelector || "h1, title";
|
|
206
|
+
let title = $(titleSelector).first().text().trim();
|
|
207
|
+
if (!title) {
|
|
208
|
+
title = $("title").text().trim();
|
|
209
|
+
}
|
|
210
|
+
const content = extractBestContentText($, options);
|
|
211
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
212
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
213
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
214
|
+
let imageUrl;
|
|
215
|
+
if (image) {
|
|
216
|
+
try {
|
|
217
|
+
imageUrl = new URL(image, url).href;
|
|
218
|
+
} catch {
|
|
219
|
+
imageUrl = image;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
223
|
+
let type = options.defaultType || "page";
|
|
224
|
+
if (options.typeFromUrl) {
|
|
225
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
226
|
+
if (url.includes(pattern)) {
|
|
227
|
+
type = typeName;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
const productMeta = extractProductMetadata(html);
|
|
233
|
+
const metadata = {
|
|
234
|
+
type,
|
|
235
|
+
...title ? { title } : {},
|
|
236
|
+
url,
|
|
237
|
+
...imageUrl ? { imageUrl } : {},
|
|
238
|
+
...description ? { description } : {},
|
|
239
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
240
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
241
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
242
|
+
...options.metadata
|
|
243
|
+
};
|
|
244
|
+
const previewLen = 400;
|
|
245
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
246
|
+
return {
|
|
247
|
+
id: urlToDocumentId(url),
|
|
248
|
+
metadata,
|
|
249
|
+
content,
|
|
250
|
+
indexable,
|
|
251
|
+
contentPreview
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
function stripNoiseFromDom($, options) {
|
|
255
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
256
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
257
|
+
}
|
|
258
|
+
function extractBestContentText($, options) {
|
|
259
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
260
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
261
|
+
let best = "";
|
|
262
|
+
for (const sel of selectors) {
|
|
263
|
+
$(sel).each((_, el) => {
|
|
264
|
+
const t = cleanContent($(el).text().trim());
|
|
265
|
+
if (t.length > best.length) best = t;
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
269
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
270
|
+
return best;
|
|
271
|
+
}
|
|
272
|
+
function extractHeroImage($, pageUrl) {
|
|
273
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
274
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
275
|
+
let best;
|
|
276
|
+
scope.find("img[src]").each((_, el) => {
|
|
277
|
+
if (best) return false;
|
|
278
|
+
const src = $(el).attr("src") || "";
|
|
279
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
280
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
281
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
282
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
283
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
284
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
285
|
+
if (src.includes("/_next/image")) {
|
|
286
|
+
try {
|
|
287
|
+
const nextUrl = new URL(src, pageUrl);
|
|
288
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
289
|
+
if (realUrl) {
|
|
290
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
291
|
+
return false;
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
best = src;
|
|
297
|
+
return false;
|
|
298
|
+
});
|
|
299
|
+
return best;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// src/WebRAGPlugin.ts
|
|
7
303
|
function bulkOpCurrentUrl(op) {
|
|
8
304
|
const meta = op.document?.metadata;
|
|
9
305
|
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
@@ -22,7 +318,7 @@ function isUrlListingInsert(document) {
|
|
|
22
318
|
return false;
|
|
23
319
|
}
|
|
24
320
|
}
|
|
25
|
-
var WebRAGPlugin = class
|
|
321
|
+
var WebRAGPlugin = class {
|
|
26
322
|
name = "web-rag";
|
|
27
323
|
type = "rag";
|
|
28
324
|
priority;
|
|
@@ -252,6 +548,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
252
548
|
url: doc.metadata.url,
|
|
253
549
|
imageUrl: doc.metadata.imageUrl,
|
|
254
550
|
description: doc.metadata.description,
|
|
551
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
552
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
553
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
255
554
|
score: doc.score
|
|
256
555
|
}))
|
|
257
556
|
}
|
|
@@ -1422,7 +1721,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1422
1721
|
return await response.text();
|
|
1423
1722
|
}
|
|
1424
1723
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1425
|
-
const $ =
|
|
1724
|
+
const $ = cheerio3.load(html);
|
|
1426
1725
|
const links = /* @__PURE__ */ new Set();
|
|
1427
1726
|
$("a[href]").each((_, el) => {
|
|
1428
1727
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1588,7 +1887,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1588
1887
|
}
|
|
1589
1888
|
}
|
|
1590
1889
|
try {
|
|
1591
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
1890
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1592
1891
|
renderMode,
|
|
1593
1892
|
renderOptions,
|
|
1594
1893
|
minContentLength,
|
|
@@ -1619,7 +1918,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1619
1918
|
status: crawlSt,
|
|
1620
1919
|
modeUsed: diag?.modeUsed,
|
|
1621
1920
|
contentLength: doc?.content?.length,
|
|
1622
|
-
bodyTextLengthHint,
|
|
1921
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1623
1922
|
title: doc?.metadata?.title,
|
|
1624
1923
|
docId: doc?.id,
|
|
1625
1924
|
error: diag?.errorMessage
|
|
@@ -1731,125 +2030,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1731
2030
|
const html = await response.text();
|
|
1732
2031
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1733
2032
|
}
|
|
1734
|
-
/**
|
|
1735
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1736
|
-
* would otherwise hit an empty wrapper.
|
|
1737
|
-
*/
|
|
1738
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1739
|
-
stripNoiseFromDom($, config) {
|
|
1740
|
-
const removeSelectors = config.removeSelectors || [
|
|
1741
|
-
"script",
|
|
1742
|
-
"style",
|
|
1743
|
-
"nav",
|
|
1744
|
-
"header",
|
|
1745
|
-
"footer",
|
|
1746
|
-
".sidebar",
|
|
1747
|
-
".navigation",
|
|
1748
|
-
".menu",
|
|
1749
|
-
".comments",
|
|
1750
|
-
'[role="navigation"]',
|
|
1751
|
-
'[role="banner"]'
|
|
1752
|
-
];
|
|
1753
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1754
|
-
}
|
|
1755
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1756
|
-
extractBestContentText($, config) {
|
|
1757
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1758
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1759
|
-
let best = "";
|
|
1760
|
-
for (const sel of selectors) {
|
|
1761
|
-
$(sel).each((_, el) => {
|
|
1762
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1763
|
-
if (t.length > best.length) best = t;
|
|
1764
|
-
});
|
|
1765
|
-
}
|
|
1766
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1767
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1768
|
-
return best;
|
|
1769
|
-
}
|
|
1770
2033
|
bodyTextLengthHint(html, config) {
|
|
1771
|
-
|
|
1772
|
-
this.stripNoiseFromDom($, config);
|
|
1773
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2034
|
+
return bodyTextLengthHint(html, config);
|
|
1774
2035
|
}
|
|
1775
2036
|
extractDocumentFromHtml(url, html, config) {
|
|
1776
|
-
const
|
|
1777
|
-
|
|
1778
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1779
|
-
let title = $(titleSelector).first().text().trim();
|
|
1780
|
-
if (!title) {
|
|
1781
|
-
title = $("title").text().trim();
|
|
1782
|
-
}
|
|
1783
|
-
const content = this.extractBestContentText($, config);
|
|
1784
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1785
|
-
if (!content || content.length < minChars) return null;
|
|
1786
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1787
|
-
this.extractHeroImage($, url) || void 0;
|
|
1788
|
-
let imageUrl;
|
|
1789
|
-
if (image) {
|
|
1790
|
-
try {
|
|
1791
|
-
imageUrl = new URL(image, url).href;
|
|
1792
|
-
} catch {
|
|
1793
|
-
imageUrl = image;
|
|
1794
|
-
}
|
|
1795
|
-
}
|
|
1796
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1797
|
-
let type = config.defaultType || "page";
|
|
1798
|
-
if (config.typeFromUrl) {
|
|
1799
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1800
|
-
if (url.includes(pattern)) {
|
|
1801
|
-
type = typeName;
|
|
1802
|
-
break;
|
|
1803
|
-
}
|
|
1804
|
-
}
|
|
1805
|
-
}
|
|
1806
|
-
const id = this.urlToId(url);
|
|
2037
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2038
|
+
if (!extracted.indexable) return null;
|
|
1807
2039
|
return {
|
|
1808
|
-
id,
|
|
1809
|
-
content,
|
|
1810
|
-
metadata:
|
|
1811
|
-
type,
|
|
1812
|
-
title,
|
|
1813
|
-
url,
|
|
1814
|
-
...imageUrl ? { imageUrl } : {},
|
|
1815
|
-
...description ? { description } : {},
|
|
1816
|
-
...config.metadata
|
|
1817
|
-
}
|
|
2040
|
+
id: extracted.id,
|
|
2041
|
+
content: extracted.content,
|
|
2042
|
+
metadata: extracted.metadata
|
|
1818
2043
|
};
|
|
1819
2044
|
}
|
|
1820
|
-
/**
|
|
1821
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1822
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1823
|
-
*/
|
|
1824
|
-
extractHeroImage($, pageUrl) {
|
|
1825
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1826
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1827
|
-
let best;
|
|
1828
|
-
scope.find("img[src]").each((_, el) => {
|
|
1829
|
-
if (best) return false;
|
|
1830
|
-
const src = $(el).attr("src") || "";
|
|
1831
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1832
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1833
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1834
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1835
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1836
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1837
|
-
if (src.includes("/_next/image")) {
|
|
1838
|
-
try {
|
|
1839
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1840
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1841
|
-
if (realUrl) {
|
|
1842
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1843
|
-
return false;
|
|
1844
|
-
}
|
|
1845
|
-
} catch {
|
|
1846
|
-
}
|
|
1847
|
-
}
|
|
1848
|
-
best = src;
|
|
1849
|
-
return false;
|
|
1850
|
-
});
|
|
1851
|
-
return best;
|
|
1852
|
-
}
|
|
1853
2045
|
looksLikeDynamicShell(html) {
|
|
1854
2046
|
const lower = html.toLowerCase();
|
|
1855
2047
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1867,7 +2059,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1867
2059
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1868
2060
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1869
2061
|
}
|
|
1870
|
-
diagFromRenderedAttempt(doc,
|
|
2062
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1871
2063
|
if (blockedSuspected) {
|
|
1872
2064
|
return {
|
|
1873
2065
|
doc: null,
|
|
@@ -1883,12 +2075,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1883
2075
|
return {
|
|
1884
2076
|
doc,
|
|
1885
2077
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1886
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2078
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1887
2079
|
};
|
|
1888
2080
|
}
|
|
1889
2081
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1890
2082
|
if (ctx.renderMode === true) {
|
|
1891
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2083
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1892
2084
|
url,
|
|
1893
2085
|
config,
|
|
1894
2086
|
timeout,
|
|
@@ -1897,7 +2089,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1897
2089
|
);
|
|
1898
2090
|
return this.diagFromRenderedAttempt(
|
|
1899
2091
|
doc,
|
|
1900
|
-
|
|
2092
|
+
bodyTextLengthHint2,
|
|
1901
2093
|
renderFailure,
|
|
1902
2094
|
blockedSuspected,
|
|
1903
2095
|
"render_ok",
|
|
@@ -2014,7 +2206,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2014
2206
|
}
|
|
2015
2207
|
}
|
|
2016
2208
|
const html = await page.content();
|
|
2017
|
-
const
|
|
2209
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2018
2210
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2019
2211
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2020
2212
|
try {
|
|
@@ -2029,7 +2221,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2029
2221
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2030
2222
|
}
|
|
2031
2223
|
}
|
|
2032
|
-
return { doc, bodyTextLengthHint };
|
|
2224
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2033
2225
|
} catch (e) {
|
|
2034
2226
|
const msg = String(e?.message || e || "render_failed");
|
|
2035
2227
|
const lower = msg.toLowerCase();
|
|
@@ -2121,14 +2313,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2121
2313
|
/**
|
|
2122
2314
|
* Clean extracted text content
|
|
2123
2315
|
*/
|
|
2124
|
-
cleanContent(text) {
|
|
2125
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
2126
|
-
}
|
|
2127
|
-
/**
|
|
2128
|
-
* Convert URL to a stable document ID
|
|
2129
|
-
*/
|
|
2130
2316
|
urlToId(url) {
|
|
2131
|
-
return url
|
|
2317
|
+
return urlToDocumentId(url);
|
|
2132
2318
|
}
|
|
2133
2319
|
/**
|
|
2134
2320
|
* Delay helper
|
|
@@ -2397,5 +2583,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2397
2583
|
}
|
|
2398
2584
|
};
|
|
2399
2585
|
export {
|
|
2400
|
-
WebRAGPlugin
|
|
2586
|
+
WebRAGPlugin,
|
|
2587
|
+
bodyTextLengthHint,
|
|
2588
|
+
extractPageFromHtml,
|
|
2589
|
+
extractProductMetadata,
|
|
2590
|
+
normalizeAvailability,
|
|
2591
|
+
normalizeCurrency,
|
|
2592
|
+
parsePrice,
|
|
2593
|
+
urlToDocumentId
|
|
2401
2594
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|