edgecrawl 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,218 @@
1
+ // src/scraper.mjs
2
+ // Playwright headless browser scraping
3
+
4
+ import { chromium } from "playwright";
5
+
6
+ let browser = null;
7
+
8
+ /**
9
+ * Launch browser
10
+ * @param {object} options
11
+ * @param {boolean} options.headless - Headless mode (default: true)
12
+ * @param {string} options.proxy - Proxy server URL
13
+ */
14
+ export async function launchBrowser(options = {}) {
15
+ if (browser) return;
16
+
17
+ const { headless = true, proxy } = options;
18
+
19
+ browser = await chromium.launch({
20
+ headless,
21
+ ...(proxy && { proxy: { server: proxy } }),
22
+ });
23
+ }
24
+
25
+ /**
26
+ * Create browser context (shared helper)
27
+ * Sets Cookie, extra headers, UserAgent, Viewport
28
+ */
29
+ async function createContext(options = {}, targetUrl = null) {
30
+ const {
31
+ userAgent = null,
32
+ viewportWidth,
33
+ viewportHeight,
34
+ cookies = [],
35
+ extraHeaders = [],
36
+ } = options;
37
+
38
+ const context = await browser.newContext({
39
+ ...(userAgent && { userAgent }),
40
+ ...(viewportWidth && viewportHeight && {
41
+ viewport: { width: viewportWidth, height: viewportHeight },
42
+ }),
43
+ });
44
+
45
+ // Set cookies
46
+ if (cookies.length > 0 && targetUrl) {
47
+ const parsed = cookies.map((c) => {
48
+ const [name, ...rest] = c.split("=");
49
+ return { name, value: rest.join("="), url: targetUrl };
50
+ });
51
+ await context.addCookies(parsed);
52
+ }
53
+
54
+ // Set extra HTTP headers
55
+ if (extraHeaders.length > 0) {
56
+ const headers = {};
57
+ for (const h of extraHeaders) {
58
+ const idx = h.indexOf(":");
59
+ if (idx > 0) {
60
+ headers[h.slice(0, idx).trim()] = h.slice(idx + 1).trim();
61
+ }
62
+ }
63
+ await context.setExtraHTTPHeaders(headers);
64
+ }
65
+
66
+ return context;
67
+ }
68
+
69
+ /**
70
+ * Fetch HTML from a URL
71
+ * @param {string} url
72
+ * @param {object} options
73
+ * @returns {{ html: string, url: string, status: number }}
74
+ */
75
+ export async function fetchPage(url, options = {}) {
76
+ if (!browser) await launchBrowser();
77
+
78
+ const {
79
+ waitUntil = "load",
80
+ timeout = 30000,
81
+ waitForSelector = null,
82
+ scrollToBottom = false,
83
+ blockMedia = true,
84
+ } = options;
85
+
86
+ const context = await createContext(options, url);
87
+ const page = await context.newPage();
88
+
89
+ // Block images, fonts, media for faster loading
90
+ if (blockMedia) {
91
+ await page.route("**/*", (route) => {
92
+ const type = route.request().resourceType();
93
+ if (["image", "media", "font"].includes(type)) {
94
+ return route.abort();
95
+ }
96
+ return route.continue();
97
+ });
98
+ }
99
+
100
+ try {
101
+ const response = await page.goto(url, { waitUntil, timeout });
102
+
103
+ // Wait for dynamic content
104
+ if (waitForSelector) {
105
+ await page.waitForSelector(waitForSelector, { timeout: 10000 });
106
+ }
107
+
108
+ // SPA support: wait for DOM to stabilize
109
+ await waitForDOMStable(page);
110
+
111
+ // Lazy-loading: scroll to bottom
112
+ if (scrollToBottom) {
113
+ await autoScroll(page);
114
+ }
115
+
116
+ const html = await page.content();
117
+
118
+ return {
119
+ html,
120
+ url: page.url(), // URL after redirects
121
+ status: response?.status() || 0,
122
+ };
123
+ } finally {
124
+ await context.close();
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Fetch multiple URLs in parallel
130
+ * @param {string[]} urls
131
+ * @param {object} options
132
+ * @param {number} concurrency - Concurrency limit
133
+ */
134
+ export async function fetchPages(urls, options = {}, concurrency = 3) {
135
+ const results = [];
136
+ const queue = [...urls];
137
+
138
+ const workers = Array.from({ length: concurrency }, async () => {
139
+ while (queue.length > 0) {
140
+ const url = queue.shift();
141
+ try {
142
+ const result = await fetchPage(url, options);
143
+ results.push({ url, ...result, error: null });
144
+ } catch (error) {
145
+ results.push({ url, html: null, status: 0, error: error.message });
146
+ }
147
+ }
148
+ });
149
+
150
+ await Promise.all(workers);
151
+ return results;
152
+ }
153
+
154
+ /**
155
+ * Wait for DOM to stabilize (SPA support)
156
+ * Uses MutationObserver to detect when DOM changes settle
157
+ */
158
+ async function waitForDOMStable(page, stableMs = 1000, timeoutMs = 10000) {
159
+ await page.evaluate(({ stableMs, timeoutMs }) => {
160
+ return new Promise((resolve) => {
161
+ let timer = null;
162
+ const observer = new MutationObserver(() => {
163
+ clearTimeout(timer);
164
+ timer = setTimeout(() => {
165
+ observer.disconnect();
166
+ resolve();
167
+ }, stableMs);
168
+ });
169
+ observer.observe(document.body, {
170
+ childList: true,
171
+ subtree: true,
172
+ });
173
+ timer = setTimeout(() => {
174
+ observer.disconnect();
175
+ resolve();
176
+ }, stableMs);
177
+ setTimeout(() => {
178
+ observer.disconnect();
179
+ resolve();
180
+ }, timeoutMs);
181
+ });
182
+ }, { stableMs, timeoutMs });
183
+ }
184
+
185
+ /**
186
+ * Auto-scroll to bottom of page (for lazy-loaded content)
187
+ */
188
+ async function autoScroll(page) {
189
+ await page.evaluate(async () => {
190
+ await new Promise((resolve) => {
191
+ let totalHeight = 0;
192
+ const distance = 400;
193
+ const timer = setInterval(() => {
194
+ const scrollHeight = document.body.scrollHeight;
195
+ window.scrollBy(0, distance);
196
+ totalHeight += distance;
197
+ if (totalHeight >= scrollHeight) {
198
+ clearInterval(timer);
199
+ resolve();
200
+ }
201
+ }, 100);
202
+ setTimeout(() => {
203
+ clearInterval(timer);
204
+ resolve();
205
+ }, 10000);
206
+ });
207
+ });
208
+ }
209
+
210
+ /**
211
+ * Close browser
212
+ */
213
+ export async function closeBrowser() {
214
+ if (browser) {
215
+ await browser.close();
216
+ browser = null;
217
+ }
218
+ }
@@ -0,0 +1,226 @@
1
+ // src/structured-extract.mjs
2
+ // Extract structured data (JSON-LD, Open Graph) from HTML
3
+ // If schema match coverage is sufficient, return result without LLM
4
+
5
+ import { JSDOM } from "jsdom";
6
+
7
+ /**
8
+ * Extract JSON-LD data from HTML
9
+ * @param {string} html - Raw HTML
10
+ * @returns {object[]} Array of JSON-LD objects
11
+ */
12
+ function extractJsonLd(html) {
13
+ const dom = new JSDOM(html);
14
+ const scripts = dom.window.document.querySelectorAll('script[type="application/ld+json"]');
15
+ const results = [];
16
+
17
+ for (const script of scripts) {
18
+ try {
19
+ const data = JSON.parse(script.textContent);
20
+ // Expand @graph if present
21
+ if (data["@graph"] && Array.isArray(data["@graph"])) {
22
+ results.push(...data["@graph"]);
23
+ } else if (Array.isArray(data)) {
24
+ results.push(...data);
25
+ } else {
26
+ results.push(data);
27
+ }
28
+ } catch {
29
+ // Ignore JSON parse failures
30
+ }
31
+ }
32
+
33
+ return results;
34
+ }
35
+
36
+ /**
37
+ * Extract Open Graph meta tags from HTML
38
+ * @param {string} html - Raw HTML
39
+ * @returns {object} OG property dictionary { "og:title": "...", ... }
40
+ */
41
+ function extractOpenGraph(html) {
42
+ const dom = new JSDOM(html);
43
+ const doc = dom.window.document;
44
+ const og = {};
45
+
46
+ // og:* meta tags
47
+ const ogMetas = doc.querySelectorAll('meta[property^="og:"]');
48
+ for (const meta of ogMetas) {
49
+ const prop = meta.getAttribute("property");
50
+ const content = meta.getAttribute("content");
51
+ if (prop && content) {
52
+ og[prop] = content;
53
+ }
54
+ }
55
+
56
+ // twitter:* meta tags
57
+ const twitterMetas = doc.querySelectorAll('meta[name^="twitter:"], meta[property^="twitter:"]');
58
+ for (const meta of twitterMetas) {
59
+ const prop = meta.getAttribute("name") || meta.getAttribute("property");
60
+ const content = meta.getAttribute("content");
61
+ if (prop && content) {
62
+ og[prop] = content;
63
+ }
64
+ }
65
+
66
+ // Basic meta tags
67
+ const descMeta = doc.querySelector('meta[name="description"]');
68
+ if (descMeta) og["meta:description"] = descMeta.getAttribute("content");
69
+
70
+ const titleEl = doc.querySelector("title");
71
+ if (titleEl) og["meta:title"] = titleEl.textContent.trim();
72
+
73
+ return og;
74
+ }
75
+
76
+ /**
77
+ * Flatten JSON-LD data into key-value pairs
78
+ * Nested objects become "parent.child" format
79
+ */
80
+ function flattenObject(obj, prefix = "") {
81
+ const result = {};
82
+
83
+ for (const [key, value] of Object.entries(obj)) {
84
+ if (key.startsWith("@")) continue; // Skip @type, @context, etc.
85
+
86
+ const fullKey = prefix ? `${prefix}.${key}` : key;
87
+
88
+ if (value && typeof value === "object" && !Array.isArray(value)) {
89
+ Object.assign(result, flattenObject(value, fullKey));
90
+ } else {
91
+ result[fullKey] = value;
92
+ }
93
+ }
94
+
95
+ return result;
96
+ }
97
+
98
+ /**
99
+ * Match structured data against a schema
100
+ * For each schema key, find a corresponding value in structured data
101
+ * @param {object} structuredData - Flattened structured data
102
+ * @param {object} schema - User-defined schema
103
+ * @returns {{ matched: object, coverage: number }} Match results and coverage ratio
104
+ */
105
+ function matchToSchema(structuredData, schema) {
106
+ const schemaKeys = Object.keys(schema);
107
+ const matched = {};
108
+ let matchCount = 0;
109
+
110
+ // Key name alias map (common mapping patterns)
111
+ const keyAliases = {
112
+ title: ["name", "headline", "og:title", "twitter:title", "meta:title"],
113
+ description: ["description", "abstract", "og:description", "twitter:description", "meta:description"],
114
+ price: ["price", "offers.price", "offers.lowPrice"],
115
+ currency: ["priceCurrency", "offers.priceCurrency"],
116
+ image: ["image", "thumbnailUrl", "og:image", "twitter:image"],
117
+ url: ["url", "mainEntityOfPage", "og:url"],
118
+ author: ["author.name", "author", "creator"],
119
+ date: ["datePublished", "dateCreated", "dateModified"],
120
+ published: ["datePublished", "dateCreated"],
121
+ brand: ["brand.name", "brand"],
122
+ category: ["category", "articleSection"],
123
+ rating: ["aggregateRating.ratingValue"],
124
+ reviewCount: ["aggregateRating.reviewCount"],
125
+ availability: ["offers.availability"],
126
+ language: ["inLanguage"],
127
+ };
128
+
129
+ for (const schemaKey of schemaKeys) {
130
+ const normalizedKey = schemaKey.toLowerCase().replace(/[_-]/g, "");
131
+
132
+ // 1. Direct key match
133
+ if (structuredData[schemaKey] !== undefined) {
134
+ matched[schemaKey] = structuredData[schemaKey];
135
+ matchCount++;
136
+ continue;
137
+ }
138
+
139
+ // 2. Alias match
140
+ let found = false;
141
+ for (const [aliasGroup, aliases] of Object.entries(keyAliases)) {
142
+ if (normalizedKey.includes(aliasGroup)) {
143
+ for (const alias of aliases) {
144
+ if (structuredData[alias] !== undefined) {
145
+ matched[schemaKey] = structuredData[alias];
146
+ matchCount++;
147
+ found = true;
148
+ break;
149
+ }
150
+ }
151
+ if (found) break;
152
+ }
153
+ }
154
+ if (found) continue;
155
+
156
+ // 3. Partial match (structured data key contains schema key or vice versa)
157
+ for (const [dataKey, dataValue] of Object.entries(structuredData)) {
158
+ const normalizedDataKey = dataKey.toLowerCase().replace(/[_.-]/g, "");
159
+ if (normalizedDataKey.includes(normalizedKey) || normalizedKey.includes(normalizedDataKey)) {
160
+ matched[schemaKey] = dataValue;
161
+ matchCount++;
162
+ break;
163
+ }
164
+ }
165
+ }
166
+
167
+ return {
168
+ matched,
169
+ coverage: schemaKeys.length > 0 ? matchCount / schemaKeys.length : 0,
170
+ };
171
+ }
172
+
173
+ /**
174
+ * Extract structured data from HTML and match against schema
175
+ * Returns result if coverage meets threshold (no LLM needed)
176
+ *
177
+ * @param {string} html - Raw HTML
178
+ * @param {object} schema - User-defined schema
179
+ * @param {object} options
180
+ * @param {number} options.minCoverage - Minimum coverage ratio (0-1, default: 0.5)
181
+ * @returns {{ extracted: object, source: string, coverage: number } | null}
182
+ */
183
+ export function tryStructuredExtract(html, schema, options = {}) {
184
+ const { minCoverage = 0.5 } = options;
185
+
186
+ // 1. Try extraction from JSON-LD
187
+ const jsonLdItems = extractJsonLd(html);
188
+ if (jsonLdItems.length > 0) {
189
+ // Merge and flatten all JSON-LD items
190
+ const merged = {};
191
+ for (const item of jsonLdItems) {
192
+ Object.assign(merged, flattenObject(item));
193
+ }
194
+
195
+ const { matched, coverage } = matchToSchema(merged, schema);
196
+ if (coverage >= minCoverage) {
197
+ return { extracted: matched, source: "json-ld", coverage };
198
+ }
199
+ }
200
+
201
+ // 2. Try extraction from Open Graph
202
+ const ogData = extractOpenGraph(html);
203
+ if (Object.keys(ogData).length > 0) {
204
+ const { matched, coverage } = matchToSchema(ogData, schema);
205
+ if (coverage >= minCoverage) {
206
+ return { extracted: matched, source: "open-graph", coverage };
207
+ }
208
+ }
209
+
210
+ // 3. Merge JSON-LD + OG and retry
211
+ if (jsonLdItems.length > 0 && Object.keys(ogData).length > 0) {
212
+ const merged = {};
213
+ for (const item of jsonLdItems) {
214
+ Object.assign(merged, flattenObject(item));
215
+ }
216
+ Object.assign(merged, ogData);
217
+
218
+ const { matched, coverage } = matchToSchema(merged, schema);
219
+ if (coverage >= minCoverage) {
220
+ return { extracted: matched, source: "json-ld+open-graph", coverage };
221
+ }
222
+ }
223
+
224
+ // Coverage insufficient -> null (fallback to LLM)
225
+ return null;
226
+ }