edgecrawl 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +260 -0
- package/index.mjs +22 -0
- package/package.json +63 -0
- package/schemas/article.json +15 -0
- package/schemas/news.json +19 -0
- package/schemas/product.json +20 -0
- package/src/cli.mjs +226 -0
- package/src/html2md.mjs +268 -0
- package/src/llm.mjs +281 -0
- package/src/pipeline.mjs +165 -0
- package/src/scraper.mjs +218 -0
- package/src/structured-extract.mjs +226 -0
package/src/scraper.mjs
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
// src/scraper.mjs
|
|
2
|
+
// Playwright headless browser scraping
|
|
3
|
+
|
|
4
|
+
import { chromium } from "playwright";
|
|
5
|
+
|
|
6
|
+
let browser = null;
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Launch browser
|
|
10
|
+
* @param {object} options
|
|
11
|
+
* @param {boolean} options.headless - Headless mode (default: true)
|
|
12
|
+
* @param {string} options.proxy - Proxy server URL
|
|
13
|
+
*/
|
|
14
|
+
export async function launchBrowser(options = {}) {
|
|
15
|
+
if (browser) return;
|
|
16
|
+
|
|
17
|
+
const { headless = true, proxy } = options;
|
|
18
|
+
|
|
19
|
+
browser = await chromium.launch({
|
|
20
|
+
headless,
|
|
21
|
+
...(proxy && { proxy: { server: proxy } }),
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Create browser context (shared helper)
|
|
27
|
+
* Sets Cookie, extra headers, UserAgent, Viewport
|
|
28
|
+
*/
|
|
29
|
+
async function createContext(options = {}, targetUrl = null) {
|
|
30
|
+
const {
|
|
31
|
+
userAgent = null,
|
|
32
|
+
viewportWidth,
|
|
33
|
+
viewportHeight,
|
|
34
|
+
cookies = [],
|
|
35
|
+
extraHeaders = [],
|
|
36
|
+
} = options;
|
|
37
|
+
|
|
38
|
+
const context = await browser.newContext({
|
|
39
|
+
...(userAgent && { userAgent }),
|
|
40
|
+
...(viewportWidth && viewportHeight && {
|
|
41
|
+
viewport: { width: viewportWidth, height: viewportHeight },
|
|
42
|
+
}),
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// Set cookies
|
|
46
|
+
if (cookies.length > 0 && targetUrl) {
|
|
47
|
+
const parsed = cookies.map((c) => {
|
|
48
|
+
const [name, ...rest] = c.split("=");
|
|
49
|
+
return { name, value: rest.join("="), url: targetUrl };
|
|
50
|
+
});
|
|
51
|
+
await context.addCookies(parsed);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Set extra HTTP headers
|
|
55
|
+
if (extraHeaders.length > 0) {
|
|
56
|
+
const headers = {};
|
|
57
|
+
for (const h of extraHeaders) {
|
|
58
|
+
const idx = h.indexOf(":");
|
|
59
|
+
if (idx > 0) {
|
|
60
|
+
headers[h.slice(0, idx).trim()] = h.slice(idx + 1).trim();
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
await context.setExtraHTTPHeaders(headers);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return context;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Fetch HTML from a URL
|
|
71
|
+
* @param {string} url
|
|
72
|
+
* @param {object} options
|
|
73
|
+
* @returns {{ html: string, url: string, status: number }}
|
|
74
|
+
*/
|
|
75
|
+
export async function fetchPage(url, options = {}) {
|
|
76
|
+
if (!browser) await launchBrowser();
|
|
77
|
+
|
|
78
|
+
const {
|
|
79
|
+
waitUntil = "load",
|
|
80
|
+
timeout = 30000,
|
|
81
|
+
waitForSelector = null,
|
|
82
|
+
scrollToBottom = false,
|
|
83
|
+
blockMedia = true,
|
|
84
|
+
} = options;
|
|
85
|
+
|
|
86
|
+
const context = await createContext(options, url);
|
|
87
|
+
const page = await context.newPage();
|
|
88
|
+
|
|
89
|
+
// Block images, fonts, media for faster loading
|
|
90
|
+
if (blockMedia) {
|
|
91
|
+
await page.route("**/*", (route) => {
|
|
92
|
+
const type = route.request().resourceType();
|
|
93
|
+
if (["image", "media", "font"].includes(type)) {
|
|
94
|
+
return route.abort();
|
|
95
|
+
}
|
|
96
|
+
return route.continue();
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
try {
|
|
101
|
+
const response = await page.goto(url, { waitUntil, timeout });
|
|
102
|
+
|
|
103
|
+
// Wait for dynamic content
|
|
104
|
+
if (waitForSelector) {
|
|
105
|
+
await page.waitForSelector(waitForSelector, { timeout: 10000 });
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// SPA support: wait for DOM to stabilize
|
|
109
|
+
await waitForDOMStable(page);
|
|
110
|
+
|
|
111
|
+
// Lazy-loading: scroll to bottom
|
|
112
|
+
if (scrollToBottom) {
|
|
113
|
+
await autoScroll(page);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const html = await page.content();
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
html,
|
|
120
|
+
url: page.url(), // URL after redirects
|
|
121
|
+
status: response?.status() || 0,
|
|
122
|
+
};
|
|
123
|
+
} finally {
|
|
124
|
+
await context.close();
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Fetch multiple URLs in parallel
|
|
130
|
+
* @param {string[]} urls
|
|
131
|
+
* @param {object} options
|
|
132
|
+
* @param {number} concurrency - Concurrency limit
|
|
133
|
+
*/
|
|
134
|
+
export async function fetchPages(urls, options = {}, concurrency = 3) {
|
|
135
|
+
const results = [];
|
|
136
|
+
const queue = [...urls];
|
|
137
|
+
|
|
138
|
+
const workers = Array.from({ length: concurrency }, async () => {
|
|
139
|
+
while (queue.length > 0) {
|
|
140
|
+
const url = queue.shift();
|
|
141
|
+
try {
|
|
142
|
+
const result = await fetchPage(url, options);
|
|
143
|
+
results.push({ url, ...result, error: null });
|
|
144
|
+
} catch (error) {
|
|
145
|
+
results.push({ url, html: null, status: 0, error: error.message });
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
await Promise.all(workers);
|
|
151
|
+
return results;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Wait for DOM to stabilize (SPA support)
|
|
156
|
+
* Uses MutationObserver to detect when DOM changes settle
|
|
157
|
+
*/
|
|
158
|
+
async function waitForDOMStable(page, stableMs = 1000, timeoutMs = 10000) {
|
|
159
|
+
await page.evaluate(({ stableMs, timeoutMs }) => {
|
|
160
|
+
return new Promise((resolve) => {
|
|
161
|
+
let timer = null;
|
|
162
|
+
const observer = new MutationObserver(() => {
|
|
163
|
+
clearTimeout(timer);
|
|
164
|
+
timer = setTimeout(() => {
|
|
165
|
+
observer.disconnect();
|
|
166
|
+
resolve();
|
|
167
|
+
}, stableMs);
|
|
168
|
+
});
|
|
169
|
+
observer.observe(document.body, {
|
|
170
|
+
childList: true,
|
|
171
|
+
subtree: true,
|
|
172
|
+
});
|
|
173
|
+
timer = setTimeout(() => {
|
|
174
|
+
observer.disconnect();
|
|
175
|
+
resolve();
|
|
176
|
+
}, stableMs);
|
|
177
|
+
setTimeout(() => {
|
|
178
|
+
observer.disconnect();
|
|
179
|
+
resolve();
|
|
180
|
+
}, timeoutMs);
|
|
181
|
+
});
|
|
182
|
+
}, { stableMs, timeoutMs });
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Auto-scroll to bottom of page (for lazy-loaded content)
|
|
187
|
+
*/
|
|
188
|
+
async function autoScroll(page) {
|
|
189
|
+
await page.evaluate(async () => {
|
|
190
|
+
await new Promise((resolve) => {
|
|
191
|
+
let totalHeight = 0;
|
|
192
|
+
const distance = 400;
|
|
193
|
+
const timer = setInterval(() => {
|
|
194
|
+
const scrollHeight = document.body.scrollHeight;
|
|
195
|
+
window.scrollBy(0, distance);
|
|
196
|
+
totalHeight += distance;
|
|
197
|
+
if (totalHeight >= scrollHeight) {
|
|
198
|
+
clearInterval(timer);
|
|
199
|
+
resolve();
|
|
200
|
+
}
|
|
201
|
+
}, 100);
|
|
202
|
+
setTimeout(() => {
|
|
203
|
+
clearInterval(timer);
|
|
204
|
+
resolve();
|
|
205
|
+
}, 10000);
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Close browser
|
|
212
|
+
*/
|
|
213
|
+
export async function closeBrowser() {
|
|
214
|
+
if (browser) {
|
|
215
|
+
await browser.close();
|
|
216
|
+
browser = null;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
// src/structured-extract.mjs
|
|
2
|
+
// Extract structured data (JSON-LD, Open Graph) from HTML
|
|
3
|
+
// If schema match coverage is sufficient, return result without LLM
|
|
4
|
+
|
|
5
|
+
import { JSDOM } from "jsdom";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Extract JSON-LD data from HTML
|
|
9
|
+
* @param {string} html - Raw HTML
|
|
10
|
+
* @returns {object[]} Array of JSON-LD objects
|
|
11
|
+
*/
|
|
12
|
+
function extractJsonLd(html) {
|
|
13
|
+
const dom = new JSDOM(html);
|
|
14
|
+
const scripts = dom.window.document.querySelectorAll('script[type="application/ld+json"]');
|
|
15
|
+
const results = [];
|
|
16
|
+
|
|
17
|
+
for (const script of scripts) {
|
|
18
|
+
try {
|
|
19
|
+
const data = JSON.parse(script.textContent);
|
|
20
|
+
// Expand @graph if present
|
|
21
|
+
if (data["@graph"] && Array.isArray(data["@graph"])) {
|
|
22
|
+
results.push(...data["@graph"]);
|
|
23
|
+
} else if (Array.isArray(data)) {
|
|
24
|
+
results.push(...data);
|
|
25
|
+
} else {
|
|
26
|
+
results.push(data);
|
|
27
|
+
}
|
|
28
|
+
} catch {
|
|
29
|
+
// Ignore JSON parse failures
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return results;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Extract Open Graph meta tags from HTML
|
|
38
|
+
* @param {string} html - Raw HTML
|
|
39
|
+
* @returns {object} OG property dictionary { "og:title": "...", ... }
|
|
40
|
+
*/
|
|
41
|
+
function extractOpenGraph(html) {
|
|
42
|
+
const dom = new JSDOM(html);
|
|
43
|
+
const doc = dom.window.document;
|
|
44
|
+
const og = {};
|
|
45
|
+
|
|
46
|
+
// og:* meta tags
|
|
47
|
+
const ogMetas = doc.querySelectorAll('meta[property^="og:"]');
|
|
48
|
+
for (const meta of ogMetas) {
|
|
49
|
+
const prop = meta.getAttribute("property");
|
|
50
|
+
const content = meta.getAttribute("content");
|
|
51
|
+
if (prop && content) {
|
|
52
|
+
og[prop] = content;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// twitter:* meta tags
|
|
57
|
+
const twitterMetas = doc.querySelectorAll('meta[name^="twitter:"], meta[property^="twitter:"]');
|
|
58
|
+
for (const meta of twitterMetas) {
|
|
59
|
+
const prop = meta.getAttribute("name") || meta.getAttribute("property");
|
|
60
|
+
const content = meta.getAttribute("content");
|
|
61
|
+
if (prop && content) {
|
|
62
|
+
og[prop] = content;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Basic meta tags
|
|
67
|
+
const descMeta = doc.querySelector('meta[name="description"]');
|
|
68
|
+
if (descMeta) og["meta:description"] = descMeta.getAttribute("content");
|
|
69
|
+
|
|
70
|
+
const titleEl = doc.querySelector("title");
|
|
71
|
+
if (titleEl) og["meta:title"] = titleEl.textContent.trim();
|
|
72
|
+
|
|
73
|
+
return og;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Flatten JSON-LD data into key-value pairs
|
|
78
|
+
* Nested objects become "parent.child" format
|
|
79
|
+
*/
|
|
80
|
+
function flattenObject(obj, prefix = "") {
|
|
81
|
+
const result = {};
|
|
82
|
+
|
|
83
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
84
|
+
if (key.startsWith("@")) continue; // Skip @type, @context, etc.
|
|
85
|
+
|
|
86
|
+
const fullKey = prefix ? `${prefix}.${key}` : key;
|
|
87
|
+
|
|
88
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
89
|
+
Object.assign(result, flattenObject(value, fullKey));
|
|
90
|
+
} else {
|
|
91
|
+
result[fullKey] = value;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return result;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Match structured data against a schema
|
|
100
|
+
* For each schema key, find a corresponding value in structured data
|
|
101
|
+
* @param {object} structuredData - Flattened structured data
|
|
102
|
+
* @param {object} schema - User-defined schema
|
|
103
|
+
* @returns {{ matched: object, coverage: number }} Match results and coverage ratio
|
|
104
|
+
*/
|
|
105
|
+
function matchToSchema(structuredData, schema) {
|
|
106
|
+
const schemaKeys = Object.keys(schema);
|
|
107
|
+
const matched = {};
|
|
108
|
+
let matchCount = 0;
|
|
109
|
+
|
|
110
|
+
// Key name alias map (common mapping patterns)
|
|
111
|
+
const keyAliases = {
|
|
112
|
+
title: ["name", "headline", "og:title", "twitter:title", "meta:title"],
|
|
113
|
+
description: ["description", "abstract", "og:description", "twitter:description", "meta:description"],
|
|
114
|
+
price: ["price", "offers.price", "offers.lowPrice"],
|
|
115
|
+
currency: ["priceCurrency", "offers.priceCurrency"],
|
|
116
|
+
image: ["image", "thumbnailUrl", "og:image", "twitter:image"],
|
|
117
|
+
url: ["url", "mainEntityOfPage", "og:url"],
|
|
118
|
+
author: ["author.name", "author", "creator"],
|
|
119
|
+
date: ["datePublished", "dateCreated", "dateModified"],
|
|
120
|
+
published: ["datePublished", "dateCreated"],
|
|
121
|
+
brand: ["brand.name", "brand"],
|
|
122
|
+
category: ["category", "articleSection"],
|
|
123
|
+
rating: ["aggregateRating.ratingValue"],
|
|
124
|
+
reviewCount: ["aggregateRating.reviewCount"],
|
|
125
|
+
availability: ["offers.availability"],
|
|
126
|
+
language: ["inLanguage"],
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
for (const schemaKey of schemaKeys) {
|
|
130
|
+
const normalizedKey = schemaKey.toLowerCase().replace(/[_-]/g, "");
|
|
131
|
+
|
|
132
|
+
// 1. Direct key match
|
|
133
|
+
if (structuredData[schemaKey] !== undefined) {
|
|
134
|
+
matched[schemaKey] = structuredData[schemaKey];
|
|
135
|
+
matchCount++;
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// 2. Alias match
|
|
140
|
+
let found = false;
|
|
141
|
+
for (const [aliasGroup, aliases] of Object.entries(keyAliases)) {
|
|
142
|
+
if (normalizedKey.includes(aliasGroup)) {
|
|
143
|
+
for (const alias of aliases) {
|
|
144
|
+
if (structuredData[alias] !== undefined) {
|
|
145
|
+
matched[schemaKey] = structuredData[alias];
|
|
146
|
+
matchCount++;
|
|
147
|
+
found = true;
|
|
148
|
+
break;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
if (found) break;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (found) continue;
|
|
155
|
+
|
|
156
|
+
// 3. Partial match (structured data key contains schema key or vice versa)
|
|
157
|
+
for (const [dataKey, dataValue] of Object.entries(structuredData)) {
|
|
158
|
+
const normalizedDataKey = dataKey.toLowerCase().replace(/[_.-]/g, "");
|
|
159
|
+
if (normalizedDataKey.includes(normalizedKey) || normalizedKey.includes(normalizedDataKey)) {
|
|
160
|
+
matched[schemaKey] = dataValue;
|
|
161
|
+
matchCount++;
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
matched,
|
|
169
|
+
coverage: schemaKeys.length > 0 ? matchCount / schemaKeys.length : 0,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Extract structured data from HTML and match against schema
|
|
175
|
+
* Returns result if coverage meets threshold (no LLM needed)
|
|
176
|
+
*
|
|
177
|
+
* @param {string} html - Raw HTML
|
|
178
|
+
* @param {object} schema - User-defined schema
|
|
179
|
+
* @param {object} options
|
|
180
|
+
* @param {number} options.minCoverage - Minimum coverage ratio (0-1, default: 0.5)
|
|
181
|
+
* @returns {{ extracted: object, source: string, coverage: number } | null}
|
|
182
|
+
*/
|
|
183
|
+
export function tryStructuredExtract(html, schema, options = {}) {
|
|
184
|
+
const { minCoverage = 0.5 } = options;
|
|
185
|
+
|
|
186
|
+
// 1. Try extraction from JSON-LD
|
|
187
|
+
const jsonLdItems = extractJsonLd(html);
|
|
188
|
+
if (jsonLdItems.length > 0) {
|
|
189
|
+
// Merge and flatten all JSON-LD items
|
|
190
|
+
const merged = {};
|
|
191
|
+
for (const item of jsonLdItems) {
|
|
192
|
+
Object.assign(merged, flattenObject(item));
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const { matched, coverage } = matchToSchema(merged, schema);
|
|
196
|
+
if (coverage >= minCoverage) {
|
|
197
|
+
return { extracted: matched, source: "json-ld", coverage };
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// 2. Try extraction from Open Graph
|
|
202
|
+
const ogData = extractOpenGraph(html);
|
|
203
|
+
if (Object.keys(ogData).length > 0) {
|
|
204
|
+
const { matched, coverage } = matchToSchema(ogData, schema);
|
|
205
|
+
if (coverage >= minCoverage) {
|
|
206
|
+
return { extracted: matched, source: "open-graph", coverage };
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// 3. Merge JSON-LD + OG and retry
|
|
211
|
+
if (jsonLdItems.length > 0 && Object.keys(ogData).length > 0) {
|
|
212
|
+
const merged = {};
|
|
213
|
+
for (const item of jsonLdItems) {
|
|
214
|
+
Object.assign(merged, flattenObject(item));
|
|
215
|
+
}
|
|
216
|
+
Object.assign(merged, ogData);
|
|
217
|
+
|
|
218
|
+
const { matched, coverage } = matchToSchema(merged, schema);
|
|
219
|
+
if (coverage >= minCoverage) {
|
|
220
|
+
return { extracted: matched, source: "json-ld+open-graph", coverage };
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Coverage insufficient -> null (fallback to LLM)
|
|
225
|
+
return null;
|
|
226
|
+
}
|