@olib-ai/owl-browser-sdk 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -0
- package/dist/extraction/content-cleaner.d.ts +40 -0
- package/dist/extraction/content-cleaner.d.ts.map +1 -0
- package/dist/extraction/content-cleaner.js +393 -0
- package/dist/extraction/content-cleaner.js.map +1 -0
- package/dist/extraction/extractor.d.ts +139 -0
- package/dist/extraction/extractor.d.ts.map +1 -0
- package/dist/extraction/extractor.js +212 -0
- package/dist/extraction/extractor.js.map +1 -0
- package/dist/extraction/html-processor.d.ts +75 -0
- package/dist/extraction/html-processor.d.ts.map +1 -0
- package/dist/extraction/html-processor.js +192 -0
- package/dist/extraction/html-processor.js.map +1 -0
- package/dist/extraction/index.d.ts +14 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/index.js +19 -0
- package/dist/extraction/index.js.map +1 -0
- package/dist/extraction/list-extractor.d.ts +24 -0
- package/dist/extraction/list-extractor.d.ts.map +1 -0
- package/dist/extraction/list-extractor.js +303 -0
- package/dist/extraction/list-extractor.js.map +1 -0
- package/dist/extraction/meta-extractor.d.ts +40 -0
- package/dist/extraction/meta-extractor.d.ts.map +1 -0
- package/dist/extraction/meta-extractor.js +216 -0
- package/dist/extraction/meta-extractor.js.map +1 -0
- package/dist/extraction/pagination.d.ts +29 -0
- package/dist/extraction/pagination.d.ts.map +1 -0
- package/dist/extraction/pagination.js +323 -0
- package/dist/extraction/pagination.js.map +1 -0
- package/dist/extraction/pattern-detector.d.ts +16 -0
- package/dist/extraction/pattern-detector.d.ts.map +1 -0
- package/dist/extraction/pattern-detector.js +390 -0
- package/dist/extraction/pattern-detector.js.map +1 -0
- package/dist/extraction/scrape-session.d.ts +23 -0
- package/dist/extraction/scrape-session.d.ts.map +1 -0
- package/dist/extraction/scrape-session.js +192 -0
- package/dist/extraction/scrape-session.js.map +1 -0
- package/dist/extraction/selector-engine.d.ts +23 -0
- package/dist/extraction/selector-engine.d.ts.map +1 -0
- package/dist/extraction/selector-engine.js +127 -0
- package/dist/extraction/selector-engine.js.map +1 -0
- package/dist/extraction/table-extractor.d.ts +29 -0
- package/dist/extraction/table-extractor.d.ts.map +1 -0
- package/dist/extraction/table-extractor.js +282 -0
- package/dist/extraction/table-extractor.js.map +1 -0
- package/dist/extraction/transforms.d.ts +47 -0
- package/dist/extraction/transforms.d.ts.map +1 -0
- package/dist/extraction/transforms.js +277 -0
- package/dist/extraction/transforms.js.map +1 -0
- package/dist/extraction/types.d.ts +199 -0
- package/dist/extraction/types.d.ts.map +1 -0
- package/dist/extraction/types.js +5 -0
- package/dist/extraction/types.js.map +1 -0
- package/dist/flow/executor.js +1 -1
- package/dist/flow/executor.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/playwright/browser-type.d.ts +101 -0
- package/dist/playwright/browser-type.d.ts.map +1 -0
- package/dist/playwright/browser-type.js +134 -0
- package/dist/playwright/browser-type.js.map +1 -0
- package/dist/playwright/browser.d.ts +98 -0
- package/dist/playwright/browser.d.ts.map +1 -0
- package/dist/playwright/browser.js +229 -0
- package/dist/playwright/browser.js.map +1 -0
- package/dist/playwright/context.d.ts +211 -0
- package/dist/playwright/context.d.ts.map +1 -0
- package/dist/playwright/context.js +466 -0
- package/dist/playwright/context.js.map +1 -0
- package/dist/playwright/extractor.d.ts +108 -0
- package/dist/playwright/extractor.d.ts.map +1 -0
- package/dist/playwright/extractor.js +404 -0
- package/dist/playwright/extractor.js.map +1 -0
- package/dist/playwright/frame.d.ts +147 -0
- package/dist/playwright/frame.d.ts.map +1 -0
- package/dist/playwright/frame.js +492 -0
- package/dist/playwright/frame.js.map +1 -0
- package/dist/playwright/index.d.ts +163 -0
- package/dist/playwright/index.d.ts.map +1 -0
- package/dist/playwright/index.js +313 -0
- package/dist/playwright/index.js.map +1 -0
- package/dist/playwright/keyboard.d.ts +74 -0
- package/dist/playwright/keyboard.d.ts.map +1 -0
- package/dist/playwright/keyboard.js +187 -0
- package/dist/playwright/keyboard.js.map +1 -0
- package/dist/playwright/locator.d.ts +237 -0
- package/dist/playwright/locator.d.ts.map +1 -0
- package/dist/playwright/locator.js +646 -0
- package/dist/playwright/locator.js.map +1 -0
- package/dist/playwright/mouse.d.ts +82 -0
- package/dist/playwright/mouse.d.ts.map +1 -0
- package/dist/playwright/mouse.js +137 -0
- package/dist/playwright/mouse.js.map +1 -0
- package/dist/playwright/page-helpers.d.ts +261 -0
- package/dist/playwright/page-helpers.d.ts.map +1 -0
- package/dist/playwright/page-helpers.js +423 -0
- package/dist/playwright/page-helpers.js.map +1 -0
- package/dist/playwright/page.d.ts +566 -0
- package/dist/playwright/page.d.ts.map +1 -0
- package/dist/playwright/page.js +1476 -0
- package/dist/playwright/page.js.map +1 -0
- package/dist/playwright/response.d.ts +100 -0
- package/dist/playwright/response.d.ts.map +1 -0
- package/dist/playwright/response.js +194 -0
- package/dist/playwright/response.js.map +1 -0
- package/dist/playwright/types.d.ts +354 -0
- package/dist/playwright/types.d.ts.map +1 -0
- package/dist/playwright/types.js +8 -0
- package/dist/playwright/types.js.map +1 -0
- package/openapi.json +343 -36
- package/package.json +10 -1
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zero-config auto pattern discovery via pure DOM analysis.
|
|
3
|
+
*
|
|
4
|
+
* Finds repeating DOM structures without any AI — uses tag+class frequency,
|
|
5
|
+
* child consistency scoring, and semantic field inference.
|
|
6
|
+
*/
|
|
7
|
+
import { load } from 'cheerio';
|
|
8
|
+
import { extractAll } from './selector-engine.js';
|
|
9
|
+
import { analyzeStructure } from './list-extractor.js';
|
|
10
|
+
/**
|
|
11
|
+
* Detect repeating patterns on the page.
|
|
12
|
+
*/
|
|
13
|
+
export function detect(html, options) {
|
|
14
|
+
const minCount = options?.minCount ?? 3;
|
|
15
|
+
const minConfidence = options?.minConfidence ?? 0.5;
|
|
16
|
+
const region = options?.region;
|
|
17
|
+
const $ = load(html);
|
|
18
|
+
const root = region ? $(region).first() : $('body');
|
|
19
|
+
if (root.length === 0)
|
|
20
|
+
return [];
|
|
21
|
+
// Find all elements that could be containers of repeating items
|
|
22
|
+
const candidates = findCandidates($, root, minCount);
|
|
23
|
+
// Score and rank candidates
|
|
24
|
+
const patterns = [];
|
|
25
|
+
for (const candidate of candidates) {
|
|
26
|
+
try {
|
|
27
|
+
const { containerSelector, itemSelector, items } = candidate;
|
|
28
|
+
if (items.length < minCount)
|
|
29
|
+
continue;
|
|
30
|
+
const confidence = scoreCandidate($, items);
|
|
31
|
+
if (confidence < minConfidence)
|
|
32
|
+
continue;
|
|
33
|
+
// Infer fields from first item
|
|
34
|
+
const fields = analyzeStructure(html, containerSelector, itemSelector);
|
|
35
|
+
if (Object.keys(fields).length === 0)
|
|
36
|
+
continue;
|
|
37
|
+
// Extract sample (first 3 items)
|
|
38
|
+
const fullSelector = `${containerSelector} ${itemSelector}`;
|
|
39
|
+
const sample = extractAll(html, fullSelector, fields).slice(0, 3);
|
|
40
|
+
patterns.push({
|
|
41
|
+
containerSelector,
|
|
42
|
+
itemSelector: fullSelector,
|
|
43
|
+
itemCount: items.length,
|
|
44
|
+
confidence,
|
|
45
|
+
fields,
|
|
46
|
+
sample,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
// Skip candidates with invalid selectors (e.g. special chars in class names)
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// Sort by confidence desc, then by item count desc
|
|
55
|
+
patterns.sort((a, b) => {
|
|
56
|
+
if (b.confidence !== a.confidence)
|
|
57
|
+
return b.confidence - a.confidence;
|
|
58
|
+
return b.itemCount - a.itemCount;
|
|
59
|
+
});
|
|
60
|
+
return patterns;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Detect patterns and immediately extract the best one.
|
|
64
|
+
*/
|
|
65
|
+
export function detectAndExtract(html, options) {
|
|
66
|
+
const patterns = detect(html, options);
|
|
67
|
+
if (patterns.length === 0)
|
|
68
|
+
return [];
|
|
69
|
+
const best = patterns[0];
|
|
70
|
+
return extractAll(html, best.itemSelector, best.fields);
|
|
71
|
+
}
|
|
72
|
+
// --- Utility class detection for multi-class matching ---
|
|
73
|
+
const UTILITY_PREFIXES = /^(p|m|w|h|min|max|flex|grid|text|bg|border|rounded|shadow|gap|space|overflow|z|opacity|cursor|font|leading|tracking|transition|duration|ease|scale|rotate|translate|skew|origin|ring|outline|decoration|placeholder|divide|sr)-/;
|
|
74
|
+
const UTILITY_WORDS = new Set(['flex', 'block', 'hidden', 'relative', 'absolute', 'fixed', 'sticky', 'inline', 'grid', 'table', 'contents', 'static', 'visible', 'invisible', 'isolate', 'truncate', 'antialiased', 'italic', 'underline', 'uppercase', 'lowercase', 'capitalize', 'ordinal']);
|
|
75
|
+
function isSemanticClass(cls) {
|
|
76
|
+
if (cls.length <= 1)
|
|
77
|
+
return false;
|
|
78
|
+
if (UTILITY_PREFIXES.test(cls))
|
|
79
|
+
return false;
|
|
80
|
+
if (UTILITY_WORDS.has(cls))
|
|
81
|
+
return false;
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
function findCandidates($, root, minCount) {
|
|
85
|
+
const candidates = [];
|
|
86
|
+
const seen = new Set();
|
|
87
|
+
// Strategy 1: Find elements with multiple same-tag+class children
|
|
88
|
+
root.find('*').each((_i, el) => {
|
|
89
|
+
if (el.type !== 'tag')
|
|
90
|
+
return;
|
|
91
|
+
const $el = $(el);
|
|
92
|
+
const children = $el.children();
|
|
93
|
+
if (children.length < minCount)
|
|
94
|
+
return;
|
|
95
|
+
// Count tag+class combos (multi-class matching with utility filtering)
|
|
96
|
+
const comboCounts = new Map();
|
|
97
|
+
children.each((_j, child) => {
|
|
98
|
+
if (child.type !== 'tag')
|
|
99
|
+
return;
|
|
100
|
+
const tag = child.tagName.toLowerCase();
|
|
101
|
+
const allClasses = $(child).attr('class')?.split(/\s+/).filter(Boolean) ?? [];
|
|
102
|
+
const semanticClasses = allClasses.filter(isSemanticClass).slice(0, 3);
|
|
103
|
+
const key = semanticClasses.length > 0
|
|
104
|
+
? `${tag}.${semanticClasses.map(cssEscape).join('.')}`
|
|
105
|
+
: (allClasses[0] ? `${tag}.${cssEscape(allClasses[0])}` : tag);
|
|
106
|
+
const existing = comboCounts.get(key);
|
|
107
|
+
if (existing) {
|
|
108
|
+
existing.count++;
|
|
109
|
+
existing.items.push(child);
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
comboCounts.set(key, { count: 1, items: [child] });
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
for (const [itemKey, { count, items }] of comboCounts) {
|
|
116
|
+
if (count < minCount)
|
|
117
|
+
continue;
|
|
118
|
+
const containerSel = buildSelector($, $el);
|
|
119
|
+
const fullKey = `${containerSel}|${itemKey}`;
|
|
120
|
+
if (seen.has(fullKey))
|
|
121
|
+
continue;
|
|
122
|
+
seen.add(fullKey);
|
|
123
|
+
candidates.push({
|
|
124
|
+
containerSelector: containerSel,
|
|
125
|
+
itemSelector: `> ${itemKey}`,
|
|
126
|
+
items: items.map(i => $(i)),
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
// Strategy 2: Look for common list patterns (ul/ol with li, tbody with tr)
|
|
131
|
+
root.find('ul, ol').each((_i, el) => {
|
|
132
|
+
const $el = $(el);
|
|
133
|
+
const lis = $el.children('li');
|
|
134
|
+
if (lis.length >= minCount) {
|
|
135
|
+
const sel = buildSelector($, $el);
|
|
136
|
+
if (!seen.has(`${sel}|li`)) {
|
|
137
|
+
seen.add(`${sel}|li`);
|
|
138
|
+
const items = [];
|
|
139
|
+
lis.each((_j, li) => { items.push($(li)); });
|
|
140
|
+
candidates.push({
|
|
141
|
+
containerSelector: sel,
|
|
142
|
+
itemSelector: '> li',
|
|
143
|
+
items,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
root.find('tbody').each((_i, el) => {
|
|
149
|
+
const $el = $(el);
|
|
150
|
+
const trs = $el.children('tr');
|
|
151
|
+
if (trs.length >= minCount) {
|
|
152
|
+
const sel = buildSelector($, $el);
|
|
153
|
+
if (!seen.has(`${sel}|tr`)) {
|
|
154
|
+
seen.add(`${sel}|tr`);
|
|
155
|
+
const items = [];
|
|
156
|
+
trs.each((_j, tr) => { items.push($(tr)); });
|
|
157
|
+
candidates.push({
|
|
158
|
+
containerSelector: sel,
|
|
159
|
+
itemSelector: '> tr',
|
|
160
|
+
items,
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
// Strategy 3: Deeper nesting — search grandchildren when direct children are wrapper divs
|
|
166
|
+
root.find('*').each((_i, el) => {
|
|
167
|
+
if (el.type !== 'tag')
|
|
168
|
+
return;
|
|
169
|
+
const $el = $(el);
|
|
170
|
+
const children = $el.children();
|
|
171
|
+
// Only try deeper search if children are generic wrappers (divs/spans/li with few classes)
|
|
172
|
+
const divChildren = children.filter((_j, child) => {
|
|
173
|
+
if (child.type !== 'tag')
|
|
174
|
+
return false;
|
|
175
|
+
const tag = child.tagName.toLowerCase();
|
|
176
|
+
return tag === 'div' || tag === 'span' || tag === 'li';
|
|
177
|
+
});
|
|
178
|
+
if (divChildren.length < minCount)
|
|
179
|
+
return;
|
|
180
|
+
// Look at grandchildren for consistent patterns
|
|
181
|
+
const grandComboCounts = new Map();
|
|
182
|
+
divChildren.each((_j, wrapper) => {
|
|
183
|
+
const $wrapper = $(wrapper);
|
|
184
|
+
const grandchildren = $wrapper.children();
|
|
185
|
+
if (grandchildren.length === 0)
|
|
186
|
+
return;
|
|
187
|
+
// Use the wrapper's first grandchild tag+class as the pattern
|
|
188
|
+
const firstGrand = grandchildren[0];
|
|
189
|
+
if (!firstGrand || firstGrand.type !== 'tag')
|
|
190
|
+
return;
|
|
191
|
+
const gTag = firstGrand.tagName.toLowerCase();
|
|
192
|
+
const gClasses = $(firstGrand).attr('class')?.split(/\s+/).filter(Boolean).filter(isSemanticClass).slice(0, 3) ?? [];
|
|
193
|
+
const gKey = gClasses.length > 0 ? `${gTag}.${gClasses.map(cssEscape).join('.')}` : gTag;
|
|
194
|
+
// Count the wrapper, not the grandchild — the wrapper IS the item
|
|
195
|
+
const wTag = wrapper.tagName.toLowerCase();
|
|
196
|
+
const wClasses = $(wrapper).attr('class')?.split(/\s+/).filter(Boolean).filter(isSemanticClass).slice(0, 3) ?? [];
|
|
197
|
+
const wKey = wClasses.length > 0 ? `${wTag}.${wClasses.map(cssEscape).join('.')}` : wTag;
|
|
198
|
+
const existing = grandComboCounts.get(wKey);
|
|
199
|
+
if (existing) {
|
|
200
|
+
existing.count++;
|
|
201
|
+
existing.items.push(wrapper);
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
grandComboCounts.set(wKey, { count: 1, items: [wrapper] });
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
for (const [itemKey, { count, items }] of grandComboCounts) {
|
|
208
|
+
if (count < minCount)
|
|
209
|
+
continue;
|
|
210
|
+
const containerSel = buildSelector($, $el);
|
|
211
|
+
const fullKey = `${containerSel}|deep|${itemKey}`;
|
|
212
|
+
if (seen.has(fullKey))
|
|
213
|
+
continue;
|
|
214
|
+
seen.add(fullKey);
|
|
215
|
+
candidates.push({
|
|
216
|
+
containerSelector: containerSel,
|
|
217
|
+
itemSelector: `> ${itemKey}`,
|
|
218
|
+
items: items.map(i => $(i)),
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
});
|
|
222
|
+
// Strategy 4: ARIA role detection
|
|
223
|
+
const ariaRoles = ['listitem', 'article', 'row', 'option', 'menuitem', 'tab'];
|
|
224
|
+
for (const role of ariaRoles) {
|
|
225
|
+
root.find(`[role="${role}"]`).each((_i, el) => {
|
|
226
|
+
if (el.type !== 'tag')
|
|
227
|
+
return;
|
|
228
|
+
const $el = $(el);
|
|
229
|
+
const parent = $el.parent();
|
|
230
|
+
if (parent.length === 0)
|
|
231
|
+
return;
|
|
232
|
+
const siblings = parent.find(`> [role="${role}"]`);
|
|
233
|
+
if (siblings.length < minCount)
|
|
234
|
+
return;
|
|
235
|
+
const containerSel = buildSelector($, parent);
|
|
236
|
+
const itemKey = `[role="${role}"]`;
|
|
237
|
+
const fullKey = `${containerSel}|${itemKey}`;
|
|
238
|
+
if (seen.has(fullKey))
|
|
239
|
+
return;
|
|
240
|
+
seen.add(fullKey);
|
|
241
|
+
const items = [];
|
|
242
|
+
siblings.each((_j, sib) => { items.push($(sib)); });
|
|
243
|
+
candidates.push({
|
|
244
|
+
containerSelector: containerSel,
|
|
245
|
+
itemSelector: `> ${itemKey}`,
|
|
246
|
+
items,
|
|
247
|
+
});
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
// Strategy 5: Data-attribute patterns
|
|
251
|
+
const dataAttrs = ['data-testid', 'data-id', 'data-index', 'data-item', 'data-row'];
|
|
252
|
+
for (const attr of dataAttrs) {
|
|
253
|
+
const attrEls = root.find(`[${attr}]`);
|
|
254
|
+
if (attrEls.length < minCount)
|
|
255
|
+
continue;
|
|
256
|
+
// Group by parent
|
|
257
|
+
const parentGroups = new Map();
|
|
258
|
+
attrEls.each((_i, el) => {
|
|
259
|
+
if (el.type !== 'tag')
|
|
260
|
+
return;
|
|
261
|
+
const parent = $(el).parent();
|
|
262
|
+
const pKey = buildSelector($, parent);
|
|
263
|
+
const existing = parentGroups.get(pKey);
|
|
264
|
+
if (existing) {
|
|
265
|
+
existing.items.push(el);
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
parentGroups.set(pKey, { parent, items: [el] });
|
|
269
|
+
}
|
|
270
|
+
});
|
|
271
|
+
for (const [containerSel, { items }] of parentGroups) {
|
|
272
|
+
if (items.length < minCount)
|
|
273
|
+
continue;
|
|
274
|
+
const itemKey = `[${attr}]`;
|
|
275
|
+
const fullKey = `${containerSel}|${itemKey}`;
|
|
276
|
+
if (seen.has(fullKey))
|
|
277
|
+
continue;
|
|
278
|
+
seen.add(fullKey);
|
|
279
|
+
candidates.push({
|
|
280
|
+
containerSelector: containerSel,
|
|
281
|
+
itemSelector: `> ${itemKey}`,
|
|
282
|
+
items: items.map(i => $(i)),
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return candidates;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Score a candidate by child consistency and content richness.
|
|
290
|
+
*/
|
|
291
|
+
function scoreCandidate($, items) {
|
|
292
|
+
if (items.length === 0)
|
|
293
|
+
return 0;
|
|
294
|
+
let score = 0;
|
|
295
|
+
// 1. Child tag structure consistency (0-0.4)
|
|
296
|
+
const structures = items.map(item => {
|
|
297
|
+
const tags = [];
|
|
298
|
+
item.children().each((_i, el) => {
|
|
299
|
+
if (el.type === 'tag') {
|
|
300
|
+
tags.push(el.tagName.toLowerCase());
|
|
301
|
+
}
|
|
302
|
+
});
|
|
303
|
+
return tags.join(',');
|
|
304
|
+
});
|
|
305
|
+
const mostCommon = mode(structures);
|
|
306
|
+
const consistency = structures.filter(s => s === mostCommon).length / structures.length;
|
|
307
|
+
score += consistency * 0.4;
|
|
308
|
+
// 2. Average number of text nodes (0-0.2)
|
|
309
|
+
const avgTextNodes = items.reduce((sum, item) => {
|
|
310
|
+
const text = item.text().trim();
|
|
311
|
+
return sum + (text.length > 10 ? 1 : 0);
|
|
312
|
+
}, 0) / items.length;
|
|
313
|
+
score += Math.min(avgTextNodes, 1) * 0.2;
|
|
314
|
+
// 3. Presence of links (0-0.15)
|
|
315
|
+
const hasLinks = items.filter(item => item.find('a').length > 0).length / items.length;
|
|
316
|
+
score += hasLinks * 0.15;
|
|
317
|
+
// 4. Presence of images (0-0.1)
|
|
318
|
+
const hasImages = items.filter(item => item.find('img').length > 0).length / items.length;
|
|
319
|
+
score += hasImages * 0.1;
|
|
320
|
+
// 5. Item count bonus (0-0.15)
|
|
321
|
+
score += Math.min(items.length / 20, 1) * 0.15;
|
|
322
|
+
// 6. ARIA role consistency (0-0.1)
|
|
323
|
+
const hasRole = items.filter(item => item.attr('role')).length / items.length;
|
|
324
|
+
score += hasRole * 0.1;
|
|
325
|
+
// 7. Data-attribute consistency (0-0.1)
|
|
326
|
+
const hasDataAttr = items.filter(item => {
|
|
327
|
+
const node = item[0];
|
|
328
|
+
if (!node || node.type !== 'tag')
|
|
329
|
+
return false;
|
|
330
|
+
const attribs = node.attribs ?? {};
|
|
331
|
+
return Object.keys(attribs).some(k => k.startsWith('data-'));
|
|
332
|
+
}).length / items.length;
|
|
333
|
+
score += hasDataAttr * 0.1;
|
|
334
|
+
return Math.round(score * 100) / 100;
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Escape special characters in a CSS identifier (class name, id).
|
|
338
|
+
* Characters like : [ ] . > + ~ need backslash escaping in CSS selectors.
|
|
339
|
+
*/
|
|
340
|
+
function cssEscape(value) {
|
|
341
|
+
return value.replace(/([!"#$%&'()*+,./:;<=>?@[\\\]^`{|}~])/g, '\\$1');
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Build a CSS selector for an element.
|
|
345
|
+
*/
|
|
346
|
+
function buildSelector($, el) {
|
|
347
|
+
const node = el[0];
|
|
348
|
+
if (!node || node.type !== 'tag')
|
|
349
|
+
return '*';
|
|
350
|
+
const tag = node.tagName.toLowerCase();
|
|
351
|
+
const id = el.attr('id');
|
|
352
|
+
if (id)
|
|
353
|
+
return `${tag}#${cssEscape(id)}`;
|
|
354
|
+
const allClasses = el.attr('class')?.split(/\s+/).filter(Boolean) ?? [];
|
|
355
|
+
const semanticClasses = allClasses.filter(isSemanticClass).slice(0, 2);
|
|
356
|
+
const cls = semanticClasses.length > 0
|
|
357
|
+
? semanticClasses.map(cssEscape).join('.')
|
|
358
|
+
: allClasses.slice(0, 2).map(cssEscape).join('.');
|
|
359
|
+
let selector = cls ? `${tag}.${cls}` : tag;
|
|
360
|
+
// Check uniqueness — if multiple matches, add nth-of-type
|
|
361
|
+
try {
|
|
362
|
+
const matches = $(selector);
|
|
363
|
+
if (matches.length > 1) {
|
|
364
|
+
const index = matches.index(el);
|
|
365
|
+
if (index >= 0) {
|
|
366
|
+
selector = `${selector}:nth-of-type(${index + 1})`;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
catch {
|
|
371
|
+
// Skip uniqueness check on invalid selectors
|
|
372
|
+
}
|
|
373
|
+
return selector;
|
|
374
|
+
}
|
|
375
|
+
function mode(arr) {
|
|
376
|
+
const counts = new Map();
|
|
377
|
+
for (const item of arr) {
|
|
378
|
+
counts.set(item, (counts.get(item) ?? 0) + 1);
|
|
379
|
+
}
|
|
380
|
+
let best;
|
|
381
|
+
let bestCount = 0;
|
|
382
|
+
for (const [item, count] of counts) {
|
|
383
|
+
if (count > bestCount) {
|
|
384
|
+
bestCount = count;
|
|
385
|
+
best = item;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
return best;
|
|
389
|
+
}
|
|
390
|
+
//# sourceMappingURL=pattern-detector.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pattern-detector.js","sourceRoot":"","sources":["../../src/extraction/pattern-detector.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,IAAI,EAAiC,MAAM,SAAS,CAAC;AAG9D,OAAO,EAAE,UAAU,EAAE,MAAM,sBAAsB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAEvD;;GAEG;AACH,MAAM,UAAU,MAAM,CACpB,IAAY,EACZ,OAAuB;IAEvB,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ,IAAI,CAAC,CAAC;IACxC,MAAM,aAAa,GAAG,OAAO,EAAE,aAAa,IAAI,GAAG,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,EAAE,MAAM,CAAC;IAE/B,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IACpD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,gEAAgE;IAChE,MAAM,UAAU,GAAG,cAAc,CAAC,CAAC,EAAE,IAAI,EAAE,QAAQ,CAAC,CAAC;IAErD,4BAA4B;IAC5B,MAAM,QAAQ,GAAsB,EAAE,CAAC;IAEvC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,IAAI,CAAC;YACH,MAAM,EAAE,iBAAiB,EAAE,YAAY,EAAE,KAAK,EAAE,GAAG,SAAS,CAAC;YAC7D,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ;gBAAE,SAAS;YAEtC,MAAM,UAAU,GAAG,cAAc,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YAC5C,IAAI,UAAU,GAAG,aAAa;gBAAE,SAAS;YAEzC,+BAA+B;YAC/B,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,iBAAiB,EAAE,YAAY,CAAC,CAAC;YACvE,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAE/C,iCAAiC;YACjC,MAAM,YAAY,GAAG,GAAG,iBAAiB,IAAI,YAAY,EAAE,CAAC;YAC5D,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,EAAE,YAAY,EAAE,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAElE,QAAQ,CAAC,IAAI,CAAC;gBACZ,iBAAiB;gBACjB,YAAY,EAAE,YAAY;gBAC1B,SAAS,EAAE,KAAK,CAAC,MAAM;gBACvB,UAAU;gBACV,MAAM;gBACN,MAAM;aACP,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,6EAA6E;YAC7E,SAAS;QACX,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACrB,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU;YAAE,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;QACtE,OAAO,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC;IACnC,CAAC,CAAC,CAAC;IAEH,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,IAAY,EACZ,OAAuB;IAEvB,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAErC,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;IAC1B,OAAO,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;AAC1D,CAAC;AAUD,2DAA2D;AAE3D,MAAM,gBAAgB,GAAG,iOAAiO,CAAC;AAC3P,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,UAAU,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,aAAa,EAAE,QAAQ,EAAE,WAAW,EAAE,WAAW,EAAE,WAAW,EAAE,YAAY,EAAE,SAAS,CAAC,CAAC,CAAC;AAE/R,SAAS,eAAe,CAAC,GAAW;IAClC,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAClC,IAAI,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IAC7C,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACzC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CACrB,CAAa,EACb,IAAsB,EACtB,QAAgB;IAEhB,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAE/B,kEAAkE;IAClE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAC7B,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,IAAI,QAAQ,CAAC,MAAM,GAAG,QAAQ;YAAE,OAAO;QAEvC,uEAAuE;QACvE,MAAM,WAAW,GAAG,IAAI,GAAG,EAA+C,CAAC;QAC3E,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE;YAC1B,IAAI,KAAK,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO;YACjC,MAAM,GAAG,GAAI,KAA6B,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YACjE,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAC9E,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YACvE,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC;gBACpC,CAAC,CAAC,GAAG,GAAG,IAAI,eAAe,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE;gBACtD,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YACjE,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACtC,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,KAAK,EAAE,CAAC;gBACjB,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC7B,CAAC;iBAAM,CAAC;gBACN,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YACrD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,KAAK,MAAM,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;YACtD,IAAI,KAAK,GAAG,QAAQ;gBAAE,SAAS;YAE/B,MAAM,YAAY,GAAG,aAAa,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,GAAG,YAAY,IAAI,OAAO,EAAE,CAAC;YAC7C,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;gBAAE,SAAS;YAChC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAElB,UAAU,CAAC,IAAI,CAAC;gBACd,iBAAiB,EAAE,YAAY;gBAC/B,YAAY,EAAE,KAAK,OAAO,EAAE;gBAC5B,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,2EAA2E;IAC3E,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAClC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,GAAG,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,GAAG,GAAG,aAAa,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;gBACtB,MAAM,KAAK,GAAuB,EAAE,CAAC;gBACrC,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC7C,UAAU,CAAC,IAAI,CAAC;oBACd,iBAAiB,EAAE,GAAG;oBACtB,YAAY,EAAE,MAAM;oBACpB,KAAK;iBACN,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACjC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,GAAG,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,GAAG,GAAG,aAAa,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC;gBACtB,MAAM,KAAK,GAAuB,EAAE,CAAC;gBACrC,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC7C,UAAU,CAAC,IAAI,CAAC;oBACd,iBAAiB,EAAE,GAAG;oBACtB,YAAY,EAAE,MAAM;oBACpB,KAAK;iBACN,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,0FAA0F;IAC1F,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAC7B,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,2FAA2F;QAC3F,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE;YAChD,IAAI,KAAK,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO,KAAK,CAAC;YACvC,MAAM,GAAG,GAAI,KAA6B,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YACjE,OAAO,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,IAAI,CAAC;QACzD,CAAC,CAAC,CAAC;QACH,IAAI,WAAW,CAAC,MAAM,GAAG,QAAQ;YAAE,OAAO;QAE1C,gDAAgD;QAChD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA+C,CAAC;QAChF,WAAW,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;YAC/B,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YAC5B,MAAM,aAAa,GAAG,QAAQ,CAAC,QAAQ,EAAE,CAAC;YAC1C,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO;YACvC,8DAA8D;YAC9D,MAAM,UAAU,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;YACpC,IAAI,CAAC,UAAU,IAAI,UAAU,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO;YACrD,MAAM,IAAI,GAAI,UAAkC,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YACvE,MAAM,QAAQ,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YACrH,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YACzF,kEAAkE;YAClE,MAAM,IAAI,GAAI,OAA+B,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YACpE,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YAClH,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YACzF,MAAM,QAAQ,GAAG,gBAAgB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAC5C,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,KAAK,EAAE,CAAC;gBACjB,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC/B,CAAC;iBAAM,CAAC;gBACN,gBAAgB,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,KAAK,MAAM,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,IAAI,gBAAgB,EAAE,CAAC;YAC3D,IAAI,KAAK,GAAG,QAAQ;gBAAE,SAAS;YAC/B,MAAM,YAAY,GAAG,aAAa,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,GAAG,YAAY,SAAS,OAAO,EAAE,CAAC;YAClD,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;gBAAE,SAAS;YAChC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAClB,UAAU,CAAC,IAAI,CAAC;gBACd,iBAAiB,EAAE,YAAY;gBAC/B,YAAY,EAAE,KAAK,OAAO,EAAE;gBAC5B,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,kCAAkC;IAClC,MAAM,SAAS,GAAG,CAAC,UAAU,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAE,KAAK,CAAC,CAAC;IAC9E,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;YAC5C,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO;YAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC;YAC5B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO;YAChC,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;YACnD,IAAI,QAAQ,CAAC,MAAM,GAAG,QAAQ;gBAAE,OAAO;YAEvC,MAAM,YAAY,GAAG,aAAa,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,UAAU,IAAI,IAAI,CAAC;YACnC,MAAM,OAAO,GAAG,GAAG,YAAY,IAAI,OAAO,EAAE,CAAC;YAC7C,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;gBAAE,OAAO;YAC9B,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAElB,MAAM,KAAK,GAAuB,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,GAAG,EAAE,EAAE,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACpD,UAAU,CAAC,IAAI,CAAC;gBACd,iBAAiB,EAAE,YAAY;gBAC/B,YAAY,EAAE,KAAK,OAAO,EAAE;gBAC5B,KAAK;aACN,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED,sCAAsC;IACtC,MAAM,SAAS,GAAG,CAAC,aAAa,EAAE,SAAS,EAAE,YAAY,EAAE,WAAW,EAAE,UAAU,CAAC,CAAC;IACpF,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,GAAG,CAAC,CAAC;QACvC,IAAI,OAAO,CAAC,MAAM,GAAG,QAAQ;YAAE,SAAS;QAExC,kBAAkB;QAClB,MAAM,YAAY,GAAG,IAAI,GAAG,EAA0D,CAAC;QACvF,OAAO,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;YACtB,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO;YAC9B,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;YAC9B,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;YACtC,MAAM,QAAQ,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAC1B,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;YAClD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,KAAK,MAAM,CAAC,YAAY,EAAE,EAAE,KAAK,EAAE,CAAC,IAAI,YAAY,EAAE,CAAC;YACrD,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ;gBAAE,SAAS;YACtC,MAAM,OAAO,GAAG,IAAI,IAAI,GAAG,CAAC;YAC5B,MAAM,OAAO,GAAG,GAAG,YAAY,IAAI,OAAO,EAAE,CAAC;YAC7C,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;gBAAE,SAAS;YAChC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAClB,UAAU,CAAC,IAAI,CAAC;gBACd,iBAAiB,EAAE,YAAY;gBAC/B,YAAY,EAAE,KAAK,OAAO,EAAE;gBAC5B,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,CAAa,EAAE,KAAyB;IAC9D,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEjC,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,6CAA6C;IAC7C,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE;QAClC,MAAM,IAAI,GAAa,EAAE,CAAC;QAC1B,IAAI,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;YAC9B,IAAI,EAAE,CAAC,IAAI,KAAK,KAAK,EAAE,CAAC;gBACtB,IAAI,CAAC,IAAI,CAAE,EAA0B,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC,CAAC,CAAC;QACH,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IACH,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC;IACpC,MAAM,WAAW,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,UAAU,CAAC,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;IACxF,KAAK,IAAI,WAAW,GAAG,GAAG,CAAC;IAE3B,0CAA0C;IAC1C,MAAM,YAAY,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE;QAC9C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAChC,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1C,CAAC,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACrB,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC,GAAG,GAAG,CAAC;IAEzC,gCAAgC;IAChC,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IACvF,KAAK,IAAI,QAAQ,GAAG,IAAI,CAAC;IAEzB,gCAAgC;IAChC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IAC1F,KAAK,IAAI,SAAS,GAAG,GAAG,CAAC;IAEzB,+BAA+B;IAC/B,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC;IAE/C,mCAAmC;IACnC,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IAC9E,KAAK,IAAI,OAAO,GAAG,GAAG,CAAC;IAEvB,wCAAwC;IACxC,MAAM,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE;QACtC,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO,KAAK,CAAC;QAC/C,MAAM,OAAO,GAAI,IAAY,CAAC,OAAO,IAAI,EAAE,CAAC;QAC5C,OAAO,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;IAC/D,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IACzB,KAAK,IAAI,WAAW,GAAG,GAAG,CAAC;IAE3B,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC;AACvC,CAAC;AAED;;;GAGG;AACH,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,KAAK,CAAC,OAAO,CAAC,uCAAuC,EAAE,MAAM,CAAC,CAAC;AACxE,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,CAAa,EAAE,EAAoB;IACxD,MAAM,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;IACnB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK;QAAE,OAAO,GAAG,CAAC;IAE7C,MAAM,GAAG,GAAI,IAA4B,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;IAChE,MAAM,EAAE,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACzB,IAAI,EAAE;QAAE,OAAO,GAAG,GAAG,IAAI,SAAS,CAAC,EAAE,CAAC,EAAE,CAAC;IAEzC,MAAM,UAAU,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;IACxE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACvE,MAAM,GAAG,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC;QACpC,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;QAC1C,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAEpD,IAAI,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IAE3C,0DAA0D;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC5B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YAChC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;gBACf,QAAQ,GAAG,GAAG,QAAQ,gBAAgB,KAAK,GAAG,CAAC,GAAG,CAAC;YACrD,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,6CAA6C;IAC/C,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,IAAI,CAAI,GAAQ;IACvB,MAAM,MAAM,GAAG,IAAI,GAAG,EAAa,CAAC;IACpC,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,CAAC;IACD,IAAI,IAAmB,CAAC;IACxB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;QACnC,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YACtB,SAAS,GAAG,KAAK,CAAC;YAClB,IAAI,GAAG,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-page scraping orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Combines content cleaning, extraction, pagination, deduplication,
|
|
5
|
+
* and optional detail-page following into a single scrape() call.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractionResult, ScrapeOptions } from './types.js';
|
|
8
|
+
import type { HTMLProcessor } from './html-processor.js';
|
|
9
|
+
export declare class ScrapeSession {
|
|
10
|
+
private readonly _proc;
|
|
11
|
+
private _aborted;
|
|
12
|
+
constructor(proc: HTMLProcessor);
|
|
13
|
+
/**
|
|
14
|
+
* Cancel a running scrape.
|
|
15
|
+
*/
|
|
16
|
+
abort(): void;
|
|
17
|
+
/**
|
|
18
|
+
* Run a multi-page scrape.
|
|
19
|
+
*/
|
|
20
|
+
scrape(containerSelector: string, options?: ScrapeOptions): Promise<ExtractionResult>;
|
|
21
|
+
private _followDetailPages;
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=scrape-session.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrape-session.d.ts","sourceRoot":"","sources":["../../src/extraction/scrape-session.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,gBAAgB,EAGhB,aAAa,EAEd,MAAM,YAAY,CAAC;AACpB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAMzD,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAgB;IACtC,OAAO,CAAC,QAAQ,CAAS;gBAEb,IAAI,EAAE,aAAa;IAI/B;;OAEG;IACH,KAAK,IAAI,IAAI;IAIb;;OAEG;IACG,MAAM,CACV,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,aAAa,GACtB,OAAO,CAAC,gBAAgB,CAAC;YAkJd,kBAAkB;CA0DjC"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-page scraping orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Combines content cleaning, extraction, pagination, deduplication,
|
|
5
|
+
* and optional detail-page following into a single scrape() call.
|
|
6
|
+
*/
|
|
7
|
+
import { extractAll } from './selector-engine.js';
|
|
8
|
+
import { removeObstructions } from './content-cleaner.js';
|
|
9
|
+
import { detectPagination, hasNextPage, goToNextPage } from './pagination.js';
|
|
10
|
+
import { resolveUrl } from './transforms.js';
|
|
11
|
+
export class ScrapeSession {
|
|
12
|
+
_proc;
|
|
13
|
+
_aborted = false;
|
|
14
|
+
constructor(proc) {
|
|
15
|
+
this._proc = proc;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Cancel a running scrape.
|
|
19
|
+
*/
|
|
20
|
+
abort() {
|
|
21
|
+
this._aborted = true;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Run a multi-page scrape.
|
|
25
|
+
*/
|
|
26
|
+
async scrape(containerSelector, options) {
|
|
27
|
+
this._aborted = false;
|
|
28
|
+
const startTime = performance.now();
|
|
29
|
+
const maxPages = options?.maxPages ?? 10;
|
|
30
|
+
const maxItems = options?.maxItems ?? Infinity;
|
|
31
|
+
const shouldClean = options?.clean !== false;
|
|
32
|
+
const pageDelay = options?.pageDelay ?? 500;
|
|
33
|
+
const deduplicateBy = options?.deduplicateBy;
|
|
34
|
+
const fields = options?.fields ?? {};
|
|
35
|
+
const allItems = [];
|
|
36
|
+
const pageUrls = [];
|
|
37
|
+
const seenKeys = new Set();
|
|
38
|
+
let complete = true;
|
|
39
|
+
let prevPageSignatures = null;
|
|
40
|
+
// Detect or use provided pagination
|
|
41
|
+
let pagination = options?.pagination ?? (await detectPagination(this._proc));
|
|
42
|
+
for (let page = 0; page < maxPages; page++) {
|
|
43
|
+
if (this._aborted) {
|
|
44
|
+
complete = false;
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
// Track page URL
|
|
48
|
+
const currentUrl = await this._proc.getUrl();
|
|
49
|
+
pageUrls.push(currentUrl);
|
|
50
|
+
// Clean obstructions
|
|
51
|
+
if (shouldClean && page === 0) {
|
|
52
|
+
await removeObstructions(this._proc);
|
|
53
|
+
}
|
|
54
|
+
// Wait for container to appear (handles dynamic content loading)
|
|
55
|
+
try {
|
|
56
|
+
await this._proc.waitForSelector(containerSelector, 5000);
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
// Timeout — try extracting anyway
|
|
60
|
+
}
|
|
61
|
+
// Get HTML and extract
|
|
62
|
+
const html = await this._proc.getHtml('basic');
|
|
63
|
+
let pageItems = extractAll(html, containerSelector, fields);
|
|
64
|
+
// Empty page guard
|
|
65
|
+
if (pageItems.length === 0) {
|
|
66
|
+
if (page > 0) {
|
|
67
|
+
// Retry once after a short wait (handles slow-loading SPAs)
|
|
68
|
+
await this._proc.wait(2000);
|
|
69
|
+
const retryHtml = await this._proc.getHtml('basic');
|
|
70
|
+
pageItems = extractAll(retryHtml, containerSelector, fields);
|
|
71
|
+
}
|
|
72
|
+
if (pageItems.length === 0) {
|
|
73
|
+
// No items found — stop scraping
|
|
74
|
+
if (page > 0)
|
|
75
|
+
complete = false;
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Stale page guard — detect if page didn't actually change
|
|
80
|
+
const currentSignatures = new Set(pageItems.map(item => JSON.stringify(item)));
|
|
81
|
+
if (prevPageSignatures && prevPageSignatures.size > 0) {
|
|
82
|
+
let overlapCount = 0;
|
|
83
|
+
for (const sig of currentSignatures) {
|
|
84
|
+
if (prevPageSignatures.has(sig))
|
|
85
|
+
overlapCount++;
|
|
86
|
+
}
|
|
87
|
+
const overlapRatio = overlapCount / Math.max(currentSignatures.size, 1);
|
|
88
|
+
if (overlapRatio > 0.8) {
|
|
89
|
+
// Page didn't change — stop
|
|
90
|
+
complete = false;
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
prevPageSignatures = currentSignatures;
|
|
95
|
+
// Deduplicate
|
|
96
|
+
if (deduplicateBy) {
|
|
97
|
+
pageItems = pageItems.filter(item => {
|
|
98
|
+
const key = String(item[deduplicateBy] ?? '');
|
|
99
|
+
if (!key || seenKeys.has(key))
|
|
100
|
+
return false;
|
|
101
|
+
seenKeys.add(key);
|
|
102
|
+
return true;
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
// Follow detail pages if configured
|
|
106
|
+
if (options?.follow) {
|
|
107
|
+
pageItems = await this._followDetailPages(html, containerSelector, pageItems, options.follow, currentUrl);
|
|
108
|
+
}
|
|
109
|
+
// Fire callbacks
|
|
110
|
+
for (const item of pageItems) {
|
|
111
|
+
if (allItems.length >= maxItems) {
|
|
112
|
+
complete = false;
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
allItems.push(item);
|
|
116
|
+
options?.onItem?.(item);
|
|
117
|
+
}
|
|
118
|
+
options?.onPage?.(page + 1, pageItems);
|
|
119
|
+
// Check limits
|
|
120
|
+
if (allItems.length >= maxItems) {
|
|
121
|
+
complete = false;
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
// Navigate to next page
|
|
125
|
+
if (page < maxPages - 1 && pagination) {
|
|
126
|
+
const currentPageNum = (pagination.startPage ?? 1) + page;
|
|
127
|
+
const hasMore = await hasNextPage(this._proc, pagination, currentPageNum);
|
|
128
|
+
if (!hasMore)
|
|
129
|
+
break;
|
|
130
|
+
const success = await goToNextPage(this._proc, pagination, currentPageNum);
|
|
131
|
+
if (!success)
|
|
132
|
+
break;
|
|
133
|
+
if (pageDelay > 0) {
|
|
134
|
+
await this._proc.wait(pageDelay);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
else if (!pagination) {
|
|
138
|
+
break; // No pagination detected
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return {
|
|
142
|
+
items: allItems,
|
|
143
|
+
totalItems: allItems.length,
|
|
144
|
+
pagesScraped: pageUrls.length,
|
|
145
|
+
pageUrls,
|
|
146
|
+
complete,
|
|
147
|
+
durationMs: performance.now() - startTime,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
async _followDetailPages(html, containerSelector, items, follow, baseUrl) {
|
|
151
|
+
const enrichedItems = [];
|
|
152
|
+
for (const item of items) {
|
|
153
|
+
if (this._aborted) {
|
|
154
|
+
enrichedItems.push(item);
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
// Look for URL in common field names
|
|
158
|
+
let url = null;
|
|
159
|
+
for (const key of ['link', 'url', 'href']) {
|
|
160
|
+
const val = item[key];
|
|
161
|
+
if (typeof val === 'string' && val) {
|
|
162
|
+
url = resolveUrl(val, baseUrl);
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
if (!url) {
|
|
167
|
+
enrichedItems.push(item);
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
try {
|
|
171
|
+
// Navigate to detail page
|
|
172
|
+
await this._proc.goto(url);
|
|
173
|
+
await this._proc.wait(follow.waitAfter ?? 1000);
|
|
174
|
+
// Extract detail fields
|
|
175
|
+
const detailHtml = await this._proc.getHtml('basic');
|
|
176
|
+
const detailContainerSelector = follow.containerSelector ?? 'body';
|
|
177
|
+
const { extractFirst } = await import('./selector-engine.js');
|
|
178
|
+
const detailData = extractFirst(detailHtml, detailContainerSelector, follow.fields);
|
|
179
|
+
// Merge detail data into item
|
|
180
|
+
enrichedItems.push({ ...item, ...detailData });
|
|
181
|
+
// Navigate back
|
|
182
|
+
await this._proc.evaluate('window.history.back()');
|
|
183
|
+
await this._proc.wait(500);
|
|
184
|
+
}
|
|
185
|
+
catch {
|
|
186
|
+
enrichedItems.push(item);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return enrichedItems;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
//# sourceMappingURL=scrape-session.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrape-session.js","sourceRoot":"","sources":["../../src/extraction/scrape-session.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,UAAU,EAAE,MAAM,sBAAsB,CAAC;AAClD,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC9E,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAE7C,MAAM,OAAO,aAAa;IACP,KAAK,CAAgB;IAC9B,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,IAAmB;QAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,iBAAyB,EACzB,OAAuB;QAEvB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;QACtB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ,IAAI,EAAE,CAAC;QACzC,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ,IAAI,QAAQ,CAAC;QAC/C,MAAM,WAAW,GAAG,OAAO,EAAE,KAAK,KAAK,KAAK,CAAC;QAC7C,MAAM,SAAS,GAAG,OAAO,EAAE,SAAS,IAAI,GAAG,CAAC;QAC5C,MAAM,aAAa,GAAG,OAAO,EAAE,aAAa,CAAC;QAC7C,MAAM,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;QAErC,MAAM,QAAQ,GAAsB,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;QACnC,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,kBAAkB,GAAuB,IAAI,CAAC;QAElD,oCAAoC;QACpC,IAAI,UAAU,GACZ,OAAO,EAAE,UAAU,IAAI,CAAC,MAAM,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;QAE9D,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,QAAQ,EAAE,IAAI,EAAE,EAAE,CAAC;YAC3C,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,QAAQ,GAAG,KAAK,CAAC;gBACjB,MAAM;YACR,CAAC;YAED,iBAAiB;YACjB,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;YAC7C,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAE1B,qBAAqB;YACrB,IAAI,WAAW,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBAC9B,MAAM,kBAAkB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACvC,CAAC;YAED,iEAAiE;YACjE,IAAI,CAAC;gBACH,MAAM,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC;YAC5D,CAAC;YAAC,MAAM,CAAC;gBACP,kCAAkC;YACpC,CAAC;YAED,uBAAuB;YACvB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAC/C,IAAI,SAAS,GAAG,UAAU,CAAC,IAAI,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;YAE5D,mBAAmB;YACnB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC3B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;oBACb,4DAA4D;oBAC5D,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBAC5B,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;oBACpD,SAAS,GAAG,UAAU,CAAC,SAAS,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;gBAC/D,CAAC;gBACD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;oBAC3B,iCAAiC;oBACjC,IAAI,IAAI,GAAG,CAAC;wBAAE,QAAQ,GAAG,KAAK,CAAC;oBAC/B,MAAM;gBACR,CAAC;YACH,CAAC;YAED,2DAA2D;YAC3D,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAC/B,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAC5C,CAAC;YACF,IAAI,kBAAkB,IAAI,kBAAkB,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBACtD,IAAI,YAAY,GAAG,CAAC,CAAC;gBACrB,KAAK,MAAM,GAAG,IAAI,iBAAiB,EAAE,CAAC;oBACpC,IAAI,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC;wBAAE,YAAY,EAAE,CAAC;gBAClD,CAAC;gBACD,MAAM,YAAY,GAAG,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;gBACxE,IAAI,YAAY,GAAG,GAAG,EAAE,CAAC;oBACvB,4BAA4B;oBAC5B,QAAQ,GAAG,KAAK,CAAC;oBACjB,MAAM;gBACR,CAAC;YACH,CAAC;YACD,kBAAkB,GAAG,iBAAiB,CAAC;YAEvC,cAAc;YACd,IAAI,aAAa,EAAE,CAAC;gBAClB,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE;oBAClC,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC;oBAC9C,IAAI,CAAC,GAAG,IAAI,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC;wBAAE,OAAO,KAAK,CAAC;oBAC5C,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBAClB,OAAO,IAAI,CAAC;gBACd,CAAC,CAAC,CAAC;YACL,CAAC;YAED,oCAAoC;YACpC,IAAI,OAAO,EAAE,MAAM,EAAE,CAAC;gBACpB,SAAS,GAAG,MAAM,IAAI,CAAC,kBAAkB,CACvC,IAAI,EACJ,iBAAiB,EACjB,SAAS,EACT,OAAO,CAAC,MAAM,EACd,UAAU,CACX,CAAC;YACJ,CAAC;YAED,iBAAiB;YACjB,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;gBAC7B,IAAI,QAAQ,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;oBAChC,QAAQ,GAAG,KAAK,CAAC;oBACjB,MAAM;gBACR,CAAC;gBACD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACpB,OAAO,EAAE,MAAM,EAAE,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC;YAED,OAAO,EAAE,MAAM,EAAE,CAAC,IAAI,GAAG,CAAC,EAAE,SAAS,CAAC,CAAC;YAEvC,eAAe;YACf,IAAI,QAAQ,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;gBAChC,QAAQ,GAAG,KAAK,CAAC;gBACjB,MAAM;YACR,CAAC;YAED,wBAAwB;YACxB,IAAI,IAAI,GAAG,QAAQ,GAAG,CAAC,IAAI,UAAU,EAAE,CAAC;gBACtC,MAAM,cAAc,GAAG,CAAC,UAAU,CAAC,SAAS,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC;gBAC1D,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,KAAK,EAAE,UAAU,EAAE,cAAc,CAAC,CAAC;gBAC1E,IAAI,CAAC,OAAO;oBAAE,MAAM;gBAEpB,MAAM,OAAO,GAAG,MAAM,YAAY,CAAC,IAAI,CAAC,KAAK,EAAE,UAAU,EAAE,cAAc,CAAC,CAAC;gBAC3E,IAAI,CAAC,OAAO;oBAAE,MAAM;gBAEpB,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;oBAClB,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACnC,CAAC;YACH,CAAC;iBAAM,IAAI,CAAC,UAAU,EAAE,CAAC;gBACvB,MAAM,CAAC,yBAAyB;YAClC,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK,EAAE,QAAQ;YACf,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,QAAQ;YACR,QAAQ;YACR,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS;SAC1C,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,kBAAkB,CAC9B,IAAY,EACZ,iBAAyB,EACzB,KAAwB,EACxB,MAA4C,EAC5C,OAAe;QAEf,MAAM,aAAa,GAAsB,EAAE,CAAC;QAE5C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACzB,SAAS;YACX,CAAC;YAED,qCAAqC;YACrC,IAAI,GAAG,GAAkB,IAAI,CAAC;YAC9B,KAAK,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,EAAE,CAAC;gBAC1C,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;gBACtB,IAAI,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,EAAE,CAAC;oBACnC,GAAG,GAAG,UAAU,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;oBAC/B,MAAM;gBACR,CAAC;YACH,CAAC;YAED,IAAI,CAAC,GAAG,EAAE,CAAC;gBACT,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACzB,SAAS;YACX,CAAC;YAED,IAAI,CAAC;gBACH,0BAA0B;gBAC1B,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAC3B,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;gBAEhD,wBAAwB;gBACxB,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBACrD,MAAM,uBAAuB,GAAG,MAAM,CAAC,iBAAiB,IAAI,MAAM,CAAC;gBACnE,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;gBAC9D,MAAM,UAAU,GAAG,YAAY,CAC7B,UAAU,EACV,uBAAuB,EACvB,MAAM,CAAC,MAAM,CACd,CAAC;gBAEF,8BAA8B;gBAC9B,aAAa,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,GAAG,UAAU,EAAE,CAAC,CAAC;gBAE/C,gBAAgB;gBAChB,MAAM,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,uBAAuB,CAAC,CAAC;gBACnD,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC7B,CAAC;YAAC,MAAM,CAAC;gBACP,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,aAAa,CAAC;IACvB,CAAC;CACF"}
|