@olib-ai/owl-browser-sdk 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -0
- package/dist/extraction/content-cleaner.d.ts +40 -0
- package/dist/extraction/content-cleaner.d.ts.map +1 -0
- package/dist/extraction/content-cleaner.js +393 -0
- package/dist/extraction/content-cleaner.js.map +1 -0
- package/dist/extraction/extractor.d.ts +139 -0
- package/dist/extraction/extractor.d.ts.map +1 -0
- package/dist/extraction/extractor.js +212 -0
- package/dist/extraction/extractor.js.map +1 -0
- package/dist/extraction/html-processor.d.ts +75 -0
- package/dist/extraction/html-processor.d.ts.map +1 -0
- package/dist/extraction/html-processor.js +192 -0
- package/dist/extraction/html-processor.js.map +1 -0
- package/dist/extraction/index.d.ts +14 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/index.js +19 -0
- package/dist/extraction/index.js.map +1 -0
- package/dist/extraction/list-extractor.d.ts +24 -0
- package/dist/extraction/list-extractor.d.ts.map +1 -0
- package/dist/extraction/list-extractor.js +303 -0
- package/dist/extraction/list-extractor.js.map +1 -0
- package/dist/extraction/meta-extractor.d.ts +40 -0
- package/dist/extraction/meta-extractor.d.ts.map +1 -0
- package/dist/extraction/meta-extractor.js +216 -0
- package/dist/extraction/meta-extractor.js.map +1 -0
- package/dist/extraction/pagination.d.ts +29 -0
- package/dist/extraction/pagination.d.ts.map +1 -0
- package/dist/extraction/pagination.js +323 -0
- package/dist/extraction/pagination.js.map +1 -0
- package/dist/extraction/pattern-detector.d.ts +16 -0
- package/dist/extraction/pattern-detector.d.ts.map +1 -0
- package/dist/extraction/pattern-detector.js +390 -0
- package/dist/extraction/pattern-detector.js.map +1 -0
- package/dist/extraction/scrape-session.d.ts +23 -0
- package/dist/extraction/scrape-session.d.ts.map +1 -0
- package/dist/extraction/scrape-session.js +192 -0
- package/dist/extraction/scrape-session.js.map +1 -0
- package/dist/extraction/selector-engine.d.ts +23 -0
- package/dist/extraction/selector-engine.d.ts.map +1 -0
- package/dist/extraction/selector-engine.js +127 -0
- package/dist/extraction/selector-engine.js.map +1 -0
- package/dist/extraction/table-extractor.d.ts +29 -0
- package/dist/extraction/table-extractor.d.ts.map +1 -0
- package/dist/extraction/table-extractor.js +282 -0
- package/dist/extraction/table-extractor.js.map +1 -0
- package/dist/extraction/transforms.d.ts +47 -0
- package/dist/extraction/transforms.d.ts.map +1 -0
- package/dist/extraction/transforms.js +277 -0
- package/dist/extraction/transforms.js.map +1 -0
- package/dist/extraction/types.d.ts +199 -0
- package/dist/extraction/types.d.ts.map +1 -0
- package/dist/extraction/types.js +5 -0
- package/dist/extraction/types.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/playwright/browser-type.d.ts +101 -0
- package/dist/playwright/browser-type.d.ts.map +1 -0
- package/dist/playwright/browser-type.js +134 -0
- package/dist/playwright/browser-type.js.map +1 -0
- package/dist/playwright/browser.d.ts +98 -0
- package/dist/playwright/browser.d.ts.map +1 -0
- package/dist/playwright/browser.js +229 -0
- package/dist/playwright/browser.js.map +1 -0
- package/dist/playwright/context.d.ts +217 -0
- package/dist/playwright/context.d.ts.map +1 -0
- package/dist/playwright/context.js +518 -0
- package/dist/playwright/context.js.map +1 -0
- package/dist/playwright/extractor.d.ts +108 -0
- package/dist/playwright/extractor.d.ts.map +1 -0
- package/dist/playwright/extractor.js +404 -0
- package/dist/playwright/extractor.js.map +1 -0
- package/dist/playwright/frame.d.ts +147 -0
- package/dist/playwright/frame.d.ts.map +1 -0
- package/dist/playwright/frame.js +492 -0
- package/dist/playwright/frame.js.map +1 -0
- package/dist/playwright/index.d.ts +163 -0
- package/dist/playwright/index.d.ts.map +1 -0
- package/dist/playwright/index.js +313 -0
- package/dist/playwright/index.js.map +1 -0
- package/dist/playwright/keyboard.d.ts +74 -0
- package/dist/playwright/keyboard.d.ts.map +1 -0
- package/dist/playwright/keyboard.js +187 -0
- package/dist/playwright/keyboard.js.map +1 -0
- package/dist/playwright/locator.d.ts +237 -0
- package/dist/playwright/locator.d.ts.map +1 -0
- package/dist/playwright/locator.js +667 -0
- package/dist/playwright/locator.js.map +1 -0
- package/dist/playwright/mouse.d.ts +82 -0
- package/dist/playwright/mouse.d.ts.map +1 -0
- package/dist/playwright/mouse.js +137 -0
- package/dist/playwright/mouse.js.map +1 -0
- package/dist/playwright/page-helpers.d.ts +267 -0
- package/dist/playwright/page-helpers.d.ts.map +1 -0
- package/dist/playwright/page-helpers.js +449 -0
- package/dist/playwright/page-helpers.js.map +1 -0
- package/dist/playwright/page.d.ts +605 -0
- package/dist/playwright/page.d.ts.map +1 -0
- package/dist/playwright/page.js +1698 -0
- package/dist/playwright/page.js.map +1 -0
- package/dist/playwright/response.d.ts +100 -0
- package/dist/playwright/response.d.ts.map +1 -0
- package/dist/playwright/response.js +194 -0
- package/dist/playwright/response.js.map +1 -0
- package/dist/playwright/types.d.ts +354 -0
- package/dist/playwright/types.d.ts.map +1 -0
- package/dist/playwright/types.js +8 -0
- package/dist/playwright/types.js.map +1 -0
- package/openapi.json +327 -35
- package/package.json +10 -1
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pagination detection and navigation.
|
|
3
|
+
*
|
|
4
|
+
* Detects pagination type from DOM heuristics: next links, page numbers,
|
|
5
|
+
* scroll markers, load-more buttons, rel="next".
|
|
6
|
+
*/
|
|
7
|
+
import type { PaginationConfig } from './types.js';
|
|
8
|
+
import type { HTMLProcessor } from './html-processor.js';
|
|
9
|
+
/**
|
|
10
|
+
* Auto-detect pagination type from the current page DOM.
|
|
11
|
+
*
|
|
12
|
+
* Uses a single browser evaluate() call to check all DOM-based patterns
|
|
13
|
+
* (rel="next", known selectors, text-based detection, load-more) at once,
|
|
14
|
+
* reducing round-trips from ~20 to 1.
|
|
15
|
+
*/
|
|
16
|
+
export declare function detectPagination(proc: HTMLProcessor): Promise<PaginationConfig | null>;
|
|
17
|
+
/**
|
|
18
|
+
* Check if there's a next page available.
|
|
19
|
+
*/
|
|
20
|
+
export declare function hasNextPage(proc: HTMLProcessor, config: PaginationConfig, currentPage: number): Promise<boolean>;
|
|
21
|
+
/**
|
|
22
|
+
* Navigate to the next page.
|
|
23
|
+
*/
|
|
24
|
+
export declare function goToNextPage(proc: HTMLProcessor, config: PaginationConfig, currentPage: number): Promise<boolean>;
|
|
25
|
+
/**
|
|
26
|
+
* Resolve a URL pattern with a page number.
|
|
27
|
+
*/
|
|
28
|
+
export declare function resolvePageUrl(pattern: string, page: number): string;
|
|
29
|
+
//# sourceMappingURL=pagination.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pagination.d.ts","sourceRoot":"","sources":["../../src/extraction/pagination.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACnD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAiCzD;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CA0L5F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,gBAAgB,EACxB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,OAAO,CAAC,CA0BlB;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,gBAAgB,EACxB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,OAAO,CAAC,CAkClB;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pagination detection and navigation.
|
|
3
|
+
*
|
|
4
|
+
* Detects pagination type from DOM heuristics: next links, page numbers,
|
|
5
|
+
* scroll markers, load-more buttons, rel="next".
|
|
6
|
+
*/
|
|
7
|
+
// Patterns for detecting "Next" buttons/links
|
|
8
|
+
const NEXT_LINK_PATTERNS = [
|
|
9
|
+
'a[rel="next"]',
|
|
10
|
+
'link[rel="next"]',
|
|
11
|
+
'[class*="next"] a',
|
|
12
|
+
'[class*="pagination"] a[class*="next"]',
|
|
13
|
+
'a[aria-label*="next" i]',
|
|
14
|
+
'a[aria-label*="Next"]',
|
|
15
|
+
'button[aria-label*="next" i]',
|
|
16
|
+
'button[aria-label*="Next"]',
|
|
17
|
+
'nav[class*="pagination"] a:last-child',
|
|
18
|
+
'.pagination .next a',
|
|
19
|
+
'.pagination a.next',
|
|
20
|
+
'.pager .next a',
|
|
21
|
+
'a.next-page',
|
|
22
|
+
'a[class*="next-page"]',
|
|
23
|
+
'button.next-page',
|
|
24
|
+
'button[class*="next-page"]',
|
|
25
|
+
];
|
|
26
|
+
// Patterns for "Load more" buttons
|
|
27
|
+
const LOAD_MORE_PATTERNS = [
|
|
28
|
+
'button[class*="load-more"]',
|
|
29
|
+
'button[class*="loadmore"]',
|
|
30
|
+
'a[class*="load-more"]',
|
|
31
|
+
'[class*="load-more"] button',
|
|
32
|
+
'[data-action="load-more"]',
|
|
33
|
+
'button[class*="show-more"]',
|
|
34
|
+
'a[class*="show-more"]',
|
|
35
|
+
];
|
|
36
|
+
/**
|
|
37
|
+
* Auto-detect pagination type from the current page DOM.
|
|
38
|
+
*
|
|
39
|
+
* Uses a single browser evaluate() call to check all DOM-based patterns
|
|
40
|
+
* (rel="next", known selectors, text-based detection, load-more) at once,
|
|
41
|
+
* reducing round-trips from ~20 to 1.
|
|
42
|
+
*/
|
|
43
|
+
export async function detectPagination(proc) {
|
|
44
|
+
const url = await proc.getUrl();
|
|
45
|
+
// Single JS call that checks all patterns at once
|
|
46
|
+
const result = await proc.evaluate(`
|
|
47
|
+
(function() {
|
|
48
|
+
var result = { type: null, selector: null, href: null };
|
|
49
|
+
|
|
50
|
+
// 1. rel="next" link
|
|
51
|
+
var relNext = document.querySelector('a[rel="next"], link[rel="next"]');
|
|
52
|
+
if (relNext) {
|
|
53
|
+
result.type = 'rel-next';
|
|
54
|
+
result.href = relNext.href || null;
|
|
55
|
+
result.selector = 'a[rel="next"]';
|
|
56
|
+
return result;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// 2. Check known next-button selectors
|
|
60
|
+
var nextSelectors = ${JSON.stringify(NEXT_LINK_PATTERNS)};
|
|
61
|
+
for (var i = 0; i < nextSelectors.length; i++) {
|
|
62
|
+
try {
|
|
63
|
+
var el = document.querySelector(nextSelectors[i]);
|
|
64
|
+
if (el && el.offsetParent !== null) {
|
|
65
|
+
result.type = 'click-next';
|
|
66
|
+
result.selector = nextSelectors[i];
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
} catch(e) {}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// 3. Text-based "next" detection in pagination containers (with i18n)
|
|
73
|
+
var nextTexts = ['next', 'siguiente', 'suivant', 'weiter', 'volgende', 'avanti', '\\u0434\\u0430\\u043b\\u0435\\u0435', '\\u6b21\\u3078', '\\u4e0b\\u4e00\\u9875', '\\u45e4\\u44eb'];
|
|
74
|
+
var arrowTexts = ['>', '>>', '\\u203a', '\\u2192', '\\u276f'];
|
|
75
|
+
var containers = document.querySelectorAll(
|
|
76
|
+
'nav, [class*="pagination"], [class*="pager"], [role="navigation"]'
|
|
77
|
+
);
|
|
78
|
+
for (var i = 0; i < containers.length; i++) {
|
|
79
|
+
var clickables = containers[i].querySelectorAll('a, button');
|
|
80
|
+
for (var j = 0; j < clickables.length; j++) {
|
|
81
|
+
var el = clickables[j];
|
|
82
|
+
if (el.disabled) continue;
|
|
83
|
+
var text = (el.textContent || '').trim().toLowerCase();
|
|
84
|
+
var ariaLabel = (el.getAttribute('aria-label') || '').toLowerCase();
|
|
85
|
+
|
|
86
|
+
var isNext = false;
|
|
87
|
+
for (var k = 0; k < nextTexts.length; k++) {
|
|
88
|
+
if (text.indexOf(nextTexts[k]) === 0 || ariaLabel.indexOf(nextTexts[k]) === 0) {
|
|
89
|
+
isNext = true;
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (!isNext) {
|
|
94
|
+
for (var k = 0; k < arrowTexts.length; k++) {
|
|
95
|
+
if (text === arrowTexts[k]) { isNext = true; break; }
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (isNext) {
|
|
100
|
+
// Build a precise selector
|
|
101
|
+
var tag = el.tagName.toLowerCase();
|
|
102
|
+
var sel = tag;
|
|
103
|
+
if (el.id) {
|
|
104
|
+
sel = tag + '#' + el.id;
|
|
105
|
+
} else if (el.getAttribute('aria-label')) {
|
|
106
|
+
sel = tag + '[aria-label="' + el.getAttribute('aria-label').replace(/"/g, '\\\\"') + '"]';
|
|
107
|
+
} else {
|
|
108
|
+
// Use classes + nth-child for precision
|
|
109
|
+
var cls = Array.from(el.classList).slice(0, 3).join('.');
|
|
110
|
+
if (cls) {
|
|
111
|
+
sel = tag + '.' + cls;
|
|
112
|
+
// Check uniqueness
|
|
113
|
+
var matches = document.querySelectorAll(sel);
|
|
114
|
+
if (matches.length > 1) {
|
|
115
|
+
var parent = el.parentElement;
|
|
116
|
+
if (parent) {
|
|
117
|
+
var siblings = parent.querySelectorAll(':scope > ' + sel);
|
|
118
|
+
for (var s = 0; s < siblings.length; s++) {
|
|
119
|
+
if (siblings[s] === el) {
|
|
120
|
+
sel = tag + '.' + cls + ':nth-child(' + (s + 1) + ')';
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
} else {
|
|
127
|
+
// No classes — use parent context + nth-child
|
|
128
|
+
var parent = el.parentElement;
|
|
129
|
+
if (parent) {
|
|
130
|
+
var siblings = parent.querySelectorAll(':scope > ' + tag);
|
|
131
|
+
for (var s = 0; s < siblings.length; s++) {
|
|
132
|
+
if (siblings[s] === el) {
|
|
133
|
+
var parentSel = '';
|
|
134
|
+
if (parent.id) parentSel = '#' + parent.id;
|
|
135
|
+
else if (parent.classList.length > 0) parentSel = parent.tagName.toLowerCase() + '.' + Array.from(parent.classList).slice(0, 2).join('.');
|
|
136
|
+
else parentSel = parent.tagName.toLowerCase();
|
|
137
|
+
sel = parentSel + ' > ' + tag + ':nth-child(' + (s + 1) + ')';
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
result.type = 'click-next';
|
|
145
|
+
result.selector = sel;
|
|
146
|
+
return result;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// 4. Load-more buttons
|
|
152
|
+
var loadMoreSelectors = ${JSON.stringify(LOAD_MORE_PATTERNS)};
|
|
153
|
+
for (var i = 0; i < loadMoreSelectors.length; i++) {
|
|
154
|
+
try {
|
|
155
|
+
var el = document.querySelector(loadMoreSelectors[i]);
|
|
156
|
+
if (el && el.offsetParent !== null) {
|
|
157
|
+
result.type = 'load-more';
|
|
158
|
+
result.selector = loadMoreSelectors[i];
|
|
159
|
+
return result;
|
|
160
|
+
}
|
|
161
|
+
} catch(e) {}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
return null;
|
|
165
|
+
})()
|
|
166
|
+
`);
|
|
167
|
+
if (result) {
|
|
168
|
+
if (result.type === 'rel-next' && result.href) {
|
|
169
|
+
const pattern = detectUrlPattern(url, result.href);
|
|
170
|
+
if (pattern) {
|
|
171
|
+
return {
|
|
172
|
+
type: 'url-pattern',
|
|
173
|
+
urlPattern: pattern.pattern,
|
|
174
|
+
startPage: pattern.currentPage,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
return { type: 'click-next', nextSelector: result.selector };
|
|
178
|
+
}
|
|
179
|
+
if (result.type === 'click-next') {
|
|
180
|
+
return { type: 'click-next', nextSelector: result.selector };
|
|
181
|
+
}
|
|
182
|
+
if (result.type === 'load-more') {
|
|
183
|
+
return { type: 'load-more', nextSelector: result.selector };
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// 5. Check for URL-based pagination (page= in current URL)
|
|
187
|
+
const urlMatch = /[?&](page|p)=(\d+)/.exec(url);
|
|
188
|
+
if (urlMatch) {
|
|
189
|
+
const param = urlMatch[1];
|
|
190
|
+
const currentPage = parseInt(urlMatch[2], 10);
|
|
191
|
+
return {
|
|
192
|
+
type: 'url-pattern',
|
|
193
|
+
urlPattern: url.replace(new RegExp(`([?&])${param}=\\d+`), `$1${param}={page}`),
|
|
194
|
+
startPage: currentPage,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
// 6. Check for /page/N pattern in URL
|
|
198
|
+
const pathMatch = /\/page\/(\d+)/.exec(url);
|
|
199
|
+
if (pathMatch) {
|
|
200
|
+
return {
|
|
201
|
+
type: 'url-pattern',
|
|
202
|
+
urlPattern: url.replace(/\/page\/\d+/, '/page/{page}'),
|
|
203
|
+
startPage: parseInt(pathMatch[1], 10),
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
// 7. Check for offset-based pagination
|
|
207
|
+
const offsetMatch = /[?&](offset|start|skip)=(\d+)/.exec(url);
|
|
208
|
+
if (offsetMatch) {
|
|
209
|
+
const param = offsetMatch[1];
|
|
210
|
+
const currentOffset = parseInt(offsetMatch[2], 10);
|
|
211
|
+
// Try to detect limit param
|
|
212
|
+
const limitMatch = /[?&](limit|count|size)=(\d+)/.exec(url);
|
|
213
|
+
const limit = limitMatch ? parseInt(limitMatch[2], 10) : 20;
|
|
214
|
+
return {
|
|
215
|
+
type: 'url-pattern',
|
|
216
|
+
urlPattern: url.replace(new RegExp(`([?&])${param}=\\d+`), `$1${param}={page}`),
|
|
217
|
+
startPage: currentOffset,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
return null;
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Check if there's a next page available.
|
|
224
|
+
*/
|
|
225
|
+
export async function hasNextPage(proc, config, currentPage) {
|
|
226
|
+
if (config.type === 'url-list') {
|
|
227
|
+
return config.urls !== undefined && currentPage < config.urls.length;
|
|
228
|
+
}
|
|
229
|
+
if (config.type === 'url-pattern') {
|
|
230
|
+
// We can always try the next page — will be validated by the scraper
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
if (config.type === 'click-next' || config.type === 'load-more') {
|
|
234
|
+
if (!config.nextSelector)
|
|
235
|
+
return false;
|
|
236
|
+
const exists = await proc.evaluate(`
|
|
237
|
+
(function() {
|
|
238
|
+
var el = document.querySelector(${JSON.stringify(config.nextSelector)});
|
|
239
|
+
return el && el.offsetParent !== null && !el.disabled;
|
|
240
|
+
})()
|
|
241
|
+
`);
|
|
242
|
+
return exists;
|
|
243
|
+
}
|
|
244
|
+
if (config.type === 'infinite-scroll') {
|
|
245
|
+
return true; // Handled by scroll behavior
|
|
246
|
+
}
|
|
247
|
+
return false;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Navigate to the next page.
|
|
251
|
+
*/
|
|
252
|
+
export async function goToNextPage(proc, config, currentPage) {
|
|
253
|
+
try {
|
|
254
|
+
if (config.type === 'click-next' || config.type === 'load-more') {
|
|
255
|
+
if (!config.nextSelector)
|
|
256
|
+
return false;
|
|
257
|
+
await proc.click(config.nextSelector);
|
|
258
|
+
await proc.wait(config.waitAfter ?? 1000);
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
if (config.type === 'url-pattern') {
|
|
262
|
+
const nextPage = currentPage + 1;
|
|
263
|
+
const url = resolvePageUrl(config.urlPattern, nextPage);
|
|
264
|
+
await proc.goto(url);
|
|
265
|
+
await proc.wait(config.waitAfter ?? 1000);
|
|
266
|
+
return true;
|
|
267
|
+
}
|
|
268
|
+
if (config.type === 'url-list') {
|
|
269
|
+
if (!config.urls || currentPage >= config.urls.length)
|
|
270
|
+
return false;
|
|
271
|
+
await proc.goto(config.urls[currentPage]);
|
|
272
|
+
await proc.wait(config.waitAfter ?? 1000);
|
|
273
|
+
return true;
|
|
274
|
+
}
|
|
275
|
+
if (config.type === 'infinite-scroll') {
|
|
276
|
+
await proc.scrollToBottom();
|
|
277
|
+
await proc.wait(config.waitAfter ?? 1500);
|
|
278
|
+
return true;
|
|
279
|
+
}
|
|
280
|
+
return false;
|
|
281
|
+
}
|
|
282
|
+
catch {
|
|
283
|
+
return false;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Resolve a URL pattern with a page number.
|
|
288
|
+
*/
|
|
289
|
+
export function resolvePageUrl(pattern, page) {
|
|
290
|
+
return pattern.replace('{page}', String(page));
|
|
291
|
+
}
|
|
292
|
+
// ==================== Internal ====================
|
|
293
|
+
function detectUrlPattern(currentUrl, nextUrl) {
|
|
294
|
+
try {
|
|
295
|
+
const current = new URL(currentUrl);
|
|
296
|
+
const next = new URL(nextUrl);
|
|
297
|
+
// Check query params
|
|
298
|
+
for (const [key, value] of next.searchParams) {
|
|
299
|
+
const currentValue = current.searchParams.get(key);
|
|
300
|
+
const nextNum = parseInt(value, 10);
|
|
301
|
+
const currentNum = currentValue ? parseInt(currentValue, 10) : NaN;
|
|
302
|
+
if (!isNaN(nextNum) && !isNaN(currentNum) && nextNum === currentNum + 1) {
|
|
303
|
+
const pattern = currentUrl.replace(new RegExp(`([?&])${key}=${currentNum}`), `$1${key}={page}`);
|
|
304
|
+
return { pattern, currentPage: currentNum };
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
// Check path pattern /page/N
|
|
308
|
+
const currentMatch = /\/page\/(\d+)/.exec(current.pathname);
|
|
309
|
+
const nextMatch = /\/page\/(\d+)/.exec(next.pathname);
|
|
310
|
+
if (currentMatch && nextMatch) {
|
|
311
|
+
const currentPage = parseInt(currentMatch[1], 10);
|
|
312
|
+
return {
|
|
313
|
+
pattern: currentUrl.replace(/\/page\/\d+/, '/page/{page}'),
|
|
314
|
+
currentPage,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
catch {
|
|
319
|
+
// Invalid URLs
|
|
320
|
+
}
|
|
321
|
+
return null;
|
|
322
|
+
}
|
|
323
|
+
//# sourceMappingURL=pagination.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pagination.js","sourceRoot":"","sources":["../../src/extraction/pagination.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,8CAA8C;AAC9C,MAAM,kBAAkB,GAAG;IACzB,eAAe;IACf,kBAAkB;IAClB,mBAAmB;IACnB,wCAAwC;IACxC,yBAAyB;IACzB,uBAAuB;IACvB,8BAA8B;IAC9B,4BAA4B;IAC5B,uCAAuC;IACvC,qBAAqB;IACrB,oBAAoB;IACpB,gBAAgB;IAChB,aAAa;IACb,uBAAuB;IACvB,kBAAkB;IAClB,4BAA4B;CAC7B,CAAC;AAEF,mCAAmC;AACnC,MAAM,kBAAkB,GAAG;IACzB,4BAA4B;IAC5B,2BAA2B;IAC3B,uBAAuB;IACvB,6BAA6B;IAC7B,2BAA2B;IAC3B,4BAA4B;IAC5B,uBAAuB;CACxB,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,IAAmB;IACxD,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;IAEhC,kDAAkD;IAClD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;;;;;4BAcT,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCA4F9B,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC;;;;;;;;;;;;;;GAc/D,CAAmE,CAAC;IAErE,IAAI,MAAM,EAAE,CAAC;QACX,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;YAC9C,MAAM,OAAO,GAAG,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;YACnD,IAAI,OAAO,EAAE,CAAC;gBACZ,OAAO;oBACL,IAAI,EAAE,aAAa;oBACnB,UAAU,EAAE,OAAO,CAAC,OAAO;oBAC3B,SAAS,EAAE,OAAO,CAAC,WAAW;iBAC/B,CAAC;YACJ,CAAC;YACD,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC/D,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;YACjC,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC/D,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChC,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC9D,CAAC;IACH,CAAC;IAED,2DAA2D;IAC3D,MAAM,QAAQ,GAAG,oBAAoB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,KAAK,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;QAC/C,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,SAAS,KAAK,OAAO,CAAC,EAAE,KAAK,KAAK,SAAS,CAAC;YAC/E,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;IAED,sCAAsC;IACtC,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC5C,IAAI,SAAS,EAAE,CAAC;QACd,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,aAAa,EAAE,cAAc,CAAC;YACtD,SAAS,EAAE,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC;SACvC,CAAC;IACJ,CAAC;IAED,uCAAuC;IACvC,MAAM,WAAW,GAAG,+BAA+B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9D,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,KAAK,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;QAC9B,MAAM,aAAa,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;QACpD,4BAA4B;QAC5B,MAAM,UAAU,GAAG,8BAA8B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,SAAS,KAAK,OAAO,CAAC,EAAE,KAAK,KAAK,SAAS,CAAC;YAC/E,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAmB,EACnB,MAAwB,EACxB,WAAmB;IAEnB,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC,IAAI,KAAK,SAAS,IAAI,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC;IACvE,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;QAClC,qEAAqE;QACrE,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;QAChE,IAAI,CAAC,MAAM,CAAC,YAAY;YAAE,OAAO,KAAK,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC;;0CAEG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,YAAY,CAAC;;;KAGxE,CAAY,CAAC;QACd,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,iBAAiB,EAAE,CAAC;QACtC,OAAO,IAAI,CAAC,CAAC,6BAA6B;IAC5C,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAmB,EACnB,MAAwB,EACxB,WAAmB;IAEnB,IAAI,CAAC;QACH,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChE,IAAI,CAAC,MAAM,CAAC,YAAY;gBAAE,OAAO,KAAK,CAAC;YACvC,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC;YACtC,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YAClC,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC;YACjC,MAAM,GAAG,GAAG,cAAc,CAAC,MAAM,CAAC,UAAW,EAAE,QAAQ,CAAC,CAAC;YACzD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACrB,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,WAAW,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM;gBAAE,OAAO,KAAK,CAAC;YACpE,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAE,CAAC,CAAC;YAC3C,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,iBAAiB,EAAE,CAAC;YACtC,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YAC5B,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,OAAe,EAAE,IAAY;IAC1D,OAAO,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAED,qDAAqD;AAErD,SAAS,gBAAgB,CACvB,UAAkB,EAClB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC;QACpC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;QAE9B,qBAAqB;QACrB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7C,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnD,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YACpC,MAAM,UAAU,GAAG,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAEnE,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,OAAO,KAAK,UAAU,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAChC,IAAI,MAAM,CAAC,SAAS,GAAG,IAAI,UAAU,EAAE,CAAC,EACxC,KAAK,GAAG,SAAS,CAClB,CAAC;gBACF,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC;YAC9C,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,MAAM,YAAY,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACtD,IAAI,YAAY,IAAI,SAAS,EAAE,CAAC;YAC9B,MAAM,WAAW,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;YACnD,OAAO;gBACL,OAAO,EAAE,UAAU,CAAC,OAAO,CAAC,aAAa,EAAE,cAAc,CAAC;gBAC1D,WAAW;aACZ,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zero-config auto pattern discovery via pure DOM analysis.
|
|
3
|
+
*
|
|
4
|
+
* Finds repeating DOM structures without any AI — uses tag+class frequency,
|
|
5
|
+
* child consistency scoring, and semantic field inference.
|
|
6
|
+
*/
|
|
7
|
+
import type { DetectedPattern, DetectOptions, ExtractedRecord } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Detect repeating patterns on the page.
|
|
10
|
+
*/
|
|
11
|
+
export declare function detect(html: string, options?: DetectOptions): DetectedPattern[];
|
|
12
|
+
/**
|
|
13
|
+
* Detect patterns and immediately extract the best one.
|
|
14
|
+
*/
|
|
15
|
+
export declare function detectAndExtract(html: string, options?: DetectOptions): ExtractedRecord[];
|
|
16
|
+
//# sourceMappingURL=pattern-detector.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pattern-detector.d.ts","sourceRoot":"","sources":["../../src/extraction/pattern-detector.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAE,aAAa,EAAE,eAAe,EAAa,MAAM,YAAY,CAAC;AAI7F;;GAEG;AACH,wBAAgB,MAAM,CACpB,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,aAAa,GACtB,eAAe,EAAE,CAoDnB;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,aAAa,GACtB,eAAe,EAAE,CAMnB"}
|