@olib-ai/owl-browser-sdk 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +107 -0
  2. package/dist/extraction/content-cleaner.d.ts +40 -0
  3. package/dist/extraction/content-cleaner.d.ts.map +1 -0
  4. package/dist/extraction/content-cleaner.js +393 -0
  5. package/dist/extraction/content-cleaner.js.map +1 -0
  6. package/dist/extraction/extractor.d.ts +139 -0
  7. package/dist/extraction/extractor.d.ts.map +1 -0
  8. package/dist/extraction/extractor.js +212 -0
  9. package/dist/extraction/extractor.js.map +1 -0
  10. package/dist/extraction/html-processor.d.ts +75 -0
  11. package/dist/extraction/html-processor.d.ts.map +1 -0
  12. package/dist/extraction/html-processor.js +192 -0
  13. package/dist/extraction/html-processor.js.map +1 -0
  14. package/dist/extraction/index.d.ts +14 -0
  15. package/dist/extraction/index.d.ts.map +1 -0
  16. package/dist/extraction/index.js +19 -0
  17. package/dist/extraction/index.js.map +1 -0
  18. package/dist/extraction/list-extractor.d.ts +24 -0
  19. package/dist/extraction/list-extractor.d.ts.map +1 -0
  20. package/dist/extraction/list-extractor.js +303 -0
  21. package/dist/extraction/list-extractor.js.map +1 -0
  22. package/dist/extraction/meta-extractor.d.ts +40 -0
  23. package/dist/extraction/meta-extractor.d.ts.map +1 -0
  24. package/dist/extraction/meta-extractor.js +216 -0
  25. package/dist/extraction/meta-extractor.js.map +1 -0
  26. package/dist/extraction/pagination.d.ts +29 -0
  27. package/dist/extraction/pagination.d.ts.map +1 -0
  28. package/dist/extraction/pagination.js +323 -0
  29. package/dist/extraction/pagination.js.map +1 -0
  30. package/dist/extraction/pattern-detector.d.ts +16 -0
  31. package/dist/extraction/pattern-detector.d.ts.map +1 -0
  32. package/dist/extraction/pattern-detector.js +390 -0
  33. package/dist/extraction/pattern-detector.js.map +1 -0
  34. package/dist/extraction/scrape-session.d.ts +23 -0
  35. package/dist/extraction/scrape-session.d.ts.map +1 -0
  36. package/dist/extraction/scrape-session.js +192 -0
  37. package/dist/extraction/scrape-session.js.map +1 -0
  38. package/dist/extraction/selector-engine.d.ts +23 -0
  39. package/dist/extraction/selector-engine.d.ts.map +1 -0
  40. package/dist/extraction/selector-engine.js +127 -0
  41. package/dist/extraction/selector-engine.js.map +1 -0
  42. package/dist/extraction/table-extractor.d.ts +29 -0
  43. package/dist/extraction/table-extractor.d.ts.map +1 -0
  44. package/dist/extraction/table-extractor.js +282 -0
  45. package/dist/extraction/table-extractor.js.map +1 -0
  46. package/dist/extraction/transforms.d.ts +47 -0
  47. package/dist/extraction/transforms.d.ts.map +1 -0
  48. package/dist/extraction/transforms.js +277 -0
  49. package/dist/extraction/transforms.js.map +1 -0
  50. package/dist/extraction/types.d.ts +199 -0
  51. package/dist/extraction/types.d.ts.map +1 -0
  52. package/dist/extraction/types.js +5 -0
  53. package/dist/extraction/types.js.map +1 -0
  54. package/dist/index.d.ts +1 -0
  55. package/dist/index.d.ts.map +1 -1
  56. package/dist/index.js +2 -0
  57. package/dist/index.js.map +1 -1
  58. package/dist/playwright/browser-type.d.ts +101 -0
  59. package/dist/playwright/browser-type.d.ts.map +1 -0
  60. package/dist/playwright/browser-type.js +134 -0
  61. package/dist/playwright/browser-type.js.map +1 -0
  62. package/dist/playwright/browser.d.ts +98 -0
  63. package/dist/playwright/browser.d.ts.map +1 -0
  64. package/dist/playwright/browser.js +229 -0
  65. package/dist/playwright/browser.js.map +1 -0
  66. package/dist/playwright/context.d.ts +217 -0
  67. package/dist/playwright/context.d.ts.map +1 -0
  68. package/dist/playwright/context.js +518 -0
  69. package/dist/playwright/context.js.map +1 -0
  70. package/dist/playwright/extractor.d.ts +108 -0
  71. package/dist/playwright/extractor.d.ts.map +1 -0
  72. package/dist/playwright/extractor.js +404 -0
  73. package/dist/playwright/extractor.js.map +1 -0
  74. package/dist/playwright/frame.d.ts +147 -0
  75. package/dist/playwright/frame.d.ts.map +1 -0
  76. package/dist/playwright/frame.js +492 -0
  77. package/dist/playwright/frame.js.map +1 -0
  78. package/dist/playwright/index.d.ts +163 -0
  79. package/dist/playwright/index.d.ts.map +1 -0
  80. package/dist/playwright/index.js +313 -0
  81. package/dist/playwright/index.js.map +1 -0
  82. package/dist/playwright/keyboard.d.ts +74 -0
  83. package/dist/playwright/keyboard.d.ts.map +1 -0
  84. package/dist/playwright/keyboard.js +187 -0
  85. package/dist/playwright/keyboard.js.map +1 -0
  86. package/dist/playwright/locator.d.ts +237 -0
  87. package/dist/playwright/locator.d.ts.map +1 -0
  88. package/dist/playwright/locator.js +667 -0
  89. package/dist/playwright/locator.js.map +1 -0
  90. package/dist/playwright/mouse.d.ts +82 -0
  91. package/dist/playwright/mouse.d.ts.map +1 -0
  92. package/dist/playwright/mouse.js +137 -0
  93. package/dist/playwright/mouse.js.map +1 -0
  94. package/dist/playwright/page-helpers.d.ts +267 -0
  95. package/dist/playwright/page-helpers.d.ts.map +1 -0
  96. package/dist/playwright/page-helpers.js +449 -0
  97. package/dist/playwright/page-helpers.js.map +1 -0
  98. package/dist/playwright/page.d.ts +605 -0
  99. package/dist/playwright/page.d.ts.map +1 -0
  100. package/dist/playwright/page.js +1698 -0
  101. package/dist/playwright/page.js.map +1 -0
  102. package/dist/playwright/response.d.ts +100 -0
  103. package/dist/playwright/response.d.ts.map +1 -0
  104. package/dist/playwright/response.js +194 -0
  105. package/dist/playwright/response.js.map +1 -0
  106. package/dist/playwright/types.d.ts +354 -0
  107. package/dist/playwright/types.d.ts.map +1 -0
  108. package/dist/playwright/types.js +8 -0
  109. package/dist/playwright/types.js.map +1 -0
  110. package/openapi.json +327 -35
  111. package/package.json +10 -1
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Pagination detection and navigation.
3
+ *
4
+ * Detects pagination type from DOM heuristics: next links, page numbers,
5
+ * scroll markers, load-more buttons, rel="next".
6
+ */
7
+ import type { PaginationConfig } from './types.js';
8
+ import type { HTMLProcessor } from './html-processor.js';
9
+ /**
10
+ * Auto-detect pagination type from the current page DOM.
11
+ *
12
+ * Uses a single browser evaluate() call to check all DOM-based patterns
13
+ * (rel="next", known selectors, text-based detection, load-more) at once,
14
+ * reducing round-trips from ~20 to 1.
15
+ */
16
+ export declare function detectPagination(proc: HTMLProcessor): Promise<PaginationConfig | null>;
17
+ /**
18
+ * Check if there's a next page available.
19
+ */
20
+ export declare function hasNextPage(proc: HTMLProcessor, config: PaginationConfig, currentPage: number): Promise<boolean>;
21
+ /**
22
+ * Navigate to the next page.
23
+ */
24
+ export declare function goToNextPage(proc: HTMLProcessor, config: PaginationConfig, currentPage: number): Promise<boolean>;
25
+ /**
26
+ * Resolve a URL pattern with a page number.
27
+ */
28
+ export declare function resolvePageUrl(pattern: string, page: number): string;
29
+ //# sourceMappingURL=pagination.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pagination.d.ts","sourceRoot":"","sources":["../../src/extraction/pagination.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACnD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAiCzD;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CA0L5F;AAED;;GAEG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,gBAAgB,EACxB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,OAAO,CAAC,CA0BlB;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,IAAI,EAAE,aAAa,EACnB,MAAM,EAAE,gBAAgB,EACxB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,OAAO,CAAC,CAkClB;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -0,0 +1,323 @@
1
+ /**
2
+ * Pagination detection and navigation.
3
+ *
4
+ * Detects pagination type from DOM heuristics: next links, page numbers,
5
+ * scroll markers, load-more buttons, rel="next".
6
+ */
7
+ // Patterns for detecting "Next" buttons/links
8
+ const NEXT_LINK_PATTERNS = [
9
+ 'a[rel="next"]',
10
+ 'link[rel="next"]',
11
+ '[class*="next"] a',
12
+ '[class*="pagination"] a[class*="next"]',
13
+ 'a[aria-label*="next" i]',
14
+ 'a[aria-label*="Next"]',
15
+ 'button[aria-label*="next" i]',
16
+ 'button[aria-label*="Next"]',
17
+ 'nav[class*="pagination"] a:last-child',
18
+ '.pagination .next a',
19
+ '.pagination a.next',
20
+ '.pager .next a',
21
+ 'a.next-page',
22
+ 'a[class*="next-page"]',
23
+ 'button.next-page',
24
+ 'button[class*="next-page"]',
25
+ ];
26
+ // Patterns for "Load more" buttons
27
+ const LOAD_MORE_PATTERNS = [
28
+ 'button[class*="load-more"]',
29
+ 'button[class*="loadmore"]',
30
+ 'a[class*="load-more"]',
31
+ '[class*="load-more"] button',
32
+ '[data-action="load-more"]',
33
+ 'button[class*="show-more"]',
34
+ 'a[class*="show-more"]',
35
+ ];
36
+ /**
37
+ * Auto-detect pagination type from the current page DOM.
38
+ *
39
+ * Uses a single browser evaluate() call to check all DOM-based patterns
40
+ * (rel="next", known selectors, text-based detection, load-more) at once,
41
+ * reducing round-trips from ~20 to 1.
42
+ */
43
+ export async function detectPagination(proc) {
44
+ const url = await proc.getUrl();
45
+ // Single JS call that checks all patterns at once
46
+ const result = await proc.evaluate(`
47
+ (function() {
48
+ var result = { type: null, selector: null, href: null };
49
+
50
+ // 1. rel="next" link
51
+ var relNext = document.querySelector('a[rel="next"], link[rel="next"]');
52
+ if (relNext) {
53
+ result.type = 'rel-next';
54
+ result.href = relNext.href || null;
55
+ result.selector = 'a[rel="next"]';
56
+ return result;
57
+ }
58
+
59
+ // 2. Check known next-button selectors
60
+ var nextSelectors = ${JSON.stringify(NEXT_LINK_PATTERNS)};
61
+ for (var i = 0; i < nextSelectors.length; i++) {
62
+ try {
63
+ var el = document.querySelector(nextSelectors[i]);
64
+ if (el && el.offsetParent !== null) {
65
+ result.type = 'click-next';
66
+ result.selector = nextSelectors[i];
67
+ return result;
68
+ }
69
+ } catch(e) {}
70
+ }
71
+
72
+ // 3. Text-based "next" detection in pagination containers (with i18n)
73
+ var nextTexts = ['next', 'siguiente', 'suivant', 'weiter', 'volgende', 'avanti', '\\u0434\\u0430\\u043b\\u0435\\u0435', '\\u6b21\\u3078', '\\u4e0b\\u4e00\\u9875', '\\u45e4\\u44eb'];
74
+ var arrowTexts = ['>', '>>', '\\u203a', '\\u2192', '\\u276f'];
75
+ var containers = document.querySelectorAll(
76
+ 'nav, [class*="pagination"], [class*="pager"], [role="navigation"]'
77
+ );
78
+ for (var i = 0; i < containers.length; i++) {
79
+ var clickables = containers[i].querySelectorAll('a, button');
80
+ for (var j = 0; j < clickables.length; j++) {
81
+ var el = clickables[j];
82
+ if (el.disabled) continue;
83
+ var text = (el.textContent || '').trim().toLowerCase();
84
+ var ariaLabel = (el.getAttribute('aria-label') || '').toLowerCase();
85
+
86
+ var isNext = false;
87
+ for (var k = 0; k < nextTexts.length; k++) {
88
+ if (text.indexOf(nextTexts[k]) === 0 || ariaLabel.indexOf(nextTexts[k]) === 0) {
89
+ isNext = true;
90
+ break;
91
+ }
92
+ }
93
+ if (!isNext) {
94
+ for (var k = 0; k < arrowTexts.length; k++) {
95
+ if (text === arrowTexts[k]) { isNext = true; break; }
96
+ }
97
+ }
98
+
99
+ if (isNext) {
100
+ // Build a precise selector
101
+ var tag = el.tagName.toLowerCase();
102
+ var sel = tag;
103
+ if (el.id) {
104
+ sel = tag + '#' + el.id;
105
+ } else if (el.getAttribute('aria-label')) {
106
+ sel = tag + '[aria-label="' + el.getAttribute('aria-label').replace(/"/g, '\\\\"') + '"]';
107
+ } else {
108
+ // Use classes + nth-child for precision
109
+ var cls = Array.from(el.classList).slice(0, 3).join('.');
110
+ if (cls) {
111
+ sel = tag + '.' + cls;
112
+ // Check uniqueness
113
+ var matches = document.querySelectorAll(sel);
114
+ if (matches.length > 1) {
115
+ var parent = el.parentElement;
116
+ if (parent) {
117
+ var siblings = parent.querySelectorAll(':scope > ' + sel);
118
+ for (var s = 0; s < siblings.length; s++) {
119
+ if (siblings[s] === el) {
120
+ sel = tag + '.' + cls + ':nth-child(' + (s + 1) + ')';
121
+ break;
122
+ }
123
+ }
124
+ }
125
+ }
126
+ } else {
127
+ // No classes — use parent context + nth-child
128
+ var parent = el.parentElement;
129
+ if (parent) {
130
+ var siblings = parent.querySelectorAll(':scope > ' + tag);
131
+ for (var s = 0; s < siblings.length; s++) {
132
+ if (siblings[s] === el) {
133
+ var parentSel = '';
134
+ if (parent.id) parentSel = '#' + parent.id;
135
+ else if (parent.classList.length > 0) parentSel = parent.tagName.toLowerCase() + '.' + Array.from(parent.classList).slice(0, 2).join('.');
136
+ else parentSel = parent.tagName.toLowerCase();
137
+ sel = parentSel + ' > ' + tag + ':nth-child(' + (s + 1) + ')';
138
+ break;
139
+ }
140
+ }
141
+ }
142
+ }
143
+ }
144
+ result.type = 'click-next';
145
+ result.selector = sel;
146
+ return result;
147
+ }
148
+ }
149
+ }
150
+
151
+ // 4. Load-more buttons
152
+ var loadMoreSelectors = ${JSON.stringify(LOAD_MORE_PATTERNS)};
153
+ for (var i = 0; i < loadMoreSelectors.length; i++) {
154
+ try {
155
+ var el = document.querySelector(loadMoreSelectors[i]);
156
+ if (el && el.offsetParent !== null) {
157
+ result.type = 'load-more';
158
+ result.selector = loadMoreSelectors[i];
159
+ return result;
160
+ }
161
+ } catch(e) {}
162
+ }
163
+
164
+ return null;
165
+ })()
166
+ `);
167
+ if (result) {
168
+ if (result.type === 'rel-next' && result.href) {
169
+ const pattern = detectUrlPattern(url, result.href);
170
+ if (pattern) {
171
+ return {
172
+ type: 'url-pattern',
173
+ urlPattern: pattern.pattern,
174
+ startPage: pattern.currentPage,
175
+ };
176
+ }
177
+ return { type: 'click-next', nextSelector: result.selector };
178
+ }
179
+ if (result.type === 'click-next') {
180
+ return { type: 'click-next', nextSelector: result.selector };
181
+ }
182
+ if (result.type === 'load-more') {
183
+ return { type: 'load-more', nextSelector: result.selector };
184
+ }
185
+ }
186
+ // 5. Check for URL-based pagination (page= in current URL)
187
+ const urlMatch = /[?&](page|p)=(\d+)/.exec(url);
188
+ if (urlMatch) {
189
+ const param = urlMatch[1];
190
+ const currentPage = parseInt(urlMatch[2], 10);
191
+ return {
192
+ type: 'url-pattern',
193
+ urlPattern: url.replace(new RegExp(`([?&])${param}=\\d+`), `$1${param}={page}`),
194
+ startPage: currentPage,
195
+ };
196
+ }
197
+ // 6. Check for /page/N pattern in URL
198
+ const pathMatch = /\/page\/(\d+)/.exec(url);
199
+ if (pathMatch) {
200
+ return {
201
+ type: 'url-pattern',
202
+ urlPattern: url.replace(/\/page\/\d+/, '/page/{page}'),
203
+ startPage: parseInt(pathMatch[1], 10),
204
+ };
205
+ }
206
+ // 7. Check for offset-based pagination
207
+ const offsetMatch = /[?&](offset|start|skip)=(\d+)/.exec(url);
208
+ if (offsetMatch) {
209
+ const param = offsetMatch[1];
210
+ const currentOffset = parseInt(offsetMatch[2], 10);
211
+ // Try to detect limit param
212
+ const limitMatch = /[?&](limit|count|size)=(\d+)/.exec(url);
213
+ const limit = limitMatch ? parseInt(limitMatch[2], 10) : 20;
214
+ return {
215
+ type: 'url-pattern',
216
+ urlPattern: url.replace(new RegExp(`([?&])${param}=\\d+`), `$1${param}={page}`),
217
+ startPage: currentOffset,
218
+ };
219
+ }
220
+ return null;
221
+ }
222
+ /**
223
+ * Check if there's a next page available.
224
+ */
225
+ export async function hasNextPage(proc, config, currentPage) {
226
+ if (config.type === 'url-list') {
227
+ return config.urls !== undefined && currentPage < config.urls.length;
228
+ }
229
+ if (config.type === 'url-pattern') {
230
+ // We can always try the next page — will be validated by the scraper
231
+ return true;
232
+ }
233
+ if (config.type === 'click-next' || config.type === 'load-more') {
234
+ if (!config.nextSelector)
235
+ return false;
236
+ const exists = await proc.evaluate(`
237
+ (function() {
238
+ var el = document.querySelector(${JSON.stringify(config.nextSelector)});
239
+ return el && el.offsetParent !== null && !el.disabled;
240
+ })()
241
+ `);
242
+ return exists;
243
+ }
244
+ if (config.type === 'infinite-scroll') {
245
+ return true; // Handled by scroll behavior
246
+ }
247
+ return false;
248
+ }
249
+ /**
250
+ * Navigate to the next page.
251
+ */
252
+ export async function goToNextPage(proc, config, currentPage) {
253
+ try {
254
+ if (config.type === 'click-next' || config.type === 'load-more') {
255
+ if (!config.nextSelector)
256
+ return false;
257
+ await proc.click(config.nextSelector);
258
+ await proc.wait(config.waitAfter ?? 1000);
259
+ return true;
260
+ }
261
+ if (config.type === 'url-pattern') {
262
+ const nextPage = currentPage + 1;
263
+ const url = resolvePageUrl(config.urlPattern, nextPage);
264
+ await proc.goto(url);
265
+ await proc.wait(config.waitAfter ?? 1000);
266
+ return true;
267
+ }
268
+ if (config.type === 'url-list') {
269
+ if (!config.urls || currentPage >= config.urls.length)
270
+ return false;
271
+ await proc.goto(config.urls[currentPage]);
272
+ await proc.wait(config.waitAfter ?? 1000);
273
+ return true;
274
+ }
275
+ if (config.type === 'infinite-scroll') {
276
+ await proc.scrollToBottom();
277
+ await proc.wait(config.waitAfter ?? 1500);
278
+ return true;
279
+ }
280
+ return false;
281
+ }
282
+ catch {
283
+ return false;
284
+ }
285
+ }
286
+ /**
287
+ * Resolve a URL pattern with a page number.
288
+ */
289
+ export function resolvePageUrl(pattern, page) {
290
+ return pattern.replace('{page}', String(page));
291
+ }
292
+ // ==================== Internal ====================
293
+ function detectUrlPattern(currentUrl, nextUrl) {
294
+ try {
295
+ const current = new URL(currentUrl);
296
+ const next = new URL(nextUrl);
297
+ // Check query params
298
+ for (const [key, value] of next.searchParams) {
299
+ const currentValue = current.searchParams.get(key);
300
+ const nextNum = parseInt(value, 10);
301
+ const currentNum = currentValue ? parseInt(currentValue, 10) : NaN;
302
+ if (!isNaN(nextNum) && !isNaN(currentNum) && nextNum === currentNum + 1) {
303
+ const pattern = currentUrl.replace(new RegExp(`([?&])${key}=${currentNum}`), `$1${key}={page}`);
304
+ return { pattern, currentPage: currentNum };
305
+ }
306
+ }
307
+ // Check path pattern /page/N
308
+ const currentMatch = /\/page\/(\d+)/.exec(current.pathname);
309
+ const nextMatch = /\/page\/(\d+)/.exec(next.pathname);
310
+ if (currentMatch && nextMatch) {
311
+ const currentPage = parseInt(currentMatch[1], 10);
312
+ return {
313
+ pattern: currentUrl.replace(/\/page\/\d+/, '/page/{page}'),
314
+ currentPage,
315
+ };
316
+ }
317
+ }
318
+ catch {
319
+ // Invalid URLs
320
+ }
321
+ return null;
322
+ }
323
+ //# sourceMappingURL=pagination.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pagination.js","sourceRoot":"","sources":["../../src/extraction/pagination.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,8CAA8C;AAC9C,MAAM,kBAAkB,GAAG;IACzB,eAAe;IACf,kBAAkB;IAClB,mBAAmB;IACnB,wCAAwC;IACxC,yBAAyB;IACzB,uBAAuB;IACvB,8BAA8B;IAC9B,4BAA4B;IAC5B,uCAAuC;IACvC,qBAAqB;IACrB,oBAAoB;IACpB,gBAAgB;IAChB,aAAa;IACb,uBAAuB;IACvB,kBAAkB;IAClB,4BAA4B;CAC7B,CAAC;AAEF,mCAAmC;AACnC,MAAM,kBAAkB,GAAG;IACzB,4BAA4B;IAC5B,2BAA2B;IAC3B,uBAAuB;IACvB,6BAA6B;IAC7B,2BAA2B;IAC3B,4BAA4B;IAC5B,uBAAuB;CACxB,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,IAAmB;IACxD,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;IAEhC,kDAAkD;IAClD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;;;;;4BAcT,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCA4F9B,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC;;;;;;;;;;;;;;GAc/D,CAAmE,CAAC;IAErE,IAAI,MAAM,EAAE,CAAC;QACX,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;YAC9C,MAAM,OAAO,GAAG,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;YACnD,IAAI,OAAO,EAAE,CAAC;gBACZ,OAAO;oBACL,IAAI,EAAE,aAAa;oBACnB,UAAU,EAAE,OAAO,CAAC,OAAO;oBAC3B,SAAS,EAAE,OAAO,CAAC,WAAW;iBAC/B,CAAC;YACJ,CAAC;YACD,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC/D,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;YACjC,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC/D,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChC,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC9D,CAAC;IACH,CAAC;IAED,2DAA2D;IAC3D,MAAM,QAAQ,GAAG,oBAAoB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,KAAK,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;QAC/C,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,SAAS,KAAK,OAAO,CAAC,EAAE,KAAK,KAAK,SAAS,CAAC;YAC/E,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;IAED,sCAAsC;IACtC,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC5C,IAAI,SAAS,EAAE,CAAC;QACd,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,aAAa,EAAE,cAAc,CAAC;YACtD,SAAS,EAAE,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC;SACvC,CAAC;IACJ,CAAC;IAED,uCAAuC;IACvC,MAAM,WAAW,GAAG,+BAA+B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9D,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,KAAK,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;QAC9B,MAAM,aAAa,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;QACpD,4BAA4B;QAC5B,MAAM,UAAU,GAAG,8BAA8B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,UAAU,EAAE,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,SAAS,KAAK,OAAO,CAAC,EAAE,KAAK,KAAK,SAAS,CAAC;YAC/E,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAmB,EACnB,MAAwB,EACxB,WAAmB;IAEnB,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC,IAAI,KAAK,SAAS,IAAI,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC;IACvE,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;QAClC,qEAAqE;QACrE,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;QAChE,IAAI,CAAC,MAAM,CAAC,YAAY;YAAE,OAAO,KAAK,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC;;0CAEG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,YAAY,CAAC;;;KAGxE,CAAY,CAAC;QACd,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,CAAC,IAAI,KAAK,iBAAiB,EAAE,CAAC;QACtC,OAAO,IAAI,CAAC,CAAC,6BAA6B;IAC5C,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,IAAmB,EACnB,MAAwB,EACxB,WAAmB;IAEnB,IAAI,CAAC;QACH,IAAI,MAAM,CAAC,IAAI,KAAK,YAAY,IAAI,MAAM,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChE,IAAI,CAAC,MAAM,CAAC,YAAY;gBAAE,OAAO,KAAK,CAAC;YACvC,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC;YACtC,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YAClC,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC;YACjC,MAAM,GAAG,GAAG,cAAc,CAAC,MAAM,CAAC,UAAW,EAAE,QAAQ,CAAC,CAAC;YACzD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACrB,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,WAAW,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM;gBAAE,OAAO,KAAK,CAAC;YACpE,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAE,CAAC,CAAC;YAC3C,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,KAAK,iBAAiB,EAAE,CAAC;YACtC,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YAC5B,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,OAAe,EAAE,IAAY;IAC1D,OAAO,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAED,qDAAqD;AAErD,SAAS,gBAAgB,CACvB,UAAkB,EAClB,OAAe;IAEf,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC;QACpC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;QAE9B,qBAAqB;QACrB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YAC7C,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACnD,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YACpC,MAAM,UAAU,GAAG,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAEnE,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,OAAO,KAAK,UAAU,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAChC,IAAI,MAAM,CAAC,SAAS,GAAG,IAAI,UAAU,EAAE,CAAC,EACxC,KAAK,GAAG,SAAS,CAClB,CAAC;gBACF,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC;YAC9C,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,MAAM,YAAY,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACtD,IAAI,YAAY,IAAI,SAAS,EAAE,CAAC;YAC9B,MAAM,WAAW,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAE,EAAE,EAAE,CAAC,CAAC;YACnD,OAAO;gBACL,OAAO,EAAE,UAAU,CAAC,OAAO,CAAC,aAAa,EAAE,cAAc,CAAC;gBAC1D,WAAW;aACZ,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Zero-config auto pattern discovery via pure DOM analysis.
3
+ *
4
+ * Finds repeating DOM structures without any AI — uses tag+class frequency,
5
+ * child consistency scoring, and semantic field inference.
6
+ */
7
+ import type { DetectedPattern, DetectOptions, ExtractedRecord } from './types.js';
8
+ /**
9
+ * Detect repeating patterns on the page.
10
+ */
11
+ export declare function detect(html: string, options?: DetectOptions): DetectedPattern[];
12
+ /**
13
+ * Detect patterns and immediately extract the best one.
14
+ */
15
+ export declare function detectAndExtract(html: string, options?: DetectOptions): ExtractedRecord[];
16
+ //# sourceMappingURL=pattern-detector.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pattern-detector.d.ts","sourceRoot":"","sources":["../../src/extraction/pattern-detector.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAE,aAAa,EAAE,eAAe,EAAa,MAAM,YAAY,CAAC;AAI7F;;GAEG;AACH,wBAAgB,MAAM,CACpB,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,aAAa,GACtB,eAAe,EAAE,CAoDnB;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,aAAa,GACtB,eAAe,EAAE,CAMnB"}