@olib-ai/owl-browser-sdk 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -0
- package/dist/extraction/content-cleaner.d.ts +40 -0
- package/dist/extraction/content-cleaner.d.ts.map +1 -0
- package/dist/extraction/content-cleaner.js +393 -0
- package/dist/extraction/content-cleaner.js.map +1 -0
- package/dist/extraction/extractor.d.ts +139 -0
- package/dist/extraction/extractor.d.ts.map +1 -0
- package/dist/extraction/extractor.js +212 -0
- package/dist/extraction/extractor.js.map +1 -0
- package/dist/extraction/html-processor.d.ts +75 -0
- package/dist/extraction/html-processor.d.ts.map +1 -0
- package/dist/extraction/html-processor.js +192 -0
- package/dist/extraction/html-processor.js.map +1 -0
- package/dist/extraction/index.d.ts +14 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/index.js +19 -0
- package/dist/extraction/index.js.map +1 -0
- package/dist/extraction/list-extractor.d.ts +24 -0
- package/dist/extraction/list-extractor.d.ts.map +1 -0
- package/dist/extraction/list-extractor.js +303 -0
- package/dist/extraction/list-extractor.js.map +1 -0
- package/dist/extraction/meta-extractor.d.ts +40 -0
- package/dist/extraction/meta-extractor.d.ts.map +1 -0
- package/dist/extraction/meta-extractor.js +216 -0
- package/dist/extraction/meta-extractor.js.map +1 -0
- package/dist/extraction/pagination.d.ts +29 -0
- package/dist/extraction/pagination.d.ts.map +1 -0
- package/dist/extraction/pagination.js +323 -0
- package/dist/extraction/pagination.js.map +1 -0
- package/dist/extraction/pattern-detector.d.ts +16 -0
- package/dist/extraction/pattern-detector.d.ts.map +1 -0
- package/dist/extraction/pattern-detector.js +390 -0
- package/dist/extraction/pattern-detector.js.map +1 -0
- package/dist/extraction/scrape-session.d.ts +23 -0
- package/dist/extraction/scrape-session.d.ts.map +1 -0
- package/dist/extraction/scrape-session.js +192 -0
- package/dist/extraction/scrape-session.js.map +1 -0
- package/dist/extraction/selector-engine.d.ts +23 -0
- package/dist/extraction/selector-engine.d.ts.map +1 -0
- package/dist/extraction/selector-engine.js +127 -0
- package/dist/extraction/selector-engine.js.map +1 -0
- package/dist/extraction/table-extractor.d.ts +29 -0
- package/dist/extraction/table-extractor.d.ts.map +1 -0
- package/dist/extraction/table-extractor.js +282 -0
- package/dist/extraction/table-extractor.js.map +1 -0
- package/dist/extraction/transforms.d.ts +47 -0
- package/dist/extraction/transforms.d.ts.map +1 -0
- package/dist/extraction/transforms.js +277 -0
- package/dist/extraction/transforms.js.map +1 -0
- package/dist/extraction/types.d.ts +199 -0
- package/dist/extraction/types.d.ts.map +1 -0
- package/dist/extraction/types.js +5 -0
- package/dist/extraction/types.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/playwright/browser-type.d.ts +101 -0
- package/dist/playwright/browser-type.d.ts.map +1 -0
- package/dist/playwright/browser-type.js +134 -0
- package/dist/playwright/browser-type.js.map +1 -0
- package/dist/playwright/browser.d.ts +98 -0
- package/dist/playwright/browser.d.ts.map +1 -0
- package/dist/playwright/browser.js +229 -0
- package/dist/playwright/browser.js.map +1 -0
- package/dist/playwright/context.d.ts +217 -0
- package/dist/playwright/context.d.ts.map +1 -0
- package/dist/playwright/context.js +518 -0
- package/dist/playwright/context.js.map +1 -0
- package/dist/playwright/extractor.d.ts +108 -0
- package/dist/playwright/extractor.d.ts.map +1 -0
- package/dist/playwright/extractor.js +404 -0
- package/dist/playwright/extractor.js.map +1 -0
- package/dist/playwright/frame.d.ts +147 -0
- package/dist/playwright/frame.d.ts.map +1 -0
- package/dist/playwright/frame.js +492 -0
- package/dist/playwright/frame.js.map +1 -0
- package/dist/playwright/index.d.ts +163 -0
- package/dist/playwright/index.d.ts.map +1 -0
- package/dist/playwright/index.js +313 -0
- package/dist/playwright/index.js.map +1 -0
- package/dist/playwright/keyboard.d.ts +74 -0
- package/dist/playwright/keyboard.d.ts.map +1 -0
- package/dist/playwright/keyboard.js +187 -0
- package/dist/playwright/keyboard.js.map +1 -0
- package/dist/playwright/locator.d.ts +237 -0
- package/dist/playwright/locator.d.ts.map +1 -0
- package/dist/playwright/locator.js +667 -0
- package/dist/playwright/locator.js.map +1 -0
- package/dist/playwright/mouse.d.ts +82 -0
- package/dist/playwright/mouse.d.ts.map +1 -0
- package/dist/playwright/mouse.js +137 -0
- package/dist/playwright/mouse.js.map +1 -0
- package/dist/playwright/page-helpers.d.ts +267 -0
- package/dist/playwright/page-helpers.d.ts.map +1 -0
- package/dist/playwright/page-helpers.js +449 -0
- package/dist/playwright/page-helpers.js.map +1 -0
- package/dist/playwright/page.d.ts +605 -0
- package/dist/playwright/page.d.ts.map +1 -0
- package/dist/playwright/page.js +1698 -0
- package/dist/playwright/page.js.map +1 -0
- package/dist/playwright/response.d.ts +100 -0
- package/dist/playwright/response.d.ts.map +1 -0
- package/dist/playwright/response.js +194 -0
- package/dist/playwright/response.js.map +1 -0
- package/dist/playwright/types.d.ts +354 -0
- package/dist/playwright/types.d.ts.map +1 -0
- package/dist/playwright/types.js +8 -0
- package/dist/playwright/types.js.map +1 -0
- package/openapi.json +327 -35
- package/package.json +10 -1
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core CSS selector extraction engine using cheerio.
|
|
3
|
+
*
|
|
4
|
+
* All methods are static/pure — they parse HTML and return extracted data.
|
|
5
|
+
* Reuses the transform pipeline from transforms.ts.
|
|
6
|
+
*/
|
|
7
|
+
import { type CheerioAPI, type Cheerio } from 'cheerio';
|
|
8
|
+
import type { AnyNode } from 'domhandler';
|
|
9
|
+
import type { FieldSpec, ExtractedRecord } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Extract structured data from all elements matching containerSelector.
|
|
12
|
+
*/
|
|
13
|
+
export declare function extractAll(html: string, containerSelector: string, fields: Record<string, FieldSpec>): ExtractedRecord[];
|
|
14
|
+
/**
|
|
15
|
+
* Extract structured data from the first element matching containerSelector.
|
|
16
|
+
*/
|
|
17
|
+
export declare function extractFirst(html: string, containerSelector: string, fields: Record<string, FieldSpec>): ExtractedRecord | null;
|
|
18
|
+
/**
|
|
19
|
+
* Count elements matching a CSS selector.
|
|
20
|
+
*/
|
|
21
|
+
export declare function count(html: string, selector: string): number;
|
|
22
|
+
export declare function resolveField($: CheerioAPI, container: Cheerio<AnyNode>, spec: FieldSpec): unknown;
|
|
23
|
+
//# sourceMappingURL=selector-engine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"selector-engine.d.ts","sourceRoot":"","sources":["../../src/extraction/selector-engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAQ,KAAK,UAAU,EAAE,KAAK,OAAO,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,KAAK,EAAE,SAAS,EAAmB,eAAe,EAAE,MAAM,YAAY,CAAC;AAG9E;;GAEG;AACH,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,eAAe,EAAE,CAOnB;AAED;;GAEG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,eAAe,GAAG,IAAI,CAKxB;AAED;;GAEG;AACH,wBAAgB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAG5D;AAgBD,wBAAgB,YAAY,CAC1B,CAAC,EAAE,UAAU,EACb,SAAS,EAAE,OAAO,CAAC,OAAO,CAAC,EAC3B,IAAI,EAAE,SAAS,GACd,OAAO,CAKT"}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core CSS selector extraction engine using cheerio.
|
|
3
|
+
*
|
|
4
|
+
* All methods are static/pure — they parse HTML and return extracted data.
|
|
5
|
+
* Reuses the transform pipeline from transforms.ts.
|
|
6
|
+
*/
|
|
7
|
+
import { load } from 'cheerio';
|
|
8
|
+
import { applyTransforms, applyPattern, coerceType } from './transforms.js';
|
|
9
|
+
/**
|
|
10
|
+
* Extract structured data from all elements matching containerSelector.
|
|
11
|
+
*/
|
|
12
|
+
export function extractAll(html, containerSelector, fields) {
|
|
13
|
+
const $ = load(html);
|
|
14
|
+
const results = [];
|
|
15
|
+
$(containerSelector).each((_i, el) => {
|
|
16
|
+
results.push(extractItem($, $(el), fields));
|
|
17
|
+
});
|
|
18
|
+
return results;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Extract structured data from the first element matching containerSelector.
|
|
22
|
+
*/
|
|
23
|
+
export function extractFirst(html, containerSelector, fields) {
|
|
24
|
+
const $ = load(html);
|
|
25
|
+
const container = $(containerSelector).first();
|
|
26
|
+
if (container.length === 0)
|
|
27
|
+
return null;
|
|
28
|
+
return extractItem($, container, fields);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Count elements matching a CSS selector.
|
|
32
|
+
*/
|
|
33
|
+
export function count(html, selector) {
|
|
34
|
+
const $ = load(html);
|
|
35
|
+
return $(selector).length;
|
|
36
|
+
}
|
|
37
|
+
// ==================== Internal ====================
|
|
38
|
+
function extractItem($, container, fields) {
|
|
39
|
+
const record = {};
|
|
40
|
+
for (const [name, spec] of Object.entries(fields)) {
|
|
41
|
+
record[name] = resolveField($, container, spec);
|
|
42
|
+
}
|
|
43
|
+
return record;
|
|
44
|
+
}
|
|
45
|
+
export function resolveField($, container, spec) {
|
|
46
|
+
if (typeof spec === 'string') {
|
|
47
|
+
return extractStringField($, container, spec);
|
|
48
|
+
}
|
|
49
|
+
return extractObjectField($, container, spec);
|
|
50
|
+
}
|
|
51
|
+
function extractStringField($, container, spec) {
|
|
52
|
+
const atPos = spec.lastIndexOf('@');
|
|
53
|
+
if (atPos >= 0) {
|
|
54
|
+
const sel = spec.slice(0, atPos).trim();
|
|
55
|
+
const attr = spec.slice(atPos + 1);
|
|
56
|
+
const target = sel ? container.find(sel).first() : container;
|
|
57
|
+
if (target.length === 0)
|
|
58
|
+
return null;
|
|
59
|
+
if (attr === 'outerHTML')
|
|
60
|
+
return $.html(target) ?? null;
|
|
61
|
+
if (attr === 'innerHTML')
|
|
62
|
+
return target.html() ?? null;
|
|
63
|
+
return target.attr(attr) ?? null;
|
|
64
|
+
}
|
|
65
|
+
const el = spec.trim() ? container.find(spec).first() : container;
|
|
66
|
+
if (el.length === 0)
|
|
67
|
+
return null;
|
|
68
|
+
return el.text().trim() || null;
|
|
69
|
+
}
|
|
70
|
+
function extractObjectField($, container, spec) {
|
|
71
|
+
if (spec.nested) {
|
|
72
|
+
return extractNested($, container, spec.nested);
|
|
73
|
+
}
|
|
74
|
+
const sel = spec.selector;
|
|
75
|
+
const attr = spec.attribute;
|
|
76
|
+
const fieldType = spec.type;
|
|
77
|
+
if (spec.all) {
|
|
78
|
+
const values = [];
|
|
79
|
+
const targets = sel ? container.find(sel) : container;
|
|
80
|
+
targets.each((_i, el) => {
|
|
81
|
+
let raw = extractRawValue($, $(el), attr, fieldType);
|
|
82
|
+
raw = applyPattern(raw, spec.pattern, spec.group);
|
|
83
|
+
raw = applyTransforms(raw, spec.transform);
|
|
84
|
+
const coerced = coerceType(raw, fieldType);
|
|
85
|
+
values.push(coerced ?? spec.default ?? null);
|
|
86
|
+
});
|
|
87
|
+
return values;
|
|
88
|
+
}
|
|
89
|
+
const target = sel ? container.find(sel).first() : container;
|
|
90
|
+
if (target.length === 0) {
|
|
91
|
+
return spec.default ?? null;
|
|
92
|
+
}
|
|
93
|
+
let raw = extractRawValue($, target, attr, fieldType);
|
|
94
|
+
raw = applyPattern(raw, spec.pattern, spec.group);
|
|
95
|
+
raw = applyTransforms(raw, spec.transform);
|
|
96
|
+
const coerced = coerceType(raw, fieldType);
|
|
97
|
+
return coerced ?? spec.default ?? null;
|
|
98
|
+
}
|
|
99
|
+
function extractRawValue($, el, attr, type) {
|
|
100
|
+
if (el.length === 0)
|
|
101
|
+
return null;
|
|
102
|
+
if (type === 'html')
|
|
103
|
+
return $.html(el) ?? null;
|
|
104
|
+
if (type === 'innerHtml')
|
|
105
|
+
return el.html() ?? null;
|
|
106
|
+
if (attr) {
|
|
107
|
+
if (attr === 'outerHTML')
|
|
108
|
+
return $.html(el) ?? null;
|
|
109
|
+
if (attr === 'innerHTML')
|
|
110
|
+
return el.html() ?? null;
|
|
111
|
+
return el.attr(attr) ?? null;
|
|
112
|
+
}
|
|
113
|
+
return el.text().trim() || null;
|
|
114
|
+
}
|
|
115
|
+
function extractNested($, container, nested) {
|
|
116
|
+
const results = [];
|
|
117
|
+
const elements = container.find(nested.selector);
|
|
118
|
+
const limit = nested.limit ?? elements.length;
|
|
119
|
+
elements.each((i, el) => {
|
|
120
|
+
if (i >= limit)
|
|
121
|
+
return false;
|
|
122
|
+
results.push(extractItem($, $(el), nested.fields));
|
|
123
|
+
return undefined;
|
|
124
|
+
});
|
|
125
|
+
return results;
|
|
126
|
+
}
|
|
127
|
+
//# sourceMappingURL=selector-engine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"selector-engine.js","sourceRoot":"","sources":["../../src/extraction/selector-engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,IAAI,EAAiC,MAAM,SAAS,CAAC;AAG9D,OAAO,EAAE,eAAe,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAE5E;;GAEG;AACH,MAAM,UAAU,UAAU,CACxB,IAAY,EACZ,iBAAyB,EACzB,MAAiC;IAEjC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,CAAC,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACnC,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IACH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,iBAAyB,EACzB,MAAiC;IAEjC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,SAAS,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACxC,OAAO,WAAW,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,KAAK,CAAC,IAAY,EAAE,QAAgB;IAClD,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED,qDAAqD;AAErD,SAAS,WAAW,CAClB,CAAa,EACb,SAA2B,EAC3B,MAAiC;IAEjC,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,KAAK,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAClD,MAAM,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAClD,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,CAAa,EACb,SAA2B,EAC3B,IAAe;IAEf,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC7B,OAAO,kBAAkB,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAChD,CAAC;IACD,OAAO,kBAAkB,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,kBAAkB,CACzB,CAAa,EACb,SAA2B,EAC3B,IAAY;IAEZ,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IACpC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QACxC,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACnC,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;QAC7D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC;QACrC,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC;QACxD,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,MAAM,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QACvD,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;IACnC,CAAC;IACD,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAClE,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACjC,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;AAClC,CAAC;AAED,SAAS,kBAAkB,CACzB,CAAa,EACb,SAA2B,EAC3B,IAAqB;IAErB,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,OAAO,aAAa,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC;IAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC;IAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC;IAE5B,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;QACb,MAAM,MAAM,GAAc,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtD,OAAO,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;YACtB,IAAI,GAAG,GAAkB,eAAe,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC;YACpE,GAAG,GAAG,YAAY,CAAC,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;YAClD,GAAG,GAAG,eAAe,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;YAC3C,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QACH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAC7D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;IAC9B,CAAC;IAED,IAAI,GAAG,GAAkB,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC;IACrE,GAAG,GAAG,YAAY,CAAC,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IAClD,GAAG,GAAG,eAAe,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;IAC3C,OAAO,OAAO,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;AACzC,CAAC;AAED,SAAS,eAAe,CACtB,CAAa,EACb,EAAoB,EACpB,IAAa,EACb,IAAa;IAEb,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACjC,IAAI,IAAI,KAAK,MAAM;QAAE,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC;IAC/C,IAAI,IAAI,KAAK,WAAW;QAAE,OAAO,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;IACnD,IAAI,IAAI,EAAE,CAAC;QACT,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC;QACpD,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QACnD,OAAO,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;IAC/B,CAAC;IACD,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;AAClC,CAAC;AAED,SAAS,aAAa,CACpB,CAAa,EACb,SAA2B,EAC3B,MAA+E;IAE/E,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,MAAM,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,IAAI,QAAQ,CAAC,MAAM,CAAC;IAC9C,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACtB,IAAI,CAAC,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;QAC7B,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACnD,OAAO,SAAS,CAAC;IACnB,CAAC,CAAC,CAAC;IACH,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Specialized table extraction.
|
|
3
|
+
*
|
|
4
|
+
* Handles <table> with colspan/rowspan, headerless tables, key-value transpose,
|
|
5
|
+
* CSS grid/flexbox "tables", and <dl>/<dt>/<dd> definition lists.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractedRecord, TableOptions } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Extract a <table> as an array of records.
|
|
10
|
+
*/
|
|
11
|
+
export declare function extractTable(html: string, selector?: string, options?: TableOptions): ExtractedRecord[];
|
|
12
|
+
/**
|
|
13
|
+
* Extract a CSS grid/flexbox "table" (div-based layout).
|
|
14
|
+
*/
|
|
15
|
+
export declare function extractGrid(html: string, containerSelector: string, itemSelector?: string): ExtractedRecord[];
|
|
16
|
+
/**
|
|
17
|
+
* Extract a <dl>/<dt>/<dd> definition list as key-value records.
|
|
18
|
+
*/
|
|
19
|
+
export declare function extractDefinitionList(html: string, selector?: string): ExtractedRecord;
|
|
20
|
+
/**
|
|
21
|
+
* Auto-detect all table-like structures on the page.
|
|
22
|
+
*/
|
|
23
|
+
export declare function detectTables(html: string): Array<{
|
|
24
|
+
selector: string;
|
|
25
|
+
type: 'table' | 'definition-list';
|
|
26
|
+
rowCount: number;
|
|
27
|
+
columnCount: number;
|
|
28
|
+
}>;
|
|
29
|
+
//# sourceMappingURL=table-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"table-extractor.d.ts","sourceRoot":"","sources":["../../src/extraction/table-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAEhE;;GAEG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,QAAQ,GAAE,MAAgB,EAC1B,OAAO,CAAC,EAAE,YAAY,GACrB,eAAe,EAAE,CA4JnB;AAED;;GAEG;AACH,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,YAAY,CAAC,EAAE,MAAM,GACpB,eAAe,EAAE,CA4BnB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,EACZ,QAAQ,GAAE,MAAa,GACtB,eAAe,CAoBjB;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,CAAC;IAChD,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,GAAG,iBAAiB,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;CACrB,CAAC,CAuCD"}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Specialized table extraction.
|
|
3
|
+
*
|
|
4
|
+
* Handles <table> with colspan/rowspan, headerless tables, key-value transpose,
|
|
5
|
+
* CSS grid/flexbox "tables", and <dl>/<dt>/<dd> definition lists.
|
|
6
|
+
*/
|
|
7
|
+
import { load } from 'cheerio';
|
|
8
|
+
/**
|
|
9
|
+
* Extract a <table> as an array of records.
|
|
10
|
+
*/
|
|
11
|
+
export function extractTable(html, selector = 'table', options) {
|
|
12
|
+
const $ = load(html);
|
|
13
|
+
const table = $(selector).first();
|
|
14
|
+
if (table.length === 0)
|
|
15
|
+
return [];
|
|
16
|
+
let headers = options?.headers ? [...options.headers] : [];
|
|
17
|
+
// Extract caption
|
|
18
|
+
const caption = table.find('> caption').first();
|
|
19
|
+
const captionText = caption.length > 0 ? caption.text().trim() : null;
|
|
20
|
+
// Auto-detect headers from <th> elements
|
|
21
|
+
if (headers.length === 0) {
|
|
22
|
+
const thead = table.find('> thead').first();
|
|
23
|
+
if (thead.length > 0) {
|
|
24
|
+
// Check for multi-row headers (all rows in thead are header rows)
|
|
25
|
+
const headerRows = thead.find('> tr');
|
|
26
|
+
if (headerRows.length > 1) {
|
|
27
|
+
// Multi-row header: combine into composite headers
|
|
28
|
+
const headerGrid = [];
|
|
29
|
+
headerRows.each((_ri, row) => {
|
|
30
|
+
const rowHeaders = [];
|
|
31
|
+
$(row).find('> th').each((_ci, th) => {
|
|
32
|
+
rowHeaders.push($(th).text().trim());
|
|
33
|
+
});
|
|
34
|
+
headerGrid.push(rowHeaders);
|
|
35
|
+
});
|
|
36
|
+
// Combine: "Category" + "Name" -> "Category / Name"
|
|
37
|
+
const maxCols = Math.max(...headerGrid.map(r => r.length));
|
|
38
|
+
for (let c = 0; c < maxCols; c++) {
|
|
39
|
+
const parts = [];
|
|
40
|
+
for (const row of headerGrid) {
|
|
41
|
+
const val = row[c]?.trim();
|
|
42
|
+
if (val && !parts.includes(val))
|
|
43
|
+
parts.push(val);
|
|
44
|
+
}
|
|
45
|
+
headers.push(parts.join(' / '));
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
// Single header row
|
|
50
|
+
thead.find('> tr > th').each((_i, th) => {
|
|
51
|
+
headers.push($(th).text().trim());
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
// No thead — check if first row has <th> elements
|
|
57
|
+
const firstRow = table.find('> tbody > tr, > tr').first();
|
|
58
|
+
if (firstRow.length > 0) {
|
|
59
|
+
const ths = firstRow.find('> th');
|
|
60
|
+
if (ths.length > 0) {
|
|
61
|
+
ths.each((_i, th) => {
|
|
62
|
+
headers.push($(th).text().trim());
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// Use direct child selectors to avoid nested table rows
|
|
69
|
+
const tbody = table.find('> tbody').first();
|
|
70
|
+
const rows = tbody.length > 0
|
|
71
|
+
? tbody.find('> tr')
|
|
72
|
+
: table.find('> tr');
|
|
73
|
+
let startIdx = 0;
|
|
74
|
+
// If still no headers, use first row as headers or generate column names
|
|
75
|
+
if (headers.length === 0 && rows.length > 0) {
|
|
76
|
+
const firstRow = $(rows[0]);
|
|
77
|
+
const tds = firstRow.find('> td');
|
|
78
|
+
const ths = firstRow.find('> th');
|
|
79
|
+
if (ths.length > 0 && tds.length === 0) {
|
|
80
|
+
// First row is a header row
|
|
81
|
+
ths.each((_i, cell) => {
|
|
82
|
+
headers.push($(cell).text().trim());
|
|
83
|
+
});
|
|
84
|
+
startIdx = 1;
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
// First row is data — generate column names
|
|
88
|
+
const colCount = tds.length || firstRow.find('> td, > th').length;
|
|
89
|
+
for (let c = 0; c < colCount; c++) {
|
|
90
|
+
headers.push(`col_${c}`);
|
|
91
|
+
}
|
|
92
|
+
// Don't skip first row — it's data
|
|
93
|
+
startIdx = 0;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
// Collect data rows
|
|
97
|
+
const dataRows = [];
|
|
98
|
+
rows.each((i, row) => {
|
|
99
|
+
if (i < startIdx)
|
|
100
|
+
return;
|
|
101
|
+
const tds = $(row).find('td');
|
|
102
|
+
const ths = $(row).find('th');
|
|
103
|
+
if (tds.length === 0 && ths.length > 0)
|
|
104
|
+
return; // Skip pure-header rows
|
|
105
|
+
dataRows.push($(row));
|
|
106
|
+
});
|
|
107
|
+
// Apply skipRows
|
|
108
|
+
const skipRows = options?.skipRows ?? 0;
|
|
109
|
+
const effectiveRows = dataRows.slice(skipRows);
|
|
110
|
+
// Apply maxRows
|
|
111
|
+
const maxRows = options?.maxRows;
|
|
112
|
+
const limitedRows = maxRows ? effectiveRows.slice(0, maxRows) : effectiveRows;
|
|
113
|
+
// Build grid for colspan/rowspan
|
|
114
|
+
const grid = [];
|
|
115
|
+
for (let r = 0; r < limitedRows.length; r++) {
|
|
116
|
+
if (!grid[r])
|
|
117
|
+
grid[r] = [];
|
|
118
|
+
const cells = limitedRows[r].find('td, th');
|
|
119
|
+
let cellIdx = 0;
|
|
120
|
+
cells.each((_ci, cell) => {
|
|
121
|
+
const $cell = $(cell);
|
|
122
|
+
const colspan = parseInt($cell.attr('colspan') ?? '1', 10) || 1;
|
|
123
|
+
const rowspan = parseInt($cell.attr('rowspan') ?? '1', 10) || 1;
|
|
124
|
+
const text = $cell.text().trim();
|
|
125
|
+
while (grid[r][cellIdx] !== undefined)
|
|
126
|
+
cellIdx++;
|
|
127
|
+
for (let dr = 0; dr < rowspan; dr++) {
|
|
128
|
+
for (let dc = 0; dc < colspan; dc++) {
|
|
129
|
+
if (!grid[r + dr])
|
|
130
|
+
grid[r + dr] = [];
|
|
131
|
+
grid[r + dr][cellIdx + dc] = text;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
cellIdx += colspan;
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
// Transpose if requested
|
|
138
|
+
if (options?.transpose) {
|
|
139
|
+
return transposeGrid(grid, headers);
|
|
140
|
+
}
|
|
141
|
+
// Convert grid to records
|
|
142
|
+
const results = [];
|
|
143
|
+
for (const row of grid) {
|
|
144
|
+
if (!row)
|
|
145
|
+
continue;
|
|
146
|
+
const record = {};
|
|
147
|
+
for (let c = 0; c < headers.length; c++) {
|
|
148
|
+
const key = headers[c] || `col_${c}`;
|
|
149
|
+
let value = row[c] ?? null;
|
|
150
|
+
if (options?.columnTypes?.[key] === 'number' && typeof value === 'string') {
|
|
151
|
+
const n = parseFloat(value.replace(/[^0-9.\-]/g, ''));
|
|
152
|
+
value = isNaN(n) ? value : n;
|
|
153
|
+
}
|
|
154
|
+
record[key] = value;
|
|
155
|
+
}
|
|
156
|
+
results.push(record);
|
|
157
|
+
}
|
|
158
|
+
// Add caption to first record if present
|
|
159
|
+
if (captionText && results.length > 0) {
|
|
160
|
+
results[0]['_caption'] = captionText;
|
|
161
|
+
}
|
|
162
|
+
return results;
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Extract a CSS grid/flexbox "table" (div-based layout).
|
|
166
|
+
*/
|
|
167
|
+
export function extractGrid(html, containerSelector, itemSelector) {
|
|
168
|
+
const $ = load(html);
|
|
169
|
+
const container = $(containerSelector).first();
|
|
170
|
+
if (container.length === 0)
|
|
171
|
+
return [];
|
|
172
|
+
const items = itemSelector ? container.find(itemSelector) : container.children();
|
|
173
|
+
const results = [];
|
|
174
|
+
items.each((_i, el) => {
|
|
175
|
+
const record = {};
|
|
176
|
+
const $el = $(el);
|
|
177
|
+
// Extract text from each child element as a field
|
|
178
|
+
$el.children().each((j, child) => {
|
|
179
|
+
const $child = $(child);
|
|
180
|
+
const text = $child.text().trim();
|
|
181
|
+
// Try to use class name, data-field, or aria-label as key
|
|
182
|
+
const key = $child.attr('data-field') ??
|
|
183
|
+
$child.attr('aria-label') ??
|
|
184
|
+
inferFieldKey($child, j);
|
|
185
|
+
record[key] = text || null;
|
|
186
|
+
});
|
|
187
|
+
if (Object.keys(record).length > 0) {
|
|
188
|
+
results.push(record);
|
|
189
|
+
}
|
|
190
|
+
});
|
|
191
|
+
return results;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Extract a <dl>/<dt>/<dd> definition list as key-value records.
|
|
195
|
+
*/
|
|
196
|
+
export function extractDefinitionList(html, selector = 'dl') {
|
|
197
|
+
const $ = load(html);
|
|
198
|
+
const dl = $(selector).first();
|
|
199
|
+
if (dl.length === 0)
|
|
200
|
+
return {};
|
|
201
|
+
const record = {};
|
|
202
|
+
let currentKey = null;
|
|
203
|
+
dl.children().each((_i, el) => {
|
|
204
|
+
const $el = $(el);
|
|
205
|
+
const tag = el.type === 'tag' ? el.tagName?.toLowerCase() : '';
|
|
206
|
+
if (tag === 'dt') {
|
|
207
|
+
currentKey = $el.text().trim();
|
|
208
|
+
}
|
|
209
|
+
else if (tag === 'dd' && currentKey) {
|
|
210
|
+
record[currentKey] = $el.text().trim() || null;
|
|
211
|
+
currentKey = null;
|
|
212
|
+
}
|
|
213
|
+
});
|
|
214
|
+
return record;
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Auto-detect all table-like structures on the page.
|
|
218
|
+
*/
|
|
219
|
+
export function detectTables(html) {
|
|
220
|
+
const $ = load(html);
|
|
221
|
+
const results = [];
|
|
222
|
+
// Detect <table> elements
|
|
223
|
+
$('table').each((i, el) => {
|
|
224
|
+
const $table = $(el);
|
|
225
|
+
const rows = $table.find('tr').length;
|
|
226
|
+
const cols = $table.find('tr:first-child td, tr:first-child th').length;
|
|
227
|
+
const id = $table.attr('id');
|
|
228
|
+
const cls = $table.attr('class')?.split(/\s+/)[0];
|
|
229
|
+
let selector = 'table';
|
|
230
|
+
if (id)
|
|
231
|
+
selector = `table#${id}`;
|
|
232
|
+
else if (cls)
|
|
233
|
+
selector = `table.${cls}`;
|
|
234
|
+
else if (i > 0)
|
|
235
|
+
selector = `table:nth-of-type(${i + 1})`;
|
|
236
|
+
results.push({ selector, type: 'table', rowCount: rows, columnCount: cols });
|
|
237
|
+
});
|
|
238
|
+
// Detect <dl> elements
|
|
239
|
+
$('dl').each((i, el) => {
|
|
240
|
+
const $dl = $(el);
|
|
241
|
+
const dtCount = $dl.find('dt').length;
|
|
242
|
+
const id = $dl.attr('id');
|
|
243
|
+
const cls = $dl.attr('class')?.split(/\s+/)[0];
|
|
244
|
+
let selector = 'dl';
|
|
245
|
+
if (id)
|
|
246
|
+
selector = `dl#${id}`;
|
|
247
|
+
else if (cls)
|
|
248
|
+
selector = `dl.${cls}`;
|
|
249
|
+
else if (i > 0)
|
|
250
|
+
selector = `dl:nth-of-type(${i + 1})`;
|
|
251
|
+
results.push({ selector, type: 'definition-list', rowCount: dtCount, columnCount: 2 });
|
|
252
|
+
});
|
|
253
|
+
return results;
|
|
254
|
+
}
|
|
255
|
+
// ==================== Internal ====================
|
|
256
|
+
function transposeGrid(grid, headers) {
|
|
257
|
+
// In a transposed table, the first column becomes keys
|
|
258
|
+
if (grid.length === 0)
|
|
259
|
+
return [];
|
|
260
|
+
const results = [];
|
|
261
|
+
// Determine column count from grid
|
|
262
|
+
const colCount = Math.max(...grid.map(r => r?.length ?? 0));
|
|
263
|
+
for (let c = 1; c < colCount; c++) {
|
|
264
|
+
const record = {};
|
|
265
|
+
for (let r = 0; r < grid.length; r++) {
|
|
266
|
+
const key = grid[r]?.[0] ?? `row_${r}`;
|
|
267
|
+
record[key] = grid[r]?.[c] ?? null;
|
|
268
|
+
}
|
|
269
|
+
results.push(record);
|
|
270
|
+
}
|
|
271
|
+
return results;
|
|
272
|
+
}
|
|
273
|
+
function inferFieldKey(el, index) {
|
|
274
|
+
const classes = el.attr('class');
|
|
275
|
+
if (classes) {
|
|
276
|
+
const first = classes.split(/\s+/)[0];
|
|
277
|
+
if (first)
|
|
278
|
+
return first;
|
|
279
|
+
}
|
|
280
|
+
return `field_${index}`;
|
|
281
|
+
}
|
|
282
|
+
//# sourceMappingURL=table-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"table-extractor.js","sourceRoot":"","sources":["../../src/extraction/table-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,IAAI,EAAiC,MAAM,SAAS,CAAC;AAI9D;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,WAAmB,OAAO,EAC1B,OAAsB;IAEtB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;IAClC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAElC,IAAI,OAAO,GAAa,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAErE,kBAAkB;IAClB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,CAAC;IAChD,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;IAEtE,yCAAyC;IACzC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC;QAC5C,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,kEAAkE;YAClE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACtC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,mDAAmD;gBACnD,MAAM,UAAU,GAAe,EAAE,CAAC;gBAClC,UAAU,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;oBAC3B,MAAM,UAAU,GAAa,EAAE,CAAC;oBAChC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE,EAAE,EAAE;wBACnC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;oBACvC,CAAC,CAAC,CAAC;oBACH,UAAU,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBAC9B,CAAC,CAAC,CAAC;gBACH,oDAAoD;gBACpD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;gBAC3D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;oBACjC,MAAM,KAAK,GAAa,EAAE,CAAC;oBAC3B,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;wBAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC;4BAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBACnD,CAAC;oBACD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAClC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,oBAAoB;gBACpB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;oBACtC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;gBACpC,CAAC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;aAAM,CAAC;YACN,kDAAkD;YAClD,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,KAAK,EAAE,CAAC;YAC1D,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAClC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACnB,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;wBAClB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;oBACpC,CAAC,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,wDAAwD;IACxD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC;IAC5C,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC;QAC3B,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;QACpB,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,yEAAyE;IACzE,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC;QAC7B,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,4BAA4B;YAC5B,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE;gBACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;YACtC,CAAC,CAAC,CAAC;YACH,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;aAAM,CAAC;YACN,4CAA4C;YAC5C,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,IAAI,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;YAClE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC3B,CAAC;YACD,mCAAmC;YACnC,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,MAAM,QAAQ,GAAuB,EAAE,CAAC;IACxC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE;QACnB,IAAI,CAAC,GAAG,QAAQ;YAAE,OAAO;QACzB,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,CAAC,wBAAwB;QACxE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ,IAAI,CAAC,CAAC;IACxC,MAAM,aAAa,GAAG,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAE/C,gBAAgB;IAChB,MAAM,OAAO,GAAG,OAAO,EAAE,OAAO,CAAC;IACjC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;IAE9E,iCAAiC;IACjC,MAAM,IAAI,GAAwB,EAAE,CAAC;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;YAAE,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE;YACvB,MAAM,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YACtB,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YAChE,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAEjC,OAAO,IAAI,CAAC,CAAC,CAAE,CAAC,OAAO,CAAC,KAAK,SAAS;gBAAE,OAAO,EAAE,CAAC;YAElD,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,EAAE,EAAE,CAAC;gBACpC,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,EAAE,EAAE,CAAC;oBACpC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;wBAAE,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,EAAE,CAAC;oBACrC,IAAI,CAAC,CAAC,GAAG,EAAE,CAAE,CAAC,OAAO,GAAG,EAAE,CAAC,GAAG,IAAI,CAAC;gBACrC,CAAC;YACH,CAAC;YACD,OAAO,IAAI,OAAO,CAAC;QACrB,CAAC,CAAC,CAAC;IACL,CAAC;IAED,yBAAyB;IACzB,IAAI,OAAO,EAAE,SAAS,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACtC,CAAC;IAED,0BAA0B;IAC1B,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,CAAC,GAAG;YAAE,SAAS;QACnB,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,GAAG,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,EAAE,CAAC;YACrC,IAAI,KAAK,GAAY,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;YACpC,IAAI,OAAO,EAAE,WAAW,EAAE,CAAC,GAAG,CAAC,KAAK,QAAQ,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC1E,MAAM,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC;gBACtD,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC/B,CAAC;YACD,MAAM,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACtB,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IAED,yCAAyC;IACzC,IAAI,WAAW,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtC,OAAO,CAAC,CAAC,CAAE,CAAC,UAAU,CAAC,GAAG,WAAW,CAAC;IACxC,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,iBAAyB,EACzB,YAAqB;IAErB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,SAAS,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEtC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC;IACjF,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,kDAAkD;QAClD,GAAG,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE;YAC/B,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;YACxB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAClC,0DAA0D;YAC1D,MAAM,GAAG,GACP,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC;gBACzB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC;gBACzB,aAAa,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAC3B,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,IAAI,IAAI,CAAC;QAC7B,CAAC,CAAC,CAAC;QACH,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY,EACZ,WAAmB,IAAI;IAEvB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,EAAE,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/B,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE/B,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,IAAI,UAAU,GAAkB,IAAI,CAAC;IAErC,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAC5B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAE,EAA0B,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACxF,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,UAAU,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QACjC,CAAC;aAAM,IAAI,GAAG,KAAK,IAAI,IAAI,UAAU,EAAE,CAAC;YACtC,MAAM,CAAC,UAAU,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;YAC/C,UAAU,GAAG,IAAI,CAAC;QACpB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IAMvC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,OAAO,GAKR,EAAE,CAAC;IAER,0BAA0B;IAC1B,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACxB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QACrB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QACtC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,QAAQ,GAAG,OAAO,CAAC;QACvB,IAAI,EAAE;YAAE,QAAQ,GAAG,SAAS,EAAE,EAAE,CAAC;aAC5B,IAAI,GAAG;YAAE,QAAQ,GAAG,SAAS,GAAG,EAAE,CAAC;aACnC,IAAI,CAAC,GAAG,CAAC;YAAE,QAAQ,GAAG,qBAAqB,CAAC,GAAG,CAAC,GAAG,CAAC;QAEzD,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/E,CAAC,CAAC,CAAC;IAEH,uBAAuB;IACvB,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACrB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QACtC,MAAM,EAAE,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,EAAE;YAAE,QAAQ,GAAG,MAAM,EAAE,EAAE,CAAC;aACzB,IAAI,GAAG;YAAE,QAAQ,GAAG,MAAM,GAAG,EAAE,CAAC;aAChC,IAAI,CAAC,GAAG,CAAC;YAAE,QAAQ,GAAG,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC;QAEtD,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,iBAAiB,EAAE,QAAQ,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC;IACzF,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,qDAAqD;AAErD,SAAS,aAAa,CACpB,IAAyB,EACzB,OAAiB;IAEjB,uDAAuD;IACvD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,mCAAmC;IACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC;IAE5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,EAAE,CAAC;YACvC,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACrC,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CAAC,EAAoB,EAAE,KAAa;IACxD,MAAM,OAAO,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;IAC1B,CAAC;IACD,OAAO,SAAS,KAAK,EAAE,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure value transform pipeline for extraction.
|
|
3
|
+
*
|
|
4
|
+
* No browser dependency. All functions are stateless and side-effect free.
|
|
5
|
+
*
|
|
6
|
+
* Built-in transforms:
|
|
7
|
+
* - trim, lowercase, uppercase, number, clean, slug (from playwright extractor)
|
|
8
|
+
* - date: parse various date formats to ISO-8601 (including relative dates)
|
|
9
|
+
* - price: "$1,299.99" → 1299.99
|
|
10
|
+
* - url: resolve relative URLs to absolute
|
|
11
|
+
* - email: extract email address from text
|
|
12
|
+
* - stripHtml: remove HTML tags and decode all HTML entities
|
|
13
|
+
* - boolean: normalize truthy/falsy strings to "true"/"false"
|
|
14
|
+
* - compact: remove ALL whitespace (including newlines, tabs)
|
|
15
|
+
*/
|
|
16
|
+
import type { Transform } from './types.js';
|
|
17
|
+
/**
|
|
18
|
+
* Apply a single named transform to a string value.
|
|
19
|
+
*/
|
|
20
|
+
export declare function applyTransform(value: string, transform: Transform): string | null;
|
|
21
|
+
/**
|
|
22
|
+
* Apply a pipeline of transforms to a string value.
|
|
23
|
+
*/
|
|
24
|
+
export declare function applyTransforms(value: string | null, transforms?: Transform | Transform[]): string | null;
|
|
25
|
+
/**
|
|
26
|
+
* Apply a regex pattern to extract a substring.
|
|
27
|
+
*/
|
|
28
|
+
export declare function applyPattern(value: string | null, pattern?: string, group?: number): string | null;
|
|
29
|
+
/**
|
|
30
|
+
* Coerce a string to the specified type.
|
|
31
|
+
*/
|
|
32
|
+
export declare function coerceType(value: string | null, type?: string): unknown;
|
|
33
|
+
/**
|
|
34
|
+
* Parse a price string to a number.
|
|
35
|
+
* Handles: "$1,299.99", "EUR 1.299,99", "1 299,99 kr", etc.
|
|
36
|
+
*/
|
|
37
|
+
export declare function parsePrice(value: string): number | null;
|
|
38
|
+
/**
|
|
39
|
+
* Parse a date string to ISO-8601 format.
|
|
40
|
+
* Handles common formats: "Jan 15, 2024", "15/01/2024", "2024-01-15", etc.
|
|
41
|
+
*/
|
|
42
|
+
export declare function parseDate(value: string): string | null;
|
|
43
|
+
/**
|
|
44
|
+
* Resolve a relative URL against a base URL.
|
|
45
|
+
*/
|
|
46
|
+
export declare function resolveUrl(relative: string, baseUrl: string): string;
|
|
47
|
+
//# sourceMappingURL=transforms.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transforms.d.ts","sourceRoot":"","sources":["../../src/extraction/transforms.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAE5C;;GAEG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,GAAG,MAAM,GAAG,IAAI,CA4CjF;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,MAAM,GAAG,IAAI,EACpB,UAAU,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,GACnC,MAAM,GAAG,IAAI,CASf;AAED;;GAEG;AACH,wBAAgB,YAAY,CAC1B,KAAK,EAAE,MAAM,GAAG,IAAI,EACpB,OAAO,CAAC,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,MAAM,GAAG,IAAI,CAUf;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,EAAE,IAAI,CAAC,EAAE,MAAM,GAAG,OAAO,CAQvE;AAED;;;GAGG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAmBvD;AAED;;;GAGG;AACH,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA+BtD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAMpE"}
|