@olib-ai/owl-browser-sdk 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +107 -0
  2. package/dist/extraction/content-cleaner.d.ts +40 -0
  3. package/dist/extraction/content-cleaner.d.ts.map +1 -0
  4. package/dist/extraction/content-cleaner.js +393 -0
  5. package/dist/extraction/content-cleaner.js.map +1 -0
  6. package/dist/extraction/extractor.d.ts +139 -0
  7. package/dist/extraction/extractor.d.ts.map +1 -0
  8. package/dist/extraction/extractor.js +212 -0
  9. package/dist/extraction/extractor.js.map +1 -0
  10. package/dist/extraction/html-processor.d.ts +75 -0
  11. package/dist/extraction/html-processor.d.ts.map +1 -0
  12. package/dist/extraction/html-processor.js +192 -0
  13. package/dist/extraction/html-processor.js.map +1 -0
  14. package/dist/extraction/index.d.ts +14 -0
  15. package/dist/extraction/index.d.ts.map +1 -0
  16. package/dist/extraction/index.js +19 -0
  17. package/dist/extraction/index.js.map +1 -0
  18. package/dist/extraction/list-extractor.d.ts +24 -0
  19. package/dist/extraction/list-extractor.d.ts.map +1 -0
  20. package/dist/extraction/list-extractor.js +303 -0
  21. package/dist/extraction/list-extractor.js.map +1 -0
  22. package/dist/extraction/meta-extractor.d.ts +40 -0
  23. package/dist/extraction/meta-extractor.d.ts.map +1 -0
  24. package/dist/extraction/meta-extractor.js +216 -0
  25. package/dist/extraction/meta-extractor.js.map +1 -0
  26. package/dist/extraction/pagination.d.ts +29 -0
  27. package/dist/extraction/pagination.d.ts.map +1 -0
  28. package/dist/extraction/pagination.js +323 -0
  29. package/dist/extraction/pagination.js.map +1 -0
  30. package/dist/extraction/pattern-detector.d.ts +16 -0
  31. package/dist/extraction/pattern-detector.d.ts.map +1 -0
  32. package/dist/extraction/pattern-detector.js +390 -0
  33. package/dist/extraction/pattern-detector.js.map +1 -0
  34. package/dist/extraction/scrape-session.d.ts +23 -0
  35. package/dist/extraction/scrape-session.d.ts.map +1 -0
  36. package/dist/extraction/scrape-session.js +192 -0
  37. package/dist/extraction/scrape-session.js.map +1 -0
  38. package/dist/extraction/selector-engine.d.ts +23 -0
  39. package/dist/extraction/selector-engine.d.ts.map +1 -0
  40. package/dist/extraction/selector-engine.js +127 -0
  41. package/dist/extraction/selector-engine.js.map +1 -0
  42. package/dist/extraction/table-extractor.d.ts +29 -0
  43. package/dist/extraction/table-extractor.d.ts.map +1 -0
  44. package/dist/extraction/table-extractor.js +282 -0
  45. package/dist/extraction/table-extractor.js.map +1 -0
  46. package/dist/extraction/transforms.d.ts +47 -0
  47. package/dist/extraction/transforms.d.ts.map +1 -0
  48. package/dist/extraction/transforms.js +277 -0
  49. package/dist/extraction/transforms.js.map +1 -0
  50. package/dist/extraction/types.d.ts +199 -0
  51. package/dist/extraction/types.d.ts.map +1 -0
  52. package/dist/extraction/types.js +5 -0
  53. package/dist/extraction/types.js.map +1 -0
  54. package/dist/index.d.ts +1 -0
  55. package/dist/index.d.ts.map +1 -1
  56. package/dist/index.js +2 -0
  57. package/dist/index.js.map +1 -1
  58. package/dist/playwright/browser-type.d.ts +101 -0
  59. package/dist/playwright/browser-type.d.ts.map +1 -0
  60. package/dist/playwright/browser-type.js +134 -0
  61. package/dist/playwright/browser-type.js.map +1 -0
  62. package/dist/playwright/browser.d.ts +98 -0
  63. package/dist/playwright/browser.d.ts.map +1 -0
  64. package/dist/playwright/browser.js +229 -0
  65. package/dist/playwright/browser.js.map +1 -0
  66. package/dist/playwright/context.d.ts +217 -0
  67. package/dist/playwright/context.d.ts.map +1 -0
  68. package/dist/playwright/context.js +518 -0
  69. package/dist/playwright/context.js.map +1 -0
  70. package/dist/playwright/extractor.d.ts +108 -0
  71. package/dist/playwright/extractor.d.ts.map +1 -0
  72. package/dist/playwright/extractor.js +404 -0
  73. package/dist/playwright/extractor.js.map +1 -0
  74. package/dist/playwright/frame.d.ts +147 -0
  75. package/dist/playwright/frame.d.ts.map +1 -0
  76. package/dist/playwright/frame.js +492 -0
  77. package/dist/playwright/frame.js.map +1 -0
  78. package/dist/playwright/index.d.ts +163 -0
  79. package/dist/playwright/index.d.ts.map +1 -0
  80. package/dist/playwright/index.js +313 -0
  81. package/dist/playwright/index.js.map +1 -0
  82. package/dist/playwright/keyboard.d.ts +74 -0
  83. package/dist/playwright/keyboard.d.ts.map +1 -0
  84. package/dist/playwright/keyboard.js +187 -0
  85. package/dist/playwright/keyboard.js.map +1 -0
  86. package/dist/playwright/locator.d.ts +237 -0
  87. package/dist/playwright/locator.d.ts.map +1 -0
  88. package/dist/playwright/locator.js +667 -0
  89. package/dist/playwright/locator.js.map +1 -0
  90. package/dist/playwright/mouse.d.ts +82 -0
  91. package/dist/playwright/mouse.d.ts.map +1 -0
  92. package/dist/playwright/mouse.js +137 -0
  93. package/dist/playwright/mouse.js.map +1 -0
  94. package/dist/playwright/page-helpers.d.ts +267 -0
  95. package/dist/playwright/page-helpers.d.ts.map +1 -0
  96. package/dist/playwright/page-helpers.js +449 -0
  97. package/dist/playwright/page-helpers.js.map +1 -0
  98. package/dist/playwright/page.d.ts +605 -0
  99. package/dist/playwright/page.d.ts.map +1 -0
  100. package/dist/playwright/page.js +1698 -0
  101. package/dist/playwright/page.js.map +1 -0
  102. package/dist/playwright/response.d.ts +100 -0
  103. package/dist/playwright/response.d.ts.map +1 -0
  104. package/dist/playwright/response.js +194 -0
  105. package/dist/playwright/response.js.map +1 -0
  106. package/dist/playwright/types.d.ts +354 -0
  107. package/dist/playwright/types.d.ts.map +1 -0
  108. package/dist/playwright/types.js +8 -0
  109. package/dist/playwright/types.js.map +1 -0
  110. package/openapi.json +327 -35
  111. package/package.json +10 -1
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Core CSS selector extraction engine using cheerio.
3
+ *
4
+ * All methods are static/pure — they parse HTML and return extracted data.
5
+ * Reuses the transform pipeline from transforms.ts.
6
+ */
7
+ import { type CheerioAPI, type Cheerio } from 'cheerio';
8
+ import type { AnyNode } from 'domhandler';
9
+ import type { FieldSpec, ExtractedRecord } from './types.js';
10
+ /**
11
+ * Extract structured data from all elements matching containerSelector.
12
+ */
13
+ export declare function extractAll(html: string, containerSelector: string, fields: Record<string, FieldSpec>): ExtractedRecord[];
14
+ /**
15
+ * Extract structured data from the first element matching containerSelector.
16
+ */
17
+ export declare function extractFirst(html: string, containerSelector: string, fields: Record<string, FieldSpec>): ExtractedRecord | null;
18
+ /**
19
+ * Count elements matching a CSS selector.
20
+ */
21
+ export declare function count(html: string, selector: string): number;
22
+ export declare function resolveField($: CheerioAPI, container: Cheerio<AnyNode>, spec: FieldSpec): unknown;
23
+ //# sourceMappingURL=selector-engine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"selector-engine.d.ts","sourceRoot":"","sources":["../../src/extraction/selector-engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAQ,KAAK,UAAU,EAAE,KAAK,OAAO,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,KAAK,EAAE,SAAS,EAAmB,eAAe,EAAE,MAAM,YAAY,CAAC;AAG9E;;GAEG;AACH,wBAAgB,UAAU,CACxB,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,eAAe,EAAE,CAOnB;AAED;;GAEG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,eAAe,GAAG,IAAI,CAKxB;AAED;;GAEG;AACH,wBAAgB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAG5D;AAgBD,wBAAgB,YAAY,CAC1B,CAAC,EAAE,UAAU,EACb,SAAS,EAAE,OAAO,CAAC,OAAO,CAAC,EAC3B,IAAI,EAAE,SAAS,GACd,OAAO,CAKT"}
@@ -0,0 +1,127 @@
1
+ /**
2
+ * Core CSS selector extraction engine using cheerio.
3
+ *
4
+ * All methods are static/pure — they parse HTML and return extracted data.
5
+ * Reuses the transform pipeline from transforms.ts.
6
+ */
7
+ import { load } from 'cheerio';
8
+ import { applyTransforms, applyPattern, coerceType } from './transforms.js';
9
+ /**
10
+ * Extract structured data from all elements matching containerSelector.
11
+ */
12
+ export function extractAll(html, containerSelector, fields) {
13
+ const $ = load(html);
14
+ const results = [];
15
+ $(containerSelector).each((_i, el) => {
16
+ results.push(extractItem($, $(el), fields));
17
+ });
18
+ return results;
19
+ }
20
+ /**
21
+ * Extract structured data from the first element matching containerSelector.
22
+ */
23
+ export function extractFirst(html, containerSelector, fields) {
24
+ const $ = load(html);
25
+ const container = $(containerSelector).first();
26
+ if (container.length === 0)
27
+ return null;
28
+ return extractItem($, container, fields);
29
+ }
30
+ /**
31
+ * Count elements matching a CSS selector.
32
+ */
33
+ export function count(html, selector) {
34
+ const $ = load(html);
35
+ return $(selector).length;
36
+ }
37
+ // ==================== Internal ====================
38
+ function extractItem($, container, fields) {
39
+ const record = {};
40
+ for (const [name, spec] of Object.entries(fields)) {
41
+ record[name] = resolveField($, container, spec);
42
+ }
43
+ return record;
44
+ }
45
+ export function resolveField($, container, spec) {
46
+ if (typeof spec === 'string') {
47
+ return extractStringField($, container, spec);
48
+ }
49
+ return extractObjectField($, container, spec);
50
+ }
51
+ function extractStringField($, container, spec) {
52
+ const atPos = spec.lastIndexOf('@');
53
+ if (atPos >= 0) {
54
+ const sel = spec.slice(0, atPos).trim();
55
+ const attr = spec.slice(atPos + 1);
56
+ const target = sel ? container.find(sel).first() : container;
57
+ if (target.length === 0)
58
+ return null;
59
+ if (attr === 'outerHTML')
60
+ return $.html(target) ?? null;
61
+ if (attr === 'innerHTML')
62
+ return target.html() ?? null;
63
+ return target.attr(attr) ?? null;
64
+ }
65
+ const el = spec.trim() ? container.find(spec).first() : container;
66
+ if (el.length === 0)
67
+ return null;
68
+ return el.text().trim() || null;
69
+ }
70
+ function extractObjectField($, container, spec) {
71
+ if (spec.nested) {
72
+ return extractNested($, container, spec.nested);
73
+ }
74
+ const sel = spec.selector;
75
+ const attr = spec.attribute;
76
+ const fieldType = spec.type;
77
+ if (spec.all) {
78
+ const values = [];
79
+ const targets = sel ? container.find(sel) : container;
80
+ targets.each((_i, el) => {
81
+ let raw = extractRawValue($, $(el), attr, fieldType);
82
+ raw = applyPattern(raw, spec.pattern, spec.group);
83
+ raw = applyTransforms(raw, spec.transform);
84
+ const coerced = coerceType(raw, fieldType);
85
+ values.push(coerced ?? spec.default ?? null);
86
+ });
87
+ return values;
88
+ }
89
+ const target = sel ? container.find(sel).first() : container;
90
+ if (target.length === 0) {
91
+ return spec.default ?? null;
92
+ }
93
+ let raw = extractRawValue($, target, attr, fieldType);
94
+ raw = applyPattern(raw, spec.pattern, spec.group);
95
+ raw = applyTransforms(raw, spec.transform);
96
+ const coerced = coerceType(raw, fieldType);
97
+ return coerced ?? spec.default ?? null;
98
+ }
99
+ function extractRawValue($, el, attr, type) {
100
+ if (el.length === 0)
101
+ return null;
102
+ if (type === 'html')
103
+ return $.html(el) ?? null;
104
+ if (type === 'innerHtml')
105
+ return el.html() ?? null;
106
+ if (attr) {
107
+ if (attr === 'outerHTML')
108
+ return $.html(el) ?? null;
109
+ if (attr === 'innerHTML')
110
+ return el.html() ?? null;
111
+ return el.attr(attr) ?? null;
112
+ }
113
+ return el.text().trim() || null;
114
+ }
115
+ function extractNested($, container, nested) {
116
+ const results = [];
117
+ const elements = container.find(nested.selector);
118
+ const limit = nested.limit ?? elements.length;
119
+ elements.each((i, el) => {
120
+ if (i >= limit)
121
+ return false;
122
+ results.push(extractItem($, $(el), nested.fields));
123
+ return undefined;
124
+ });
125
+ return results;
126
+ }
127
+ //# sourceMappingURL=selector-engine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"selector-engine.js","sourceRoot":"","sources":["../../src/extraction/selector-engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,IAAI,EAAiC,MAAM,SAAS,CAAC;AAG9D,OAAO,EAAE,eAAe,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAE5E;;GAEG;AACH,MAAM,UAAU,UAAU,CACxB,IAAY,EACZ,iBAAyB,EACzB,MAAiC;IAEjC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,CAAC,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACnC,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IACH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,iBAAyB,EACzB,MAAiC;IAEjC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,SAAS,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACxC,OAAO,WAAW,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;AAC3C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,KAAK,CAAC,IAAY,EAAE,QAAgB;IAClD,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED,qDAAqD;AAErD,SAAS,WAAW,CAClB,CAAa,EACb,SAA2B,EAC3B,MAAiC;IAEjC,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,KAAK,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAClD,MAAM,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAClD,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,CAAa,EACb,SAA2B,EAC3B,IAAe;IAEf,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC7B,OAAO,kBAAkB,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAChD,CAAC;IACD,OAAO,kBAAkB,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;AAChD,CAAC;AAED,SAAS,kBAAkB,CACzB,CAAa,EACb,SAA2B,EAC3B,IAAY;IAEZ,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IACpC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;QACxC,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACnC,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;QAC7D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC;QACrC,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC;QACxD,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,MAAM,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QACvD,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;IACnC,CAAC;IACD,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAClE,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACjC,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;AAClC,CAAC;AAED,SAAS,kBAAkB,CACzB,CAAa,EACb,SAA2B,EAC3B,IAAqB;IAErB,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,OAAO,aAAa,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;IAClD,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC;IAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC;IAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC;IAE5B,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;QACb,MAAM,MAAM,GAAc,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtD,OAAO,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;YACtB,IAAI,GAAG,GAAkB,eAAe,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC;YACpE,GAAG,GAAG,YAAY,CAAC,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;YAClD,GAAG,GAAG,eAAe,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;YAC3C,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QACH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAC7D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;IAC9B,CAAC;IAED,IAAI,GAAG,GAAkB,eAAe,CAAC,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC;IACrE,GAAG,GAAG,YAAY,CAAC,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IAClD,GAAG,GAAG,eAAe,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;IAC3C,OAAO,OAAO,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;AACzC,CAAC;AAED,SAAS,eAAe,CACtB,CAAa,EACb,EAAoB,EACpB,IAAa,EACb,IAAa;IAEb,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IACjC,IAAI,IAAI,KAAK,MAAM;QAAE,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC;IAC/C,IAAI,IAAI,KAAK,WAAW;QAAE,OAAO,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;IACnD,IAAI,IAAI,EAAE,CAAC;QACT,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC;QACpD,IAAI,IAAI,KAAK,WAAW;YAAE,OAAO,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QACnD,OAAO,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;IAC/B,CAAC;IACD,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;AAClC,CAAC;AAED,SAAS,aAAa,CACpB,CAAa,EACb,SAA2B,EAC3B,MAA+E;IAE/E,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,MAAM,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,IAAI,QAAQ,CAAC,MAAM,CAAC;IAC9C,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACtB,IAAI,CAAC,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;QAC7B,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACnD,OAAO,SAAS,CAAC;IACnB,CAAC,CAAC,CAAC;IACH,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Specialized table extraction.
3
+ *
4
+ * Handles <table> with colspan/rowspan, headerless tables, key-value transpose,
5
+ * CSS grid/flexbox "tables", and <dl>/<dt>/<dd> definition lists.
6
+ */
7
+ import type { ExtractedRecord, TableOptions } from './types.js';
8
+ /**
9
+ * Extract a <table> as an array of records.
10
+ */
11
+ export declare function extractTable(html: string, selector?: string, options?: TableOptions): ExtractedRecord[];
12
+ /**
13
+ * Extract a CSS grid/flexbox "table" (div-based layout).
14
+ */
15
+ export declare function extractGrid(html: string, containerSelector: string, itemSelector?: string): ExtractedRecord[];
16
+ /**
17
+ * Extract a <dl>/<dt>/<dd> definition list as key-value records.
18
+ */
19
+ export declare function extractDefinitionList(html: string, selector?: string): ExtractedRecord;
20
+ /**
21
+ * Auto-detect all table-like structures on the page.
22
+ */
23
+ export declare function detectTables(html: string): Array<{
24
+ selector: string;
25
+ type: 'table' | 'definition-list';
26
+ rowCount: number;
27
+ columnCount: number;
28
+ }>;
29
+ //# sourceMappingURL=table-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"table-extractor.d.ts","sourceRoot":"","sources":["../../src/extraction/table-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAEhE;;GAEG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,MAAM,EACZ,QAAQ,GAAE,MAAgB,EAC1B,OAAO,CAAC,EAAE,YAAY,GACrB,eAAe,EAAE,CA4JnB;AAED;;GAEG;AACH,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,iBAAiB,EAAE,MAAM,EACzB,YAAY,CAAC,EAAE,MAAM,GACpB,eAAe,EAAE,CA4BnB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,EACZ,QAAQ,GAAE,MAAa,GACtB,eAAe,CAoBjB;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,CAAC;IAChD,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,GAAG,iBAAiB,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;CACrB,CAAC,CAuCD"}
@@ -0,0 +1,282 @@
1
+ /**
2
+ * Specialized table extraction.
3
+ *
4
+ * Handles <table> with colspan/rowspan, headerless tables, key-value transpose,
5
+ * CSS grid/flexbox "tables", and <dl>/<dt>/<dd> definition lists.
6
+ */
7
+ import { load } from 'cheerio';
8
+ /**
9
+ * Extract a <table> as an array of records.
10
+ */
11
+ export function extractTable(html, selector = 'table', options) {
12
+ const $ = load(html);
13
+ const table = $(selector).first();
14
+ if (table.length === 0)
15
+ return [];
16
+ let headers = options?.headers ? [...options.headers] : [];
17
+ // Extract caption
18
+ const caption = table.find('> caption').first();
19
+ const captionText = caption.length > 0 ? caption.text().trim() : null;
20
+ // Auto-detect headers from <th> elements
21
+ if (headers.length === 0) {
22
+ const thead = table.find('> thead').first();
23
+ if (thead.length > 0) {
24
+ // Check for multi-row headers (all rows in thead are header rows)
25
+ const headerRows = thead.find('> tr');
26
+ if (headerRows.length > 1) {
27
+ // Multi-row header: combine into composite headers
28
+ const headerGrid = [];
29
+ headerRows.each((_ri, row) => {
30
+ const rowHeaders = [];
31
+ $(row).find('> th').each((_ci, th) => {
32
+ rowHeaders.push($(th).text().trim());
33
+ });
34
+ headerGrid.push(rowHeaders);
35
+ });
36
+ // Combine: "Category" + "Name" -> "Category / Name"
37
+ const maxCols = Math.max(...headerGrid.map(r => r.length));
38
+ for (let c = 0; c < maxCols; c++) {
39
+ const parts = [];
40
+ for (const row of headerGrid) {
41
+ const val = row[c]?.trim();
42
+ if (val && !parts.includes(val))
43
+ parts.push(val);
44
+ }
45
+ headers.push(parts.join(' / '));
46
+ }
47
+ }
48
+ else {
49
+ // Single header row
50
+ thead.find('> tr > th').each((_i, th) => {
51
+ headers.push($(th).text().trim());
52
+ });
53
+ }
54
+ }
55
+ else {
56
+ // No thead — check if first row has <th> elements
57
+ const firstRow = table.find('> tbody > tr, > tr').first();
58
+ if (firstRow.length > 0) {
59
+ const ths = firstRow.find('> th');
60
+ if (ths.length > 0) {
61
+ ths.each((_i, th) => {
62
+ headers.push($(th).text().trim());
63
+ });
64
+ }
65
+ }
66
+ }
67
+ }
68
+ // Use direct child selectors to avoid nested table rows
69
+ const tbody = table.find('> tbody').first();
70
+ const rows = tbody.length > 0
71
+ ? tbody.find('> tr')
72
+ : table.find('> tr');
73
+ let startIdx = 0;
74
+ // If still no headers, use first row as headers or generate column names
75
+ if (headers.length === 0 && rows.length > 0) {
76
+ const firstRow = $(rows[0]);
77
+ const tds = firstRow.find('> td');
78
+ const ths = firstRow.find('> th');
79
+ if (ths.length > 0 && tds.length === 0) {
80
+ // First row is a header row
81
+ ths.each((_i, cell) => {
82
+ headers.push($(cell).text().trim());
83
+ });
84
+ startIdx = 1;
85
+ }
86
+ else {
87
+ // First row is data — generate column names
88
+ const colCount = tds.length || firstRow.find('> td, > th').length;
89
+ for (let c = 0; c < colCount; c++) {
90
+ headers.push(`col_${c}`);
91
+ }
92
+ // Don't skip first row — it's data
93
+ startIdx = 0;
94
+ }
95
+ }
96
+ // Collect data rows
97
+ const dataRows = [];
98
+ rows.each((i, row) => {
99
+ if (i < startIdx)
100
+ return;
101
+ const tds = $(row).find('td');
102
+ const ths = $(row).find('th');
103
+ if (tds.length === 0 && ths.length > 0)
104
+ return; // Skip pure-header rows
105
+ dataRows.push($(row));
106
+ });
107
+ // Apply skipRows
108
+ const skipRows = options?.skipRows ?? 0;
109
+ const effectiveRows = dataRows.slice(skipRows);
110
+ // Apply maxRows
111
+ const maxRows = options?.maxRows;
112
+ const limitedRows = maxRows ? effectiveRows.slice(0, maxRows) : effectiveRows;
113
+ // Build grid for colspan/rowspan
114
+ const grid = [];
115
+ for (let r = 0; r < limitedRows.length; r++) {
116
+ if (!grid[r])
117
+ grid[r] = [];
118
+ const cells = limitedRows[r].find('td, th');
119
+ let cellIdx = 0;
120
+ cells.each((_ci, cell) => {
121
+ const $cell = $(cell);
122
+ const colspan = parseInt($cell.attr('colspan') ?? '1', 10) || 1;
123
+ const rowspan = parseInt($cell.attr('rowspan') ?? '1', 10) || 1;
124
+ const text = $cell.text().trim();
125
+ while (grid[r][cellIdx] !== undefined)
126
+ cellIdx++;
127
+ for (let dr = 0; dr < rowspan; dr++) {
128
+ for (let dc = 0; dc < colspan; dc++) {
129
+ if (!grid[r + dr])
130
+ grid[r + dr] = [];
131
+ grid[r + dr][cellIdx + dc] = text;
132
+ }
133
+ }
134
+ cellIdx += colspan;
135
+ });
136
+ }
137
+ // Transpose if requested
138
+ if (options?.transpose) {
139
+ return transposeGrid(grid, headers);
140
+ }
141
+ // Convert grid to records
142
+ const results = [];
143
+ for (const row of grid) {
144
+ if (!row)
145
+ continue;
146
+ const record = {};
147
+ for (let c = 0; c < headers.length; c++) {
148
+ const key = headers[c] || `col_${c}`;
149
+ let value = row[c] ?? null;
150
+ if (options?.columnTypes?.[key] === 'number' && typeof value === 'string') {
151
+ const n = parseFloat(value.replace(/[^0-9.\-]/g, ''));
152
+ value = isNaN(n) ? value : n;
153
+ }
154
+ record[key] = value;
155
+ }
156
+ results.push(record);
157
+ }
158
+ // Add caption to first record if present
159
+ if (captionText && results.length > 0) {
160
+ results[0]['_caption'] = captionText;
161
+ }
162
+ return results;
163
+ }
164
+ /**
165
+ * Extract a CSS grid/flexbox "table" (div-based layout).
166
+ */
167
+ export function extractGrid(html, containerSelector, itemSelector) {
168
+ const $ = load(html);
169
+ const container = $(containerSelector).first();
170
+ if (container.length === 0)
171
+ return [];
172
+ const items = itemSelector ? container.find(itemSelector) : container.children();
173
+ const results = [];
174
+ items.each((_i, el) => {
175
+ const record = {};
176
+ const $el = $(el);
177
+ // Extract text from each child element as a field
178
+ $el.children().each((j, child) => {
179
+ const $child = $(child);
180
+ const text = $child.text().trim();
181
+ // Try to use class name, data-field, or aria-label as key
182
+ const key = $child.attr('data-field') ??
183
+ $child.attr('aria-label') ??
184
+ inferFieldKey($child, j);
185
+ record[key] = text || null;
186
+ });
187
+ if (Object.keys(record).length > 0) {
188
+ results.push(record);
189
+ }
190
+ });
191
+ return results;
192
+ }
193
+ /**
194
+ * Extract a <dl>/<dt>/<dd> definition list as key-value records.
195
+ */
196
+ export function extractDefinitionList(html, selector = 'dl') {
197
+ const $ = load(html);
198
+ const dl = $(selector).first();
199
+ if (dl.length === 0)
200
+ return {};
201
+ const record = {};
202
+ let currentKey = null;
203
+ dl.children().each((_i, el) => {
204
+ const $el = $(el);
205
+ const tag = el.type === 'tag' ? el.tagName?.toLowerCase() : '';
206
+ if (tag === 'dt') {
207
+ currentKey = $el.text().trim();
208
+ }
209
+ else if (tag === 'dd' && currentKey) {
210
+ record[currentKey] = $el.text().trim() || null;
211
+ currentKey = null;
212
+ }
213
+ });
214
+ return record;
215
+ }
216
+ /**
217
+ * Auto-detect all table-like structures on the page.
218
+ */
219
+ export function detectTables(html) {
220
+ const $ = load(html);
221
+ const results = [];
222
+ // Detect <table> elements
223
+ $('table').each((i, el) => {
224
+ const $table = $(el);
225
+ const rows = $table.find('tr').length;
226
+ const cols = $table.find('tr:first-child td, tr:first-child th').length;
227
+ const id = $table.attr('id');
228
+ const cls = $table.attr('class')?.split(/\s+/)[0];
229
+ let selector = 'table';
230
+ if (id)
231
+ selector = `table#${id}`;
232
+ else if (cls)
233
+ selector = `table.${cls}`;
234
+ else if (i > 0)
235
+ selector = `table:nth-of-type(${i + 1})`;
236
+ results.push({ selector, type: 'table', rowCount: rows, columnCount: cols });
237
+ });
238
+ // Detect <dl> elements
239
+ $('dl').each((i, el) => {
240
+ const $dl = $(el);
241
+ const dtCount = $dl.find('dt').length;
242
+ const id = $dl.attr('id');
243
+ const cls = $dl.attr('class')?.split(/\s+/)[0];
244
+ let selector = 'dl';
245
+ if (id)
246
+ selector = `dl#${id}`;
247
+ else if (cls)
248
+ selector = `dl.${cls}`;
249
+ else if (i > 0)
250
+ selector = `dl:nth-of-type(${i + 1})`;
251
+ results.push({ selector, type: 'definition-list', rowCount: dtCount, columnCount: 2 });
252
+ });
253
+ return results;
254
+ }
255
+ // ==================== Internal ====================
256
+ function transposeGrid(grid, headers) {
257
+ // In a transposed table, the first column becomes keys
258
+ if (grid.length === 0)
259
+ return [];
260
+ const results = [];
261
+ // Determine column count from grid
262
+ const colCount = Math.max(...grid.map(r => r?.length ?? 0));
263
+ for (let c = 1; c < colCount; c++) {
264
+ const record = {};
265
+ for (let r = 0; r < grid.length; r++) {
266
+ const key = grid[r]?.[0] ?? `row_${r}`;
267
+ record[key] = grid[r]?.[c] ?? null;
268
+ }
269
+ results.push(record);
270
+ }
271
+ return results;
272
+ }
273
+ function inferFieldKey(el, index) {
274
+ const classes = el.attr('class');
275
+ if (classes) {
276
+ const first = classes.split(/\s+/)[0];
277
+ if (first)
278
+ return first;
279
+ }
280
+ return `field_${index}`;
281
+ }
282
+ //# sourceMappingURL=table-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"table-extractor.js","sourceRoot":"","sources":["../../src/extraction/table-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,IAAI,EAAiC,MAAM,SAAS,CAAC;AAI9D;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,IAAY,EACZ,WAAmB,OAAO,EAC1B,OAAsB;IAEtB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;IAClC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAElC,IAAI,OAAO,GAAa,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAErE,kBAAkB;IAClB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,CAAC;IAChD,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;IAEtE,yCAAyC;IACzC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC;QAC5C,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,kEAAkE;YAClE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACtC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,mDAAmD;gBACnD,MAAM,UAAU,GAAe,EAAE,CAAC;gBAClC,UAAU,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;oBAC3B,MAAM,UAAU,GAAa,EAAE,CAAC;oBAChC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE,EAAE,EAAE;wBACnC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;oBACvC,CAAC,CAAC,CAAC;oBACH,UAAU,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBAC9B,CAAC,CAAC,CAAC;gBACH,oDAAoD;gBACpD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;gBAC3D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;oBACjC,MAAM,KAAK,GAAa,EAAE,CAAC;oBAC3B,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;wBAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC;4BAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBACnD,CAAC;oBACD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBAClC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,oBAAoB;gBACpB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;oBACtC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;gBACpC,CAAC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;aAAM,CAAC;YACN,kDAAkD;YAClD,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,KAAK,EAAE,CAAC;YAC1D,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAClC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACnB,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;wBAClB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;oBACpC,CAAC,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,wDAAwD;IACxD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,KAAK,EAAE,CAAC;IAC5C,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC;QAC3B,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;QACpB,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,yEAAyE;IACzE,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC;QAC7B,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAClC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvC,4BAA4B;YAC5B,GAAG,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE;gBACpB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;YACtC,CAAC,CAAC,CAAC;YACH,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;aAAM,CAAC;YACN,4CAA4C;YAC5C,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,IAAI,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;YAClE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC3B,CAAC;YACD,mCAAmC;YACnC,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,MAAM,QAAQ,GAAuB,EAAE,CAAC;IACxC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE;QACnB,IAAI,CAAC,GAAG,QAAQ;YAAE,OAAO;QACzB,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9B,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,CAAC,wBAAwB;QACxE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ,IAAI,CAAC,CAAC;IACxC,MAAM,aAAa,GAAG,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAE/C,gBAAgB;IAChB,MAAM,OAAO,GAAG,OAAO,EAAE,OAAO,CAAC;IACjC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;IAE9E,iCAAiC;IACjC,MAAM,IAAI,GAAwB,EAAE,CAAC;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;YAAE,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE;YACvB,MAAM,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YACtB,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YAChE,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAEjC,OAAO,IAAI,CAAC,CAAC,CAAE,CAAC,OAAO,CAAC,KAAK,SAAS;gBAAE,OAAO,EAAE,CAAC;YAElD,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,EAAE,EAAE,CAAC;gBACpC,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,OAAO,EAAE,EAAE,EAAE,EAAE,CAAC;oBACpC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;wBAAE,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,EAAE,CAAC;oBACrC,IAAI,CAAC,CAAC,GAAG,EAAE,CAAE,CAAC,OAAO,GAAG,EAAE,CAAC,GAAG,IAAI,CAAC;gBACrC,CAAC;YACH,CAAC;YACD,OAAO,IAAI,OAAO,CAAC;QACrB,CAAC,CAAC,CAAC;IACL,CAAC;IAED,yBAAyB;IACzB,IAAI,OAAO,EAAE,SAAS,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACtC,CAAC;IAED,0BAA0B;IAC1B,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,CAAC,GAAG;YAAE,SAAS;QACnB,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,GAAG,GAAG,OAAO,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,EAAE,CAAC;YACrC,IAAI,KAAK,GAAY,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;YACpC,IAAI,OAAO,EAAE,WAAW,EAAE,CAAC,GAAG,CAAC,KAAK,QAAQ,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC1E,MAAM,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC;gBACtD,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YAC/B,CAAC;YACD,MAAM,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACtB,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IAED,yCAAyC;IACzC,IAAI,WAAW,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtC,OAAO,CAAC,CAAC,CAAE,CAAC,UAAU,CAAC,GAAG,WAAW,CAAC;IACxC,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,iBAAyB,EACzB,YAAqB;IAErB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,SAAS,GAAG,CAAC,CAAC,iBAAiB,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEtC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,EAAE,CAAC;IACjF,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,kDAAkD;QAClD,GAAG,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE;YAC/B,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;YACxB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAClC,0DAA0D;YAC1D,MAAM,GAAG,GACP,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC;gBACzB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC;gBACzB,aAAa,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAC3B,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,IAAI,IAAI,CAAC;QAC7B,CAAC,CAAC,CAAC;QACH,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY,EACZ,WAAmB,IAAI;IAEvB,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,EAAE,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;IAC/B,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE/B,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,IAAI,UAAU,GAAkB,IAAI,CAAC;IAErC,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAC5B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,EAAE,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAE,EAA0B,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACxF,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,UAAU,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QACjC,CAAC;aAAM,IAAI,GAAG,KAAK,IAAI,IAAI,UAAU,EAAE,CAAC;YACtC,MAAM,CAAC,UAAU,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;YAC/C,UAAU,GAAG,IAAI,CAAC;QACpB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IAMvC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,MAAM,OAAO,GAKR,EAAE,CAAC;IAER,0BAA0B;IAC1B,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACxB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QACrB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QACtC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,QAAQ,GAAG,OAAO,CAAC;QACvB,IAAI,EAAE;YAAE,QAAQ,GAAG,SAAS,EAAE,EAAE,CAAC;aAC5B,IAAI,GAAG;YAAE,QAAQ,GAAG,SAAS,GAAG,EAAE,CAAC;aACnC,IAAI,CAAC,GAAG,CAAC;YAAE,QAAQ,GAAG,qBAAqB,CAAC,GAAG,CAAC,GAAG,CAAC;QAEzD,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/E,CAAC,CAAC,CAAC;IAEH,uBAAuB;IACvB,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACrB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QACtC,MAAM,EAAE,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/C,IAAI,QAAQ,GAAG,IAAI,CAAC;QACpB,IAAI,EAAE;YAAE,QAAQ,GAAG,MAAM,EAAE,EAAE,CAAC;aACzB,IAAI,GAAG;YAAE,QAAQ,GAAG,MAAM,GAAG,EAAE,CAAC;aAChC,IAAI,CAAC,GAAG,CAAC;YAAE,QAAQ,GAAG,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC;QAEtD,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,iBAAiB,EAAE,QAAQ,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC;IACzF,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,qDAAqD;AAErD,SAAS,aAAa,CACpB,IAAyB,EACzB,OAAiB;IAEjB,uDAAuD;IACvD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,mCAAmC;IACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC;IAE5D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,EAAE,CAAC;YACvC,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACrC,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CAAC,EAAoB,EAAE,KAAa;IACxD,MAAM,OAAO,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;IAC1B,CAAC;IACD,OAAO,SAAS,KAAK,EAAE,CAAC;AAC1B,CAAC"}
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Pure value transform pipeline for extraction.
3
+ *
4
+ * No browser dependency. All functions are stateless and side-effect free.
5
+ *
6
+ * Built-in transforms:
7
+ * - trim, lowercase, uppercase, number, clean, slug (from playwright extractor)
8
+ * - date: parse various date formats to ISO-8601 (including relative dates)
9
+ * - price: "$1,299.99" → 1299.99
10
+ * - url: resolve relative URLs to absolute
11
+ * - email: extract email address from text
12
+ * - stripHtml: remove HTML tags and decode all HTML entities
13
+ * - boolean: normalize truthy/falsy strings to "true"/"false"
14
+ * - compact: remove ALL whitespace (including newlines, tabs)
15
+ */
16
+ import type { Transform } from './types.js';
17
+ /**
18
+ * Apply a single named transform to a string value.
19
+ */
20
+ export declare function applyTransform(value: string, transform: Transform): string | null;
21
+ /**
22
+ * Apply a pipeline of transforms to a string value.
23
+ */
24
+ export declare function applyTransforms(value: string | null, transforms?: Transform | Transform[]): string | null;
25
+ /**
26
+ * Apply a regex pattern to extract a substring.
27
+ */
28
+ export declare function applyPattern(value: string | null, pattern?: string, group?: number): string | null;
29
+ /**
30
+ * Coerce a string to the specified type.
31
+ */
32
+ export declare function coerceType(value: string | null, type?: string): unknown;
33
+ /**
34
+ * Parse a price string to a number.
35
+ * Handles: "$1,299.99", "EUR 1.299,99", "1 299,99 kr", etc.
36
+ */
37
+ export declare function parsePrice(value: string): number | null;
38
+ /**
39
+ * Parse a date string to ISO-8601 format.
40
+ * Handles common formats: "Jan 15, 2024", "15/01/2024", "2024-01-15", etc.
41
+ */
42
+ export declare function parseDate(value: string): string | null;
43
+ /**
44
+ * Resolve a relative URL against a base URL.
45
+ */
46
+ export declare function resolveUrl(relative: string, baseUrl: string): string;
47
+ //# sourceMappingURL=transforms.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transforms.d.ts","sourceRoot":"","sources":["../../src/extraction/transforms.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAE5C;;GAEG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,GAAG,MAAM,GAAG,IAAI,CA4CjF;AAED;;GAEG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,MAAM,GAAG,IAAI,EACpB,UAAU,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,GACnC,MAAM,GAAG,IAAI,CASf;AAED;;GAEG;AACH,wBAAgB,YAAY,CAC1B,KAAK,EAAE,MAAM,GAAG,IAAI,EACpB,OAAO,CAAC,EAAE,MAAM,EAChB,KAAK,CAAC,EAAE,MAAM,GACb,MAAM,GAAG,IAAI,CAUf;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,EAAE,IAAI,CAAC,EAAE,MAAM,GAAG,OAAO,CAQvE;AAED;;;GAGG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAmBvD;AAED;;;GAGG;AACH,wBAAgB,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA+BtD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAMpE"}