@olib-ai/owl-browser-sdk 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -0
- package/dist/extraction/content-cleaner.d.ts +40 -0
- package/dist/extraction/content-cleaner.d.ts.map +1 -0
- package/dist/extraction/content-cleaner.js +393 -0
- package/dist/extraction/content-cleaner.js.map +1 -0
- package/dist/extraction/extractor.d.ts +139 -0
- package/dist/extraction/extractor.d.ts.map +1 -0
- package/dist/extraction/extractor.js +212 -0
- package/dist/extraction/extractor.js.map +1 -0
- package/dist/extraction/html-processor.d.ts +75 -0
- package/dist/extraction/html-processor.d.ts.map +1 -0
- package/dist/extraction/html-processor.js +192 -0
- package/dist/extraction/html-processor.js.map +1 -0
- package/dist/extraction/index.d.ts +14 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/index.js +19 -0
- package/dist/extraction/index.js.map +1 -0
- package/dist/extraction/list-extractor.d.ts +24 -0
- package/dist/extraction/list-extractor.d.ts.map +1 -0
- package/dist/extraction/list-extractor.js +303 -0
- package/dist/extraction/list-extractor.js.map +1 -0
- package/dist/extraction/meta-extractor.d.ts +40 -0
- package/dist/extraction/meta-extractor.d.ts.map +1 -0
- package/dist/extraction/meta-extractor.js +216 -0
- package/dist/extraction/meta-extractor.js.map +1 -0
- package/dist/extraction/pagination.d.ts +29 -0
- package/dist/extraction/pagination.d.ts.map +1 -0
- package/dist/extraction/pagination.js +323 -0
- package/dist/extraction/pagination.js.map +1 -0
- package/dist/extraction/pattern-detector.d.ts +16 -0
- package/dist/extraction/pattern-detector.d.ts.map +1 -0
- package/dist/extraction/pattern-detector.js +390 -0
- package/dist/extraction/pattern-detector.js.map +1 -0
- package/dist/extraction/scrape-session.d.ts +23 -0
- package/dist/extraction/scrape-session.d.ts.map +1 -0
- package/dist/extraction/scrape-session.js +192 -0
- package/dist/extraction/scrape-session.js.map +1 -0
- package/dist/extraction/selector-engine.d.ts +23 -0
- package/dist/extraction/selector-engine.d.ts.map +1 -0
- package/dist/extraction/selector-engine.js +127 -0
- package/dist/extraction/selector-engine.js.map +1 -0
- package/dist/extraction/table-extractor.d.ts +29 -0
- package/dist/extraction/table-extractor.d.ts.map +1 -0
- package/dist/extraction/table-extractor.js +282 -0
- package/dist/extraction/table-extractor.js.map +1 -0
- package/dist/extraction/transforms.d.ts +47 -0
- package/dist/extraction/transforms.d.ts.map +1 -0
- package/dist/extraction/transforms.js +277 -0
- package/dist/extraction/transforms.js.map +1 -0
- package/dist/extraction/types.d.ts +199 -0
- package/dist/extraction/types.d.ts.map +1 -0
- package/dist/extraction/types.js +5 -0
- package/dist/extraction/types.js.map +1 -0
- package/dist/flow/executor.js +1 -1
- package/dist/flow/executor.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/playwright/browser-type.d.ts +101 -0
- package/dist/playwright/browser-type.d.ts.map +1 -0
- package/dist/playwright/browser-type.js +134 -0
- package/dist/playwright/browser-type.js.map +1 -0
- package/dist/playwright/browser.d.ts +98 -0
- package/dist/playwright/browser.d.ts.map +1 -0
- package/dist/playwright/browser.js +229 -0
- package/dist/playwright/browser.js.map +1 -0
- package/dist/playwright/context.d.ts +211 -0
- package/dist/playwright/context.d.ts.map +1 -0
- package/dist/playwright/context.js +466 -0
- package/dist/playwright/context.js.map +1 -0
- package/dist/playwright/extractor.d.ts +108 -0
- package/dist/playwright/extractor.d.ts.map +1 -0
- package/dist/playwright/extractor.js +404 -0
- package/dist/playwright/extractor.js.map +1 -0
- package/dist/playwright/frame.d.ts +147 -0
- package/dist/playwright/frame.d.ts.map +1 -0
- package/dist/playwright/frame.js +492 -0
- package/dist/playwright/frame.js.map +1 -0
- package/dist/playwright/index.d.ts +163 -0
- package/dist/playwright/index.d.ts.map +1 -0
- package/dist/playwright/index.js +313 -0
- package/dist/playwright/index.js.map +1 -0
- package/dist/playwright/keyboard.d.ts +74 -0
- package/dist/playwright/keyboard.d.ts.map +1 -0
- package/dist/playwright/keyboard.js +187 -0
- package/dist/playwright/keyboard.js.map +1 -0
- package/dist/playwright/locator.d.ts +237 -0
- package/dist/playwright/locator.d.ts.map +1 -0
- package/dist/playwright/locator.js +646 -0
- package/dist/playwright/locator.js.map +1 -0
- package/dist/playwright/mouse.d.ts +82 -0
- package/dist/playwright/mouse.d.ts.map +1 -0
- package/dist/playwright/mouse.js +137 -0
- package/dist/playwright/mouse.js.map +1 -0
- package/dist/playwright/page-helpers.d.ts +261 -0
- package/dist/playwright/page-helpers.d.ts.map +1 -0
- package/dist/playwright/page-helpers.js +423 -0
- package/dist/playwright/page-helpers.js.map +1 -0
- package/dist/playwright/page.d.ts +566 -0
- package/dist/playwright/page.d.ts.map +1 -0
- package/dist/playwright/page.js +1476 -0
- package/dist/playwright/page.js.map +1 -0
- package/dist/playwright/response.d.ts +100 -0
- package/dist/playwright/response.d.ts.map +1 -0
- package/dist/playwright/response.js +194 -0
- package/dist/playwright/response.js.map +1 -0
- package/dist/playwright/types.d.ts +354 -0
- package/dist/playwright/types.d.ts.map +1 -0
- package/dist/playwright/types.js +8 -0
- package/dist/playwright/types.js.map +1 -0
- package/openapi.json +343 -36
- package/package.json +10 -1
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Extractor class — the primary entry point for the extraction module.
|
|
3
|
+
*
|
|
4
|
+
* Constructs and coordinates all extraction strategies:
|
|
5
|
+
* select, table, meta, detect, lists, scrape, clean, and more.
|
|
6
|
+
*
|
|
7
|
+
* No AI dependencies — uses only deterministic browser tools and
|
|
8
|
+
* client-side HTML parsing (cheerio).
|
|
9
|
+
*/
|
|
10
|
+
import type { OwlBrowser } from '../client.js';
|
|
11
|
+
import type { ExtractedRecord, FieldSpec, ExtractionResult, MetaData, DetectedPattern, DetectOptions, TableOptions, CleanOptions, CleanResult, ScrapeOptions, ListOptions } from './types.js';
|
|
12
|
+
import { detectTables } from './table-extractor.js';
|
|
13
|
+
/**
|
|
14
|
+
* Universal data extraction from any website.
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```typescript
|
|
18
|
+
* import { OwlBrowser } from '@olib-ai/owl-browser-sdk';
|
|
19
|
+
* import { Extractor } from '@olib-ai/owl-browser-sdk/extraction';
|
|
20
|
+
*
|
|
21
|
+
* const browser = new OwlBrowser({ url: '...', token: '...' });
|
|
22
|
+
* await browser.connect();
|
|
23
|
+
* const ctx = await browser.createContext();
|
|
24
|
+
*
|
|
25
|
+
* const ex = new Extractor(browser, ctx.context_id);
|
|
26
|
+
* await ex.goto('https://example.com/products');
|
|
27
|
+
*
|
|
28
|
+
* // Manual extraction with CSS selectors
|
|
29
|
+
* const products = await ex.select('.product-card', {
|
|
30
|
+
* name: 'h3',
|
|
31
|
+
* price: '.price',
|
|
32
|
+
* image: 'img@src',
|
|
33
|
+
* link: 'a@href',
|
|
34
|
+
* });
|
|
35
|
+
*
|
|
36
|
+
* // Auto-detect repeating patterns
|
|
37
|
+
* const patterns = await ex.detect();
|
|
38
|
+
*
|
|
39
|
+
* // Multi-page scraping
|
|
40
|
+
* const result = await ex.scrape('.product-card', {
|
|
41
|
+
* fields: { name: 'h3', price: '.price' },
|
|
42
|
+
* maxPages: 5,
|
|
43
|
+
* });
|
|
44
|
+
* ```
|
|
45
|
+
*/
|
|
46
|
+
export declare class Extractor {
|
|
47
|
+
private readonly _proc;
|
|
48
|
+
private _scrapeSession;
|
|
49
|
+
constructor(client: OwlBrowser, contextId: string);
|
|
50
|
+
/**
|
|
51
|
+
* Navigate to a URL and wait for network idle.
|
|
52
|
+
*/
|
|
53
|
+
goto(url: string, options?: {
|
|
54
|
+
waitForIdle?: boolean;
|
|
55
|
+
}): Promise<void>;
|
|
56
|
+
/**
|
|
57
|
+
* Extract structured data from all elements matching a selector.
|
|
58
|
+
*/
|
|
59
|
+
select(containerSelector: string, fields: Record<string, FieldSpec>): Promise<ExtractedRecord[]>;
|
|
60
|
+
/**
|
|
61
|
+
* Extract structured data from the first matching element.
|
|
62
|
+
*/
|
|
63
|
+
selectFirst(containerSelector: string, fields: Record<string, FieldSpec>): Promise<ExtractedRecord | null>;
|
|
64
|
+
/**
|
|
65
|
+
* Count elements matching a selector.
|
|
66
|
+
*/
|
|
67
|
+
count(selector: string): Promise<number>;
|
|
68
|
+
/**
|
|
69
|
+
* Extract a <table> as structured records.
|
|
70
|
+
*/
|
|
71
|
+
table(selector?: string, options?: TableOptions): Promise<ExtractedRecord[]>;
|
|
72
|
+
/**
|
|
73
|
+
* Extract a CSS grid/flexbox table.
|
|
74
|
+
*/
|
|
75
|
+
grid(containerSelector: string, itemSelector?: string): Promise<ExtractedRecord[]>;
|
|
76
|
+
/**
|
|
77
|
+
* Extract a <dl>/<dt>/<dd> definition list as key-value pairs.
|
|
78
|
+
*/
|
|
79
|
+
definitionList(selector?: string): Promise<ExtractedRecord>;
|
|
80
|
+
/**
|
|
81
|
+
* Auto-detect all table-like structures on the page.
|
|
82
|
+
*/
|
|
83
|
+
detectTables(): Promise<ReturnType<typeof detectTables>>;
|
|
84
|
+
/**
|
|
85
|
+
* Extract all structured metadata (title, OG, Twitter, JSON-LD, microdata, feeds).
|
|
86
|
+
*/
|
|
87
|
+
meta(): Promise<MetaData>;
|
|
88
|
+
/**
|
|
89
|
+
* Extract JSON-LD structured data.
|
|
90
|
+
*/
|
|
91
|
+
jsonLd(): Promise<object[]>;
|
|
92
|
+
/**
|
|
93
|
+
* Detect repeating patterns on the page.
|
|
94
|
+
*/
|
|
95
|
+
detect(options?: DetectOptions): Promise<DetectedPattern[]>;
|
|
96
|
+
/**
|
|
97
|
+
* Detect the best pattern and immediately extract all items.
|
|
98
|
+
*/
|
|
99
|
+
detectAndExtract(options?: DetectOptions): Promise<ExtractedRecord[]>;
|
|
100
|
+
/**
|
|
101
|
+
* Extract items from a list/card container with auto-field detection.
|
|
102
|
+
*/
|
|
103
|
+
lists(containerSelector: string, options?: ListOptions): Promise<ExtractedRecord[]>;
|
|
104
|
+
/**
|
|
105
|
+
* Run a multi-page scrape with pagination, deduplication, and detail following.
|
|
106
|
+
*/
|
|
107
|
+
scrape(containerSelector: string, options?: ScrapeOptions): Promise<ExtractionResult>;
|
|
108
|
+
/**
|
|
109
|
+
* Abort a running scrape session.
|
|
110
|
+
*/
|
|
111
|
+
abortScrape(): void;
|
|
112
|
+
/**
|
|
113
|
+
* Remove obstructions (cookie banners, modals, fixed elements, ads).
|
|
114
|
+
*/
|
|
115
|
+
clean(options?: CleanOptions): Promise<CleanResult>;
|
|
116
|
+
/**
|
|
117
|
+
* Get page HTML.
|
|
118
|
+
*/
|
|
119
|
+
html(options?: {
|
|
120
|
+
cleanLevel?: 'minimal' | 'basic' | 'aggressive';
|
|
121
|
+
}): Promise<string>;
|
|
122
|
+
/**
|
|
123
|
+
* Get page content as markdown.
|
|
124
|
+
*/
|
|
125
|
+
markdown(): Promise<string>;
|
|
126
|
+
/**
|
|
127
|
+
* Get text content, optionally filtered by selector and regex.
|
|
128
|
+
*/
|
|
129
|
+
text(selector?: string, regex?: string): Promise<string>;
|
|
130
|
+
/**
|
|
131
|
+
* Detect the site type (e.g., 'google', 'amazon', 'wikipedia').
|
|
132
|
+
*/
|
|
133
|
+
detectSite(): Promise<string>;
|
|
134
|
+
/**
|
|
135
|
+
* Extract structured data using built-in site templates.
|
|
136
|
+
*/
|
|
137
|
+
siteData(template?: string): Promise<unknown>;
|
|
138
|
+
}
|
|
139
|
+
//# sourceMappingURL=extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/extraction/extractor.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC/C,OAAO,KAAK,EACV,eAAe,EACf,SAAS,EACT,gBAAgB,EAChB,QAAQ,EACR,eAAe,EACf,aAAa,EACb,YAAY,EACZ,YAAY,EACZ,WAAW,EACX,aAAa,EACb,WAAW,EACZ,MAAM,YAAY,CAAC;AAGpB,OAAO,EAIL,YAAY,EACb,MAAM,sBAAsB,CAAC;AAO9B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,qBAAa,SAAS;IACpB,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAgB;IACtC,OAAO,CAAC,cAAc,CAA8B;gBAExC,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM;IAMjD;;OAEG;IACG,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAM3E;;OAEG;IACG,MAAM,CACV,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,OAAO,CAAC,eAAe,EAAE,CAAC;IAK7B;;OAEG;IACG,WAAW,CACf,iBAAiB,EAAE,MAAM,EACzB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,GAChC,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC;IAKlC;;OAEG;IACG,KAAK,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAO9C;;OAEG;IACG,KAAK,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAKlF;;OAEG;IACG,IAAI,CACR,iBAAiB,EAAE,MAAM,EACzB,YAAY,CAAC,EAAE,MAAM,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC;IAK7B;;OAEG;IACG,cAAc,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAKjE;;OAEG;IACG,YAAY,IAAI,OAAO,CAAC,UAAU,CAAC,OAAO,YAAY,CAAC,CAAC;IAO9D;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,QAAQ,CAAC;IAK/B;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IAOjC;;OAEG;IACG,MAAM,CAAC,OAAO,CAAC,EAAE,aAAa,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAKjE;;OAEG;IACG,gBAAgB,CAAC,OAAO,CAAC,EAAE,aAAa,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAO3E;;OAEG;IACG,KAAK,CACT,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC;IAO7B;;OAEG;IACG,MAAM,CACV,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,aAAa,GACtB,OAAO,CAAC,gBAAgB,CAAC;IAS5B;;OAEG;IACH,WAAW,IAAI,IAAI;IAMnB;;OAEG;IACG,KAAK,CAAC,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,WAAW,CAAC;IAMzD;;OAEG;IACG,IAAI,CAAC,OAAO,CAAC,EAAE;QAAE,UAAU,CAAC,EAAE,SAAS,GAAG,OAAO,GAAG,YAAY,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI1F;;OAEG;IACG,QAAQ,IAAI,OAAO,CAAC,MAAM,CAAC;IAIjC;;OAEG;IACG,IAAI,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAM9D;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,MAAM,CAAC;IAInC;;OAEG;IACG,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;CAGpD"}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main Extractor class — the primary entry point for the extraction module.
|
|
3
|
+
*
|
|
4
|
+
* Constructs and coordinates all extraction strategies:
|
|
5
|
+
* select, table, meta, detect, lists, scrape, clean, and more.
|
|
6
|
+
*
|
|
7
|
+
* No AI dependencies — uses only deterministic browser tools and
|
|
8
|
+
* client-side HTML parsing (cheerio).
|
|
9
|
+
*/
|
|
10
|
+
import { HTMLProcessor } from './html-processor.js';
|
|
11
|
+
import { extractAll, extractFirst, count } from './selector-engine.js';
|
|
12
|
+
import { extractTable, extractGrid, extractDefinitionList, detectTables, } from './table-extractor.js';
|
|
13
|
+
import { extractMeta, extractJsonLd } from './meta-extractor.js';
|
|
14
|
+
import { removeObstructions } from './content-cleaner.js';
|
|
15
|
+
import { extract as extractList } from './list-extractor.js';
|
|
16
|
+
import { detect as detectPatterns, detectAndExtract } from './pattern-detector.js';
|
|
17
|
+
import { ScrapeSession } from './scrape-session.js';
|
|
18
|
+
/**
|
|
19
|
+
* Universal data extraction from any website.
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```typescript
|
|
23
|
+
* import { OwlBrowser } from '@olib-ai/owl-browser-sdk';
|
|
24
|
+
* import { Extractor } from '@olib-ai/owl-browser-sdk/extraction';
|
|
25
|
+
*
|
|
26
|
+
* const browser = new OwlBrowser({ url: '...', token: '...' });
|
|
27
|
+
* await browser.connect();
|
|
28
|
+
* const ctx = await browser.createContext();
|
|
29
|
+
*
|
|
30
|
+
* const ex = new Extractor(browser, ctx.context_id);
|
|
31
|
+
* await ex.goto('https://example.com/products');
|
|
32
|
+
*
|
|
33
|
+
* // Manual extraction with CSS selectors
|
|
34
|
+
* const products = await ex.select('.product-card', {
|
|
35
|
+
* name: 'h3',
|
|
36
|
+
* price: '.price',
|
|
37
|
+
* image: 'img@src',
|
|
38
|
+
* link: 'a@href',
|
|
39
|
+
* });
|
|
40
|
+
*
|
|
41
|
+
* // Auto-detect repeating patterns
|
|
42
|
+
* const patterns = await ex.detect();
|
|
43
|
+
*
|
|
44
|
+
* // Multi-page scraping
|
|
45
|
+
* const result = await ex.scrape('.product-card', {
|
|
46
|
+
* fields: { name: 'h3', price: '.price' },
|
|
47
|
+
* maxPages: 5,
|
|
48
|
+
* });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
export class Extractor {
|
|
52
|
+
_proc;
|
|
53
|
+
_scrapeSession = null;
|
|
54
|
+
constructor(client, contextId) {
|
|
55
|
+
this._proc = new HTMLProcessor(client, contextId);
|
|
56
|
+
}
|
|
57
|
+
// ==================== Navigation ====================
|
|
58
|
+
/**
|
|
59
|
+
* Navigate to a URL and wait for network idle.
|
|
60
|
+
*/
|
|
61
|
+
async goto(url, options) {
|
|
62
|
+
await this._proc.goto(url, options?.waitForIdle);
|
|
63
|
+
}
|
|
64
|
+
// ==================== Manual Extraction ====================
|
|
65
|
+
/**
|
|
66
|
+
* Extract structured data from all elements matching a selector.
|
|
67
|
+
*/
|
|
68
|
+
async select(containerSelector, fields) {
|
|
69
|
+
const html = await this._proc.getHtml('basic');
|
|
70
|
+
return extractAll(html, containerSelector, fields);
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Extract structured data from the first matching element.
|
|
74
|
+
*/
|
|
75
|
+
async selectFirst(containerSelector, fields) {
|
|
76
|
+
const html = await this._proc.getHtml('basic');
|
|
77
|
+
return extractFirst(html, containerSelector, fields);
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Count elements matching a selector.
|
|
81
|
+
*/
|
|
82
|
+
async count(selector) {
|
|
83
|
+
const html = await this._proc.getHtml('basic');
|
|
84
|
+
return count(html, selector);
|
|
85
|
+
}
|
|
86
|
+
// ==================== Tables ====================
|
|
87
|
+
/**
|
|
88
|
+
* Extract a <table> as structured records.
|
|
89
|
+
*/
|
|
90
|
+
async table(selector, options) {
|
|
91
|
+
const html = await this._proc.getHtml('basic');
|
|
92
|
+
return extractTable(html, selector, options);
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Extract a CSS grid/flexbox table.
|
|
96
|
+
*/
|
|
97
|
+
async grid(containerSelector, itemSelector) {
|
|
98
|
+
const html = await this._proc.getHtml('basic');
|
|
99
|
+
return extractGrid(html, containerSelector, itemSelector);
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Extract a <dl>/<dt>/<dd> definition list as key-value pairs.
|
|
103
|
+
*/
|
|
104
|
+
async definitionList(selector) {
|
|
105
|
+
const html = await this._proc.getHtml('basic');
|
|
106
|
+
return extractDefinitionList(html, selector);
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Auto-detect all table-like structures on the page.
|
|
110
|
+
*/
|
|
111
|
+
async detectTables() {
|
|
112
|
+
const html = await this._proc.getHtml('basic');
|
|
113
|
+
return detectTables(html);
|
|
114
|
+
}
|
|
115
|
+
// ==================== Metadata ====================
|
|
116
|
+
/**
|
|
117
|
+
* Extract all structured metadata (title, OG, Twitter, JSON-LD, microdata, feeds).
|
|
118
|
+
*/
|
|
119
|
+
async meta() {
|
|
120
|
+
const html = await this._proc.getHtml('minimal');
|
|
121
|
+
return extractMeta(html);
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Extract JSON-LD structured data.
|
|
125
|
+
*/
|
|
126
|
+
async jsonLd() {
|
|
127
|
+
const html = await this._proc.getHtml('minimal');
|
|
128
|
+
return extractJsonLd(html);
|
|
129
|
+
}
|
|
130
|
+
// ==================== Auto-Detection ====================
|
|
131
|
+
/**
|
|
132
|
+
* Detect repeating patterns on the page.
|
|
133
|
+
*/
|
|
134
|
+
async detect(options) {
|
|
135
|
+
const html = await this._proc.getHtml('aggressive');
|
|
136
|
+
return detectPatterns(html, options);
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Detect the best pattern and immediately extract all items.
|
|
140
|
+
*/
|
|
141
|
+
async detectAndExtract(options) {
|
|
142
|
+
const html = await this._proc.getHtml('aggressive');
|
|
143
|
+
return detectAndExtract(html, options);
|
|
144
|
+
}
|
|
145
|
+
// ==================== Lists ====================
|
|
146
|
+
/**
|
|
147
|
+
* Extract items from a list/card container with auto-field detection.
|
|
148
|
+
*/
|
|
149
|
+
async lists(containerSelector, options) {
|
|
150
|
+
const html = await this._proc.getHtml('basic');
|
|
151
|
+
return extractList(html, containerSelector, options);
|
|
152
|
+
}
|
|
153
|
+
// ==================== Multi-Page ====================
|
|
154
|
+
/**
|
|
155
|
+
* Run a multi-page scrape with pagination, deduplication, and detail following.
|
|
156
|
+
*/
|
|
157
|
+
async scrape(containerSelector, options) {
|
|
158
|
+
this._scrapeSession = new ScrapeSession(this._proc);
|
|
159
|
+
try {
|
|
160
|
+
return await this._scrapeSession.scrape(containerSelector, options);
|
|
161
|
+
}
|
|
162
|
+
finally {
|
|
163
|
+
this._scrapeSession = null;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Abort a running scrape session.
|
|
168
|
+
*/
|
|
169
|
+
abortScrape() {
|
|
170
|
+
this._scrapeSession?.abort();
|
|
171
|
+
}
|
|
172
|
+
// ==================== Content Cleaning ====================
|
|
173
|
+
/**
|
|
174
|
+
* Remove obstructions (cookie banners, modals, fixed elements, ads).
|
|
175
|
+
*/
|
|
176
|
+
async clean(options) {
|
|
177
|
+
return removeObstructions(this._proc, options);
|
|
178
|
+
}
|
|
179
|
+
// ==================== Raw Content ====================
|
|
180
|
+
/**
|
|
181
|
+
* Get page HTML.
|
|
182
|
+
*/
|
|
183
|
+
async html(options) {
|
|
184
|
+
return this._proc.getHtml(options?.cleanLevel);
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Get page content as markdown.
|
|
188
|
+
*/
|
|
189
|
+
async markdown() {
|
|
190
|
+
return this._proc.getMarkdown();
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Get text content, optionally filtered by selector and regex.
|
|
194
|
+
*/
|
|
195
|
+
async text(selector, regex) {
|
|
196
|
+
return this._proc.getText(selector, regex);
|
|
197
|
+
}
|
|
198
|
+
// ==================== Site Templates ====================
|
|
199
|
+
/**
|
|
200
|
+
* Detect the site type (e.g., 'google', 'amazon', 'wikipedia').
|
|
201
|
+
*/
|
|
202
|
+
async detectSite() {
|
|
203
|
+
return this._proc.detectSite();
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Extract structured data using built-in site templates.
|
|
207
|
+
*/
|
|
208
|
+
async siteData(template) {
|
|
209
|
+
return this._proc.extractJson(template);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
//# sourceMappingURL=extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/extraction/extractor.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAgBH,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AACvE,OAAO,EACL,YAAY,EACZ,WAAW,EACX,qBAAqB,EACrB,YAAY,GACb,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACjE,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,OAAO,IAAI,WAAW,EAAoB,MAAM,qBAAqB,CAAC;AAC/E,OAAO,EAAE,MAAM,IAAI,cAAc,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACnF,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEpD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,MAAM,OAAO,SAAS;IACH,KAAK,CAAgB;IAC9B,cAAc,GAAyB,IAAI,CAAC;IAEpD,YAAY,MAAkB,EAAE,SAAiB;QAC/C,IAAI,CAAC,KAAK,GAAG,IAAI,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IACpD,CAAC;IAED,uDAAuD;IAEvD;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,GAAW,EAAE,OAAmC;QACzD,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC;IACnD,CAAC;IAED,8DAA8D;IAE9D;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,iBAAyB,EACzB,MAAiC;QAEjC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,UAAU,CAAC,IAAI,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CACf,iBAAyB,EACzB,MAAiC;QAEjC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,YAAY,CAAC,IAAI,EAAE,iBAAiB,EAAE,MAAM,CAAC,CAAC;IACvD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,QAAgB;QAC1B,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IAC/B,CAAC;IAED,mDAAmD;IAEnD;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,QAAiB,EAAE,OAAsB;QACnD,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,YAAY,CAAC,IAAI,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CACR,iBAAyB,EACzB,YAAqB;QAErB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,WAAW,CAAC,IAAI,EAAE,iBAAiB,EAAE,YAAY,CAAC,CAAC;IAC5D,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,QAAiB;QACpC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,qBAAqB,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY;QAChB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,YAAY,CAAC,IAAI,CAAC,CAAC;IAC5B,CAAC;IAED,qDAAqD;IAErD;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QACjD,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM;QACV,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QACjD,OAAO,aAAa,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;IAED,2DAA2D;IAE3D;;OAEG;IACH,KAAK,CAAC,MAAM,CAAC,OAAuB;QAClC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QACpD,OAAO,cAAc,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,OAAuB;QAC5C,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QACpD,OAAO,gBAAgB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC;IAED,kDAAkD;IAElD;;OAEG;IACH,KAAK,CAAC,KAAK,CACT,iBAAyB,EACzB,OAAqB;QAErB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAC/C,OAAO,WAAW,CAAC,IAAI,EAAE,iBAAiB,EAAE,OAAO,CAAC,CAAC;IACvD,CAAC;IAED,uDAAuD;IAEvD;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,iBAAyB,EACzB,OAAuB;QAEvB,IAAI,CAAC,cAAc,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACpD,IAAI,CAAC;YACH,OAAO,MAAM,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC;QACtE,CAAC;gBAAS,CAAC;YACT,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC;QAC7B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,WAAW;QACT,IAAI,CAAC,cAAc,EAAE,KAAK,EAAE,CAAC;IAC/B,CAAC;IAED,6DAA6D;IAE7D;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,OAAsB;QAChC,OAAO,kBAAkB,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IACjD,CAAC;IAED,wDAAwD;IAExD;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,OAA6D;QACtE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACjD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ;QACZ,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,QAAiB,EAAE,KAAc;QAC1C,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;IAC7C,CAAC;IAED,2DAA2D;IAE3D;;OAEG;IACH,KAAK,CAAC,UAAU;QACd,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,QAAiB;QAC9B,OAAO,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;IAC1C,CAAC;CACF"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browser communication layer for extraction.
|
|
3
|
+
*
|
|
4
|
+
* Wraps browser tools (get_html, get_markdown, evaluate, navigate, etc.)
|
|
5
|
+
* into a clean async interface used by all extraction modules.
|
|
6
|
+
*/
|
|
7
|
+
import type { OwlBrowser } from '../client.js';
|
|
8
|
+
import type { PageInfo } from './types.js';
|
|
9
|
+
export declare class HTMLProcessor {
|
|
10
|
+
private readonly _client;
|
|
11
|
+
private readonly _contextId;
|
|
12
|
+
constructor(client: OwlBrowser, contextId: string);
|
|
13
|
+
/**
|
|
14
|
+
* Get page HTML from browser.
|
|
15
|
+
* @param cleanLevel - Cleaning level: 'minimal' | 'basic' | 'aggressive' (default: 'basic')
|
|
16
|
+
*/
|
|
17
|
+
getHtml(cleanLevel?: 'minimal' | 'basic' | 'aggressive'): Promise<string>;
|
|
18
|
+
/**
|
|
19
|
+
* Get page content as markdown.
|
|
20
|
+
*/
|
|
21
|
+
getMarkdown(): Promise<string>;
|
|
22
|
+
/**
|
|
23
|
+
* Extract text from page, optionally filtered by selector and regex.
|
|
24
|
+
*/
|
|
25
|
+
getText(selector?: string, regex?: string): Promise<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Get current page URL and title.
|
|
28
|
+
*/
|
|
29
|
+
getPageInfo(): Promise<PageInfo>;
|
|
30
|
+
/**
|
|
31
|
+
* Get current page URL.
|
|
32
|
+
*/
|
|
33
|
+
getUrl(): Promise<string>;
|
|
34
|
+
/**
|
|
35
|
+
* Execute JavaScript in the page and return the result.
|
|
36
|
+
*/
|
|
37
|
+
evaluate(expression: string): Promise<unknown>;
|
|
38
|
+
/**
|
|
39
|
+
* Navigate to a URL and optionally wait for network idle.
|
|
40
|
+
*/
|
|
41
|
+
goto(url: string, waitForIdle?: boolean): Promise<void>;
|
|
42
|
+
/**
|
|
43
|
+
* Wait for a CSS selector to appear in the DOM.
|
|
44
|
+
*/
|
|
45
|
+
waitForSelector(selector: string, timeout?: number): Promise<void>;
|
|
46
|
+
/**
|
|
47
|
+
* Wait for network to become idle.
|
|
48
|
+
*/
|
|
49
|
+
waitForNetworkIdle(timeout?: number): Promise<void>;
|
|
50
|
+
/**
|
|
51
|
+
* Wait for a specified number of milliseconds.
|
|
52
|
+
*/
|
|
53
|
+
wait(ms: number): Promise<void>;
|
|
54
|
+
/**
|
|
55
|
+
* Click an element by CSS selector.
|
|
56
|
+
*/
|
|
57
|
+
click(selector: string): Promise<void>;
|
|
58
|
+
/**
|
|
59
|
+
* Scroll down by a number of pixels.
|
|
60
|
+
*/
|
|
61
|
+
scrollBy(x: number, y: number): Promise<void>;
|
|
62
|
+
/**
|
|
63
|
+
* Scroll to the bottom of the page.
|
|
64
|
+
*/
|
|
65
|
+
scrollToBottom(): Promise<void>;
|
|
66
|
+
/**
|
|
67
|
+
* Detect the site type (e.g., 'google', 'amazon', 'wikipedia').
|
|
68
|
+
*/
|
|
69
|
+
detectSite(): Promise<string>;
|
|
70
|
+
/**
|
|
71
|
+
* Extract structured JSON using built-in site templates.
|
|
72
|
+
*/
|
|
73
|
+
extractJson(template?: string): Promise<unknown>;
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=html-processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-processor.d.ts","sourceRoot":"","sources":["../../src/extraction/html-processor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC/C,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAE3C,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAa;IACrC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;gBAExB,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM;IAKjD;;;OAGG;IACG,OAAO,CAAC,UAAU,CAAC,EAAE,SAAS,GAAG,OAAO,GAAG,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;IAO/E;;OAEG;IACG,WAAW,IAAI,OAAO,CAAC,MAAM,CAAC;IAOpC;;OAEG;IACG,OAAO,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQjE;;OAEG;IACG,WAAW,IAAI,OAAO,CAAC,QAAQ,CAAC;IActC;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,MAAM,CAAC;IAK/B;;OAEG;IACG,QAAQ,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAYpD;;OAEG;IACG,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC;IAU7D;;OAEG;IACG,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IASxE;;OAEG;IACG,kBAAkB,CAAC,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAMzD;;OAEG;IACG,IAAI,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAOrC;;OAEG;IACG,KAAK,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAO5C;;OAEG;IACG,QAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAQnD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,IAAI,CAAC;IAMrC;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,MAAM,CAAC;IAYnC;;OAEG;IACG,WAAW,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;CAKvD"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browser communication layer for extraction.
|
|
3
|
+
*
|
|
4
|
+
* Wraps browser tools (get_html, get_markdown, evaluate, navigate, etc.)
|
|
5
|
+
* into a clean async interface used by all extraction modules.
|
|
6
|
+
*/
|
|
7
|
+
export class HTMLProcessor {
|
|
8
|
+
_client;
|
|
9
|
+
_contextId;
|
|
10
|
+
constructor(client, contextId) {
|
|
11
|
+
this._client = client;
|
|
12
|
+
this._contextId = contextId;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Get page HTML from browser.
|
|
16
|
+
* @param cleanLevel - Cleaning level: 'minimal' | 'basic' | 'aggressive' (default: 'basic')
|
|
17
|
+
*/
|
|
18
|
+
async getHtml(cleanLevel) {
|
|
19
|
+
const params = { context_id: this._contextId };
|
|
20
|
+
if (cleanLevel)
|
|
21
|
+
params['clean_level'] = cleanLevel;
|
|
22
|
+
const result = await this._client.execute('browser_get_html', params);
|
|
23
|
+
return unwrapString(result);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Get page content as markdown.
|
|
27
|
+
*/
|
|
28
|
+
async getMarkdown() {
|
|
29
|
+
const result = await this._client.execute('browser_get_markdown', {
|
|
30
|
+
context_id: this._contextId,
|
|
31
|
+
});
|
|
32
|
+
return unwrapString(result);
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Extract text from page, optionally filtered by selector and regex.
|
|
36
|
+
*/
|
|
37
|
+
async getText(selector, regex) {
|
|
38
|
+
const params = { context_id: this._contextId };
|
|
39
|
+
if (selector)
|
|
40
|
+
params['selector'] = selector;
|
|
41
|
+
if (regex)
|
|
42
|
+
params['regex'] = regex;
|
|
43
|
+
const result = await this._client.execute('browser_extract_text', params);
|
|
44
|
+
return unwrapString(result);
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Get current page URL and title.
|
|
48
|
+
*/
|
|
49
|
+
async getPageInfo() {
|
|
50
|
+
const result = await this._client.execute('browser_get_page_info', {
|
|
51
|
+
context_id: this._contextId,
|
|
52
|
+
});
|
|
53
|
+
if (typeof result === 'object' && result !== null) {
|
|
54
|
+
const r = result;
|
|
55
|
+
return {
|
|
56
|
+
url: String(r['url'] ?? ''),
|
|
57
|
+
title: String(r['title'] ?? ''),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
return { url: '', title: '' };
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Get current page URL.
|
|
64
|
+
*/
|
|
65
|
+
async getUrl() {
|
|
66
|
+
const info = await this.getPageInfo();
|
|
67
|
+
return info.url;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Execute JavaScript in the page and return the result.
|
|
71
|
+
*/
|
|
72
|
+
async evaluate(expression) {
|
|
73
|
+
const result = await this._client.execute('browser_evaluate', {
|
|
74
|
+
context_id: this._contextId,
|
|
75
|
+
expression,
|
|
76
|
+
});
|
|
77
|
+
if (typeof result === 'object' && result !== null) {
|
|
78
|
+
const r = result;
|
|
79
|
+
return r['result'] ?? result;
|
|
80
|
+
}
|
|
81
|
+
return result;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Navigate to a URL and optionally wait for network idle.
|
|
85
|
+
*/
|
|
86
|
+
async goto(url, waitForIdle) {
|
|
87
|
+
await this._client.execute('browser_navigate', {
|
|
88
|
+
context_id: this._contextId,
|
|
89
|
+
url,
|
|
90
|
+
});
|
|
91
|
+
if (waitForIdle !== false) {
|
|
92
|
+
await this.waitForNetworkIdle();
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Wait for a CSS selector to appear in the DOM.
|
|
97
|
+
*/
|
|
98
|
+
async waitForSelector(selector, timeout) {
|
|
99
|
+
const params = {
|
|
100
|
+
context_id: this._contextId,
|
|
101
|
+
selector,
|
|
102
|
+
};
|
|
103
|
+
if (timeout !== undefined)
|
|
104
|
+
params['timeout'] = timeout;
|
|
105
|
+
await this._client.execute('browser_wait_for_selector', params);
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Wait for network to become idle.
|
|
109
|
+
*/
|
|
110
|
+
async waitForNetworkIdle(timeout) {
|
|
111
|
+
const params = { context_id: this._contextId };
|
|
112
|
+
if (timeout !== undefined)
|
|
113
|
+
params['timeout'] = timeout;
|
|
114
|
+
await this._client.execute('browser_wait_for_network_idle', params);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Wait for a specified number of milliseconds.
|
|
118
|
+
*/
|
|
119
|
+
async wait(ms) {
|
|
120
|
+
await this._client.execute('browser_wait', {
|
|
121
|
+
context_id: this._contextId,
|
|
122
|
+
timeout: ms,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Click an element by CSS selector.
|
|
127
|
+
*/
|
|
128
|
+
async click(selector) {
|
|
129
|
+
await this._client.execute('browser_click', {
|
|
130
|
+
context_id: this._contextId,
|
|
131
|
+
selector,
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Scroll down by a number of pixels.
|
|
136
|
+
*/
|
|
137
|
+
async scrollBy(x, y) {
|
|
138
|
+
await this._client.execute('browser_scroll_by', {
|
|
139
|
+
context_id: this._contextId,
|
|
140
|
+
x,
|
|
141
|
+
y,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Scroll to the bottom of the page.
|
|
146
|
+
*/
|
|
147
|
+
async scrollToBottom() {
|
|
148
|
+
await this._client.execute('browser_scroll_to_bottom', {
|
|
149
|
+
context_id: this._contextId,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Detect the site type (e.g., 'google', 'amazon', 'wikipedia').
|
|
154
|
+
*/
|
|
155
|
+
async detectSite() {
|
|
156
|
+
const result = await this._client.execute('browser_detect_site', {
|
|
157
|
+
context_id: this._contextId,
|
|
158
|
+
});
|
|
159
|
+
if (typeof result === 'string')
|
|
160
|
+
return result;
|
|
161
|
+
if (typeof result === 'object' && result !== null) {
|
|
162
|
+
const r = result;
|
|
163
|
+
return String(r['site_type'] ?? r['type'] ?? 'unknown');
|
|
164
|
+
}
|
|
165
|
+
return 'unknown';
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extract structured JSON using built-in site templates.
|
|
169
|
+
*/
|
|
170
|
+
async extractJson(template) {
|
|
171
|
+
const params = { context_id: this._contextId };
|
|
172
|
+
if (template)
|
|
173
|
+
params['template'] = template;
|
|
174
|
+
return await this._client.execute('browser_extract_json', params);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Unwrap a tool result that may be a plain string or an object with html/content/text/markdown keys.
|
|
179
|
+
*/
|
|
180
|
+
function unwrapString(result) {
|
|
181
|
+
if (typeof result === 'string')
|
|
182
|
+
return result;
|
|
183
|
+
if (typeof result === 'object' && result !== null) {
|
|
184
|
+
const r = result;
|
|
185
|
+
for (const key of ['html', 'markdown', 'text', 'content']) {
|
|
186
|
+
if (typeof r[key] === 'string')
|
|
187
|
+
return r[key];
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
return String(result ?? '');
|
|
191
|
+
}
|
|
192
|
+
//# sourceMappingURL=html-processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-processor.js","sourceRoot":"","sources":["../../src/extraction/html-processor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,MAAM,OAAO,aAAa;IACP,OAAO,CAAa;IACpB,UAAU,CAAS;IAEpC,YAAY,MAAkB,EAAE,SAAiB;QAC/C,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;QACtB,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;IAC9B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,OAAO,CAAC,UAA+C;QAC3D,MAAM,MAAM,GAA4B,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAAC;QACxE,IAAI,UAAU;YAAE,MAAM,CAAC,aAAa,CAAC,GAAG,UAAU,CAAC;QACnD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE,MAAM,CAAC,CAAC;QACtE,OAAO,YAAY,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW;QACf,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,sBAAsB,EAAE;YAChE,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;QACH,OAAO,YAAY,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,QAAiB,EAAE,KAAc;QAC7C,MAAM,MAAM,GAA4B,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAAC;QACxE,IAAI,QAAQ;YAAE,MAAM,CAAC,UAAU,CAAC,GAAG,QAAQ,CAAC;QAC5C,IAAI,KAAK;YAAE,MAAM,CAAC,OAAO,CAAC,GAAG,KAAK,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,sBAAsB,EAAE,MAAM,CAAC,CAAC;QAC1E,OAAO,YAAY,CAAC,MAAM,CAAC,CAAC;IAC9B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW;QACf,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,uBAAuB,EAAE;YACjE,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;QACH,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YAClD,MAAM,CAAC,GAAG,MAAiC,CAAC;YAC5C,OAAO;gBACL,GAAG,EAAE,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAC3B,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;aAChC,CAAC;QACJ,CAAC;QACD,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;IAChC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM;QACV,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;QACtC,OAAO,IAAI,CAAC,GAAG,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,UAAkB;QAC/B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE;YAC5D,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,UAAU;SACX,CAAC,CAAC;QACH,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YAClD,MAAM,CAAC,GAAG,MAAiC,CAAC;YAC5C,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC;QAC/B,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,GAAW,EAAE,WAAqB;QAC3C,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE;YAC7C,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,GAAG;SACJ,CAAC,CAAC;QACH,IAAI,WAAW,KAAK,KAAK,EAAE,CAAC;YAC1B,MAAM,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAClC,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,eAAe,CAAC,QAAgB,EAAE,OAAgB;QACtD,MAAM,MAAM,GAA4B;YACtC,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ;SACT,CAAC;QACF,IAAI,OAAO,KAAK,SAAS;YAAE,MAAM,CAAC,SAAS,CAAC,GAAG,OAAO,CAAC;QACvD,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,2BAA2B,EAAE,MAAM,CAAC,CAAC;IAClE,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,kBAAkB,CAAC,OAAgB;QACvC,MAAM,MAAM,GAA4B,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAAC;QACxE,IAAI,OAAO,KAAK,SAAS;YAAE,MAAM,CAAC,SAAS,CAAC,GAAG,OAAO,CAAC;QACvD,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;IACtE,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,EAAU;QACnB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,cAAc,EAAE;YACzC,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,OAAO,EAAE,EAAE;SACZ,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,QAAgB;QAC1B,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,eAAe,EAAE;YAC1C,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ;SACT,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,CAAS,EAAE,CAAS;QACjC,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,mBAAmB,EAAE;YAC9C,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,CAAC;YACD,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc;QAClB,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,0BAA0B,EAAE;YACrD,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU;QACd,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,qBAAqB,EAAE;YAC/D,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC,CAAC;QACH,IAAI,OAAO,MAAM,KAAK,QAAQ;YAAE,OAAO,MAAM,CAAC;QAC9C,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YAClD,MAAM,CAAC,GAAG,MAAiC,CAAC;YAC5C,OAAO,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,IAAI,SAAS,CAAC,CAAC;QAC1D,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,QAAiB;QACjC,MAAM,MAAM,GAA4B,EAAE,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,CAAC;QACxE,IAAI,QAAQ;YAAE,MAAM,CAAC,UAAU,CAAC,GAAG,QAAQ,CAAC;QAC5C,OAAO,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,sBAAsB,EAAE,MAAM,CAAC,CAAC;IACpE,CAAC;CACF;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,MAAe;IACnC,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,MAAM,CAAC;IAC9C,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,CAAC,GAAG,MAAiC,CAAC;QAC5C,KAAK,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,CAAC;YAC1D,IAAI,OAAO,CAAC,CAAC,GAAG,CAAC,KAAK,QAAQ;gBAAE,OAAO,CAAC,CAAC,GAAG,CAAW,CAAC;QAC1D,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;AAC9B,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal extraction module for Owl Browser Node.js SDK.
|
|
3
|
+
*
|
|
4
|
+
* Provides structured data extraction from any website using CSS selectors,
|
|
5
|
+
* pattern detection, table parsing, metadata extraction, and multi-page scraping.
|
|
6
|
+
*/
|
|
7
|
+
export { Extractor } from './extractor.js';
|
|
8
|
+
export type { Transform, FieldSpec, ObjectFieldSpec, ExtractedRecord, ExtractionResult, MetaData, FeedLink, DetectedPattern, DetectOptions, TableOptions, CleanOptions, CleanResult, ScrapeOptions, PaginationConfig, FollowConfig, PageInfo, ListOptions, } from './types.js';
|
|
9
|
+
export { applyTransform, applyTransforms, applyPattern, coerceType, parsePrice, parseDate, resolveUrl, } from './transforms.js';
|
|
10
|
+
export { extractAll, extractFirst, count, } from './selector-engine.js';
|
|
11
|
+
export { extractTable, extractGrid, extractDefinitionList, detectTables, } from './table-extractor.js';
|
|
12
|
+
export { extractMeta, extractJsonLd, extractOpenGraph, extractTwitterCard, extractMicrodata, extractFeeds, extractCanonical, extractFavicon, } from './meta-extractor.js';
|
|
13
|
+
export { detect, detectAndExtract, } from './pattern-detector.js';
|
|
14
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/extraction/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG3C,YAAY,EACV,SAAS,EACT,SAAS,EACT,eAAe,EACf,eAAe,EACf,gBAAgB,EAChB,QAAQ,EACR,QAAQ,EACR,eAAe,EACf,aAAa,EACb,YAAY,EACZ,YAAY,EACZ,WAAW,EACX,aAAa,EACb,gBAAgB,EAChB,YAAY,EACZ,QAAQ,EACR,WAAW,GACZ,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,cAAc,EACd,eAAe,EACf,YAAY,EACZ,UAAU,EACV,UAAU,EACV,SAAS,EACT,UAAU,GACX,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,UAAU,EACV,YAAY,EACZ,KAAK,GACN,MAAM,sBAAsB,CAAC;AAG9B,OAAO,EACL,YAAY,EACZ,WAAW,EACX,qBAAqB,EACrB,YAAY,GACb,MAAM,sBAAsB,CAAC;AAG9B,OAAO,EACL,WAAW,EACX,aAAa,EACb,gBAAgB,EAChB,kBAAkB,EAClB,gBAAgB,EAChB,YAAY,EACZ,gBAAgB,EAChB,cAAc,GACf,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,MAAM,EACN,gBAAgB,GACjB,MAAM,uBAAuB,CAAC"}
|