webpeel 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -4
- package/dist/cli-auth.d.ts +6 -0
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +463 -22
- package/dist/cli.js.map +1 -1
- package/dist/core/challenge-detection.d.ts.map +1 -1
- package/dist/core/challenge-detection.js +39 -6
- package/dist/core/challenge-detection.js.map +1 -1
- package/dist/core/extract-listings.d.ts.map +1 -1
- package/dist/core/extract-listings.js +167 -36
- package/dist/core/extract-listings.js.map +1 -1
- package/dist/core/fetcher.d.ts +6 -0
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +147 -11
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/hotel-search.d.ts +121 -0
- package/dist/core/hotel-search.d.ts.map +1 -0
- package/dist/core/hotel-search.js +381 -0
- package/dist/core/hotel-search.js.map +1 -0
- package/dist/core/llm-extract.d.ts +42 -0
- package/dist/core/llm-extract.d.ts.map +1 -0
- package/dist/core/llm-extract.js +144 -0
- package/dist/core/llm-extract.js.map +1 -0
- package/dist/core/profiles.d.ts +48 -0
- package/dist/core/profiles.d.ts.map +1 -0
- package/dist/core/profiles.js +211 -0
- package/dist/core/profiles.js.map +1 -0
- package/dist/core/schema-extraction.d.ts +67 -0
- package/dist/core/schema-extraction.d.ts.map +1 -0
- package/dist/core/schema-extraction.js +353 -0
- package/dist/core/schema-extraction.js.map +1 -0
- package/dist/core/strategies.d.ts +5 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +9 -2
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/types.d.ts +6 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel Profile Management
|
|
3
|
+
*
|
|
4
|
+
* Manages named browser profiles stored in ~/.webpeel/profiles/<name>/
|
|
5
|
+
* Each profile contains:
|
|
6
|
+
* - storage-state.json (Playwright storage state: cookies, localStorage, origins)
|
|
7
|
+
* - metadata.json (name, created, lastUsed, domains, description)
|
|
8
|
+
*/
|
|
9
|
+
import { chromium } from 'playwright';
|
|
10
|
+
import { homedir } from 'os';
|
|
11
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync, } from 'fs';
|
|
12
|
+
import path from 'path';
|
|
13
|
+
// ─── Paths ───────────────────────────────────────────────────────────────────
|
|
14
|
+
const PROFILES_DIR = path.join(homedir(), '.webpeel', 'profiles');
|
|
15
|
+
function ensureProfilesDir() {
|
|
16
|
+
if (!existsSync(PROFILES_DIR)) {
|
|
17
|
+
mkdirSync(PROFILES_DIR, { recursive: true });
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
// ─── Name validation ─────────────────────────────────────────────────────────
|
|
21
|
+
/**
|
|
22
|
+
* Valid profile names: letters, digits, hyphens only. No spaces or special chars.
|
|
23
|
+
*/
|
|
24
|
+
export function isValidProfileName(name) {
|
|
25
|
+
return /^[a-zA-Z0-9-]+$/.test(name) && name.length > 0 && name.length <= 64;
|
|
26
|
+
}
|
|
27
|
+
// ─── Core helpers ─────────────────────────────────────────────────────────────
|
|
28
|
+
/**
|
|
29
|
+
* Get the directory path for a named profile, or null if it doesn't exist.
|
|
30
|
+
*/
|
|
31
|
+
export function getProfilePath(name) {
|
|
32
|
+
const dir = path.join(PROFILES_DIR, name);
|
|
33
|
+
if (existsSync(dir) && existsSync(path.join(dir, 'metadata.json'))) {
|
|
34
|
+
return dir;
|
|
35
|
+
}
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Load the Playwright storage state (cookies + localStorage) for a named profile.
|
|
40
|
+
* Returns null if the profile or storage-state.json doesn't exist.
|
|
41
|
+
*/
|
|
42
|
+
export function loadStorageState(name) {
|
|
43
|
+
const statePath = path.join(PROFILES_DIR, name, 'storage-state.json');
|
|
44
|
+
if (!existsSync(statePath))
|
|
45
|
+
return null;
|
|
46
|
+
try {
|
|
47
|
+
return JSON.parse(readFileSync(statePath, 'utf-8'));
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Update the lastUsed timestamp for a profile.
|
|
55
|
+
*/
|
|
56
|
+
export function touchProfile(name) {
|
|
57
|
+
const metaPath = path.join(PROFILES_DIR, name, 'metadata.json');
|
|
58
|
+
if (!existsSync(metaPath))
|
|
59
|
+
return;
|
|
60
|
+
try {
|
|
61
|
+
const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
|
|
62
|
+
meta.lastUsed = new Date().toISOString();
|
|
63
|
+
writeFileSync(metaPath, JSON.stringify(meta, null, 2));
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
/* ignore */
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* List all profiles, sorted by lastUsed descending.
|
|
71
|
+
*/
|
|
72
|
+
export function listProfiles() {
|
|
73
|
+
ensureProfilesDir();
|
|
74
|
+
const profiles = [];
|
|
75
|
+
try {
|
|
76
|
+
const entries = readdirSync(PROFILES_DIR, { withFileTypes: true });
|
|
77
|
+
for (const entry of entries) {
|
|
78
|
+
if (!entry.isDirectory())
|
|
79
|
+
continue;
|
|
80
|
+
const metaPath = path.join(PROFILES_DIR, entry.name, 'metadata.json');
|
|
81
|
+
if (!existsSync(metaPath))
|
|
82
|
+
continue;
|
|
83
|
+
try {
|
|
84
|
+
const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
|
|
85
|
+
profiles.push(meta);
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
/* skip corrupt profile */
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
catch {
|
|
93
|
+
/* ignore read errors */
|
|
94
|
+
}
|
|
95
|
+
// Sort: most recently used first
|
|
96
|
+
profiles.sort((a, b) => b.lastUsed.localeCompare(a.lastUsed));
|
|
97
|
+
return profiles;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Delete a named profile. Returns true if deleted, false if not found.
|
|
101
|
+
*/
|
|
102
|
+
export function deleteProfile(name) {
|
|
103
|
+
const dir = path.join(PROFILES_DIR, name);
|
|
104
|
+
if (!existsSync(dir))
|
|
105
|
+
return false;
|
|
106
|
+
try {
|
|
107
|
+
rmSync(dir, { recursive: true, force: true });
|
|
108
|
+
return true;
|
|
109
|
+
}
|
|
110
|
+
catch {
|
|
111
|
+
return false;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// ─── Interactive profile creation ─────────────────────────────────────────────
|
|
115
|
+
/**
|
|
116
|
+
* Interactively create a new profile:
|
|
117
|
+
* 1. Launches a VISIBLE (headed) Chromium browser
|
|
118
|
+
* 2. User navigates and logs into sites
|
|
119
|
+
* 3. On browser close or Ctrl+C, captures storage state and saves the profile
|
|
120
|
+
*/
|
|
121
|
+
export async function createProfile(name, description) {
|
|
122
|
+
if (!isValidProfileName(name)) {
|
|
123
|
+
throw new Error(`Invalid profile name "${name}". Use only letters, numbers, and hyphens (no spaces or special characters).`);
|
|
124
|
+
}
|
|
125
|
+
ensureProfilesDir();
|
|
126
|
+
const profileDir = path.join(PROFILES_DIR, name);
|
|
127
|
+
if (existsSync(profileDir)) {
|
|
128
|
+
throw new Error(`Profile "${name}" already exists. Delete it first with:\n webpeel profile delete ${name}`);
|
|
129
|
+
}
|
|
130
|
+
mkdirSync(profileDir, { recursive: true });
|
|
131
|
+
// Launch headed (visible) Chromium — no user-data-dir so we start fresh
|
|
132
|
+
const browser = await chromium.launch({ headless: false });
|
|
133
|
+
const context = await browser.newContext();
|
|
134
|
+
const page = await context.newPage();
|
|
135
|
+
await page.goto('about:blank').catch(() => { });
|
|
136
|
+
console.log('');
|
|
137
|
+
console.log('╔══════════════════════════════════════════════════════╗');
|
|
138
|
+
console.log(`║ WebPeel Profile Setup: "${name}"`);
|
|
139
|
+
console.log('║ ║');
|
|
140
|
+
console.log('║ Navigate to websites and log in. ║');
|
|
141
|
+
console.log('║ When done, press Ctrl+C or close this window. ║');
|
|
142
|
+
console.log('╚══════════════════════════════════════════════════════╝');
|
|
143
|
+
console.log('');
|
|
144
|
+
let saved = false;
|
|
145
|
+
const saveAndClose = async () => {
|
|
146
|
+
if (saved)
|
|
147
|
+
return;
|
|
148
|
+
saved = true;
|
|
149
|
+
console.log('\nCapturing browser session...');
|
|
150
|
+
try {
|
|
151
|
+
const storageState = await context.storageState();
|
|
152
|
+
writeFileSync(path.join(profileDir, 'storage-state.json'), JSON.stringify(storageState, null, 2));
|
|
153
|
+
// Extract unique domains from cookies (strip leading dot)
|
|
154
|
+
const domains = [
|
|
155
|
+
...new Set((storageState.cookies ?? [])
|
|
156
|
+
.map((c) => (c.domain ?? '').replace(/^\./, ''))
|
|
157
|
+
.filter(Boolean)),
|
|
158
|
+
];
|
|
159
|
+
const now = new Date().toISOString();
|
|
160
|
+
const meta = {
|
|
161
|
+
name,
|
|
162
|
+
created: now,
|
|
163
|
+
lastUsed: now,
|
|
164
|
+
domains,
|
|
165
|
+
...(description ? { description } : {}),
|
|
166
|
+
};
|
|
167
|
+
writeFileSync(path.join(profileDir, 'metadata.json'), JSON.stringify(meta, null, 2));
|
|
168
|
+
console.log(`✓ Profile "${name}" saved to ${profileDir}`);
|
|
169
|
+
if (domains.length > 0) {
|
|
170
|
+
console.log(` Domains: ${domains.join(', ')}`);
|
|
171
|
+
}
|
|
172
|
+
else {
|
|
173
|
+
console.log(' No login sessions detected (no cookies).');
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
catch (e) {
|
|
177
|
+
console.error('Warning: Failed to save storage state:', e instanceof Error ? e.message : String(e));
|
|
178
|
+
// Clean up partial directory
|
|
179
|
+
try {
|
|
180
|
+
rmSync(profileDir, { recursive: true, force: true });
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
/* ignore */
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
try {
|
|
187
|
+
await browser.close();
|
|
188
|
+
}
|
|
189
|
+
catch {
|
|
190
|
+
/* ignore — browser may already be closed */
|
|
191
|
+
}
|
|
192
|
+
};
|
|
193
|
+
// Wait for the browser to disconnect (user closed the window) OR SIGINT (Ctrl+C)
|
|
194
|
+
await new Promise((resolve) => {
|
|
195
|
+
browser.on('disconnected', async () => {
|
|
196
|
+
await saveAndClose();
|
|
197
|
+
resolve();
|
|
198
|
+
});
|
|
199
|
+
// Handle Ctrl+C gracefully
|
|
200
|
+
const sigintHandler = async () => {
|
|
201
|
+
await saveAndClose();
|
|
202
|
+
resolve();
|
|
203
|
+
};
|
|
204
|
+
process.once('SIGINT', sigintHandler);
|
|
205
|
+
// Clean up the SIGINT handler if browser closes first
|
|
206
|
+
browser.on('disconnected', () => {
|
|
207
|
+
process.removeListener('SIGINT', sigintHandler);
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
//# sourceMappingURL=profiles.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"profiles.js","sourceRoot":"","sources":["../../src/core/profiles.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EACL,UAAU,EACV,SAAS,EACT,YAAY,EACZ,aAAa,EACb,MAAM,EACN,WAAW,GACZ,MAAM,IAAI,CAAC;AACZ,OAAO,IAAI,MAAM,MAAM,CAAC;AAYxB,gFAAgF;AAEhF,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAElE,SAAS,iBAAiB;IACxB,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QAC9B,SAAS,CAAC,YAAY,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/C,CAAC;AACH,CAAC;AAED,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC;AAC9E,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,eAAe,CAAC,CAAC,EAAE,CAAC;QACnE,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,oBAAoB,CAAC,CAAC;IACtE,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;QAAE,OAAO,IAAI,CAAC;IACxC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;IAChE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO;IAClC,IAAI,CAAC;QACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACzC,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,MAAM,CAAC;QACP,YAAY;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY;IAC1B,iBAAiB,EAAE,CAAC;IACpB,MAAM,QAAQ,GAAsB,EAAE,CAAC;IACvC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,WAAW,CAAC,YAAY,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QACnE,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE;gBAAE,SAAS;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,KAAK,CAAC,IAAI,EAAE,eAAe,CAAC,CAAC;YACtE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;gBAAE,SAAS;YACpC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;gBAC1E,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YAAC,MAAM,CAAC;gBACP,0BAA0B;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;IACD,iCAAiC;IACjC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC9D,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,IAAY,EAAE,WAAoB;IACpE,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,yBAAyB,IAAI,8EAA8E,CAC5G,CAAC;IACJ,CAAC;IAED,iBAAiB,EAAE,CAAC;IAEpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IACjD,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,YAAY,IAAI,qEAAqE,IAAI,EAAE,CAC5F,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,wEAAwE;IACxE,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;IAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IAE/C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAChB,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,8BAA8B,IAAI,GAAG,CAAC,CAAC;IACnD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,IAAI,KAAK,GAAG,KAAK,CAAC;IAElB,MAAM,YAAY,GAAG,KAAK,IAAmB,EAAE;QAC7C,IAAI,KAAK;YAAE,OAAO;QAClB,KAAK,GAAG,IAAI,CAAC;QAEb,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;YAElD,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,oBAAoB,CAAC,EAC3C,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,CACtC,CAAC;YAEF,0DAA0D;YAC1D,MAAM,OAAO,GAAa;gBACxB,GAAG,IAAI,GAAG,CACR,CAAC,YAAY,CAAC,OAAO,IAAI,EAAE,CAAC;qBACzB,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;qBACpD,MAAM,CAAC,OAAO,CAAC,CACnB;aACF,CAAC;YAEF,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrC,MAAM,IAAI,GAAoB;gBAC5B,IAAI;gBACJ,OAAO,EAAE,GAAG;gBACZ,QAAQ,EAAE,GAAG;gBACb,OAAO;gBACP,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACxC,CAAC;YAEF,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,EACtC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAC9B,CAAC;YAEF,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,cAAc,UAAU,EAAE,CAAC,CAAC;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAClD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CACX,wCAAwC,EACxC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAC3C,CAAC;YACF,6BAA6B;YAC7B,IAAI,CAAC;gBACH,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACvD,CAAC;YAAC,MAAM,CAAC;gBACP,YAAY;YACd,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,4CAA4C;QAC9C,CAAC;IACH,CAAC,CAAC;IAEF,iFAAiF;IACjF,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;QAClC,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,KAAK,IAAI,EAAE;YACpC,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,MAAM,aAAa,GAAG,KAAK,IAAI,EAAE;YAC/B,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAEtC,sDAAsD;QACtD,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,GAAG,EAAE;YAC9B,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAClD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema-based extraction using CSS selectors.
|
|
3
|
+
*
|
|
4
|
+
* Each schema defines how to extract listings from a specific domain,
|
|
5
|
+
* inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
|
|
6
|
+
* auto-detection, schemas provide exact selectors for each site's DOM.
|
|
7
|
+
*
|
|
8
|
+
* @module schema-extraction
|
|
9
|
+
*/
|
|
10
|
+
export interface SchemaField {
|
|
11
|
+
/** Field name in output (e.g., "title", "price", "rating") */
|
|
12
|
+
name: string;
|
|
13
|
+
/** CSS selector relative to baseSelector. Empty string selects the base element itself. */
|
|
14
|
+
selector: string;
|
|
15
|
+
/** What to extract */
|
|
16
|
+
type: 'text' | 'attribute' | 'html' | 'exists';
|
|
17
|
+
/** For type='attribute', which attribute to read */
|
|
18
|
+
attribute?: string;
|
|
19
|
+
/** Extract all matches (returns array instead of first match) */
|
|
20
|
+
multiple?: boolean;
|
|
21
|
+
/** Optional transform to apply after extraction */
|
|
22
|
+
transform?: 'trim' | 'number' | 'stripCurrency';
|
|
23
|
+
}
|
|
24
|
+
export interface ExtractionSchema {
|
|
25
|
+
/** Human-readable schema name (e.g., "Booking.com Hotel Search") */
|
|
26
|
+
name: string;
|
|
27
|
+
/** Schema version string */
|
|
28
|
+
version: string;
|
|
29
|
+
/** Matching domains (e.g., ["booking.com", "www.booking.com"]) */
|
|
30
|
+
domains: string[];
|
|
31
|
+
/** Optional URL path patterns (regex strings) for more specific matching */
|
|
32
|
+
urlPatterns?: string[];
|
|
33
|
+
/** CSS selector for each listing item */
|
|
34
|
+
baseSelector: string;
|
|
35
|
+
/** Fields to extract from each item */
|
|
36
|
+
fields: SchemaField[];
|
|
37
|
+
/** Optional pagination config */
|
|
38
|
+
pagination?: {
|
|
39
|
+
nextSelector?: string;
|
|
40
|
+
pageParam?: string;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
/** A single extracted item — field names map to extracted values */
|
|
44
|
+
export interface ExtractedItem {
|
|
45
|
+
[key: string]: string | string[] | boolean | number | undefined;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Load all bundled schemas.
|
|
49
|
+
*/
|
|
50
|
+
export declare function loadBundledSchemas(): ExtractionSchema[];
|
|
51
|
+
/**
|
|
52
|
+
* Find a matching schema for a given URL.
|
|
53
|
+
*
|
|
54
|
+
* Matches by domain first, then optionally by URL patterns (regex).
|
|
55
|
+
* Returns the first matching schema or null.
|
|
56
|
+
*/
|
|
57
|
+
export declare function findSchemaForUrl(url: string): ExtractionSchema | null;
|
|
58
|
+
/**
|
|
59
|
+
* Extract listings from HTML using a schema's CSS selectors.
|
|
60
|
+
*
|
|
61
|
+
* @param html - Raw HTML string to parse
|
|
62
|
+
* @param schema - Extraction schema to use
|
|
63
|
+
* @param baseUrl - Optional base URL for resolving relative links
|
|
64
|
+
* @returns Array of extracted items (may be empty)
|
|
65
|
+
*/
|
|
66
|
+
export declare function extractWithSchema(html: string, schema: ExtractionSchema, baseUrl?: string): ExtractedItem[];
|
|
67
|
+
//# sourceMappingURL=schema-extraction.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema-extraction.d.ts","sourceRoot":"","sources":["../../src/core/schema-extraction.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AASH,MAAM,WAAW,WAAW;IAC1B,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,2FAA2F;IAC3F,QAAQ,EAAE,MAAM,CAAC;IACjB,sBAAsB;IACtB,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC/C,oDAAoD;IACpD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,mDAAmD;IACnD,SAAS,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,eAAe,CAAC;CACjD;AAED,MAAM,WAAW,gBAAgB;IAC/B,oEAAoE;IACpE,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,4EAA4E;IAC5E,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,yCAAyC;IACzC,YAAY,EAAE,MAAM,CAAC;IACrB,uCAAuC;IACvC,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,iCAAiC;IACjC,UAAU,CAAC,EAAE;QACX,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED,oEAAoE;AACpE,MAAM,WAAW,aAAa;IAC5B,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,EAAE,GAAG,OAAO,GAAG,MAAM,GAAG,SAAS,CAAC;CACjE;AA4PD;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,gBAAgB,EAAE,CAEvD;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAsCrE;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,gBAAgB,EACxB,OAAO,CAAC,EAAE,MAAM,GACf,aAAa,EAAE,CA0DjB"}
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema-based extraction using CSS selectors.
|
|
3
|
+
*
|
|
4
|
+
* Each schema defines how to extract listings from a specific domain,
|
|
5
|
+
* inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
|
|
6
|
+
* auto-detection, schemas provide exact selectors for each site's DOM.
|
|
7
|
+
*
|
|
8
|
+
* @module schema-extraction
|
|
9
|
+
*/
|
|
10
|
+
import { load } from 'cheerio';
|
|
11
|
+
/* ------------------------------------------------------------------ */
|
|
12
|
+
/* Bundled schemas (hardcoded to avoid JSON import complications) */
|
|
13
|
+
/* ------------------------------------------------------------------ */
|
|
14
|
+
const BOOKING_COM_SCHEMA = {
|
|
15
|
+
name: 'Booking.com Hotel Search',
|
|
16
|
+
version: '1.0',
|
|
17
|
+
domains: ['booking.com', 'www.booking.com'],
|
|
18
|
+
urlPatterns: ['searchresults'],
|
|
19
|
+
baseSelector: "[data-testid='property-card']",
|
|
20
|
+
fields: [
|
|
21
|
+
{ name: 'title', selector: "[data-testid='title'], .sr-hotel__name, h3 a", type: 'text' },
|
|
22
|
+
{ name: 'price', selector: "[data-testid='price-and-discounted-price'], .bui-price-display__value, [data-testid='price-for-x-nights']", type: 'text', transform: 'trim' },
|
|
23
|
+
{ name: 'rating', selector: "[data-testid='review-score'] div:first-child, .bui-review-score__badge", type: 'text' },
|
|
24
|
+
{ name: 'reviewCount', selector: "[data-testid='review-score'] div:nth-child(2) div:nth-child(2), .bui-review-score__text", type: 'text' },
|
|
25
|
+
{ name: 'location', selector: "[data-testid='address'], .sr_card_address_line", type: 'text' },
|
|
26
|
+
{ name: 'link', selector: "a[data-testid='title-link'], h3 a, a.hotel_name_link", type: 'attribute', attribute: 'href' },
|
|
27
|
+
{ name: 'image', selector: "img[data-testid='image'], img.hotel_image", type: 'attribute', attribute: 'src' },
|
|
28
|
+
{ name: 'stars', selector: "[data-testid='rating-stars'] span, .bui-star-rating .bui-star-rating__star", type: 'text' },
|
|
29
|
+
],
|
|
30
|
+
};
|
|
31
|
+
const AMAZON_COM_SCHEMA = {
|
|
32
|
+
name: 'Amazon Product Search',
|
|
33
|
+
version: '1.0',
|
|
34
|
+
domains: ['amazon.com', 'www.amazon.com', 'amazon.co.uk', 'amazon.de', 'amazon.fr', 'amazon.ca'],
|
|
35
|
+
urlPatterns: ['/s\\?', '/s/'],
|
|
36
|
+
baseSelector: "[data-component-type='s-search-result']",
|
|
37
|
+
fields: [
|
|
38
|
+
{ name: 'title', selector: 'h2 a span, h2 span a span', type: 'text' },
|
|
39
|
+
{ name: 'price', selector: '.a-price .a-offscreen', type: 'text' },
|
|
40
|
+
{ name: 'originalPrice', selector: '.a-price.a-text-price .a-offscreen', type: 'text' },
|
|
41
|
+
{ name: 'rating', selector: '.a-icon-star-small .a-icon-alt, .a-icon-star-mini .a-icon-alt', type: 'text' },
|
|
42
|
+
{ name: 'reviewCount', selector: "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span, .a-size-base.s-underline-text", type: 'text' },
|
|
43
|
+
{ name: 'link', selector: 'h2 a', type: 'attribute', attribute: 'href' },
|
|
44
|
+
{ name: 'image', selector: '.s-image', type: 'attribute', attribute: 'src' },
|
|
45
|
+
{ name: 'sponsored', selector: '.puis-sponsored-label-text', type: 'exists' },
|
|
46
|
+
{ name: 'asin', selector: '', type: 'attribute', attribute: 'data-asin' },
|
|
47
|
+
],
|
|
48
|
+
};
|
|
49
|
+
const EBAY_COM_SCHEMA = {
|
|
50
|
+
name: 'eBay Search Results',
|
|
51
|
+
version: '1.0',
|
|
52
|
+
domains: ['ebay.com', 'www.ebay.com'],
|
|
53
|
+
urlPatterns: ['/sch/'],
|
|
54
|
+
baseSelector: '.s-item, [data-viewport]',
|
|
55
|
+
fields: [
|
|
56
|
+
{ name: 'title', selector: '.s-item__title span, .s-item__title', type: 'text' },
|
|
57
|
+
{ name: 'price', selector: '.s-item__price', type: 'text' },
|
|
58
|
+
{ name: 'link', selector: '.s-item__link, a.s-item__link', type: 'attribute', attribute: 'href' },
|
|
59
|
+
{ name: 'image', selector: '.s-item__image-wrapper img, .s-item__image img', type: 'attribute', attribute: 'src' },
|
|
60
|
+
{ name: 'condition', selector: '.SECONDARY_INFO', type: 'text' },
|
|
61
|
+
{ name: 'shipping', selector: '.s-item__shipping, .s-item__freeXDays', type: 'text' },
|
|
62
|
+
{ name: 'seller', selector: '.s-item__seller-info-text', type: 'text' },
|
|
63
|
+
],
|
|
64
|
+
};
|
|
65
|
+
const YELP_COM_SCHEMA = {
|
|
66
|
+
name: 'Yelp Business Search',
|
|
67
|
+
version: '1.0',
|
|
68
|
+
domains: ['yelp.com', 'www.yelp.com'],
|
|
69
|
+
urlPatterns: ['/search'],
|
|
70
|
+
baseSelector: "[data-testid='serp-ia-card'], li.border-color--default",
|
|
71
|
+
fields: [
|
|
72
|
+
{ name: 'title', selector: "a[href*='/biz/'] span, h3 a span", type: 'text' },
|
|
73
|
+
{ name: 'rating', selector: "[aria-label*='star rating'], .i-stars", type: 'attribute', attribute: 'aria-label' },
|
|
74
|
+
{ name: 'reviewCount', selector: ".reviewCount, span[class*='css-']", type: 'text' },
|
|
75
|
+
{ name: 'price', selector: '.priceRange, span.priceRange', type: 'text' },
|
|
76
|
+
{ name: 'category', selector: ".priceCategory span, p[class*='css-'] a", type: 'text' },
|
|
77
|
+
{ name: 'link', selector: "a[href*='/biz/']", type: 'attribute', attribute: 'href' },
|
|
78
|
+
{ name: 'address', selector: "address, span[class*='css-']", type: 'text' },
|
|
79
|
+
],
|
|
80
|
+
};
|
|
81
|
+
const WALMART_COM_SCHEMA = {
|
|
82
|
+
name: 'Walmart Product Search',
|
|
83
|
+
version: '1.0',
|
|
84
|
+
domains: ['walmart.com', 'www.walmart.com'],
|
|
85
|
+
urlPatterns: ['/search'],
|
|
86
|
+
baseSelector: "[data-testid='list-view'] > div, [data-item-id]",
|
|
87
|
+
fields: [
|
|
88
|
+
{ name: 'title', selector: "a[link-identifier] span, [data-automation-id='product-title']", type: 'text' },
|
|
89
|
+
{ name: 'price', selector: "[data-automation-id='product-price'] .f2, [itemprop='price']", type: 'text' },
|
|
90
|
+
{ name: 'rating', selector: "[data-testid='product-ratings'] .w_iUH7, .stars-reviews-count", type: 'text' },
|
|
91
|
+
{ name: 'link', selector: "a[link-identifier], a[href*='/ip/']", type: 'attribute', attribute: 'href' },
|
|
92
|
+
{ name: 'image', selector: "img[data-testid='productTileImage'], img[loading]", type: 'attribute', attribute: 'src' },
|
|
93
|
+
{ name: 'seller', selector: "[data-automation-id='fulfillment-badge']", type: 'text' },
|
|
94
|
+
],
|
|
95
|
+
};
|
|
96
|
+
const HACKERNEWS_SCHEMA = {
|
|
97
|
+
name: 'Hacker News',
|
|
98
|
+
version: '1.0',
|
|
99
|
+
domains: ['news.ycombinator.com'],
|
|
100
|
+
baseSelector: 'tr.athing',
|
|
101
|
+
fields: [
|
|
102
|
+
{ name: 'title', selector: '.titleline a', type: 'text' },
|
|
103
|
+
{ name: 'link', selector: '.titleline a', type: 'attribute', attribute: 'href' },
|
|
104
|
+
{ name: 'rank', selector: '.rank', type: 'text' },
|
|
105
|
+
{ name: 'site', selector: '.sitestr', type: 'text' },
|
|
106
|
+
],
|
|
107
|
+
};
|
|
108
|
+
const EXPEDIA_COM_SCHEMA = {
|
|
109
|
+
name: 'Expedia Hotel Search',
|
|
110
|
+
version: '1.0',
|
|
111
|
+
domains: ['expedia.com', 'www.expedia.com'],
|
|
112
|
+
urlPatterns: ['Hotel-Search', 'hotel-search'],
|
|
113
|
+
baseSelector: "[data-stid='property-listing'], li.uitk-spacing[class*='uitk-spacing'], [data-stid='lodging-card-responsive']",
|
|
114
|
+
fields: [
|
|
115
|
+
{ name: 'title', selector: "[data-stid='content-hotel-title'], .uitk-heading-5, .uitk-heading-6, h3[class*='uitk-heading']", type: 'text' },
|
|
116
|
+
{ name: 'price', selector: "[data-stid='price-summary'] .uitk-type-500, [data-stid='price-summary-message-total'], .uitk-type-500", type: 'text', transform: 'trim' },
|
|
117
|
+
{ name: 'rating', selector: "[data-stid='star-rating-msg'], .uitk-badge-base, [aria-label*='out of']", type: 'text' },
|
|
118
|
+
{ name: 'reviewCount', selector: "[data-stid='review-info-text'], .uitk-type-200", type: 'text' },
|
|
119
|
+
{ name: 'location', selector: "[data-stid='location-info'], [data-stid='neighborhood-name']", type: 'text' },
|
|
120
|
+
{ name: 'link', selector: "a[data-stid='open-hotel-information'], a[href*='/h/'], a.uitk-card-link", type: 'attribute', attribute: 'href' },
|
|
121
|
+
{ name: 'image', selector: "img[data-stid='image'], .uitk-image-media img", type: 'attribute', attribute: 'src' },
|
|
122
|
+
],
|
|
123
|
+
};
|
|
124
|
+
/** All bundled schemas in priority order */
|
|
125
|
+
const BUNDLED_SCHEMAS = [
|
|
126
|
+
BOOKING_COM_SCHEMA,
|
|
127
|
+
AMAZON_COM_SCHEMA,
|
|
128
|
+
EBAY_COM_SCHEMA,
|
|
129
|
+
YELP_COM_SCHEMA,
|
|
130
|
+
WALMART_COM_SCHEMA,
|
|
131
|
+
HACKERNEWS_SCHEMA,
|
|
132
|
+
EXPEDIA_COM_SCHEMA,
|
|
133
|
+
];
|
|
134
|
+
/* ------------------------------------------------------------------ */
|
|
135
|
+
/* Helpers */
|
|
136
|
+
/* ------------------------------------------------------------------ */
|
|
137
|
+
/**
|
|
138
|
+
* Apply a transform to an extracted string value.
|
|
139
|
+
*/
|
|
140
|
+
function applyTransform(value, transform) {
|
|
141
|
+
if (!transform)
|
|
142
|
+
return value;
|
|
143
|
+
switch (transform) {
|
|
144
|
+
case 'trim':
|
|
145
|
+
return value.trim();
|
|
146
|
+
case 'number': {
|
|
147
|
+
const num = parseFloat(value.replace(/[^\d.]/g, ''));
|
|
148
|
+
return isNaN(num) ? value : num;
|
|
149
|
+
}
|
|
150
|
+
case 'stripCurrency':
|
|
151
|
+
return value.replace(/[^\d.,]/g, '').trim();
|
|
152
|
+
default:
|
|
153
|
+
return value;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Resolve a potentially relative URL against a base URL.
|
|
158
|
+
*/
|
|
159
|
+
function resolveUrl(href, baseUrl) {
|
|
160
|
+
if (!href)
|
|
161
|
+
return undefined;
|
|
162
|
+
if (href.startsWith('data:') || href.startsWith('javascript:'))
|
|
163
|
+
return undefined;
|
|
164
|
+
if (!baseUrl)
|
|
165
|
+
return href;
|
|
166
|
+
try {
|
|
167
|
+
return new URL(href, baseUrl).href;
|
|
168
|
+
}
|
|
169
|
+
catch {
|
|
170
|
+
return href;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Extract a single field value from a cheerio element.
|
|
175
|
+
*/
|
|
176
|
+
function extractFieldValue($, $el, field, baseUrl) {
|
|
177
|
+
// For empty selector on attribute type, read from the base element itself
|
|
178
|
+
const useBaseEl = field.selector === '' || field.selector.trim() === '';
|
|
179
|
+
if (field.multiple && !useBaseEl) {
|
|
180
|
+
// Collect all matches
|
|
181
|
+
const results = [];
|
|
182
|
+
$el.find(field.selector).each((_, el) => {
|
|
183
|
+
const $match = $(el);
|
|
184
|
+
let val;
|
|
185
|
+
switch (field.type) {
|
|
186
|
+
case 'text':
|
|
187
|
+
val = $match.text().trim();
|
|
188
|
+
break;
|
|
189
|
+
case 'attribute':
|
|
190
|
+
val = field.attribute ? ($match.attr(field.attribute) ?? undefined) : undefined;
|
|
191
|
+
if (field.attribute === 'href' || field.attribute === 'src') {
|
|
192
|
+
val = resolveUrl(val, baseUrl);
|
|
193
|
+
}
|
|
194
|
+
break;
|
|
195
|
+
case 'html':
|
|
196
|
+
val = $match.html() ?? undefined;
|
|
197
|
+
break;
|
|
198
|
+
case 'exists':
|
|
199
|
+
// not meaningful for multiple
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
if (val !== undefined && val !== '')
|
|
203
|
+
results.push(val);
|
|
204
|
+
});
|
|
205
|
+
return results.length > 0 ? results : undefined;
|
|
206
|
+
}
|
|
207
|
+
// Single match mode
|
|
208
|
+
const $target = useBaseEl ? $el : $el.find(field.selector).first();
|
|
209
|
+
switch (field.type) {
|
|
210
|
+
case 'exists':
|
|
211
|
+
return useBaseEl ? true : $el.find(field.selector).length > 0;
|
|
212
|
+
case 'text': {
|
|
213
|
+
if (!useBaseEl && $target.length === 0)
|
|
214
|
+
return undefined;
|
|
215
|
+
const text = $target.text().trim();
|
|
216
|
+
if (text === '')
|
|
217
|
+
return undefined;
|
|
218
|
+
const transformed = applyTransform(text, field.transform);
|
|
219
|
+
return transformed;
|
|
220
|
+
}
|
|
221
|
+
case 'attribute': {
|
|
222
|
+
if (!field.attribute)
|
|
223
|
+
return undefined;
|
|
224
|
+
const attrVal = $target.attr(field.attribute) ?? undefined;
|
|
225
|
+
if (attrVal === undefined)
|
|
226
|
+
return undefined;
|
|
227
|
+
if (field.attribute === 'href' || field.attribute === 'src') {
|
|
228
|
+
const resolved = resolveUrl(attrVal, baseUrl);
|
|
229
|
+
if (!resolved)
|
|
230
|
+
return undefined;
|
|
231
|
+
return applyTransform(resolved, field.transform);
|
|
232
|
+
}
|
|
233
|
+
return applyTransform(attrVal, field.transform);
|
|
234
|
+
}
|
|
235
|
+
case 'html': {
|
|
236
|
+
if (!useBaseEl && $target.length === 0)
|
|
237
|
+
return undefined;
|
|
238
|
+
return $target.html() ?? undefined;
|
|
239
|
+
}
|
|
240
|
+
default:
|
|
241
|
+
return undefined;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/* ------------------------------------------------------------------ */
|
|
245
|
+
/* Public API */
|
|
246
|
+
/* ------------------------------------------------------------------ */
|
|
247
|
+
/**
|
|
248
|
+
* Load all bundled schemas.
|
|
249
|
+
*/
|
|
250
|
+
export function loadBundledSchemas() {
|
|
251
|
+
return [...BUNDLED_SCHEMAS];
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Find a matching schema for a given URL.
|
|
255
|
+
*
|
|
256
|
+
* Matches by domain first, then optionally by URL patterns (regex).
|
|
257
|
+
* Returns the first matching schema or null.
|
|
258
|
+
*/
|
|
259
|
+
export function findSchemaForUrl(url) {
|
|
260
|
+
let parsed;
|
|
261
|
+
try {
|
|
262
|
+
parsed = new URL(url);
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
return null;
|
|
266
|
+
}
|
|
267
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
268
|
+
const fullUrl = url;
|
|
269
|
+
for (const schema of BUNDLED_SCHEMAS) {
|
|
270
|
+
// Check domain match
|
|
271
|
+
const domainMatch = schema.domains.some(domain => {
|
|
272
|
+
const d = domain.toLowerCase();
|
|
273
|
+
return hostname === d || hostname.endsWith('.' + d) || d.endsWith('.' + hostname);
|
|
274
|
+
});
|
|
275
|
+
if (!domainMatch)
|
|
276
|
+
continue;
|
|
277
|
+
// If no urlPatterns, domain match is enough
|
|
278
|
+
if (!schema.urlPatterns || schema.urlPatterns.length === 0) {
|
|
279
|
+
return schema;
|
|
280
|
+
}
|
|
281
|
+
// Check URL patterns against the full URL
|
|
282
|
+
const patternMatch = schema.urlPatterns.some(pattern => {
|
|
283
|
+
try {
|
|
284
|
+
return new RegExp(pattern).test(fullUrl);
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
return false;
|
|
288
|
+
}
|
|
289
|
+
});
|
|
290
|
+
if (patternMatch)
|
|
291
|
+
return schema;
|
|
292
|
+
}
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Extract listings from HTML using a schema's CSS selectors.
|
|
297
|
+
*
|
|
298
|
+
* @param html - Raw HTML string to parse
|
|
299
|
+
* @param schema - Extraction schema to use
|
|
300
|
+
* @param baseUrl - Optional base URL for resolving relative links
|
|
301
|
+
* @returns Array of extracted items (may be empty)
|
|
302
|
+
*/
|
|
303
|
+
export function extractWithSchema(html, schema, baseUrl) {
|
|
304
|
+
if (!html || html.trim().length === 0)
|
|
305
|
+
return [];
|
|
306
|
+
const $ = load(html);
|
|
307
|
+
const items = [];
|
|
308
|
+
// Find the title/name field to use for filtering empty items
|
|
309
|
+
const titleFieldName = schema.fields.find(f => f.name === 'title' || f.name === 'name')?.name;
|
|
310
|
+
$(schema.baseSelector).each((_, el) => {
|
|
311
|
+
const $el = $(el);
|
|
312
|
+
const item = {};
|
|
313
|
+
for (const field of schema.fields) {
|
|
314
|
+
const value = extractFieldValue($, $el, field, baseUrl);
|
|
315
|
+
if (value !== undefined) {
|
|
316
|
+
item[field.name] = value;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
// Clean title/name field: strip common junk suffixes (e.g., "Opens in new window")
|
|
320
|
+
if (titleFieldName !== undefined && typeof item[titleFieldName] === 'string') {
|
|
321
|
+
let title = item[titleFieldName];
|
|
322
|
+
// Strip "Opens in (a) new window/tab" variants
|
|
323
|
+
title = title.replace(/\s*Opens?\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?/gi, '');
|
|
324
|
+
// Strip "New Listing", "Sponsored", "Ad" prefixes
|
|
325
|
+
title = title.replace(/^(?:New\s+Listing|Sponsored|Ad)\s*[-–—:·]?\s*/i, '');
|
|
326
|
+
item[titleFieldName] = title.trim();
|
|
327
|
+
}
|
|
328
|
+
// Skip items with no title/name (likely empty/phantom elements)
|
|
329
|
+
if (titleFieldName !== undefined) {
|
|
330
|
+
const titleVal = item[titleFieldName];
|
|
331
|
+
if (!titleVal || (typeof titleVal === 'string' && titleVal.trim() === '')) {
|
|
332
|
+
return; // skip
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
// Skip completely empty items
|
|
336
|
+
if (Object.keys(item).length === 0)
|
|
337
|
+
return;
|
|
338
|
+
items.push(item);
|
|
339
|
+
});
|
|
340
|
+
// Deduplicate: remove items with identical title + price (common with nested selectors)
|
|
341
|
+
if (titleFieldName) {
|
|
342
|
+
const seen = new Set();
|
|
343
|
+
return items.filter(item => {
|
|
344
|
+
const key = `${String(item[titleFieldName] ?? '')}|${String(item.price ?? '')}`;
|
|
345
|
+
if (seen.has(key))
|
|
346
|
+
return false;
|
|
347
|
+
seen.add(key);
|
|
348
|
+
return true;
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
return items;
|
|
352
|
+
}
|
|
353
|
+
//# sourceMappingURL=schema-extraction.js.map
|