webpeel 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +39 -4
  2. package/dist/cli-auth.d.ts +6 -0
  3. package/dist/cli-auth.d.ts.map +1 -1
  4. package/dist/cli-auth.js.map +1 -1
  5. package/dist/cli.js +463 -22
  6. package/dist/cli.js.map +1 -1
  7. package/dist/core/challenge-detection.d.ts.map +1 -1
  8. package/dist/core/challenge-detection.js +39 -6
  9. package/dist/core/challenge-detection.js.map +1 -1
  10. package/dist/core/extract-listings.d.ts.map +1 -1
  11. package/dist/core/extract-listings.js +167 -36
  12. package/dist/core/extract-listings.js.map +1 -1
  13. package/dist/core/fetcher.d.ts +6 -0
  14. package/dist/core/fetcher.d.ts.map +1 -1
  15. package/dist/core/fetcher.js +147 -11
  16. package/dist/core/fetcher.js.map +1 -1
  17. package/dist/core/hotel-search.d.ts +121 -0
  18. package/dist/core/hotel-search.d.ts.map +1 -0
  19. package/dist/core/hotel-search.js +381 -0
  20. package/dist/core/hotel-search.js.map +1 -0
  21. package/dist/core/llm-extract.d.ts +42 -0
  22. package/dist/core/llm-extract.d.ts.map +1 -0
  23. package/dist/core/llm-extract.js +144 -0
  24. package/dist/core/llm-extract.js.map +1 -0
  25. package/dist/core/profiles.d.ts +48 -0
  26. package/dist/core/profiles.d.ts.map +1 -0
  27. package/dist/core/profiles.js +211 -0
  28. package/dist/core/profiles.js.map +1 -0
  29. package/dist/core/schema-extraction.d.ts +67 -0
  30. package/dist/core/schema-extraction.d.ts.map +1 -0
  31. package/dist/core/schema-extraction.js +353 -0
  32. package/dist/core/schema-extraction.js.map +1 -0
  33. package/dist/core/strategies.d.ts +5 -0
  34. package/dist/core/strategies.d.ts.map +1 -1
  35. package/dist/core/strategies.js +9 -2
  36. package/dist/core/strategies.js.map +1 -1
  37. package/dist/index.d.ts.map +1 -1
  38. package/dist/index.js +2 -1
  39. package/dist/index.js.map +1 -1
  40. package/dist/types.d.ts +6 -0
  41. package/dist/types.d.ts.map +1 -1
  42. package/dist/types.js.map +1 -1
  43. package/package.json +1 -1
@@ -0,0 +1,211 @@
1
+ /**
2
+ * WebPeel Profile Management
3
+ *
4
+ * Manages named browser profiles stored in ~/.webpeel/profiles/<name>/
5
+ * Each profile contains:
6
+ * - storage-state.json (Playwright storage state: cookies, localStorage, origins)
7
+ * - metadata.json (name, created, lastUsed, domains, description)
8
+ */
9
+ import { chromium } from 'playwright';
10
+ import { homedir } from 'os';
11
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync, } from 'fs';
12
+ import path from 'path';
13
+ // ─── Paths ───────────────────────────────────────────────────────────────────
14
+ const PROFILES_DIR = path.join(homedir(), '.webpeel', 'profiles');
15
+ function ensureProfilesDir() {
16
+ if (!existsSync(PROFILES_DIR)) {
17
+ mkdirSync(PROFILES_DIR, { recursive: true });
18
+ }
19
+ }
20
+ // ─── Name validation ─────────────────────────────────────────────────────────
21
+ /**
22
+ * Valid profile names: letters, digits, hyphens only. No spaces or special chars.
23
+ */
24
+ export function isValidProfileName(name) {
25
+ return /^[a-zA-Z0-9-]+$/.test(name) && name.length > 0 && name.length <= 64;
26
+ }
27
+ // ─── Core helpers ─────────────────────────────────────────────────────────────
28
+ /**
29
+ * Get the directory path for a named profile, or null if it doesn't exist.
30
+ */
31
+ export function getProfilePath(name) {
32
+ const dir = path.join(PROFILES_DIR, name);
33
+ if (existsSync(dir) && existsSync(path.join(dir, 'metadata.json'))) {
34
+ return dir;
35
+ }
36
+ return null;
37
+ }
38
+ /**
39
+ * Load the Playwright storage state (cookies + localStorage) for a named profile.
40
+ * Returns null if the profile or storage-state.json doesn't exist.
41
+ */
42
+ export function loadStorageState(name) {
43
+ const statePath = path.join(PROFILES_DIR, name, 'storage-state.json');
44
+ if (!existsSync(statePath))
45
+ return null;
46
+ try {
47
+ return JSON.parse(readFileSync(statePath, 'utf-8'));
48
+ }
49
+ catch {
50
+ return null;
51
+ }
52
+ }
53
+ /**
54
+ * Update the lastUsed timestamp for a profile.
55
+ */
56
+ export function touchProfile(name) {
57
+ const metaPath = path.join(PROFILES_DIR, name, 'metadata.json');
58
+ if (!existsSync(metaPath))
59
+ return;
60
+ try {
61
+ const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
62
+ meta.lastUsed = new Date().toISOString();
63
+ writeFileSync(metaPath, JSON.stringify(meta, null, 2));
64
+ }
65
+ catch {
66
+ /* ignore */
67
+ }
68
+ }
69
+ /**
70
+ * List all profiles, sorted by lastUsed descending.
71
+ */
72
+ export function listProfiles() {
73
+ ensureProfilesDir();
74
+ const profiles = [];
75
+ try {
76
+ const entries = readdirSync(PROFILES_DIR, { withFileTypes: true });
77
+ for (const entry of entries) {
78
+ if (!entry.isDirectory())
79
+ continue;
80
+ const metaPath = path.join(PROFILES_DIR, entry.name, 'metadata.json');
81
+ if (!existsSync(metaPath))
82
+ continue;
83
+ try {
84
+ const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
85
+ profiles.push(meta);
86
+ }
87
+ catch {
88
+ /* skip corrupt profile */
89
+ }
90
+ }
91
+ }
92
+ catch {
93
+ /* ignore read errors */
94
+ }
95
+ // Sort: most recently used first
96
+ profiles.sort((a, b) => b.lastUsed.localeCompare(a.lastUsed));
97
+ return profiles;
98
+ }
99
+ /**
100
+ * Delete a named profile. Returns true if deleted, false if not found.
101
+ */
102
+ export function deleteProfile(name) {
103
+ const dir = path.join(PROFILES_DIR, name);
104
+ if (!existsSync(dir))
105
+ return false;
106
+ try {
107
+ rmSync(dir, { recursive: true, force: true });
108
+ return true;
109
+ }
110
+ catch {
111
+ return false;
112
+ }
113
+ }
114
+ // ─── Interactive profile creation ─────────────────────────────────────────────
115
+ /**
116
+ * Interactively create a new profile:
117
+ * 1. Launches a VISIBLE (headed) Chromium browser
118
+ * 2. User navigates and logs into sites
119
+ * 3. On browser close or Ctrl+C, captures storage state and saves the profile
120
+ */
121
+ export async function createProfile(name, description) {
122
+ if (!isValidProfileName(name)) {
123
+ throw new Error(`Invalid profile name "${name}". Use only letters, numbers, and hyphens (no spaces or special characters).`);
124
+ }
125
+ ensureProfilesDir();
126
+ const profileDir = path.join(PROFILES_DIR, name);
127
+ if (existsSync(profileDir)) {
128
+ throw new Error(`Profile "${name}" already exists. Delete it first with:\n webpeel profile delete ${name}`);
129
+ }
130
+ mkdirSync(profileDir, { recursive: true });
131
+ // Launch headed (visible) Chromium — no user-data-dir so we start fresh
132
+ const browser = await chromium.launch({ headless: false });
133
+ const context = await browser.newContext();
134
+ const page = await context.newPage();
135
+ await page.goto('about:blank').catch(() => { });
136
+ console.log('');
137
+ console.log('╔══════════════════════════════════════════════════════╗');
138
+ console.log(`║ WebPeel Profile Setup: "${name}"`);
139
+ console.log('║ ║');
140
+ console.log('║ Navigate to websites and log in. ║');
141
+ console.log('║ When done, press Ctrl+C or close this window. ║');
142
+ console.log('╚══════════════════════════════════════════════════════╝');
143
+ console.log('');
144
+ let saved = false;
145
+ const saveAndClose = async () => {
146
+ if (saved)
147
+ return;
148
+ saved = true;
149
+ console.log('\nCapturing browser session...');
150
+ try {
151
+ const storageState = await context.storageState();
152
+ writeFileSync(path.join(profileDir, 'storage-state.json'), JSON.stringify(storageState, null, 2));
153
+ // Extract unique domains from cookies (strip leading dot)
154
+ const domains = [
155
+ ...new Set((storageState.cookies ?? [])
156
+ .map((c) => (c.domain ?? '').replace(/^\./, ''))
157
+ .filter(Boolean)),
158
+ ];
159
+ const now = new Date().toISOString();
160
+ const meta = {
161
+ name,
162
+ created: now,
163
+ lastUsed: now,
164
+ domains,
165
+ ...(description ? { description } : {}),
166
+ };
167
+ writeFileSync(path.join(profileDir, 'metadata.json'), JSON.stringify(meta, null, 2));
168
+ console.log(`✓ Profile "${name}" saved to ${profileDir}`);
169
+ if (domains.length > 0) {
170
+ console.log(` Domains: ${domains.join(', ')}`);
171
+ }
172
+ else {
173
+ console.log(' No login sessions detected (no cookies).');
174
+ }
175
+ }
176
+ catch (e) {
177
+ console.error('Warning: Failed to save storage state:', e instanceof Error ? e.message : String(e));
178
+ // Clean up partial directory
179
+ try {
180
+ rmSync(profileDir, { recursive: true, force: true });
181
+ }
182
+ catch {
183
+ /* ignore */
184
+ }
185
+ }
186
+ try {
187
+ await browser.close();
188
+ }
189
+ catch {
190
+ /* ignore — browser may already be closed */
191
+ }
192
+ };
193
+ // Wait for the browser to disconnect (user closed the window) OR SIGINT (Ctrl+C)
194
+ await new Promise((resolve) => {
195
+ browser.on('disconnected', async () => {
196
+ await saveAndClose();
197
+ resolve();
198
+ });
199
+ // Handle Ctrl+C gracefully
200
+ const sigintHandler = async () => {
201
+ await saveAndClose();
202
+ resolve();
203
+ };
204
+ process.once('SIGINT', sigintHandler);
205
+ // Clean up the SIGINT handler if browser closes first
206
+ browser.on('disconnected', () => {
207
+ process.removeListener('SIGINT', sigintHandler);
208
+ });
209
+ });
210
+ }
211
+ //# sourceMappingURL=profiles.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"profiles.js","sourceRoot":"","sources":["../../src/core/profiles.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EACL,UAAU,EACV,SAAS,EACT,YAAY,EACZ,aAAa,EACb,MAAM,EACN,WAAW,GACZ,MAAM,IAAI,CAAC;AACZ,OAAO,IAAI,MAAM,MAAM,CAAC;AAYxB,gFAAgF;AAEhF,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAElE,SAAS,iBAAiB;IACxB,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QAC9B,SAAS,CAAC,YAAY,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/C,CAAC;AACH,CAAC;AAED,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC;AAC9E,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,eAAe,CAAC,CAAC,EAAE,CAAC;QACnE,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,oBAAoB,CAAC,CAAC;IACtE,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;QAAE,OAAO,IAAI,CAAC;IACxC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;IAChE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO;IAClC,IAAI,CAAC;QACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACzC,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,MAAM,CAAC;QACP,YAAY;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY;IAC1B,iBAAiB,EAAE,CAAC;IACpB,MAAM,QAAQ,GAAsB,EAAE,CAAC;IACvC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,WAAW,CAAC,YAAY,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QACnE,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE;gBAAE,SAAS;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,KAAK,CAAC,IAAI,EAAE,eAAe,CAAC,CAAC;YACtE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;gBAAE,SAAS;YACpC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;gBAC1E,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YAAC,MAAM,CAAC;gBACP,0BAA0B;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;IACD,iCAAiC;IACjC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC9D,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,IAAY,EAAE,WAAoB;IACpE,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,yBAAyB,IAAI,8EAA8E,CAC5G,CAAC;IACJ,CAAC;IAED,iBAAiB,EAAE,CAAC;IAEpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IACjD,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,YAAY,IAAI,qEAAqE,IAAI,EAAE,CAC5F,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,wEAAwE;IACxE,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;IAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IAE/C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAChB,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,8BAA8B,IAAI,GAAG,CAAC,CAAC;IACnD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,IAAI,KAAK,GAAG,KAAK,CAAC;IAElB,MAAM,YAAY,GAAG,KAAK,IAAmB,EAAE;QAC7C,IAAI,KAAK;YAAE,OAAO;QAClB,KAAK,GAAG,IAAI,CAAC;QAEb,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;YAElD,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,oBAAoB,CAAC,EAC3C,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,CACtC,CAAC;YAEF,0DAA0D;YAC1D,MAAM,OAAO,GAAa;gBACxB,GAAG,IAAI,GAAG,CACR,CAAC,YAAY,CAAC,OAAO,IAAI,EAAE,CAAC;qBACzB,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;qBACpD,MAAM,CAAC,OAAO,CAAC,CACnB;aACF,CAAC;YAEF,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrC,MAAM,IAAI,GAAoB;gBAC5B,IAAI;gBACJ,OAAO,EAAE,GAAG;gBACZ,QAAQ,EAAE,GAAG;gBACb,OAAO;gBACP,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACxC,CAAC;YAEF,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,EACtC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAC9B,CAAC;YAEF,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,cAAc,UAAU,EAAE,CAAC,CAAC;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAClD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CACX,wCAAwC,EACxC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAC3C,CAAC;YACF,6BAA6B;YAC7B,IAAI,CAAC;gBACH,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACvD,CAAC;YAAC,MAAM,CAAC;gBACP,YAAY;YACd,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,4CAA4C;QAC9C,CAAC;IACH,CAAC,CAAC;IAEF,iFAAiF;IACjF,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;QAClC,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,KAAK,IAAI,EAAE;YACpC,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,MAAM,aAAa,GAAG,KAAK,IAAI,EAAE;YAC/B,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAEtC,sDAAsD;QACtD,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,GAAG,EAAE;YAC9B,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAClD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Schema-based extraction using CSS selectors.
3
+ *
4
+ * Each schema defines how to extract listings from a specific domain,
5
+ * inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
6
+ * auto-detection, schemas provide exact selectors for each site's DOM.
7
+ *
8
+ * @module schema-extraction
9
+ */
10
+ export interface SchemaField {
11
+ /** Field name in output (e.g., "title", "price", "rating") */
12
+ name: string;
13
+ /** CSS selector relative to baseSelector. Empty string selects the base element itself. */
14
+ selector: string;
15
+ /** What to extract */
16
+ type: 'text' | 'attribute' | 'html' | 'exists';
17
+ /** For type='attribute', which attribute to read */
18
+ attribute?: string;
19
+ /** Extract all matches (returns array instead of first match) */
20
+ multiple?: boolean;
21
+ /** Optional transform to apply after extraction */
22
+ transform?: 'trim' | 'number' | 'stripCurrency';
23
+ }
24
+ export interface ExtractionSchema {
25
+ /** Human-readable schema name (e.g., "Booking.com Hotel Search") */
26
+ name: string;
27
+ /** Schema version string */
28
+ version: string;
29
+ /** Matching domains (e.g., ["booking.com", "www.booking.com"]) */
30
+ domains: string[];
31
+ /** Optional URL path patterns (regex strings) for more specific matching */
32
+ urlPatterns?: string[];
33
+ /** CSS selector for each listing item */
34
+ baseSelector: string;
35
+ /** Fields to extract from each item */
36
+ fields: SchemaField[];
37
+ /** Optional pagination config */
38
+ pagination?: {
39
+ nextSelector?: string;
40
+ pageParam?: string;
41
+ };
42
+ }
43
+ /** A single extracted item — field names map to extracted values */
44
+ export interface ExtractedItem {
45
+ [key: string]: string | string[] | boolean | number | undefined;
46
+ }
47
+ /**
48
+ * Load all bundled schemas.
49
+ */
50
+ export declare function loadBundledSchemas(): ExtractionSchema[];
51
+ /**
52
+ * Find a matching schema for a given URL.
53
+ *
54
+ * Matches by domain first, then optionally by URL patterns (regex).
55
+ * Returns the first matching schema or null.
56
+ */
57
+ export declare function findSchemaForUrl(url: string): ExtractionSchema | null;
58
+ /**
59
+ * Extract listings from HTML using a schema's CSS selectors.
60
+ *
61
+ * @param html - Raw HTML string to parse
62
+ * @param schema - Extraction schema to use
63
+ * @param baseUrl - Optional base URL for resolving relative links
64
+ * @returns Array of extracted items (may be empty)
65
+ */
66
+ export declare function extractWithSchema(html: string, schema: ExtractionSchema, baseUrl?: string): ExtractedItem[];
67
+ //# sourceMappingURL=schema-extraction.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema-extraction.d.ts","sourceRoot":"","sources":["../../src/core/schema-extraction.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AASH,MAAM,WAAW,WAAW;IAC1B,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,2FAA2F;IAC3F,QAAQ,EAAE,MAAM,CAAC;IACjB,sBAAsB;IACtB,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC/C,oDAAoD;IACpD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,mDAAmD;IACnD,SAAS,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,eAAe,CAAC;CACjD;AAED,MAAM,WAAW,gBAAgB;IAC/B,oEAAoE;IACpE,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,4EAA4E;IAC5E,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,yCAAyC;IACzC,YAAY,EAAE,MAAM,CAAC;IACrB,uCAAuC;IACvC,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,iCAAiC;IACjC,UAAU,CAAC,EAAE;QACX,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED,oEAAoE;AACpE,MAAM,WAAW,aAAa;IAC5B,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,EAAE,GAAG,OAAO,GAAG,MAAM,GAAG,SAAS,CAAC;CACjE;AA4PD;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,gBAAgB,EAAE,CAEvD;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAsCrE;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,gBAAgB,EACxB,OAAO,CAAC,EAAE,MAAM,GACf,aAAa,EAAE,CA0DjB"}
@@ -0,0 +1,353 @@
1
+ /**
2
+ * Schema-based extraction using CSS selectors.
3
+ *
4
+ * Each schema defines how to extract listings from a specific domain,
5
+ * inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
6
+ * auto-detection, schemas provide exact selectors for each site's DOM.
7
+ *
8
+ * @module schema-extraction
9
+ */
10
+ import { load } from 'cheerio';
11
+ /* ------------------------------------------------------------------ */
12
+ /* Bundled schemas (hardcoded to avoid JSON import complications) */
13
+ /* ------------------------------------------------------------------ */
14
+ const BOOKING_COM_SCHEMA = {
15
+ name: 'Booking.com Hotel Search',
16
+ version: '1.0',
17
+ domains: ['booking.com', 'www.booking.com'],
18
+ urlPatterns: ['searchresults'],
19
+ baseSelector: "[data-testid='property-card']",
20
+ fields: [
21
+ { name: 'title', selector: "[data-testid='title'], .sr-hotel__name, h3 a", type: 'text' },
22
+ { name: 'price', selector: "[data-testid='price-and-discounted-price'], .bui-price-display__value, [data-testid='price-for-x-nights']", type: 'text', transform: 'trim' },
23
+ { name: 'rating', selector: "[data-testid='review-score'] div:first-child, .bui-review-score__badge", type: 'text' },
24
+ { name: 'reviewCount', selector: "[data-testid='review-score'] div:nth-child(2) div:nth-child(2), .bui-review-score__text", type: 'text' },
25
+ { name: 'location', selector: "[data-testid='address'], .sr_card_address_line", type: 'text' },
26
+ { name: 'link', selector: "a[data-testid='title-link'], h3 a, a.hotel_name_link", type: 'attribute', attribute: 'href' },
27
+ { name: 'image', selector: "img[data-testid='image'], img.hotel_image", type: 'attribute', attribute: 'src' },
28
+ { name: 'stars', selector: "[data-testid='rating-stars'] span, .bui-star-rating .bui-star-rating__star", type: 'text' },
29
+ ],
30
+ };
31
+ const AMAZON_COM_SCHEMA = {
32
+ name: 'Amazon Product Search',
33
+ version: '1.0',
34
+ domains: ['amazon.com', 'www.amazon.com', 'amazon.co.uk', 'amazon.de', 'amazon.fr', 'amazon.ca'],
35
+ urlPatterns: ['/s\\?', '/s/'],
36
+ baseSelector: "[data-component-type='s-search-result']",
37
+ fields: [
38
+ { name: 'title', selector: 'h2 a span, h2 span a span', type: 'text' },
39
+ { name: 'price', selector: '.a-price .a-offscreen', type: 'text' },
40
+ { name: 'originalPrice', selector: '.a-price.a-text-price .a-offscreen', type: 'text' },
41
+ { name: 'rating', selector: '.a-icon-star-small .a-icon-alt, .a-icon-star-mini .a-icon-alt', type: 'text' },
42
+ { name: 'reviewCount', selector: "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span, .a-size-base.s-underline-text", type: 'text' },
43
+ { name: 'link', selector: 'h2 a', type: 'attribute', attribute: 'href' },
44
+ { name: 'image', selector: '.s-image', type: 'attribute', attribute: 'src' },
45
+ { name: 'sponsored', selector: '.puis-sponsored-label-text', type: 'exists' },
46
+ { name: 'asin', selector: '', type: 'attribute', attribute: 'data-asin' },
47
+ ],
48
+ };
49
+ const EBAY_COM_SCHEMA = {
50
+ name: 'eBay Search Results',
51
+ version: '1.0',
52
+ domains: ['ebay.com', 'www.ebay.com'],
53
+ urlPatterns: ['/sch/'],
54
+ baseSelector: '.s-item, [data-viewport]',
55
+ fields: [
56
+ { name: 'title', selector: '.s-item__title span, .s-item__title', type: 'text' },
57
+ { name: 'price', selector: '.s-item__price', type: 'text' },
58
+ { name: 'link', selector: '.s-item__link, a.s-item__link', type: 'attribute', attribute: 'href' },
59
+ { name: 'image', selector: '.s-item__image-wrapper img, .s-item__image img', type: 'attribute', attribute: 'src' },
60
+ { name: 'condition', selector: '.SECONDARY_INFO', type: 'text' },
61
+ { name: 'shipping', selector: '.s-item__shipping, .s-item__freeXDays', type: 'text' },
62
+ { name: 'seller', selector: '.s-item__seller-info-text', type: 'text' },
63
+ ],
64
+ };
65
+ const YELP_COM_SCHEMA = {
66
+ name: 'Yelp Business Search',
67
+ version: '1.0',
68
+ domains: ['yelp.com', 'www.yelp.com'],
69
+ urlPatterns: ['/search'],
70
+ baseSelector: "[data-testid='serp-ia-card'], li.border-color--default",
71
+ fields: [
72
+ { name: 'title', selector: "a[href*='/biz/'] span, h3 a span", type: 'text' },
73
+ { name: 'rating', selector: "[aria-label*='star rating'], .i-stars", type: 'attribute', attribute: 'aria-label' },
74
+ { name: 'reviewCount', selector: ".reviewCount, span[class*='css-']", type: 'text' },
75
+ { name: 'price', selector: '.priceRange, span.priceRange', type: 'text' },
76
+ { name: 'category', selector: ".priceCategory span, p[class*='css-'] a", type: 'text' },
77
+ { name: 'link', selector: "a[href*='/biz/']", type: 'attribute', attribute: 'href' },
78
+ { name: 'address', selector: "address, span[class*='css-']", type: 'text' },
79
+ ],
80
+ };
81
+ const WALMART_COM_SCHEMA = {
82
+ name: 'Walmart Product Search',
83
+ version: '1.0',
84
+ domains: ['walmart.com', 'www.walmart.com'],
85
+ urlPatterns: ['/search'],
86
+ baseSelector: "[data-testid='list-view'] > div, [data-item-id]",
87
+ fields: [
88
+ { name: 'title', selector: "a[link-identifier] span, [data-automation-id='product-title']", type: 'text' },
89
+ { name: 'price', selector: "[data-automation-id='product-price'] .f2, [itemprop='price']", type: 'text' },
90
+ { name: 'rating', selector: "[data-testid='product-ratings'] .w_iUH7, .stars-reviews-count", type: 'text' },
91
+ { name: 'link', selector: "a[link-identifier], a[href*='/ip/']", type: 'attribute', attribute: 'href' },
92
+ { name: 'image', selector: "img[data-testid='productTileImage'], img[loading]", type: 'attribute', attribute: 'src' },
93
+ { name: 'seller', selector: "[data-automation-id='fulfillment-badge']", type: 'text' },
94
+ ],
95
+ };
96
+ const HACKERNEWS_SCHEMA = {
97
+ name: 'Hacker News',
98
+ version: '1.0',
99
+ domains: ['news.ycombinator.com'],
100
+ baseSelector: 'tr.athing',
101
+ fields: [
102
+ { name: 'title', selector: '.titleline a', type: 'text' },
103
+ { name: 'link', selector: '.titleline a', type: 'attribute', attribute: 'href' },
104
+ { name: 'rank', selector: '.rank', type: 'text' },
105
+ { name: 'site', selector: '.sitestr', type: 'text' },
106
+ ],
107
+ };
108
+ const EXPEDIA_COM_SCHEMA = {
109
+ name: 'Expedia Hotel Search',
110
+ version: '1.0',
111
+ domains: ['expedia.com', 'www.expedia.com'],
112
+ urlPatterns: ['Hotel-Search', 'hotel-search'],
113
+ baseSelector: "[data-stid='property-listing'], li.uitk-spacing[class*='uitk-spacing'], [data-stid='lodging-card-responsive']",
114
+ fields: [
115
+ { name: 'title', selector: "[data-stid='content-hotel-title'], .uitk-heading-5, .uitk-heading-6, h3[class*='uitk-heading']", type: 'text' },
116
+ { name: 'price', selector: "[data-stid='price-summary'] .uitk-type-500, [data-stid='price-summary-message-total'], .uitk-type-500", type: 'text', transform: 'trim' },
117
+ { name: 'rating', selector: "[data-stid='star-rating-msg'], .uitk-badge-base, [aria-label*='out of']", type: 'text' },
118
+ { name: 'reviewCount', selector: "[data-stid='review-info-text'], .uitk-type-200", type: 'text' },
119
+ { name: 'location', selector: "[data-stid='location-info'], [data-stid='neighborhood-name']", type: 'text' },
120
+ { name: 'link', selector: "a[data-stid='open-hotel-information'], a[href*='/h/'], a.uitk-card-link", type: 'attribute', attribute: 'href' },
121
+ { name: 'image', selector: "img[data-stid='image'], .uitk-image-media img", type: 'attribute', attribute: 'src' },
122
+ ],
123
+ };
124
+ /** All bundled schemas in priority order */
125
+ const BUNDLED_SCHEMAS = [
126
+ BOOKING_COM_SCHEMA,
127
+ AMAZON_COM_SCHEMA,
128
+ EBAY_COM_SCHEMA,
129
+ YELP_COM_SCHEMA,
130
+ WALMART_COM_SCHEMA,
131
+ HACKERNEWS_SCHEMA,
132
+ EXPEDIA_COM_SCHEMA,
133
+ ];
134
+ /* ------------------------------------------------------------------ */
135
+ /* Helpers */
136
+ /* ------------------------------------------------------------------ */
137
+ /**
138
+ * Apply a transform to an extracted string value.
139
+ */
140
+ function applyTransform(value, transform) {
141
+ if (!transform)
142
+ return value;
143
+ switch (transform) {
144
+ case 'trim':
145
+ return value.trim();
146
+ case 'number': {
147
+ const num = parseFloat(value.replace(/[^\d.]/g, ''));
148
+ return isNaN(num) ? value : num;
149
+ }
150
+ case 'stripCurrency':
151
+ return value.replace(/[^\d.,]/g, '').trim();
152
+ default:
153
+ return value;
154
+ }
155
+ }
156
+ /**
157
+ * Resolve a potentially relative URL against a base URL.
158
+ */
159
+ function resolveUrl(href, baseUrl) {
160
+ if (!href)
161
+ return undefined;
162
+ if (href.startsWith('data:') || href.startsWith('javascript:'))
163
+ return undefined;
164
+ if (!baseUrl)
165
+ return href;
166
+ try {
167
+ return new URL(href, baseUrl).href;
168
+ }
169
+ catch {
170
+ return href;
171
+ }
172
+ }
173
+ /**
174
+ * Extract a single field value from a cheerio element.
175
+ */
176
+ function extractFieldValue($, $el, field, baseUrl) {
177
+ // For empty selector on attribute type, read from the base element itself
178
+ const useBaseEl = field.selector === '' || field.selector.trim() === '';
179
+ if (field.multiple && !useBaseEl) {
180
+ // Collect all matches
181
+ const results = [];
182
+ $el.find(field.selector).each((_, el) => {
183
+ const $match = $(el);
184
+ let val;
185
+ switch (field.type) {
186
+ case 'text':
187
+ val = $match.text().trim();
188
+ break;
189
+ case 'attribute':
190
+ val = field.attribute ? ($match.attr(field.attribute) ?? undefined) : undefined;
191
+ if (field.attribute === 'href' || field.attribute === 'src') {
192
+ val = resolveUrl(val, baseUrl);
193
+ }
194
+ break;
195
+ case 'html':
196
+ val = $match.html() ?? undefined;
197
+ break;
198
+ case 'exists':
199
+ // not meaningful for multiple
200
+ break;
201
+ }
202
+ if (val !== undefined && val !== '')
203
+ results.push(val);
204
+ });
205
+ return results.length > 0 ? results : undefined;
206
+ }
207
+ // Single match mode
208
+ const $target = useBaseEl ? $el : $el.find(field.selector).first();
209
+ switch (field.type) {
210
+ case 'exists':
211
+ return useBaseEl ? true : $el.find(field.selector).length > 0;
212
+ case 'text': {
213
+ if (!useBaseEl && $target.length === 0)
214
+ return undefined;
215
+ const text = $target.text().trim();
216
+ if (text === '')
217
+ return undefined;
218
+ const transformed = applyTransform(text, field.transform);
219
+ return transformed;
220
+ }
221
+ case 'attribute': {
222
+ if (!field.attribute)
223
+ return undefined;
224
+ const attrVal = $target.attr(field.attribute) ?? undefined;
225
+ if (attrVal === undefined)
226
+ return undefined;
227
+ if (field.attribute === 'href' || field.attribute === 'src') {
228
+ const resolved = resolveUrl(attrVal, baseUrl);
229
+ if (!resolved)
230
+ return undefined;
231
+ return applyTransform(resolved, field.transform);
232
+ }
233
+ return applyTransform(attrVal, field.transform);
234
+ }
235
+ case 'html': {
236
+ if (!useBaseEl && $target.length === 0)
237
+ return undefined;
238
+ return $target.html() ?? undefined;
239
+ }
240
+ default:
241
+ return undefined;
242
+ }
243
+ }
244
+ /* ------------------------------------------------------------------ */
245
+ /* Public API */
246
+ /* ------------------------------------------------------------------ */
247
+ /**
248
+ * Load all bundled schemas.
249
+ */
250
+ export function loadBundledSchemas() {
251
+ return [...BUNDLED_SCHEMAS];
252
+ }
253
+ /**
254
+ * Find a matching schema for a given URL.
255
+ *
256
+ * Matches by domain first, then optionally by URL patterns (regex).
257
+ * Returns the first matching schema or null.
258
+ */
259
+ export function findSchemaForUrl(url) {
260
+ let parsed;
261
+ try {
262
+ parsed = new URL(url);
263
+ }
264
+ catch {
265
+ return null;
266
+ }
267
+ const hostname = parsed.hostname.toLowerCase();
268
+ const fullUrl = url;
269
+ for (const schema of BUNDLED_SCHEMAS) {
270
+ // Check domain match
271
+ const domainMatch = schema.domains.some(domain => {
272
+ const d = domain.toLowerCase();
273
+ return hostname === d || hostname.endsWith('.' + d) || d.endsWith('.' + hostname);
274
+ });
275
+ if (!domainMatch)
276
+ continue;
277
+ // If no urlPatterns, domain match is enough
278
+ if (!schema.urlPatterns || schema.urlPatterns.length === 0) {
279
+ return schema;
280
+ }
281
+ // Check URL patterns against the full URL
282
+ const patternMatch = schema.urlPatterns.some(pattern => {
283
+ try {
284
+ return new RegExp(pattern).test(fullUrl);
285
+ }
286
+ catch {
287
+ return false;
288
+ }
289
+ });
290
+ if (patternMatch)
291
+ return schema;
292
+ }
293
+ return null;
294
+ }
295
+ /**
296
+ * Extract listings from HTML using a schema's CSS selectors.
297
+ *
298
+ * @param html - Raw HTML string to parse
299
+ * @param schema - Extraction schema to use
300
+ * @param baseUrl - Optional base URL for resolving relative links
301
+ * @returns Array of extracted items (may be empty)
302
+ */
303
+ export function extractWithSchema(html, schema, baseUrl) {
304
+ if (!html || html.trim().length === 0)
305
+ return [];
306
+ const $ = load(html);
307
+ const items = [];
308
+ // Find the title/name field to use for filtering empty items
309
+ const titleFieldName = schema.fields.find(f => f.name === 'title' || f.name === 'name')?.name;
310
+ $(schema.baseSelector).each((_, el) => {
311
+ const $el = $(el);
312
+ const item = {};
313
+ for (const field of schema.fields) {
314
+ const value = extractFieldValue($, $el, field, baseUrl);
315
+ if (value !== undefined) {
316
+ item[field.name] = value;
317
+ }
318
+ }
319
+ // Clean title/name field: strip common junk suffixes (e.g., "Opens in new window")
320
+ if (titleFieldName !== undefined && typeof item[titleFieldName] === 'string') {
321
+ let title = item[titleFieldName];
322
+ // Strip "Opens in (a) new window/tab" variants
323
+ title = title.replace(/\s*Opens?\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?/gi, '');
324
+ // Strip "New Listing", "Sponsored", "Ad" prefixes
325
+ title = title.replace(/^(?:New\s+Listing|Sponsored|Ad)\s*[-–—:·]?\s*/i, '');
326
+ item[titleFieldName] = title.trim();
327
+ }
328
+ // Skip items with no title/name (likely empty/phantom elements)
329
+ if (titleFieldName !== undefined) {
330
+ const titleVal = item[titleFieldName];
331
+ if (!titleVal || (typeof titleVal === 'string' && titleVal.trim() === '')) {
332
+ return; // skip
333
+ }
334
+ }
335
+ // Skip completely empty items
336
+ if (Object.keys(item).length === 0)
337
+ return;
338
+ items.push(item);
339
+ });
340
+ // Deduplicate: remove items with identical title + price (common with nested selectors)
341
+ if (titleFieldName) {
342
+ const seen = new Set();
343
+ return items.filter(item => {
344
+ const key = `${String(item[titleFieldName] ?? '')}|${String(item.price ?? '')}`;
345
+ if (seen.has(key))
346
+ return false;
347
+ seen.add(key);
348
+ return true;
349
+ });
350
+ }
351
+ return items;
352
+ }
353
+ //# sourceMappingURL=schema-extraction.js.map