webpeel 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +39 -4
  2. package/dist/cli-auth.d.ts +6 -0
  3. package/dist/cli-auth.d.ts.map +1 -1
  4. package/dist/cli-auth.js.map +1 -1
  5. package/dist/cli.js +506 -23
  6. package/dist/cli.js.map +1 -1
  7. package/dist/core/challenge-detection.d.ts.map +1 -1
  8. package/dist/core/challenge-detection.js +39 -6
  9. package/dist/core/challenge-detection.js.map +1 -1
  10. package/dist/core/extract-listings.d.ts.map +1 -1
  11. package/dist/core/extract-listings.js +167 -36
  12. package/dist/core/extract-listings.js.map +1 -1
  13. package/dist/core/fetcher.d.ts +14 -1
  14. package/dist/core/fetcher.d.ts.map +1 -1
  15. package/dist/core/fetcher.js +176 -14
  16. package/dist/core/fetcher.js.map +1 -1
  17. package/dist/core/hotel-search.d.ts +123 -0
  18. package/dist/core/hotel-search.d.ts.map +1 -0
  19. package/dist/core/hotel-search.js +383 -0
  20. package/dist/core/hotel-search.js.map +1 -0
  21. package/dist/core/llm-extract.d.ts +56 -0
  22. package/dist/core/llm-extract.d.ts.map +1 -0
  23. package/dist/core/llm-extract.js +264 -0
  24. package/dist/core/llm-extract.js.map +1 -0
  25. package/dist/core/profiles.d.ts +48 -0
  26. package/dist/core/profiles.d.ts.map +1 -0
  27. package/dist/core/profiles.js +211 -0
  28. package/dist/core/profiles.js.map +1 -0
  29. package/dist/core/schema-extraction.d.ts +67 -0
  30. package/dist/core/schema-extraction.d.ts.map +1 -0
  31. package/dist/core/schema-extraction.js +353 -0
  32. package/dist/core/schema-extraction.js.map +1 -0
  33. package/dist/core/strategies.d.ts +11 -0
  34. package/dist/core/strategies.d.ts.map +1 -1
  35. package/dist/core/strategies.js +17 -5
  36. package/dist/core/strategies.js.map +1 -1
  37. package/dist/index.d.ts.map +1 -1
  38. package/dist/index.js +3 -1
  39. package/dist/index.js.map +1 -1
  40. package/dist/mcp/server.js +47 -3
  41. package/dist/mcp/server.js.map +1 -1
  42. package/dist/types.d.ts +16 -0
  43. package/dist/types.d.ts.map +1 -1
  44. package/dist/types.js.map +1 -1
  45. package/package.json +1 -1
@@ -0,0 +1,264 @@
1
+ /**
2
+ * LLM-based extraction: sends markdown/text content to an LLM
3
+ * with instructions to extract structured data.
4
+ *
5
+ * Supports OpenAI-compatible APIs (OpenAI, Anthropic via proxy, local models).
6
+ */
7
+ // Cost per 1M tokens (input, output) for known models
8
+ const MODEL_COSTS = {
9
+ 'gpt-4o-mini': [0.15, 0.60],
10
+ 'gpt-4o': [2.50, 10.0],
11
+ };
12
+ const GENERIC_SYSTEM_PROMPT = `You are a data extraction assistant. Extract structured data from the provided web content.
13
+ Return a JSON array of objects. Each object represents one item/listing found on the page.
14
+ Always include these fields when available: title, price, link, rating, description, image.
15
+ If the user provides additional instructions, follow them.
16
+ Return ONLY valid JSON — no markdown, no explanation, just the array.`;
17
+ const SCHEMA_SYSTEM_PROMPT = `You are a data extraction assistant. Extract structured data from the web content below.
18
+ Return a JSON object that EXACTLY matches the provided schema structure.
19
+ Fill in the values from the page content. Use null for fields you can't find.
20
+ Return ONLY valid JSON matching the schema — no markdown, no explanation.`;
21
+ /**
22
+ * Detect if schema is a "full" JSON Schema (has type:"object" and properties).
23
+ */
24
+ export function isFullJsonSchema(schema) {
25
+ const s = schema;
26
+ return s['type'] === 'object' && typeof s['properties'] === 'object';
27
+ }
28
+ /**
29
+ * Convert a simple example object to a proper JSON Schema.
30
+ *
31
+ * Supports:
32
+ * - Primitive values: "" → { type: "string" }, 0 → { type: "number" }
33
+ * - Arrays of objects: [{name:"", price:""}] → { type: "array", items: { type: "object", properties: {...} } }
34
+ * - Nested objects
35
+ */
36
+ export function convertSimpleToJsonSchema(example) {
37
+ return buildSchemaFromValue(example);
38
+ }
39
+ function buildSchemaFromValue(value) {
40
+ if (value === null || value === undefined) {
41
+ return { type: 'string' };
42
+ }
43
+ if (typeof value === 'string') {
44
+ return { type: 'string' };
45
+ }
46
+ if (typeof value === 'number') {
47
+ return Number.isInteger(value) ? { type: 'integer' } : { type: 'number' };
48
+ }
49
+ if (typeof value === 'boolean') {
50
+ return { type: 'boolean' };
51
+ }
52
+ if (Array.isArray(value)) {
53
+ if (value.length === 0) {
54
+ return { type: 'array', items: {} };
55
+ }
56
+ // Use the first element as the template for item schema
57
+ const itemSchema = buildSchemaFromValue(value[0]);
58
+ return { type: 'array', items: itemSchema };
59
+ }
60
+ if (typeof value === 'object') {
61
+ const obj = value;
62
+ const properties = {};
63
+ for (const [key, val] of Object.entries(obj)) {
64
+ properties[key] = buildSchemaFromValue(val);
65
+ }
66
+ return {
67
+ type: 'object',
68
+ properties,
69
+ };
70
+ }
71
+ return { type: 'string' };
72
+ }
73
+ /**
74
+ * Build the user message from content + optional instruction + optional schema.
75
+ */
76
+ export function buildUserMessage(content, instruction, schema) {
77
+ // Truncate content if over 100K chars
78
+ const truncated = content.length > 100_000 ? content.slice(0, 50_000) : content;
79
+ let msg = `Here is the web content to extract data from:\n\n${truncated}`;
80
+ if (schema) {
81
+ msg += `\n\nExtract data matching this schema: ${JSON.stringify(schema, null, 2)}`;
82
+ }
83
+ if (instruction) {
84
+ msg += `\n\nAdditional instruction: ${instruction}`;
85
+ }
86
+ return msg;
87
+ }
88
+ /**
89
+ * Calculate estimated cost in USD for a given model and token counts.
90
+ */
91
+ export function estimateCost(model, inputTokens, outputTokens) {
92
+ // Normalize model key (strip version suffixes like -2024-11-20 for matching)
93
+ const key = Object.keys(MODEL_COSTS).find(k => model.startsWith(k) || model === k);
94
+ if (!key)
95
+ return undefined;
96
+ const [inputRate, outputRate] = MODEL_COSTS[key];
97
+ return (inputTokens / 1_000_000) * inputRate + (outputTokens / 1_000_000) * outputRate;
98
+ }
99
+ /**
100
+ * Parse the LLM response text into an items array.
101
+ * Handles both `{ "items": [...] }` and `[...]` formats.
102
+ * When a schema is provided, also handles single-object responses.
103
+ */
104
+ export function parseItems(text, _schema) {
105
+ const trimmed = text.trim();
106
+ // Try to parse as-is first
107
+ let parsed;
108
+ try {
109
+ parsed = JSON.parse(trimmed);
110
+ }
111
+ catch {
112
+ // Try to extract JSON from the text (sometimes LLMs add preamble despite instructions)
113
+ const arrayMatch = trimmed.match(/\[[\s\S]*\]/);
114
+ const objMatch = trimmed.match(/\{[\s\S]*\}/);
115
+ if (arrayMatch) {
116
+ try {
117
+ parsed = JSON.parse(arrayMatch[0]);
118
+ }
119
+ catch { /* fall through */ }
120
+ }
121
+ else if (objMatch) {
122
+ try {
123
+ parsed = JSON.parse(objMatch[0]);
124
+ }
125
+ catch { /* fall through */ }
126
+ }
127
+ if (parsed === undefined) {
128
+ throw new Error(`Failed to parse LLM response as JSON: ${trimmed.slice(0, 200)}`);
129
+ }
130
+ }
131
+ // Handle { items: [...] } or { data: [...] } or { results: [...] }
132
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
133
+ const obj = parsed;
134
+ if (Array.isArray(obj['items']))
135
+ return obj['items'];
136
+ if (Array.isArray(obj['data']))
137
+ return obj['data'];
138
+ if (Array.isArray(obj['results']))
139
+ return obj['results'];
140
+ // Single object — wrap in array
141
+ return [obj];
142
+ }
143
+ // Handle bare array
144
+ if (Array.isArray(parsed)) {
145
+ return parsed;
146
+ }
147
+ return [];
148
+ }
149
+ /**
150
+ * Validate that a parsed result roughly matches the expected schema shape.
151
+ * Logs a warning if the top-level keys don't match, but returns the result anyway.
152
+ */
153
+ function validateSchemaShape(result, schema) {
154
+ if (result.length === 0)
155
+ return;
156
+ const schemaObj = schema;
157
+ // For full JSON Schema: check that the object has the expected top-level properties
158
+ if (isFullJsonSchema(schema)) {
159
+ const expectedKeys = Object.keys(schemaObj['properties'] || {});
160
+ if (expectedKeys.length > 0 && result[0]) {
161
+ const actualKeys = Object.keys(result[0]);
162
+ const missingKeys = expectedKeys.filter(k => !actualKeys.includes(k));
163
+ if (missingKeys.length > 0) {
164
+ console.warn(`[webpeel] Schema validation warning: response missing expected keys: ${missingKeys.join(', ')}`);
165
+ }
166
+ }
167
+ return;
168
+ }
169
+ // For simple example schema: check top-level keys exist
170
+ const expectedTopLevelKeys = Object.keys(schemaObj);
171
+ if (expectedTopLevelKeys.length > 0 && result[0]) {
172
+ const actualKeys = Object.keys(result[0]);
173
+ const missingKeys = expectedTopLevelKeys.filter(k => !actualKeys.includes(k));
174
+ if (missingKeys.length > 0) {
175
+ console.warn(`[webpeel] Schema validation warning: response missing expected keys: ${missingKeys.join(', ')}`);
176
+ }
177
+ }
178
+ }
179
+ /**
180
+ * Build the response_format parameter for the OpenAI API call.
181
+ */
182
+ function buildResponseFormat(schema) {
183
+ if (!schema) {
184
+ return { type: 'json_object' };
185
+ }
186
+ // Use structured output only for full JSON Schema (has type:"object" and properties)
187
+ if (isFullJsonSchema(schema)) {
188
+ return {
189
+ type: 'json_schema',
190
+ json_schema: {
191
+ name: 'extraction',
192
+ strict: true,
193
+ schema,
194
+ },
195
+ };
196
+ }
197
+ // For simple example schemas, fall back to json_object
198
+ return { type: 'json_object' };
199
+ }
200
+ /**
201
+ * Extract structured data from content using an LLM.
202
+ */
203
+ export async function extractWithLLM(options) {
204
+ const { content, instruction, baseUrl = 'https://api.openai.com/v1', model = 'gpt-4o-mini', maxTokens = 4000, } = options;
205
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
206
+ if (!apiKey) {
207
+ throw new Error('LLM extraction requires an API key.\n' +
208
+ 'Set OPENAI_API_KEY environment variable or use --llm-key <key>');
209
+ }
210
+ // Resolve schema: convert simple schemas to full JSON Schema if needed
211
+ let resolvedSchema = options.schema;
212
+ if (resolvedSchema && !isFullJsonSchema(resolvedSchema)) {
213
+ resolvedSchema = convertSimpleToJsonSchema(resolvedSchema);
214
+ }
215
+ // Choose system prompt based on whether a schema is provided
216
+ const systemPrompt = resolvedSchema ? SCHEMA_SYSTEM_PROMPT : GENERIC_SYSTEM_PROMPT;
217
+ const userMessage = buildUserMessage(content, instruction, resolvedSchema ?? options.schema);
218
+ const responseFormat = buildResponseFormat(resolvedSchema);
219
+ const response = await fetch(`${baseUrl}/chat/completions`, {
220
+ method: 'POST',
221
+ headers: {
222
+ 'Content-Type': 'application/json',
223
+ 'Authorization': `Bearer ${apiKey}`,
224
+ },
225
+ body: JSON.stringify({
226
+ model,
227
+ messages: [
228
+ { role: 'system', content: systemPrompt },
229
+ { role: 'user', content: userMessage },
230
+ ],
231
+ temperature: 0,
232
+ max_tokens: maxTokens,
233
+ response_format: responseFormat,
234
+ }),
235
+ });
236
+ if (!response.ok) {
237
+ const body = await response.text().catch(() => '');
238
+ if (response.status === 401) {
239
+ throw new Error(`LLM API authentication failed (401). Check your API key.`);
240
+ }
241
+ if (response.status === 429) {
242
+ throw new Error(`LLM API rate limit exceeded (429). Please wait and retry.`);
243
+ }
244
+ throw new Error(`LLM API error: HTTP ${response.status}${body ? ` — ${body.slice(0, 200)}` : ''}`);
245
+ }
246
+ const data = await response.json();
247
+ const rawText = data.choices?.[0]?.message?.content ?? '';
248
+ const items = parseItems(rawText, resolvedSchema);
249
+ // Validate schema shape and warn if mismatch
250
+ if (resolvedSchema) {
251
+ validateSchemaShape(items, resolvedSchema);
252
+ }
253
+ const inputTokens = data.usage?.prompt_tokens ?? 0;
254
+ const outputTokens = data.usage?.completion_tokens ?? 0;
255
+ const resolvedModel = data.model ?? model;
256
+ const cost = estimateCost(resolvedModel, inputTokens, outputTokens);
257
+ return {
258
+ items,
259
+ tokensUsed: { input: inputTokens, output: outputTokens },
260
+ model: resolvedModel,
261
+ cost,
262
+ };
263
+ }
264
+ //# sourceMappingURL=llm-extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-extract.js","sourceRoot":"","sources":["../../src/core/llm-extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmBH,sDAAsD;AACtD,MAAM,WAAW,GAAqC;IACpD,aAAa,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC;IAC3B,QAAQ,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC;CACvB,CAAC;AAEF,MAAM,qBAAqB,GAAG;;;;sEAIwC,CAAC;AAEvE,MAAM,oBAAoB,GAAG;;;0EAG6C,CAAC;AAE3E;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,MAAc;IAC7C,MAAM,CAAC,GAAG,MAA6B,CAAC;IACxC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,QAAQ,IAAI,OAAO,CAAC,CAAC,YAAY,CAAC,KAAK,QAAQ,CAAC;AACvE,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,yBAAyB,CAAC,OAAe;IACvD,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,oBAAoB,CAAC,KAAc;IAC1C,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QAC1C,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5B,CAAC;IAED,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5B,CAAC;IAED,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC5E,CAAC;IAED,IAAI,OAAO,KAAK,KAAK,SAAS,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAC7B,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QACtC,CAAC;QACD,wDAAwD;QACxD,MAAM,UAAU,GAAG,oBAAoB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC;IAC9C,CAAC;IAED,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,KAAgC,CAAC;QAC7C,MAAM,UAAU,GAA2B,EAAE,CAAC;QAC9C,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;YAC7C,UAAU,CAAC,GAAG,CAAC,GAAG,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC9C,CAAC;QACD,OAAO;YACL,IAAI,EAAE,QAAQ;YACd,UAAU;SACX,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe,EAAE,WAAoB,EAAE,MAAe;IACrF,sCAAsC;IACtC,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAEhF,IAAI,GAAG,GAAG,oDAAoD,SAAS,EAAE,CAAC;IAE1E,IAAI,MAAM,EAAE,CAAC;QACX,GAAG,IAAI,0CAA0C,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;IACrF,CAAC;IAED,IAAI,WAAW,EAAE,CAAC;QAChB,GAAG,IAAI,+BAA+B,WAAW,EAAE,CAAC;IACtD,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,KAAa,EAAE,WAAmB,EAAE,YAAoB;IACnF,6EAA6E;IAC7E,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC;IACnF,IAAI,CAAC,GAAG;QAAE,OAAO,SAAS,CAAC;IAC3B,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,WAAW,CAAC,GAAG,CAAE,CAAC;IAClD,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC,GAAG,SAAS,GAAG,CAAC,YAAY,GAAG,SAAS,CAAC,GAAG,UAAU,CAAC;AACzF,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY,EAAE,OAAgB;IACvD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,2BAA2B;IAC3B,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,uFAAuF;QACvF,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAChD,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC9C,IAAI,UAAU,EAAE,CAAC;YACf,IAAI,CAAC;gBAAC,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAAC,CAAC;YAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;QAC1E,CAAC;aAAM,IAAI,QAAQ,EAAE,CAAC;YACpB,IAAI,CAAC;gBAAC,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;YAAC,CAAC;YAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;QACxE,CAAC;QACD,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yCAAyC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,IAAI,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QACnE,MAAM,GAAG,GAAG,MAA6B,CAAC;QAC1C,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAAE,OAAO,GAAG,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YAAE,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC;QACnD,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAAE,OAAO,GAAG,CAAC,SAAS,CAAC,CAAC;QACzD,gCAAgC;QAChC,OAAO,CAAC,GAAG,CAAC,CAAC;IACf,CAAC;IAED,oBAAoB;IACpB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC1B,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;GAGG;AACH,SAAS,mBAAmB,CAAC,MAAkC,EAAE,MAAc;IAC7E,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO;IAEhC,MAAM,SAAS,GAAG,MAA6B,CAAC;IAEhD,oFAAoF;IACpF,IAAI,gBAAgB,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7B,MAAM,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;QAChE,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;YACzC,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1C,MAAM,WAAW,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;YACtE,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CAAC,wEAAwE,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACjH,CAAC;QACH,CAAC;QACD,OAAO;IACT,CAAC;IAED,wDAAwD;IACxD,MAAM,oBAAoB,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACpD,IAAI,oBAAoB,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;QACjD,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,WAAW,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9E,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,IAAI,CAAC,wEAAwE,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjH,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,MAAe;IAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,CAAC;IACjC,CAAC;IAED,qFAAqF;IACrF,IAAI,gBAAgB,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7B,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,WAAW,EAAE;gBACX,IAAI,EAAE,YAAY;gBAClB,MAAM,EAAE,IAAI;gBACZ,MAAM;aACP;SACF,CAAC;IACJ,CAAC;IAED,uDAAuD;IACvD,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,CAAC;AACjC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,OAA6B;IAChE,MAAM,EACJ,OAAO,EACP,WAAW,EACX,OAAO,GAAG,2BAA2B,EACrC,KAAK,GAAG,aAAa,EACrB,SAAS,GAAG,IAAI,GACjB,GAAG,OAAO,CAAC;IAEZ,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;IAE5D,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,uCAAuC;YACvC,gEAAgE,CACjE,CAAC;IACJ,CAAC;IAED,uEAAuE;IACvE,IAAI,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IACpC,IAAI,cAAc,IAAI,CAAC,gBAAgB,CAAC,cAAc,CAAC,EAAE,CAAC;QACxD,cAAc,GAAG,yBAAyB,CAAC,cAAc,CAAC,CAAC;IAC7D,CAAC;IAED,6DAA6D;IAC7D,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,qBAAqB,CAAC;IAEnF,MAAM,WAAW,GAAG,gBAAgB,CAAC,OAAO,EAAE,WAAW,EAAE,cAAc,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IAE7F,MAAM,cAAc,GAAG,mBAAmB,CAAC,cAAc,CAAC,CAAC;IAE3D,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,OAAO,mBAAmB,EAAE;QAC1D,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,eAAe,EAAE,UAAU,MAAM,EAAE;SACpC;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,KAAK;YACL,QAAQ,EAAE;gBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;gBACzC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE;aACvC;YACD,WAAW,EAAE,CAAC;YACd,UAAU,EAAE,SAAS;YACrB,eAAe,EAAE,cAAc;SAChC,CAAC;KACH,CAAC,CAAC;IAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;QACjB,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QACnD,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,0DAA0D,CAAC,CAAC;QAC9E,CAAC;QACD,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,2DAA2D,CAAC,CAAC;QAC/E,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACrG,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAI/B,CAAC;IAEF,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC;IAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;IAElD,6CAA6C;IAC7C,IAAI,cAAc,EAAE,CAAC;QACnB,mBAAmB,CAAC,KAAK,EAAE,cAAc,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,EAAE,aAAa,IAAI,CAAC,CAAC;IACnD,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,EAAE,iBAAiB,IAAI,CAAC,CAAC;IACxD,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC;IAC1C,MAAM,IAAI,GAAG,YAAY,CAAC,aAAa,EAAE,WAAW,EAAE,YAAY,CAAC,CAAC;IAEpE,OAAO;QACL,KAAK;QACL,UAAU,EAAE,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE;QACxD,KAAK,EAAE,aAAa;QACpB,IAAI;KACL,CAAC;AACJ,CAAC"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * WebPeel Profile Management
3
+ *
4
+ * Manages named browser profiles stored in ~/.webpeel/profiles/<name>/
5
+ * Each profile contains:
6
+ * - storage-state.json (Playwright storage state: cookies, localStorage, origins)
7
+ * - metadata.json (name, created, lastUsed, domains, description)
8
+ */
9
+ export interface ProfileMetadata {
10
+ name: string;
11
+ created: string;
12
+ lastUsed: string;
13
+ domains: string[];
14
+ description?: string;
15
+ }
16
+ /**
17
+ * Valid profile names: letters, digits, hyphens only. No spaces or special chars.
18
+ */
19
+ export declare function isValidProfileName(name: string): boolean;
20
+ /**
21
+ * Get the directory path for a named profile, or null if it doesn't exist.
22
+ */
23
+ export declare function getProfilePath(name: string): string | null;
24
+ /**
25
+ * Load the Playwright storage state (cookies + localStorage) for a named profile.
26
+ * Returns null if the profile or storage-state.json doesn't exist.
27
+ */
28
+ export declare function loadStorageState(name: string): any | null;
29
+ /**
30
+ * Update the lastUsed timestamp for a profile.
31
+ */
32
+ export declare function touchProfile(name: string): void;
33
+ /**
34
+ * List all profiles, sorted by lastUsed descending.
35
+ */
36
+ export declare function listProfiles(): ProfileMetadata[];
37
+ /**
38
+ * Delete a named profile. Returns true if deleted, false if not found.
39
+ */
40
+ export declare function deleteProfile(name: string): boolean;
41
+ /**
42
+ * Interactively create a new profile:
43
+ * 1. Launches a VISIBLE (headed) Chromium browser
44
+ * 2. User navigates and logs into sites
45
+ * 3. On browser close or Ctrl+C, captures storage state and saves the profile
46
+ */
47
+ export declare function createProfile(name: string, description?: string): Promise<void>;
48
+ //# sourceMappingURL=profiles.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"profiles.d.ts","sourceRoot":"","sources":["../../src/core/profiles.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAgBH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAcD;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAExD;AAID;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAM1D;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,GAAG,GAAG,IAAI,CAQzD;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAU/C;AAED;;GAEG;AACH,wBAAgB,YAAY,IAAI,eAAe,EAAE,CAsBhD;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CASnD;AAID;;;;;GAKG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,WAAW,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAqHrF"}
@@ -0,0 +1,211 @@
1
+ /**
2
+ * WebPeel Profile Management
3
+ *
4
+ * Manages named browser profiles stored in ~/.webpeel/profiles/<name>/
5
+ * Each profile contains:
6
+ * - storage-state.json (Playwright storage state: cookies, localStorage, origins)
7
+ * - metadata.json (name, created, lastUsed, domains, description)
8
+ */
9
+ import { chromium } from 'playwright';
10
+ import { homedir } from 'os';
11
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync, } from 'fs';
12
+ import path from 'path';
13
+ // ─── Paths ───────────────────────────────────────────────────────────────────
14
+ const PROFILES_DIR = path.join(homedir(), '.webpeel', 'profiles');
15
+ function ensureProfilesDir() {
16
+ if (!existsSync(PROFILES_DIR)) {
17
+ mkdirSync(PROFILES_DIR, { recursive: true });
18
+ }
19
+ }
20
+ // ─── Name validation ─────────────────────────────────────────────────────────
21
+ /**
22
+ * Valid profile names: letters, digits, hyphens only. No spaces or special chars.
23
+ */
24
+ export function isValidProfileName(name) {
25
+ return /^[a-zA-Z0-9-]+$/.test(name) && name.length > 0 && name.length <= 64;
26
+ }
27
+ // ─── Core helpers ─────────────────────────────────────────────────────────────
28
+ /**
29
+ * Get the directory path for a named profile, or null if it doesn't exist.
30
+ */
31
+ export function getProfilePath(name) {
32
+ const dir = path.join(PROFILES_DIR, name);
33
+ if (existsSync(dir) && existsSync(path.join(dir, 'metadata.json'))) {
34
+ return dir;
35
+ }
36
+ return null;
37
+ }
38
+ /**
39
+ * Load the Playwright storage state (cookies + localStorage) for a named profile.
40
+ * Returns null if the profile or storage-state.json doesn't exist.
41
+ */
42
+ export function loadStorageState(name) {
43
+ const statePath = path.join(PROFILES_DIR, name, 'storage-state.json');
44
+ if (!existsSync(statePath))
45
+ return null;
46
+ try {
47
+ return JSON.parse(readFileSync(statePath, 'utf-8'));
48
+ }
49
+ catch {
50
+ return null;
51
+ }
52
+ }
53
+ /**
54
+ * Update the lastUsed timestamp for a profile.
55
+ */
56
+ export function touchProfile(name) {
57
+ const metaPath = path.join(PROFILES_DIR, name, 'metadata.json');
58
+ if (!existsSync(metaPath))
59
+ return;
60
+ try {
61
+ const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
62
+ meta.lastUsed = new Date().toISOString();
63
+ writeFileSync(metaPath, JSON.stringify(meta, null, 2));
64
+ }
65
+ catch {
66
+ /* ignore */
67
+ }
68
+ }
69
+ /**
70
+ * List all profiles, sorted by lastUsed descending.
71
+ */
72
+ export function listProfiles() {
73
+ ensureProfilesDir();
74
+ const profiles = [];
75
+ try {
76
+ const entries = readdirSync(PROFILES_DIR, { withFileTypes: true });
77
+ for (const entry of entries) {
78
+ if (!entry.isDirectory())
79
+ continue;
80
+ const metaPath = path.join(PROFILES_DIR, entry.name, 'metadata.json');
81
+ if (!existsSync(metaPath))
82
+ continue;
83
+ try {
84
+ const meta = JSON.parse(readFileSync(metaPath, 'utf-8'));
85
+ profiles.push(meta);
86
+ }
87
+ catch {
88
+ /* skip corrupt profile */
89
+ }
90
+ }
91
+ }
92
+ catch {
93
+ /* ignore read errors */
94
+ }
95
+ // Sort: most recently used first
96
+ profiles.sort((a, b) => b.lastUsed.localeCompare(a.lastUsed));
97
+ return profiles;
98
+ }
99
+ /**
100
+ * Delete a named profile. Returns true if deleted, false if not found.
101
+ */
102
+ export function deleteProfile(name) {
103
+ const dir = path.join(PROFILES_DIR, name);
104
+ if (!existsSync(dir))
105
+ return false;
106
+ try {
107
+ rmSync(dir, { recursive: true, force: true });
108
+ return true;
109
+ }
110
+ catch {
111
+ return false;
112
+ }
113
+ }
114
+ // ─── Interactive profile creation ─────────────────────────────────────────────
115
+ /**
116
+ * Interactively create a new profile:
117
+ * 1. Launches a VISIBLE (headed) Chromium browser
118
+ * 2. User navigates and logs into sites
119
+ * 3. On browser close or Ctrl+C, captures storage state and saves the profile
120
+ */
121
+ export async function createProfile(name, description) {
122
+ if (!isValidProfileName(name)) {
123
+ throw new Error(`Invalid profile name "${name}". Use only letters, numbers, and hyphens (no spaces or special characters).`);
124
+ }
125
+ ensureProfilesDir();
126
+ const profileDir = path.join(PROFILES_DIR, name);
127
+ if (existsSync(profileDir)) {
128
+ throw new Error(`Profile "${name}" already exists. Delete it first with:\n webpeel profile delete ${name}`);
129
+ }
130
+ mkdirSync(profileDir, { recursive: true });
131
+ // Launch headed (visible) Chromium — no user-data-dir so we start fresh
132
+ const browser = await chromium.launch({ headless: false });
133
+ const context = await browser.newContext();
134
+ const page = await context.newPage();
135
+ await page.goto('about:blank').catch(() => { });
136
+ console.log('');
137
+ console.log('╔══════════════════════════════════════════════════════╗');
138
+ console.log(`║ WebPeel Profile Setup: "${name}"`);
139
+ console.log('║ ║');
140
+ console.log('║ Navigate to websites and log in. ║');
141
+ console.log('║ When done, press Ctrl+C or close this window. ║');
142
+ console.log('╚══════════════════════════════════════════════════════╝');
143
+ console.log('');
144
+ let saved = false;
145
+ const saveAndClose = async () => {
146
+ if (saved)
147
+ return;
148
+ saved = true;
149
+ console.log('\nCapturing browser session...');
150
+ try {
151
+ const storageState = await context.storageState();
152
+ writeFileSync(path.join(profileDir, 'storage-state.json'), JSON.stringify(storageState, null, 2));
153
+ // Extract unique domains from cookies (strip leading dot)
154
+ const domains = [
155
+ ...new Set((storageState.cookies ?? [])
156
+ .map((c) => (c.domain ?? '').replace(/^\./, ''))
157
+ .filter(Boolean)),
158
+ ];
159
+ const now = new Date().toISOString();
160
+ const meta = {
161
+ name,
162
+ created: now,
163
+ lastUsed: now,
164
+ domains,
165
+ ...(description ? { description } : {}),
166
+ };
167
+ writeFileSync(path.join(profileDir, 'metadata.json'), JSON.stringify(meta, null, 2));
168
+ console.log(`✓ Profile "${name}" saved to ${profileDir}`);
169
+ if (domains.length > 0) {
170
+ console.log(` Domains: ${domains.join(', ')}`);
171
+ }
172
+ else {
173
+ console.log(' No login sessions detected (no cookies).');
174
+ }
175
+ }
176
+ catch (e) {
177
+ console.error('Warning: Failed to save storage state:', e instanceof Error ? e.message : String(e));
178
+ // Clean up partial directory
179
+ try {
180
+ rmSync(profileDir, { recursive: true, force: true });
181
+ }
182
+ catch {
183
+ /* ignore */
184
+ }
185
+ }
186
+ try {
187
+ await browser.close();
188
+ }
189
+ catch {
190
+ /* ignore — browser may already be closed */
191
+ }
192
+ };
193
+ // Wait for the browser to disconnect (user closed the window) OR SIGINT (Ctrl+C)
194
+ await new Promise((resolve) => {
195
+ browser.on('disconnected', async () => {
196
+ await saveAndClose();
197
+ resolve();
198
+ });
199
+ // Handle Ctrl+C gracefully
200
+ const sigintHandler = async () => {
201
+ await saveAndClose();
202
+ resolve();
203
+ };
204
+ process.once('SIGINT', sigintHandler);
205
+ // Clean up the SIGINT handler if browser closes first
206
+ browser.on('disconnected', () => {
207
+ process.removeListener('SIGINT', sigintHandler);
208
+ });
209
+ });
210
+ }
211
+ //# sourceMappingURL=profiles.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"profiles.js","sourceRoot":"","sources":["../../src/core/profiles.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,OAAO,EAAE,MAAM,IAAI,CAAC;AAC7B,OAAO,EACL,UAAU,EACV,SAAS,EACT,YAAY,EACZ,aAAa,EACb,MAAM,EACN,WAAW,GACZ,MAAM,IAAI,CAAC;AACZ,OAAO,IAAI,MAAM,MAAM,CAAC;AAYxB,gFAAgF;AAEhF,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAElE,SAAS,iBAAiB;IACxB,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;QAC9B,SAAS,CAAC,YAAY,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/C,CAAC;AACH,CAAC;AAED,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC;AAC9E,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,eAAe,CAAC,CAAC,EAAE,CAAC;QACnE,OAAO,GAAG,CAAC;IACb,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,oBAAoB,CAAC,CAAC;IACtE,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;QAAE,OAAO,IAAI,CAAC;IACxC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;IAChE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO;IAClC,IAAI,CAAC;QACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,IAAI,CAAC,QAAQ,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACzC,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC;IAAC,MAAM,CAAC;QACP,YAAY;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY;IAC1B,iBAAiB,EAAE,CAAC;IACpB,MAAM,QAAQ,GAAsB,EAAE,CAAC;IACvC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,WAAW,CAAC,YAAY,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QACnE,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE;gBAAE,SAAS;YACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,KAAK,CAAC,IAAI,EAAE,eAAe,CAAC,CAAC;YACtE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;gBAAE,SAAS;YACpC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAoB,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;gBAC1E,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YAAC,MAAM,CAAC;gBACP,0BAA0B;YAC5B,CAAC;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;IACD,iCAAiC;IACjC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC9D,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IAC1C,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,IAAY,EAAE,WAAoB;IACpE,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,yBAAyB,IAAI,8EAA8E,CAC5G,CAAC;IACJ,CAAC;IAED,iBAAiB,EAAE,CAAC;IAEpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;IACjD,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,YAAY,IAAI,qEAAqE,IAAI,EAAE,CAC5F,CAAC;IACJ,CAAC;IAED,SAAS,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE3C,wEAAwE;IACxE,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;IAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;IAC3C,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;IACrC,MAAM,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IAE/C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAChB,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,8BAA8B,IAAI,GAAG,CAAC,CAAC;IACnD,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,yDAAyD,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,0DAA0D,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,IAAI,KAAK,GAAG,KAAK,CAAC;IAElB,MAAM,YAAY,GAAG,KAAK,IAAmB,EAAE;QAC7C,IAAI,KAAK;YAAE,OAAO;QAClB,KAAK,GAAG,IAAI,CAAC;QAEb,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAE9C,IAAI,CAAC;YACH,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,YAAY,EAAE,CAAC;YAElD,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,oBAAoB,CAAC,EAC3C,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,CACtC,CAAC;YAEF,0DAA0D;YAC1D,MAAM,OAAO,GAAa;gBACxB,GAAG,IAAI,GAAG,CACR,CAAC,YAAY,CAAC,OAAO,IAAI,EAAE,CAAC;qBACzB,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;qBACpD,MAAM,CAAC,OAAO,CAAC,CACnB;aACF,CAAC;YAEF,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrC,MAAM,IAAI,GAAoB;gBAC5B,IAAI;gBACJ,OAAO,EAAE,GAAG;gBACZ,QAAQ,EAAE,GAAG;gBACb,OAAO;gBACP,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACxC,CAAC;YAEF,aAAa,CACX,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,EACtC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAC9B,CAAC;YAEF,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,cAAc,UAAU,EAAE,CAAC,CAAC;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAClD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CACX,wCAAwC,EACxC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAC3C,CAAC;YACF,6BAA6B;YAC7B,IAAI,CAAC;gBACH,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACvD,CAAC;YAAC,MAAM,CAAC;gBACP,YAAY;YACd,CAAC;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,4CAA4C;QAC9C,CAAC;IACH,CAAC,CAAC;IAEF,iFAAiF;IACjF,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;QAClC,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,KAAK,IAAI,EAAE;YACpC,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,MAAM,aAAa,GAAG,KAAK,IAAI,EAAE;YAC/B,MAAM,YAAY,EAAE,CAAC;YACrB,OAAO,EAAE,CAAC;QACZ,CAAC,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAEtC,sDAAsD;QACtD,OAAO,CAAC,EAAE,CAAC,cAAc,EAAE,GAAG,EAAE;YAC9B,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;QAClD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Schema-based extraction using CSS selectors.
3
+ *
4
+ * Each schema defines how to extract listings from a specific domain,
5
+ * inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
6
+ * auto-detection, schemas provide exact selectors for each site's DOM.
7
+ *
8
+ * @module schema-extraction
9
+ */
10
+ export interface SchemaField {
11
+ /** Field name in output (e.g., "title", "price", "rating") */
12
+ name: string;
13
+ /** CSS selector relative to baseSelector. Empty string selects the base element itself. */
14
+ selector: string;
15
+ /** What to extract */
16
+ type: 'text' | 'attribute' | 'html' | 'exists';
17
+ /** For type='attribute', which attribute to read */
18
+ attribute?: string;
19
+ /** Extract all matches (returns array instead of first match) */
20
+ multiple?: boolean;
21
+ /** Optional transform to apply after extraction */
22
+ transform?: 'trim' | 'number' | 'stripCurrency';
23
+ }
24
+ export interface ExtractionSchema {
25
+ /** Human-readable schema name (e.g., "Booking.com Hotel Search") */
26
+ name: string;
27
+ /** Schema version string */
28
+ version: string;
29
+ /** Matching domains (e.g., ["booking.com", "www.booking.com"]) */
30
+ domains: string[];
31
+ /** Optional URL path patterns (regex strings) for more specific matching */
32
+ urlPatterns?: string[];
33
+ /** CSS selector for each listing item */
34
+ baseSelector: string;
35
+ /** Fields to extract from each item */
36
+ fields: SchemaField[];
37
+ /** Optional pagination config */
38
+ pagination?: {
39
+ nextSelector?: string;
40
+ pageParam?: string;
41
+ };
42
+ }
43
+ /** A single extracted item — field names map to extracted values */
44
+ export interface ExtractedItem {
45
+ [key: string]: string | string[] | boolean | number | undefined;
46
+ }
47
+ /**
48
+ * Load all bundled schemas.
49
+ */
50
+ export declare function loadBundledSchemas(): ExtractionSchema[];
51
+ /**
52
+ * Find a matching schema for a given URL.
53
+ *
54
+ * Matches by domain first, then optionally by URL patterns (regex).
55
+ * Returns the first matching schema or null.
56
+ */
57
+ export declare function findSchemaForUrl(url: string): ExtractionSchema | null;
58
+ /**
59
+ * Extract listings from HTML using a schema's CSS selectors.
60
+ *
61
+ * @param html - Raw HTML string to parse
62
+ * @param schema - Extraction schema to use
63
+ * @param baseUrl - Optional base URL for resolving relative links
64
+ * @returns Array of extracted items (may be empty)
65
+ */
66
+ export declare function extractWithSchema(html: string, schema: ExtractionSchema, baseUrl?: string): ExtractedItem[];
67
+ //# sourceMappingURL=schema-extraction.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema-extraction.d.ts","sourceRoot":"","sources":["../../src/core/schema-extraction.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AASH,MAAM,WAAW,WAAW;IAC1B,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,2FAA2F;IAC3F,QAAQ,EAAE,MAAM,CAAC;IACjB,sBAAsB;IACtB,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC/C,oDAAoD;IACpD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,mDAAmD;IACnD,SAAS,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,eAAe,CAAC;CACjD;AAED,MAAM,WAAW,gBAAgB;IAC/B,oEAAoE;IACpE,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,4EAA4E;IAC5E,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,yCAAyC;IACzC,YAAY,EAAE,MAAM,CAAC;IACrB,uCAAuC;IACvC,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,iCAAiC;IACjC,UAAU,CAAC,EAAE;QACX,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED,oEAAoE;AACpE,MAAM,WAAW,aAAa;IAC5B,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,EAAE,GAAG,OAAO,GAAG,MAAM,GAAG,SAAS,CAAC;CACjE;AA4PD;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,gBAAgB,EAAE,CAEvD;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAsCrE;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,gBAAgB,EACxB,OAAO,CAAC,EAAE,MAAM,GACf,aAAa,EAAE,CA0DjB"}