scrapex 0.5.3 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +392 -145
  3. package/dist/enhancer-Q6CSc1gA.mjs +220 -0
  4. package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
  5. package/dist/enhancer-oM4BhYYS.cjs +268 -0
  6. package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
  7. package/dist/index.cjs +852 -0
  8. package/dist/index.cjs.map +1 -0
  9. package/dist/index.d.cts +264 -0
  10. package/dist/index.d.cts.map +1 -0
  11. package/dist/index.d.mts +264 -0
  12. package/dist/index.d.mts.map +1 -0
  13. package/dist/index.mjs +798 -0
  14. package/dist/index.mjs.map +1 -0
  15. package/dist/llm/index.cjs +316 -0
  16. package/dist/llm/index.cjs.map +1 -0
  17. package/dist/llm/index.d.cts +211 -0
  18. package/dist/llm/index.d.cts.map +1 -0
  19. package/dist/llm/index.d.mts +211 -0
  20. package/dist/llm/index.d.mts.map +1 -0
  21. package/dist/llm/index.mjs +310 -0
  22. package/dist/llm/index.mjs.map +1 -0
  23. package/dist/parsers/index.cjs +200 -0
  24. package/dist/parsers/index.cjs.map +1 -0
  25. package/dist/parsers/index.d.cts +133 -0
  26. package/dist/parsers/index.d.cts.map +1 -0
  27. package/dist/parsers/index.d.mts +133 -0
  28. package/dist/parsers/index.d.mts.map +1 -0
  29. package/dist/parsers/index.mjs +192 -0
  30. package/dist/parsers/index.mjs.map +1 -0
  31. package/dist/types-CNQZVW36.d.mts +150 -0
  32. package/dist/types-CNQZVW36.d.mts.map +1 -0
  33. package/dist/types-D0HYR95H.d.cts +150 -0
  34. package/dist/types-D0HYR95H.d.cts.map +1 -0
  35. package/package.json +80 -100
  36. package/dist/index.d.ts +0 -45
  37. package/dist/index.js +0 -8
  38. package/dist/scrapex.cjs.development.js +0 -1130
  39. package/dist/scrapex.cjs.development.js.map +0 -1
  40. package/dist/scrapex.cjs.production.min.js +0 -2
  41. package/dist/scrapex.cjs.production.min.js.map +0 -1
  42. package/dist/scrapex.esm.js +0 -1122
  43. package/dist/scrapex.esm.js.map +0 -1
@@ -0,0 +1,220 @@
1
+ import { z } from "zod";
2
+
3
+ //#region src/core/errors.ts
4
+ /**
5
+ * Custom error class for scraping failures with structured error codes
6
+ */
7
+ var ScrapeError = class ScrapeError extends Error {
8
+ code;
9
+ statusCode;
10
+ constructor(message, code, statusCode, cause) {
11
+ super(message, { cause });
12
+ this.name = "ScrapeError";
13
+ this.code = code;
14
+ this.statusCode = statusCode;
15
+ if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
16
+ }
17
+ /**
18
+ * Create a ScrapeError from an unknown error
19
+ */
20
+ static from(error, code = "FETCH_FAILED") {
21
+ if (error instanceof ScrapeError) return error;
22
+ if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
23
+ return new ScrapeError(String(error), code);
24
+ }
25
+ /**
26
+ * Check if error is retryable (network issues, timeouts)
27
+ */
28
+ isRetryable() {
29
+ return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
30
+ }
31
+ /**
32
+ * Convert to a plain object for serialization
33
+ */
34
+ toJSON() {
35
+ return {
36
+ name: this.name,
37
+ message: this.message,
38
+ code: this.code,
39
+ statusCode: this.statusCode,
40
+ stack: this.stack
41
+ };
42
+ }
43
+ };
44
+
45
+ //#endregion
46
+ //#region src/llm/types.ts
47
+ /**
48
+ * Zod schemas for LLM outputs
49
+ */
50
+ const SummarySchema = z.object({ summary: z.string().describe("A concise 2-3 sentence summary of the content") });
51
+ const TagsSchema = z.object({ tags: z.array(z.string()).describe("5-10 relevant tags/keywords") });
52
+ const EntitiesSchema = z.object({
53
+ people: z.array(z.string()).describe("People mentioned"),
54
+ organizations: z.array(z.string()).describe("Organizations/companies"),
55
+ technologies: z.array(z.string()).describe("Technologies/tools/frameworks"),
56
+ locations: z.array(z.string()).describe("Locations/places"),
57
+ concepts: z.array(z.string()).describe("Key concepts/topics")
58
+ });
59
+ const ClassifySchema = z.object({
60
+ contentType: z.enum([
61
+ "article",
62
+ "repo",
63
+ "docs",
64
+ "package",
65
+ "video",
66
+ "tool",
67
+ "product",
68
+ "unknown"
69
+ ]).describe("The type of content"),
70
+ confidence: z.number().min(0).max(1).describe("Confidence score 0-1")
71
+ });
72
+
73
+ //#endregion
74
+ //#region src/llm/enhancer.ts
75
+ /**
76
+ * Enhance scraped data with LLM-powered features
77
+ */
78
+ async function enhance(data, provider, types) {
79
+ const results = {};
80
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
81
+ const context = `Title: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
82
+ const promises = [];
83
+ if (types.includes("summarize")) promises.push(summarize(context, provider).then((summary) => {
84
+ results.summary = summary;
85
+ }));
86
+ if (types.includes("tags")) promises.push(extractTags(context, provider).then((tags) => {
87
+ results.suggestedTags = tags;
88
+ }));
89
+ if (types.includes("entities")) promises.push(extractEntities(context, provider).then((entities) => {
90
+ results.entities = entities;
91
+ }));
92
+ if (types.includes("classify")) promises.push(classify(context, provider).then((classification) => {
93
+ if (classification.confidence > .7) results.contentType = classification.contentType;
94
+ }));
95
+ await Promise.all(promises);
96
+ return results;
97
+ }
98
+ /**
99
+ * Ask a custom question about the scraped content
100
+ * Results are stored in the `custom` field of ScrapedData
101
+ */
102
+ async function ask(data, provider, prompt, options) {
103
+ const key = options?.key || "response";
104
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
105
+ const processedPrompt = applyPlaceholders(prompt, data, content);
106
+ if (options?.schema) {
107
+ const result = await extract(data, provider, options.schema, processedPrompt);
108
+ return { custom: { [key]: result } };
109
+ }
110
+ const fullPrompt = prompt.includes("{{content}}") ? processedPrompt : `${processedPrompt}\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
111
+ const response = await provider.complete(fullPrompt);
112
+ return { custom: { [key]: response } };
113
+ }
114
+ /**
115
+ * Apply placeholder replacements to a prompt template
116
+ */
117
+ function applyPlaceholders(prompt, data, content) {
118
+ const domain = (() => {
119
+ try {
120
+ return new URL(data.url).hostname;
121
+ } catch {
122
+ return "";
123
+ }
124
+ })();
125
+ return prompt.replace(/\{\{title\}\}/g, data.title).replace(/\{\{url\}\}/g, data.url).replace(/\{\{content\}\}/g, content).replace(/\{\{description\}\}/g, data.description || "").replace(/\{\{excerpt\}\}/g, data.excerpt || "").replace(/\{\{domain\}\}/g, domain);
126
+ }
127
+ /**
128
+ * Extract structured data using LLM and a custom schema
129
+ */
130
+ async function extract(data, provider, schema, promptTemplate) {
131
+ const zodShape = {};
132
+ for (const [key, type] of Object.entries(schema)) {
133
+ const isOptional = type.endsWith("?");
134
+ const baseType = isOptional ? type.slice(0, -1) : type;
135
+ let zodType;
136
+ switch (baseType) {
137
+ case "string":
138
+ zodType = z.string();
139
+ break;
140
+ case "number":
141
+ zodType = z.number();
142
+ break;
143
+ case "boolean":
144
+ zodType = z.boolean();
145
+ break;
146
+ case "string[]":
147
+ zodType = z.array(z.string());
148
+ break;
149
+ case "number[]":
150
+ zodType = z.array(z.number());
151
+ break;
152
+ default: zodType = z.string();
153
+ }
154
+ zodShape[key] = isOptional ? zodType.optional() : zodType;
155
+ }
156
+ const zodSchema = z.object(zodShape);
157
+ const content = data.textContent.slice(0, 4e3);
158
+ let prompt;
159
+ if (promptTemplate) {
160
+ prompt = applyPlaceholders(promptTemplate, data, content);
161
+ if (!promptTemplate.includes("{{content}}")) prompt += `\n\nContext:\n${content}`;
162
+ } else prompt = `Extract the following information from this content:
163
+
164
+ Title: ${data.title}
165
+ URL: ${data.url}
166
+
167
+ Content:
168
+ ${content}
169
+
170
+ Extract these fields:
171
+ ${Object.entries(schema).map(([key, type]) => `- ${key} (${type})`).join("\n")}`;
172
+ return provider.completeJSON(prompt, zodSchema);
173
+ }
174
+ /**
175
+ * Generate a summary of the content
176
+ */
177
+ async function summarize(context, provider) {
178
+ const prompt = `Summarize the following content in 2-3 concise sentences:
179
+
180
+ ${context}`;
181
+ return (await provider.completeJSON(prompt, SummarySchema)).summary;
182
+ }
183
+ /**
184
+ * Extract relevant tags/keywords
185
+ */
186
+ async function extractTags(context, provider) {
187
+ const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:
188
+
189
+ ${context}`;
190
+ return (await provider.completeJSON(prompt, TagsSchema)).tags;
191
+ }
192
+ /**
193
+ * Extract named entities from content
194
+ */
195
+ async function extractEntities(context, provider) {
196
+ const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:
197
+
198
+ ${context}`;
199
+ return provider.completeJSON(prompt, EntitiesSchema);
200
+ }
201
+ /**
202
+ * Classify content type using LLM
203
+ */
204
+ async function classify(context, provider) {
205
+ const prompt = `Classify the following content into one of these categories:
206
+ - article: Blog post, news article, essay
207
+ - repo: Code repository, open source project
208
+ - docs: Documentation, API reference, guides
209
+ - package: npm/pip package page
210
+ - video: Video content, YouTube
211
+ - tool: Software tool, web application
212
+ - product: Commercial product, e-commerce
213
+
214
+ ${context}`;
215
+ return provider.completeJSON(prompt, ClassifySchema);
216
+ }
217
+
218
+ //#endregion
219
+ export { EntitiesSchema as a, ScrapeError as c, ClassifySchema as i, enhance as n, SummarySchema as o, extract as r, TagsSchema as s, ask as t };
220
+ //# sourceMappingURL=enhancer-Q6CSc1gA.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"enhancer-Q6CSc1gA.mjs","names":["results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgB,EAAE,OAAO,EACpC,SAAS,EAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAa,EAAE,OAAO,EACjC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAc,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,aAAa,EACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAY,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMA,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAU,EAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAY,EAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}
@@ -0,0 +1,268 @@
1
+ const require_index = require('./index.cjs');
2
+ let zod = require("zod");
3
+
4
+ //#region src/core/errors.ts
5
+ /**
6
+ * Custom error class for scraping failures with structured error codes
7
+ */
8
+ var ScrapeError = class ScrapeError extends Error {
9
+ code;
10
+ statusCode;
11
+ constructor(message, code, statusCode, cause) {
12
+ super(message, { cause });
13
+ this.name = "ScrapeError";
14
+ this.code = code;
15
+ this.statusCode = statusCode;
16
+ if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
17
+ }
18
+ /**
19
+ * Create a ScrapeError from an unknown error
20
+ */
21
+ static from(error, code = "FETCH_FAILED") {
22
+ if (error instanceof ScrapeError) return error;
23
+ if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
24
+ return new ScrapeError(String(error), code);
25
+ }
26
+ /**
27
+ * Check if error is retryable (network issues, timeouts)
28
+ */
29
+ isRetryable() {
30
+ return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
31
+ }
32
+ /**
33
+ * Convert to a plain object for serialization
34
+ */
35
+ toJSON() {
36
+ return {
37
+ name: this.name,
38
+ message: this.message,
39
+ code: this.code,
40
+ statusCode: this.statusCode,
41
+ stack: this.stack
42
+ };
43
+ }
44
+ };
45
+
46
+ //#endregion
47
+ //#region src/llm/types.ts
48
+ /**
49
+ * Zod schemas for LLM outputs
50
+ */
51
+ const SummarySchema = zod.z.object({ summary: zod.z.string().describe("A concise 2-3 sentence summary of the content") });
52
+ const TagsSchema = zod.z.object({ tags: zod.z.array(zod.z.string()).describe("5-10 relevant tags/keywords") });
53
+ const EntitiesSchema = zod.z.object({
54
+ people: zod.z.array(zod.z.string()).describe("People mentioned"),
55
+ organizations: zod.z.array(zod.z.string()).describe("Organizations/companies"),
56
+ technologies: zod.z.array(zod.z.string()).describe("Technologies/tools/frameworks"),
57
+ locations: zod.z.array(zod.z.string()).describe("Locations/places"),
58
+ concepts: zod.z.array(zod.z.string()).describe("Key concepts/topics")
59
+ });
60
+ const ClassifySchema = zod.z.object({
61
+ contentType: zod.z.enum([
62
+ "article",
63
+ "repo",
64
+ "docs",
65
+ "package",
66
+ "video",
67
+ "tool",
68
+ "product",
69
+ "unknown"
70
+ ]).describe("The type of content"),
71
+ confidence: zod.z.number().min(0).max(1).describe("Confidence score 0-1")
72
+ });
73
+
74
+ //#endregion
75
+ //#region src/llm/enhancer.ts
76
+ /**
77
+ * Enhance scraped data with LLM-powered features
78
+ */
79
+ async function enhance(data, provider, types) {
80
+ const results = {};
81
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
82
+ const context = `Title: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
83
+ const promises = [];
84
+ if (types.includes("summarize")) promises.push(summarize(context, provider).then((summary) => {
85
+ results.summary = summary;
86
+ }));
87
+ if (types.includes("tags")) promises.push(extractTags(context, provider).then((tags) => {
88
+ results.suggestedTags = tags;
89
+ }));
90
+ if (types.includes("entities")) promises.push(extractEntities(context, provider).then((entities) => {
91
+ results.entities = entities;
92
+ }));
93
+ if (types.includes("classify")) promises.push(classify(context, provider).then((classification) => {
94
+ if (classification.confidence > .7) results.contentType = classification.contentType;
95
+ }));
96
+ await Promise.all(promises);
97
+ return results;
98
+ }
99
+ /**
100
+ * Ask a custom question about the scraped content
101
+ * Results are stored in the `custom` field of ScrapedData
102
+ */
103
+ async function ask(data, provider, prompt, options) {
104
+ const key = options?.key || "response";
105
+ const content = data.excerpt || data.textContent.slice(0, 1e4);
106
+ const processedPrompt = applyPlaceholders(prompt, data, content);
107
+ if (options?.schema) {
108
+ const result = await extract(data, provider, options.schema, processedPrompt);
109
+ return { custom: { [key]: result } };
110
+ }
111
+ const fullPrompt = prompt.includes("{{content}}") ? processedPrompt : `${processedPrompt}\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
112
+ const response = await provider.complete(fullPrompt);
113
+ return { custom: { [key]: response } };
114
+ }
115
+ /**
116
+ * Apply placeholder replacements to a prompt template
117
+ */
118
+ function applyPlaceholders(prompt, data, content) {
119
+ const domain = (() => {
120
+ try {
121
+ return new URL(data.url).hostname;
122
+ } catch {
123
+ return "";
124
+ }
125
+ })();
126
+ return prompt.replace(/\{\{title\}\}/g, data.title).replace(/\{\{url\}\}/g, data.url).replace(/\{\{content\}\}/g, content).replace(/\{\{description\}\}/g, data.description || "").replace(/\{\{excerpt\}\}/g, data.excerpt || "").replace(/\{\{domain\}\}/g, domain);
127
+ }
128
+ /**
129
+ * Extract structured data using LLM and a custom schema
130
+ */
131
+ async function extract(data, provider, schema, promptTemplate) {
132
+ const zodShape = {};
133
+ for (const [key, type] of Object.entries(schema)) {
134
+ const isOptional = type.endsWith("?");
135
+ const baseType = isOptional ? type.slice(0, -1) : type;
136
+ let zodType;
137
+ switch (baseType) {
138
+ case "string":
139
+ zodType = zod.z.string();
140
+ break;
141
+ case "number":
142
+ zodType = zod.z.number();
143
+ break;
144
+ case "boolean":
145
+ zodType = zod.z.boolean();
146
+ break;
147
+ case "string[]":
148
+ zodType = zod.z.array(zod.z.string());
149
+ break;
150
+ case "number[]":
151
+ zodType = zod.z.array(zod.z.number());
152
+ break;
153
+ default: zodType = zod.z.string();
154
+ }
155
+ zodShape[key] = isOptional ? zodType.optional() : zodType;
156
+ }
157
+ const zodSchema = zod.z.object(zodShape);
158
+ const content = data.textContent.slice(0, 4e3);
159
+ let prompt;
160
+ if (promptTemplate) {
161
+ prompt = applyPlaceholders(promptTemplate, data, content);
162
+ if (!promptTemplate.includes("{{content}}")) prompt += `\n\nContext:\n${content}`;
163
+ } else prompt = `Extract the following information from this content:
164
+
165
+ Title: ${data.title}
166
+ URL: ${data.url}
167
+
168
+ Content:
169
+ ${content}
170
+
171
+ Extract these fields:
172
+ ${Object.entries(schema).map(([key, type]) => `- ${key} (${type})`).join("\n")}`;
173
+ return provider.completeJSON(prompt, zodSchema);
174
+ }
175
+ /**
176
+ * Generate a summary of the content
177
+ */
178
+ async function summarize(context, provider) {
179
+ const prompt = `Summarize the following content in 2-3 concise sentences:
180
+
181
+ ${context}`;
182
+ return (await provider.completeJSON(prompt, SummarySchema)).summary;
183
+ }
184
+ /**
185
+ * Extract relevant tags/keywords
186
+ */
187
+ async function extractTags(context, provider) {
188
+ const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:
189
+
190
+ ${context}`;
191
+ return (await provider.completeJSON(prompt, TagsSchema)).tags;
192
+ }
193
+ /**
194
+ * Extract named entities from content
195
+ */
196
+ async function extractEntities(context, provider) {
197
+ const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:
198
+
199
+ ${context}`;
200
+ return provider.completeJSON(prompt, EntitiesSchema);
201
+ }
202
+ /**
203
+ * Classify content type using LLM
204
+ */
205
+ async function classify(context, provider) {
206
+ const prompt = `Classify the following content into one of these categories:
207
+ - article: Blog post, news article, essay
208
+ - repo: Code repository, open source project
209
+ - docs: Documentation, API reference, guides
210
+ - package: npm/pip package page
211
+ - video: Video content, YouTube
212
+ - tool: Software tool, web application
213
+ - product: Commercial product, e-commerce
214
+
215
+ ${context}`;
216
+ return provider.completeJSON(prompt, ClassifySchema);
217
+ }
218
+
219
+ //#endregion
220
+ Object.defineProperty(exports, 'ClassifySchema', {
221
+ enumerable: true,
222
+ get: function () {
223
+ return ClassifySchema;
224
+ }
225
+ });
226
+ Object.defineProperty(exports, 'EntitiesSchema', {
227
+ enumerable: true,
228
+ get: function () {
229
+ return EntitiesSchema;
230
+ }
231
+ });
232
+ Object.defineProperty(exports, 'ScrapeError', {
233
+ enumerable: true,
234
+ get: function () {
235
+ return ScrapeError;
236
+ }
237
+ });
238
+ Object.defineProperty(exports, 'SummarySchema', {
239
+ enumerable: true,
240
+ get: function () {
241
+ return SummarySchema;
242
+ }
243
+ });
244
+ Object.defineProperty(exports, 'TagsSchema', {
245
+ enumerable: true,
246
+ get: function () {
247
+ return TagsSchema;
248
+ }
249
+ });
250
+ Object.defineProperty(exports, 'ask', {
251
+ enumerable: true,
252
+ get: function () {
253
+ return ask;
254
+ }
255
+ });
256
+ Object.defineProperty(exports, 'enhance', {
257
+ enumerable: true,
258
+ get: function () {
259
+ return enhance;
260
+ }
261
+ });
262
+ Object.defineProperty(exports, 'extract', {
263
+ enumerable: true,
264
+ get: function () {
265
+ return extract;
266
+ }
267
+ });
268
+ //# sourceMappingURL=enhancer-oM4BhYYS.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"enhancer-oM4BhYYS.cjs","names":["z","results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","z","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgBA,MAAE,OAAO,EACpC,SAASA,MAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAaA,MAAE,OAAO,EACjC,MAAMA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,QAAQA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAeA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAcA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAWA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,aAAaA,MACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAYA,MAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMC,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAUC,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAUA,MAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAYA,MAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}