scrapex 0.5.3 → 1.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +392 -145
- package/dist/enhancer-Q6CSc1gA.mjs +220 -0
- package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
- package/dist/enhancer-oM4BhYYS.cjs +268 -0
- package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
- package/dist/index.cjs +852 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +264 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +264 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +798 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +316 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +211 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +211 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +310 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +200 -0
- package/dist/parsers/index.cjs.map +1 -0
- package/dist/parsers/index.d.cts +133 -0
- package/dist/parsers/index.d.cts.map +1 -0
- package/dist/parsers/index.d.mts +133 -0
- package/dist/parsers/index.d.mts.map +1 -0
- package/dist/parsers/index.mjs +192 -0
- package/dist/parsers/index.mjs.map +1 -0
- package/dist/types-CNQZVW36.d.mts +150 -0
- package/dist/types-CNQZVW36.d.mts.map +1 -0
- package/dist/types-D0HYR95H.d.cts +150 -0
- package/dist/types-D0HYR95H.d.cts.map +1 -0
- package/package.json +80 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1130
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1122
- package/dist/scrapex.esm.js.map +0 -1
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
|
|
3
|
+
//#region src/core/errors.ts
|
|
4
|
+
/**
|
|
5
|
+
* Custom error class for scraping failures with structured error codes
|
|
6
|
+
*/
|
|
7
|
+
var ScrapeError = class ScrapeError extends Error {
|
|
8
|
+
code;
|
|
9
|
+
statusCode;
|
|
10
|
+
constructor(message, code, statusCode, cause) {
|
|
11
|
+
super(message, { cause });
|
|
12
|
+
this.name = "ScrapeError";
|
|
13
|
+
this.code = code;
|
|
14
|
+
this.statusCode = statusCode;
|
|
15
|
+
if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Create a ScrapeError from an unknown error
|
|
19
|
+
*/
|
|
20
|
+
static from(error, code = "FETCH_FAILED") {
|
|
21
|
+
if (error instanceof ScrapeError) return error;
|
|
22
|
+
if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
|
|
23
|
+
return new ScrapeError(String(error), code);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Check if error is retryable (network issues, timeouts)
|
|
27
|
+
*/
|
|
28
|
+
isRetryable() {
|
|
29
|
+
return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Convert to a plain object for serialization
|
|
33
|
+
*/
|
|
34
|
+
toJSON() {
|
|
35
|
+
return {
|
|
36
|
+
name: this.name,
|
|
37
|
+
message: this.message,
|
|
38
|
+
code: this.code,
|
|
39
|
+
statusCode: this.statusCode,
|
|
40
|
+
stack: this.stack
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
//#endregion
|
|
46
|
+
//#region src/llm/types.ts
|
|
47
|
+
/**
|
|
48
|
+
* Zod schemas for LLM outputs
|
|
49
|
+
*/
|
|
50
|
+
const SummarySchema = z.object({ summary: z.string().describe("A concise 2-3 sentence summary of the content") });
|
|
51
|
+
const TagsSchema = z.object({ tags: z.array(z.string()).describe("5-10 relevant tags/keywords") });
|
|
52
|
+
const EntitiesSchema = z.object({
|
|
53
|
+
people: z.array(z.string()).describe("People mentioned"),
|
|
54
|
+
organizations: z.array(z.string()).describe("Organizations/companies"),
|
|
55
|
+
technologies: z.array(z.string()).describe("Technologies/tools/frameworks"),
|
|
56
|
+
locations: z.array(z.string()).describe("Locations/places"),
|
|
57
|
+
concepts: z.array(z.string()).describe("Key concepts/topics")
|
|
58
|
+
});
|
|
59
|
+
const ClassifySchema = z.object({
|
|
60
|
+
contentType: z.enum([
|
|
61
|
+
"article",
|
|
62
|
+
"repo",
|
|
63
|
+
"docs",
|
|
64
|
+
"package",
|
|
65
|
+
"video",
|
|
66
|
+
"tool",
|
|
67
|
+
"product",
|
|
68
|
+
"unknown"
|
|
69
|
+
]).describe("The type of content"),
|
|
70
|
+
confidence: z.number().min(0).max(1).describe("Confidence score 0-1")
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
//#endregion
|
|
74
|
+
//#region src/llm/enhancer.ts
|
|
75
|
+
/**
|
|
76
|
+
* Enhance scraped data with LLM-powered features
|
|
77
|
+
*/
|
|
78
|
+
async function enhance(data, provider, types) {
|
|
79
|
+
const results = {};
|
|
80
|
+
const content = data.excerpt || data.textContent.slice(0, 1e4);
|
|
81
|
+
const context = `Title: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
|
|
82
|
+
const promises = [];
|
|
83
|
+
if (types.includes("summarize")) promises.push(summarize(context, provider).then((summary) => {
|
|
84
|
+
results.summary = summary;
|
|
85
|
+
}));
|
|
86
|
+
if (types.includes("tags")) promises.push(extractTags(context, provider).then((tags) => {
|
|
87
|
+
results.suggestedTags = tags;
|
|
88
|
+
}));
|
|
89
|
+
if (types.includes("entities")) promises.push(extractEntities(context, provider).then((entities) => {
|
|
90
|
+
results.entities = entities;
|
|
91
|
+
}));
|
|
92
|
+
if (types.includes("classify")) promises.push(classify(context, provider).then((classification) => {
|
|
93
|
+
if (classification.confidence > .7) results.contentType = classification.contentType;
|
|
94
|
+
}));
|
|
95
|
+
await Promise.all(promises);
|
|
96
|
+
return results;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Ask a custom question about the scraped content
|
|
100
|
+
* Results are stored in the `custom` field of ScrapedData
|
|
101
|
+
*/
|
|
102
|
+
async function ask(data, provider, prompt, options) {
|
|
103
|
+
const key = options?.key || "response";
|
|
104
|
+
const content = data.excerpt || data.textContent.slice(0, 1e4);
|
|
105
|
+
const processedPrompt = applyPlaceholders(prompt, data, content);
|
|
106
|
+
if (options?.schema) {
|
|
107
|
+
const result = await extract(data, provider, options.schema, processedPrompt);
|
|
108
|
+
return { custom: { [key]: result } };
|
|
109
|
+
}
|
|
110
|
+
const fullPrompt = prompt.includes("{{content}}") ? processedPrompt : `${processedPrompt}\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
|
|
111
|
+
const response = await provider.complete(fullPrompt);
|
|
112
|
+
return { custom: { [key]: response } };
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Apply placeholder replacements to a prompt template
|
|
116
|
+
*/
|
|
117
|
+
function applyPlaceholders(prompt, data, content) {
|
|
118
|
+
const domain = (() => {
|
|
119
|
+
try {
|
|
120
|
+
return new URL(data.url).hostname;
|
|
121
|
+
} catch {
|
|
122
|
+
return "";
|
|
123
|
+
}
|
|
124
|
+
})();
|
|
125
|
+
return prompt.replace(/\{\{title\}\}/g, data.title).replace(/\{\{url\}\}/g, data.url).replace(/\{\{content\}\}/g, content).replace(/\{\{description\}\}/g, data.description || "").replace(/\{\{excerpt\}\}/g, data.excerpt || "").replace(/\{\{domain\}\}/g, domain);
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Extract structured data using LLM and a custom schema
|
|
129
|
+
*/
|
|
130
|
+
async function extract(data, provider, schema, promptTemplate) {
|
|
131
|
+
const zodShape = {};
|
|
132
|
+
for (const [key, type] of Object.entries(schema)) {
|
|
133
|
+
const isOptional = type.endsWith("?");
|
|
134
|
+
const baseType = isOptional ? type.slice(0, -1) : type;
|
|
135
|
+
let zodType;
|
|
136
|
+
switch (baseType) {
|
|
137
|
+
case "string":
|
|
138
|
+
zodType = z.string();
|
|
139
|
+
break;
|
|
140
|
+
case "number":
|
|
141
|
+
zodType = z.number();
|
|
142
|
+
break;
|
|
143
|
+
case "boolean":
|
|
144
|
+
zodType = z.boolean();
|
|
145
|
+
break;
|
|
146
|
+
case "string[]":
|
|
147
|
+
zodType = z.array(z.string());
|
|
148
|
+
break;
|
|
149
|
+
case "number[]":
|
|
150
|
+
zodType = z.array(z.number());
|
|
151
|
+
break;
|
|
152
|
+
default: zodType = z.string();
|
|
153
|
+
}
|
|
154
|
+
zodShape[key] = isOptional ? zodType.optional() : zodType;
|
|
155
|
+
}
|
|
156
|
+
const zodSchema = z.object(zodShape);
|
|
157
|
+
const content = data.textContent.slice(0, 4e3);
|
|
158
|
+
let prompt;
|
|
159
|
+
if (promptTemplate) {
|
|
160
|
+
prompt = applyPlaceholders(promptTemplate, data, content);
|
|
161
|
+
if (!promptTemplate.includes("{{content}}")) prompt += `\n\nContext:\n${content}`;
|
|
162
|
+
} else prompt = `Extract the following information from this content:
|
|
163
|
+
|
|
164
|
+
Title: ${data.title}
|
|
165
|
+
URL: ${data.url}
|
|
166
|
+
|
|
167
|
+
Content:
|
|
168
|
+
${content}
|
|
169
|
+
|
|
170
|
+
Extract these fields:
|
|
171
|
+
${Object.entries(schema).map(([key, type]) => `- ${key} (${type})`).join("\n")}`;
|
|
172
|
+
return provider.completeJSON(prompt, zodSchema);
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Generate a summary of the content
|
|
176
|
+
*/
|
|
177
|
+
async function summarize(context, provider) {
|
|
178
|
+
const prompt = `Summarize the following content in 2-3 concise sentences:
|
|
179
|
+
|
|
180
|
+
${context}`;
|
|
181
|
+
return (await provider.completeJSON(prompt, SummarySchema)).summary;
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Extract relevant tags/keywords
|
|
185
|
+
*/
|
|
186
|
+
async function extractTags(context, provider) {
|
|
187
|
+
const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:
|
|
188
|
+
|
|
189
|
+
${context}`;
|
|
190
|
+
return (await provider.completeJSON(prompt, TagsSchema)).tags;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Extract named entities from content
|
|
194
|
+
*/
|
|
195
|
+
async function extractEntities(context, provider) {
|
|
196
|
+
const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:
|
|
197
|
+
|
|
198
|
+
${context}`;
|
|
199
|
+
return provider.completeJSON(prompt, EntitiesSchema);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Classify content type using LLM
|
|
203
|
+
*/
|
|
204
|
+
async function classify(context, provider) {
|
|
205
|
+
const prompt = `Classify the following content into one of these categories:
|
|
206
|
+
- article: Blog post, news article, essay
|
|
207
|
+
- repo: Code repository, open source project
|
|
208
|
+
- docs: Documentation, API reference, guides
|
|
209
|
+
- package: npm/pip package page
|
|
210
|
+
- video: Video content, YouTube
|
|
211
|
+
- tool: Software tool, web application
|
|
212
|
+
- product: Commercial product, e-commerce
|
|
213
|
+
|
|
214
|
+
${context}`;
|
|
215
|
+
return provider.completeJSON(prompt, ClassifySchema);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
//#endregion
|
|
219
|
+
export { EntitiesSchema as a, ScrapeError as c, ClassifySchema as i, enhance as n, SummarySchema as o, extract as r, TagsSchema as s, ask as t };
|
|
220
|
+
//# sourceMappingURL=enhancer-Q6CSc1gA.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"enhancer-Q6CSc1gA.mjs","names":["results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgB,EAAE,OAAO,EACpC,SAAS,EAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAa,EAAE,OAAO,EACjC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAe,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAc,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiB,EAAE,OAAO;CACrC,aAAa,EACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAY,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMA,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAU,EAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAU,EAAE,MAAM,EAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAU,EAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAY,EAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
const require_index = require('./index.cjs');
|
|
2
|
+
let zod = require("zod");
|
|
3
|
+
|
|
4
|
+
//#region src/core/errors.ts
|
|
5
|
+
/**
|
|
6
|
+
* Custom error class for scraping failures with structured error codes
|
|
7
|
+
*/
|
|
8
|
+
var ScrapeError = class ScrapeError extends Error {
|
|
9
|
+
code;
|
|
10
|
+
statusCode;
|
|
11
|
+
constructor(message, code, statusCode, cause) {
|
|
12
|
+
super(message, { cause });
|
|
13
|
+
this.name = "ScrapeError";
|
|
14
|
+
this.code = code;
|
|
15
|
+
this.statusCode = statusCode;
|
|
16
|
+
if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Create a ScrapeError from an unknown error
|
|
20
|
+
*/
|
|
21
|
+
static from(error, code = "FETCH_FAILED") {
|
|
22
|
+
if (error instanceof ScrapeError) return error;
|
|
23
|
+
if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
|
|
24
|
+
return new ScrapeError(String(error), code);
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Check if error is retryable (network issues, timeouts)
|
|
28
|
+
*/
|
|
29
|
+
isRetryable() {
|
|
30
|
+
return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Convert to a plain object for serialization
|
|
34
|
+
*/
|
|
35
|
+
toJSON() {
|
|
36
|
+
return {
|
|
37
|
+
name: this.name,
|
|
38
|
+
message: this.message,
|
|
39
|
+
code: this.code,
|
|
40
|
+
statusCode: this.statusCode,
|
|
41
|
+
stack: this.stack
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
//#endregion
|
|
47
|
+
//#region src/llm/types.ts
|
|
48
|
+
/**
|
|
49
|
+
* Zod schemas for LLM outputs
|
|
50
|
+
*/
|
|
51
|
+
const SummarySchema = zod.z.object({ summary: zod.z.string().describe("A concise 2-3 sentence summary of the content") });
|
|
52
|
+
const TagsSchema = zod.z.object({ tags: zod.z.array(zod.z.string()).describe("5-10 relevant tags/keywords") });
|
|
53
|
+
const EntitiesSchema = zod.z.object({
|
|
54
|
+
people: zod.z.array(zod.z.string()).describe("People mentioned"),
|
|
55
|
+
organizations: zod.z.array(zod.z.string()).describe("Organizations/companies"),
|
|
56
|
+
technologies: zod.z.array(zod.z.string()).describe("Technologies/tools/frameworks"),
|
|
57
|
+
locations: zod.z.array(zod.z.string()).describe("Locations/places"),
|
|
58
|
+
concepts: zod.z.array(zod.z.string()).describe("Key concepts/topics")
|
|
59
|
+
});
|
|
60
|
+
const ClassifySchema = zod.z.object({
|
|
61
|
+
contentType: zod.z.enum([
|
|
62
|
+
"article",
|
|
63
|
+
"repo",
|
|
64
|
+
"docs",
|
|
65
|
+
"package",
|
|
66
|
+
"video",
|
|
67
|
+
"tool",
|
|
68
|
+
"product",
|
|
69
|
+
"unknown"
|
|
70
|
+
]).describe("The type of content"),
|
|
71
|
+
confidence: zod.z.number().min(0).max(1).describe("Confidence score 0-1")
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/llm/enhancer.ts
|
|
76
|
+
/**
|
|
77
|
+
* Enhance scraped data with LLM-powered features
|
|
78
|
+
*/
|
|
79
|
+
async function enhance(data, provider, types) {
|
|
80
|
+
const results = {};
|
|
81
|
+
const content = data.excerpt || data.textContent.slice(0, 1e4);
|
|
82
|
+
const context = `Title: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
|
|
83
|
+
const promises = [];
|
|
84
|
+
if (types.includes("summarize")) promises.push(summarize(context, provider).then((summary) => {
|
|
85
|
+
results.summary = summary;
|
|
86
|
+
}));
|
|
87
|
+
if (types.includes("tags")) promises.push(extractTags(context, provider).then((tags) => {
|
|
88
|
+
results.suggestedTags = tags;
|
|
89
|
+
}));
|
|
90
|
+
if (types.includes("entities")) promises.push(extractEntities(context, provider).then((entities) => {
|
|
91
|
+
results.entities = entities;
|
|
92
|
+
}));
|
|
93
|
+
if (types.includes("classify")) promises.push(classify(context, provider).then((classification) => {
|
|
94
|
+
if (classification.confidence > .7) results.contentType = classification.contentType;
|
|
95
|
+
}));
|
|
96
|
+
await Promise.all(promises);
|
|
97
|
+
return results;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Ask a custom question about the scraped content
|
|
101
|
+
* Results are stored in the `custom` field of ScrapedData
|
|
102
|
+
*/
|
|
103
|
+
async function ask(data, provider, prompt, options) {
|
|
104
|
+
const key = options?.key || "response";
|
|
105
|
+
const content = data.excerpt || data.textContent.slice(0, 1e4);
|
|
106
|
+
const processedPrompt = applyPlaceholders(prompt, data, content);
|
|
107
|
+
if (options?.schema) {
|
|
108
|
+
const result = await extract(data, provider, options.schema, processedPrompt);
|
|
109
|
+
return { custom: { [key]: result } };
|
|
110
|
+
}
|
|
111
|
+
const fullPrompt = prompt.includes("{{content}}") ? processedPrompt : `${processedPrompt}\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}`;
|
|
112
|
+
const response = await provider.complete(fullPrompt);
|
|
113
|
+
return { custom: { [key]: response } };
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Apply placeholder replacements to a prompt template
|
|
117
|
+
*/
|
|
118
|
+
function applyPlaceholders(prompt, data, content) {
|
|
119
|
+
const domain = (() => {
|
|
120
|
+
try {
|
|
121
|
+
return new URL(data.url).hostname;
|
|
122
|
+
} catch {
|
|
123
|
+
return "";
|
|
124
|
+
}
|
|
125
|
+
})();
|
|
126
|
+
return prompt.replace(/\{\{title\}\}/g, data.title).replace(/\{\{url\}\}/g, data.url).replace(/\{\{content\}\}/g, content).replace(/\{\{description\}\}/g, data.description || "").replace(/\{\{excerpt\}\}/g, data.excerpt || "").replace(/\{\{domain\}\}/g, domain);
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Extract structured data using LLM and a custom schema
|
|
130
|
+
*/
|
|
131
|
+
async function extract(data, provider, schema, promptTemplate) {
|
|
132
|
+
const zodShape = {};
|
|
133
|
+
for (const [key, type] of Object.entries(schema)) {
|
|
134
|
+
const isOptional = type.endsWith("?");
|
|
135
|
+
const baseType = isOptional ? type.slice(0, -1) : type;
|
|
136
|
+
let zodType;
|
|
137
|
+
switch (baseType) {
|
|
138
|
+
case "string":
|
|
139
|
+
zodType = zod.z.string();
|
|
140
|
+
break;
|
|
141
|
+
case "number":
|
|
142
|
+
zodType = zod.z.number();
|
|
143
|
+
break;
|
|
144
|
+
case "boolean":
|
|
145
|
+
zodType = zod.z.boolean();
|
|
146
|
+
break;
|
|
147
|
+
case "string[]":
|
|
148
|
+
zodType = zod.z.array(zod.z.string());
|
|
149
|
+
break;
|
|
150
|
+
case "number[]":
|
|
151
|
+
zodType = zod.z.array(zod.z.number());
|
|
152
|
+
break;
|
|
153
|
+
default: zodType = zod.z.string();
|
|
154
|
+
}
|
|
155
|
+
zodShape[key] = isOptional ? zodType.optional() : zodType;
|
|
156
|
+
}
|
|
157
|
+
const zodSchema = zod.z.object(zodShape);
|
|
158
|
+
const content = data.textContent.slice(0, 4e3);
|
|
159
|
+
let prompt;
|
|
160
|
+
if (promptTemplate) {
|
|
161
|
+
prompt = applyPlaceholders(promptTemplate, data, content);
|
|
162
|
+
if (!promptTemplate.includes("{{content}}")) prompt += `\n\nContext:\n${content}`;
|
|
163
|
+
} else prompt = `Extract the following information from this content:
|
|
164
|
+
|
|
165
|
+
Title: ${data.title}
|
|
166
|
+
URL: ${data.url}
|
|
167
|
+
|
|
168
|
+
Content:
|
|
169
|
+
${content}
|
|
170
|
+
|
|
171
|
+
Extract these fields:
|
|
172
|
+
${Object.entries(schema).map(([key, type]) => `- ${key} (${type})`).join("\n")}`;
|
|
173
|
+
return provider.completeJSON(prompt, zodSchema);
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Generate a summary of the content
|
|
177
|
+
*/
|
|
178
|
+
async function summarize(context, provider) {
|
|
179
|
+
const prompt = `Summarize the following content in 2-3 concise sentences:
|
|
180
|
+
|
|
181
|
+
${context}`;
|
|
182
|
+
return (await provider.completeJSON(prompt, SummarySchema)).summary;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Extract relevant tags/keywords
|
|
186
|
+
*/
|
|
187
|
+
async function extractTags(context, provider) {
|
|
188
|
+
const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:
|
|
189
|
+
|
|
190
|
+
${context}`;
|
|
191
|
+
return (await provider.completeJSON(prompt, TagsSchema)).tags;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Extract named entities from content
|
|
195
|
+
*/
|
|
196
|
+
async function extractEntities(context, provider) {
|
|
197
|
+
const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:
|
|
198
|
+
|
|
199
|
+
${context}`;
|
|
200
|
+
return provider.completeJSON(prompt, EntitiesSchema);
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Classify content type using LLM
|
|
204
|
+
*/
|
|
205
|
+
async function classify(context, provider) {
|
|
206
|
+
const prompt = `Classify the following content into one of these categories:
|
|
207
|
+
- article: Blog post, news article, essay
|
|
208
|
+
- repo: Code repository, open source project
|
|
209
|
+
- docs: Documentation, API reference, guides
|
|
210
|
+
- package: npm/pip package page
|
|
211
|
+
- video: Video content, YouTube
|
|
212
|
+
- tool: Software tool, web application
|
|
213
|
+
- product: Commercial product, e-commerce
|
|
214
|
+
|
|
215
|
+
${context}`;
|
|
216
|
+
return provider.completeJSON(prompt, ClassifySchema);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
//#endregion
|
|
220
|
+
Object.defineProperty(exports, 'ClassifySchema', {
|
|
221
|
+
enumerable: true,
|
|
222
|
+
get: function () {
|
|
223
|
+
return ClassifySchema;
|
|
224
|
+
}
|
|
225
|
+
});
|
|
226
|
+
Object.defineProperty(exports, 'EntitiesSchema', {
|
|
227
|
+
enumerable: true,
|
|
228
|
+
get: function () {
|
|
229
|
+
return EntitiesSchema;
|
|
230
|
+
}
|
|
231
|
+
});
|
|
232
|
+
Object.defineProperty(exports, 'ScrapeError', {
|
|
233
|
+
enumerable: true,
|
|
234
|
+
get: function () {
|
|
235
|
+
return ScrapeError;
|
|
236
|
+
}
|
|
237
|
+
});
|
|
238
|
+
Object.defineProperty(exports, 'SummarySchema', {
|
|
239
|
+
enumerable: true,
|
|
240
|
+
get: function () {
|
|
241
|
+
return SummarySchema;
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
Object.defineProperty(exports, 'TagsSchema', {
|
|
245
|
+
enumerable: true,
|
|
246
|
+
get: function () {
|
|
247
|
+
return TagsSchema;
|
|
248
|
+
}
|
|
249
|
+
});
|
|
250
|
+
Object.defineProperty(exports, 'ask', {
|
|
251
|
+
enumerable: true,
|
|
252
|
+
get: function () {
|
|
253
|
+
return ask;
|
|
254
|
+
}
|
|
255
|
+
});
|
|
256
|
+
Object.defineProperty(exports, 'enhance', {
|
|
257
|
+
enumerable: true,
|
|
258
|
+
get: function () {
|
|
259
|
+
return enhance;
|
|
260
|
+
}
|
|
261
|
+
});
|
|
262
|
+
Object.defineProperty(exports, 'extract', {
|
|
263
|
+
enumerable: true,
|
|
264
|
+
get: function () {
|
|
265
|
+
return extract;
|
|
266
|
+
}
|
|
267
|
+
});
|
|
268
|
+
//# sourceMappingURL=enhancer-oM4BhYYS.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"enhancer-oM4BhYYS.cjs","names":["z","results: Partial<ScrapedData>","promises: Promise<void>[]","zodShape: Record<string, z.ZodTypeAny>","zodType: z.ZodTypeAny","z","prompt: string"],"sources":["../src/core/errors.ts","../src/llm/types.ts","../src/llm/enhancer.ts"],"sourcesContent":["/**\n * Error codes for scraping failures\n */\nexport type ScrapeErrorCode =\n | 'FETCH_FAILED'\n | 'TIMEOUT'\n | 'INVALID_URL'\n | 'BLOCKED'\n | 'NOT_FOUND'\n | 'ROBOTS_BLOCKED'\n | 'PARSE_ERROR'\n | 'LLM_ERROR'\n | 'VALIDATION_ERROR';\n\n/**\n * Custom error class for scraping failures with structured error codes\n */\nexport class ScrapeError extends Error {\n public readonly code: ScrapeErrorCode;\n public readonly statusCode?: number;\n\n constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error) {\n super(message, { cause });\n this.name = 'ScrapeError';\n this.code = code;\n this.statusCode = statusCode;\n\n // Maintains proper stack trace in V8 environments\n if (Error.captureStackTrace) {\n Error.captureStackTrace(this, ScrapeError);\n }\n }\n\n /**\n * Create a ScrapeError from an unknown error\n */\n static from(error: unknown, code: ScrapeErrorCode = 'FETCH_FAILED'): ScrapeError {\n if (error instanceof ScrapeError) {\n return error;\n }\n\n if (error instanceof Error) {\n return new ScrapeError(error.message, code, undefined, error);\n }\n\n return new ScrapeError(String(error), code);\n }\n\n /**\n * Check if error is retryable (network issues, timeouts)\n */\n isRetryable(): boolean {\n return this.code === 'FETCH_FAILED' || this.code === 'TIMEOUT';\n }\n\n /**\n * Convert to a plain object for serialization\n */\n toJSON(): Record<string, unknown> {\n return {\n name: this.name,\n message: this.message,\n code: this.code,\n statusCode: this.statusCode,\n stack: this.stack,\n };\n }\n}\n","import { z } from 'zod';\n\n/**\n * LLM completion options\n */\nexport interface CompletionOptions {\n maxTokens?: number;\n temperature?: number;\n systemPrompt?: string;\n}\n\n/**\n * LLM Provider interface - implemented by all providers\n */\nexport interface LLMProvider {\n readonly name: string;\n\n /**\n * Generate a text completion\n */\n complete(prompt: string, options?: CompletionOptions): Promise<string>;\n\n /**\n * Generate a structured JSON completion with Zod validation\n */\n completeJSON<T>(prompt: string, schema: z.ZodType<T>, options?: CompletionOptions): Promise<T>;\n}\n\n/**\n * Provider configuration for Anthropic\n */\nexport interface AnthropicConfig {\n apiKey?: string; // Falls back to ANTHROPIC_API_KEY env var\n model?: string; // Default: claude-3-haiku-20240307\n baseUrl?: string;\n}\n\n/**\n * Provider configuration for OpenAI-compatible APIs\n * Works with: OpenAI, Ollama, LM Studio, LocalAI, vLLM, etc.\n */\nexport interface OpenAICompatibleConfig {\n apiKey?: string; // Falls back to OPENAI_API_KEY env var\n model?: string; // Default: gpt-4o-mini\n baseUrl?: string; // Default: https://api.openai.com/v1\n}\n\n/**\n * Enhancement result types\n */\nexport interface SummaryResult {\n summary: string;\n}\n\nexport interface TagsResult {\n tags: string[];\n}\n\nexport interface EntitiesResult {\n people: string[];\n organizations: string[];\n technologies: string[];\n locations: string[];\n concepts: string[];\n}\n\nexport interface ClassifyResult {\n contentType: string;\n confidence: number;\n}\n\n/**\n * Zod schemas for LLM outputs\n */\nexport const SummarySchema = z.object({\n summary: z.string().describe('A concise 2-3 sentence summary of the content'),\n});\n\nexport const TagsSchema = z.object({\n tags: z.array(z.string()).describe('5-10 relevant tags/keywords'),\n});\n\nexport const EntitiesSchema = z.object({\n people: z.array(z.string()).describe('People mentioned'),\n organizations: z.array(z.string()).describe('Organizations/companies'),\n technologies: z.array(z.string()).describe('Technologies/tools/frameworks'),\n locations: z.array(z.string()).describe('Locations/places'),\n concepts: z.array(z.string()).describe('Key concepts/topics'),\n});\n\nexport const ClassifySchema = z.object({\n contentType: z\n .enum(['article', 'repo', 'docs', 'package', 'video', 'tool', 'product', 'unknown'])\n .describe('The type of content'),\n confidence: z.number().min(0).max(1).describe('Confidence score 0-1'),\n});\n","import { z } from 'zod';\nimport type {\n EnhancementType,\n ExtractedEntities,\n ExtractionSchema,\n ScrapedData,\n} from '@/core/types.js';\nimport type { LLMProvider } from './types.js';\nimport { ClassifySchema, EntitiesSchema, SummarySchema, TagsSchema } from './types.js';\n\n/**\n * Enhance scraped data with LLM-powered features\n */\nexport async function enhance(\n data: ScrapedData,\n provider: LLMProvider,\n types: EnhancementType[]\n): Promise<Partial<ScrapedData>> {\n const results: Partial<ScrapedData> = {};\n\n // Prepare content for LLM (use excerpt/textContent to save tokens)\n const content = data.excerpt || data.textContent.slice(0, 10000);\n const context = `Title: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n // Run enhancements in parallel\n const promises: Promise<void>[] = [];\n\n if (types.includes('summarize')) {\n promises.push(\n summarize(context, provider).then((summary) => {\n results.summary = summary;\n })\n );\n }\n\n if (types.includes('tags')) {\n promises.push(\n extractTags(context, provider).then((tags) => {\n results.suggestedTags = tags;\n })\n );\n }\n\n if (types.includes('entities')) {\n promises.push(\n extractEntities(context, provider).then((entities) => {\n results.entities = entities;\n })\n );\n }\n\n if (types.includes('classify')) {\n promises.push(\n classify(context, provider).then((classification) => {\n if (classification.confidence > 0.7) {\n results.contentType = classification.contentType as ScrapedData['contentType'];\n }\n })\n );\n }\n\n await Promise.all(promises);\n\n return results;\n}\n\n/**\n * Options for the ask() function\n */\nexport interface AskOptions {\n /** Key to store the result under in custom field */\n key?: string;\n /** Schema for structured response */\n schema?: ExtractionSchema;\n}\n\n/**\n * Ask a custom question about the scraped content\n * Results are stored in the `custom` field of ScrapedData\n */\nexport async function ask(\n data: ScrapedData,\n provider: LLMProvider,\n prompt: string,\n options?: AskOptions\n): Promise<Partial<ScrapedData>> {\n const key = options?.key || 'response';\n const content = data.excerpt || data.textContent.slice(0, 10000);\n\n // Apply placeholder replacements\n const processedPrompt = applyPlaceholders(prompt, data, content);\n\n if (options?.schema) {\n // Use structured extraction\n const result = await extract(data, provider, options.schema, processedPrompt);\n return { custom: { [key]: result } };\n }\n\n // Simple string response\n const fullPrompt = prompt.includes('{{content}}')\n ? processedPrompt\n : `${processedPrompt}\\n\\nTitle: ${data.title}\\nURL: ${data.url}\\n\\nContent:\\n${content}`;\n\n const response = await provider.complete(fullPrompt);\n return { custom: { [key]: response } };\n}\n\n/**\n * Apply placeholder replacements to a prompt template\n */\nfunction applyPlaceholders(prompt: string, data: ScrapedData, content: string): string {\n const domain = (() => {\n try {\n return new URL(data.url).hostname;\n } catch {\n return '';\n }\n })();\n\n return prompt\n .replace(/\\{\\{title\\}\\}/g, data.title)\n .replace(/\\{\\{url\\}\\}/g, data.url)\n .replace(/\\{\\{content\\}\\}/g, content)\n .replace(/\\{\\{description\\}\\}/g, data.description || '')\n .replace(/\\{\\{excerpt\\}\\}/g, data.excerpt || '')\n .replace(/\\{\\{domain\\}\\}/g, domain);\n}\n\n/**\n * Extract structured data using LLM and a custom schema\n */\nexport async function extract<T>(\n data: ScrapedData,\n provider: LLMProvider,\n schema: ExtractionSchema,\n promptTemplate?: string\n): Promise<T> {\n // Convert simple schema to Zod schema\n const zodShape: Record<string, z.ZodTypeAny> = {};\n\n for (const [key, type] of Object.entries(schema)) {\n const isOptional = type.endsWith('?');\n const baseType = isOptional ? type.slice(0, -1) : type;\n\n let zodType: z.ZodTypeAny;\n switch (baseType) {\n case 'string':\n zodType = z.string();\n break;\n case 'number':\n zodType = z.number();\n break;\n case 'boolean':\n zodType = z.boolean();\n break;\n case 'string[]':\n zodType = z.array(z.string());\n break;\n case 'number[]':\n zodType = z.array(z.number());\n break;\n default:\n zodType = z.string();\n }\n\n zodShape[key] = isOptional ? zodType.optional() : zodType;\n }\n\n const zodSchema = z.object(zodShape) as unknown as z.ZodType<T>;\n\n const content = data.textContent.slice(0, 4000);\n\n let prompt: string;\n\n if (promptTemplate) {\n // Apply all placeholder replacements\n prompt = applyPlaceholders(promptTemplate, data, content);\n\n // If content wasn't included via placeholder, append it\n if (!promptTemplate.includes('{{content}}')) {\n prompt += `\\n\\nContext:\\n${content}`;\n }\n } else {\n prompt = `Extract the following information from this content:\n\nTitle: ${data.title}\nURL: ${data.url}\n\nContent:\n${content}\n\nExtract these fields:\n${Object.entries(schema)\n .map(([key, type]) => `- ${key} (${type})`)\n .join('\\n')}`;\n }\n\n return provider.completeJSON<T>(prompt, zodSchema as z.ZodType<T>);\n}\n\n/**\n * Generate a summary of the content\n */\nasync function summarize(context: string, provider: LLMProvider): Promise<string> {\n const prompt = `Summarize the following content in 2-3 concise sentences:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, SummarySchema);\n return result.summary;\n}\n\n/**\n * Extract relevant tags/keywords\n */\nasync function extractTags(context: string, provider: LLMProvider): Promise<string[]> {\n const prompt = `Extract 5-10 relevant tags or keywords from the following content. Focus on technologies, concepts, and topics mentioned:\n\n${context}`;\n\n const result = await provider.completeJSON(prompt, TagsSchema);\n return result.tags;\n}\n\n/**\n * Extract named entities from content\n */\nasync function extractEntities(context: string, provider: LLMProvider): Promise<ExtractedEntities> {\n const prompt = `Extract named entities from the following content. Identify people, organizations, technologies, locations, and key concepts:\n\n${context}`;\n\n return provider.completeJSON(prompt, EntitiesSchema);\n}\n\n/**\n * Classify content type using LLM\n */\nasync function classify(\n context: string,\n provider: LLMProvider\n): Promise<{ contentType: string; confidence: number }> {\n const prompt = `Classify the following content into one of these categories:\n- article: Blog post, news article, essay\n- repo: Code repository, open source project\n- docs: Documentation, API reference, guides\n- package: npm/pip package page\n- video: Video content, YouTube\n- tool: Software tool, web application\n- product: Commercial product, e-commerce\n\n${context}`;\n\n return provider.completeJSON(prompt, ClassifySchema);\n}\n"],"mappings":";;;;;;;AAiBA,IAAa,cAAb,MAAa,oBAAoB,MAAM;CACrC,AAAgB;CAChB,AAAgB;CAEhB,YAAY,SAAiB,MAAuB,YAAqB,OAAe;AACtF,QAAM,SAAS,EAAE,OAAO,CAAC;AACzB,OAAK,OAAO;AACZ,OAAK,OAAO;AACZ,OAAK,aAAa;AAGlB,MAAI,MAAM,kBACR,OAAM,kBAAkB,MAAM,YAAY;;;;;CAO9C,OAAO,KAAK,OAAgB,OAAwB,gBAA6B;AAC/E,MAAI,iBAAiB,YACnB,QAAO;AAGT,MAAI,iBAAiB,MACnB,QAAO,IAAI,YAAY,MAAM,SAAS,MAAM,QAAW,MAAM;AAG/D,SAAO,IAAI,YAAY,OAAO,MAAM,EAAE,KAAK;;;;;CAM7C,cAAuB;AACrB,SAAO,KAAK,SAAS,kBAAkB,KAAK,SAAS;;;;;CAMvD,SAAkC;AAChC,SAAO;GACL,MAAM,KAAK;GACX,SAAS,KAAK;GACd,MAAM,KAAK;GACX,YAAY,KAAK;GACjB,OAAO,KAAK;GACb;;;;;;;;;ACSL,MAAa,gBAAgBA,MAAE,OAAO,EACpC,SAASA,MAAE,QAAQ,CAAC,SAAS,gDAAgD,EAC9E,CAAC;AAEF,MAAa,aAAaA,MAAE,OAAO,EACjC,MAAMA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,8BAA8B,EAClE,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,QAAQA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CACxD,eAAeA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,0BAA0B;CACtE,cAAcA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,gCAAgC;CAC3E,WAAWA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,mBAAmB;CAC3D,UAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC,CAAC,SAAS,sBAAsB;CAC9D,CAAC;AAEF,MAAa,iBAAiBA,MAAE,OAAO;CACrC,aAAaA,MACV,KAAK;EAAC;EAAW;EAAQ;EAAQ;EAAW;EAAS;EAAQ;EAAW;EAAU,CAAC,CACnF,SAAS,sBAAsB;CAClC,YAAYA,MAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,SAAS,uBAAuB;CACtE,CAAC;;;;;;;AClFF,eAAsB,QACpB,MACA,UACA,OAC+B;CAC/B,MAAMC,UAAgC,EAAE;CAGxC,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAChE,MAAM,UAAU,UAAU,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAGvE,MAAMC,WAA4B,EAAE;AAEpC,KAAI,MAAM,SAAS,YAAY,CAC7B,UAAS,KACP,UAAU,SAAS,SAAS,CAAC,MAAM,YAAY;AAC7C,UAAQ,UAAU;GAClB,CACH;AAGH,KAAI,MAAM,SAAS,OAAO,CACxB,UAAS,KACP,YAAY,SAAS,SAAS,CAAC,MAAM,SAAS;AAC5C,UAAQ,gBAAgB;GACxB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,gBAAgB,SAAS,SAAS,CAAC,MAAM,aAAa;AACpD,UAAQ,WAAW;GACnB,CACH;AAGH,KAAI,MAAM,SAAS,WAAW,CAC5B,UAAS,KACP,SAAS,SAAS,SAAS,CAAC,MAAM,mBAAmB;AACnD,MAAI,eAAe,aAAa,GAC9B,SAAQ,cAAc,eAAe;GAEvC,CACH;AAGH,OAAM,QAAQ,IAAI,SAAS;AAE3B,QAAO;;;;;;AAiBT,eAAsB,IACpB,MACA,UACA,QACA,SAC+B;CAC/B,MAAM,MAAM,SAAS,OAAO;CAC5B,MAAM,UAAU,KAAK,WAAW,KAAK,YAAY,MAAM,GAAG,IAAM;CAGhE,MAAM,kBAAkB,kBAAkB,QAAQ,MAAM,QAAQ;AAEhE,KAAI,SAAS,QAAQ;EAEnB,MAAM,SAAS,MAAM,QAAQ,MAAM,UAAU,QAAQ,QAAQ,gBAAgB;AAC7E,SAAO,EAAE,QAAQ,GAAG,MAAM,QAAQ,EAAE;;CAItC,MAAM,aAAa,OAAO,SAAS,cAAc,GAC7C,kBACA,GAAG,gBAAgB,aAAa,KAAK,MAAM,SAAS,KAAK,IAAI,gBAAgB;CAEjF,MAAM,WAAW,MAAM,SAAS,SAAS,WAAW;AACpD,QAAO,EAAE,QAAQ,GAAG,MAAM,UAAU,EAAE;;;;;AAMxC,SAAS,kBAAkB,QAAgB,MAAmB,SAAyB;CACrF,MAAM,gBAAgB;AACpB,MAAI;AACF,UAAO,IAAI,IAAI,KAAK,IAAI,CAAC;UACnB;AACN,UAAO;;KAEP;AAEJ,QAAO,OACJ,QAAQ,kBAAkB,KAAK,MAAM,CACrC,QAAQ,gBAAgB,KAAK,IAAI,CACjC,QAAQ,oBAAoB,QAAQ,CACpC,QAAQ,wBAAwB,KAAK,eAAe,GAAG,CACvD,QAAQ,oBAAoB,KAAK,WAAW,GAAG,CAC/C,QAAQ,mBAAmB,OAAO;;;;;AAMvC,eAAsB,QACpB,MACA,UACA,QACA,gBACY;CAEZ,MAAMC,WAAyC,EAAE;AAEjD,MAAK,MAAM,CAAC,KAAK,SAAS,OAAO,QAAQ,OAAO,EAAE;EAChD,MAAM,aAAa,KAAK,SAAS,IAAI;EACrC,MAAM,WAAW,aAAa,KAAK,MAAM,GAAG,GAAG,GAAG;EAElD,IAAIC;AACJ,UAAQ,UAAR;GACE,KAAK;AACH,cAAUC,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,QAAQ;AACpB;GACF,KAAK;AACH,cAAUA,MAAE,SAAS;AACrB;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,KAAK;AACH,cAAUA,MAAE,MAAMA,MAAE,QAAQ,CAAC;AAC7B;GACF,QACE,WAAUA,MAAE,QAAQ;;AAGxB,WAAS,OAAO,aAAa,QAAQ,UAAU,GAAG;;CAGpD,MAAM,YAAYA,MAAE,OAAO,SAAS;CAEpC,MAAM,UAAU,KAAK,YAAY,MAAM,GAAG,IAAK;CAE/C,IAAIC;AAEJ,KAAI,gBAAgB;AAElB,WAAS,kBAAkB,gBAAgB,MAAM,QAAQ;AAGzD,MAAI,CAAC,eAAe,SAAS,cAAc,CACzC,WAAU,iBAAiB;OAG7B,UAAS;;SAEJ,KAAK,MAAM;OACb,KAAK,IAAI;;;EAGd,QAAQ;;;EAGR,OAAO,QAAQ,OAAO,CACrB,KAAK,CAAC,KAAK,UAAU,KAAK,IAAI,IAAI,KAAK,GAAG,CAC1C,KAAK,KAAK;AAGX,QAAO,SAAS,aAAgB,QAAQ,UAA0B;;;;;AAMpE,eAAe,UAAU,SAAiB,UAAwC;CAChF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,cAAc,EACnD;;;;;AAMhB,eAAe,YAAY,SAAiB,UAA0C;CACpF,MAAM,SAAS;;EAEf;AAGA,SADe,MAAM,SAAS,aAAa,QAAQ,WAAW,EAChD;;;;;AAMhB,eAAe,gBAAgB,SAAiB,UAAmD;CACjG,MAAM,SAAS;;EAEf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe;;;;;AAMtD,eAAe,SACb,SACA,UACsD;CACtD,MAAM,SAAS;;;;;;;;;EASf;AAEA,QAAO,SAAS,aAAa,QAAQ,eAAe"}
|