scrapex 0.5.2 → 1.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +392 -145
- package/dist/enhancer-Q6CSc1gA.mjs +220 -0
- package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
- package/dist/enhancer-oM4BhYYS.cjs +268 -0
- package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
- package/dist/index.cjs +852 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +264 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +264 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +798 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +316 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +211 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +211 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +310 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +200 -0
- package/dist/parsers/index.cjs.map +1 -0
- package/dist/parsers/index.d.cts +133 -0
- package/dist/parsers/index.d.cts.map +1 -0
- package/dist/parsers/index.d.mts +133 -0
- package/dist/parsers/index.d.mts.map +1 -0
- package/dist/parsers/index.mjs +192 -0
- package/dist/parsers/index.mjs.map +1 -0
- package/dist/types-CNQZVW36.d.mts +150 -0
- package/dist/types-CNQZVW36.d.mts.map +1 -0
- package/dist/types-D0HYR95H.d.cts +150 -0
- package/dist/types-D0HYR95H.d.cts.map +1 -0
- package/package.json +80 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1128
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1120
- package/dist/scrapex.esm.js.map +0 -1
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
import { a as EntitiesSchema, c as ScrapeError, i as ClassifySchema, n as enhance, o as SummarySchema, r as extract, s as TagsSchema, t as ask } from "../enhancer-Q6CSc1gA.mjs";
|
|
2
|
+
import { createRequire } from "node:module";
|
|
3
|
+
|
|
4
|
+
//#region rolldown:runtime
|
|
5
|
+
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
6
|
+
|
|
7
|
+
//#endregion
|
|
8
|
+
//#region src/llm/anthropic.ts
|
|
9
|
+
const DEFAULT_MODEL$1 = "claude-3-5-haiku-20241022";
|
|
10
|
+
const DEFAULT_MAX_TOKENS$1 = 1024;
|
|
11
|
+
/**
|
|
12
|
+
* Anthropic Claude provider
|
|
13
|
+
*
|
|
14
|
+
* Requires @anthropic-ai/sdk as a peer dependency.
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```ts
|
|
18
|
+
* const provider = new AnthropicProvider({ apiKey: 'sk-...' });
|
|
19
|
+
* const result = await scrape(url, { llm: provider, enhance: ['summarize'] });
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
var AnthropicProvider = class {
|
|
23
|
+
name = "anthropic";
|
|
24
|
+
client;
|
|
25
|
+
model;
|
|
26
|
+
constructor(config = {}) {
|
|
27
|
+
const apiKey = config.apiKey ?? process.env.ANTHROPIC_API_KEY;
|
|
28
|
+
if (!apiKey) throw new ScrapeError("Anthropic API key required. Set ANTHROPIC_API_KEY env var or pass apiKey in config.", "LLM_ERROR");
|
|
29
|
+
this.model = config.model ?? DEFAULT_MODEL$1;
|
|
30
|
+
try {
|
|
31
|
+
const { Anthropic } = __require("@anthropic-ai/sdk");
|
|
32
|
+
this.client = new Anthropic({
|
|
33
|
+
apiKey,
|
|
34
|
+
baseURL: config.baseUrl
|
|
35
|
+
});
|
|
36
|
+
} catch {
|
|
37
|
+
throw new ScrapeError("@anthropic-ai/sdk is required for Anthropic provider. Install with: npm install @anthropic-ai/sdk", "LLM_ERROR");
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
async complete(prompt, options = {}) {
|
|
41
|
+
try {
|
|
42
|
+
const content = (await this.client.messages.create({
|
|
43
|
+
model: this.model,
|
|
44
|
+
max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS$1,
|
|
45
|
+
messages: [{
|
|
46
|
+
role: "user",
|
|
47
|
+
content: prompt
|
|
48
|
+
}],
|
|
49
|
+
system: options.systemPrompt,
|
|
50
|
+
temperature: options.temperature
|
|
51
|
+
})).content[0];
|
|
52
|
+
if (content?.type === "text" && content.text) return content.text;
|
|
53
|
+
throw new ScrapeError("Unexpected or empty response from Anthropic", "LLM_ERROR");
|
|
54
|
+
} catch (error) {
|
|
55
|
+
if (error instanceof ScrapeError) throw error;
|
|
56
|
+
throw new ScrapeError(`Anthropic API error: ${error instanceof Error ? error.message : String(error)}`, "LLM_ERROR", void 0, error instanceof Error ? error : void 0);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
async completeJSON(prompt, schema, options = {}) {
|
|
60
|
+
const jsonPrompt = `${prompt}
|
|
61
|
+
|
|
62
|
+
Respond ONLY with valid JSON matching this schema:
|
|
63
|
+
${JSON.stringify(zodToJsonSchema$1(schema), null, 2)}
|
|
64
|
+
|
|
65
|
+
Do not include any explanation or markdown formatting. Just the JSON object.`;
|
|
66
|
+
const response = await this.complete(jsonPrompt, {
|
|
67
|
+
...options,
|
|
68
|
+
systemPrompt: options.systemPrompt ?? "You are a helpful assistant that responds only with valid JSON."
|
|
69
|
+
});
|
|
70
|
+
try {
|
|
71
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
72
|
+
if (!jsonMatch) throw new Error("No JSON object found in response");
|
|
73
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
74
|
+
return schema.parse(parsed);
|
|
75
|
+
} catch (error) {
|
|
76
|
+
throw new ScrapeError(`Failed to parse LLM response as JSON: ${error instanceof Error ? error.message : String(error)}`, "VALIDATION_ERROR", void 0, error instanceof Error ? error : void 0);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
/**
|
|
81
|
+
* Convert a Zod schema to a simple JSON Schema representation
|
|
82
|
+
* (simplified version for prompt engineering)
|
|
83
|
+
*/
|
|
84
|
+
function zodToJsonSchema$1(schema) {
|
|
85
|
+
const def = schema._def;
|
|
86
|
+
switch (def.typeName) {
|
|
87
|
+
case "ZodObject": {
|
|
88
|
+
const shape = schema.shape;
|
|
89
|
+
const properties = {};
|
|
90
|
+
for (const [key, value] of Object.entries(shape)) properties[key] = zodToJsonSchema$1(value);
|
|
91
|
+
return {
|
|
92
|
+
type: "object",
|
|
93
|
+
properties
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
case "ZodArray": return {
|
|
97
|
+
type: "array",
|
|
98
|
+
items: zodToJsonSchema$1(def.type)
|
|
99
|
+
};
|
|
100
|
+
case "ZodString": return { type: "string" };
|
|
101
|
+
case "ZodNumber": return { type: "number" };
|
|
102
|
+
case "ZodBoolean": return { type: "boolean" };
|
|
103
|
+
case "ZodEnum": return {
|
|
104
|
+
type: "string",
|
|
105
|
+
enum: def.values
|
|
106
|
+
};
|
|
107
|
+
default: return { type: "string" };
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
//#endregion
|
|
112
|
+
//#region src/llm/openai.ts
|
|
113
|
+
const DEFAULT_MODEL = "gpt-4o-mini";
|
|
114
|
+
const DEFAULT_MAX_TOKENS = 1024;
|
|
115
|
+
const DEFAULT_BASE_URL = "https://api.openai.com/v1";
|
|
116
|
+
/**
|
|
117
|
+
* OpenAI-compatible provider
|
|
118
|
+
*
|
|
119
|
+
* Works with:
|
|
120
|
+
* - OpenAI API
|
|
121
|
+
* - Ollama (http://localhost:11434/v1)
|
|
122
|
+
* - LM Studio (http://localhost:1234/v1)
|
|
123
|
+
* - LocalAI
|
|
124
|
+
* - vLLM
|
|
125
|
+
* - Any OpenAI-compatible API
|
|
126
|
+
*
|
|
127
|
+
* Requires `openai` as a peer dependency.
|
|
128
|
+
*
|
|
129
|
+
* @example
|
|
130
|
+
* ```ts
|
|
131
|
+
* // OpenAI
|
|
132
|
+
* const provider = new OpenAIProvider({ apiKey: 'sk-...' });
|
|
133
|
+
*
|
|
134
|
+
* // Ollama
|
|
135
|
+
* const provider = new OpenAIProvider({
|
|
136
|
+
* baseUrl: 'http://localhost:11434/v1',
|
|
137
|
+
* model: 'llama3.2',
|
|
138
|
+
* apiKey: 'ollama' // Ollama doesn't require a real key
|
|
139
|
+
* });
|
|
140
|
+
*
|
|
141
|
+
* // LM Studio
|
|
142
|
+
* const provider = new OpenAIProvider({
|
|
143
|
+
* baseUrl: 'http://localhost:1234/v1',
|
|
144
|
+
* model: 'local-model',
|
|
145
|
+
* apiKey: 'lm-studio'
|
|
146
|
+
* });
|
|
147
|
+
* ```
|
|
148
|
+
*/
|
|
149
|
+
var OpenAIProvider = class {
|
|
150
|
+
name = "openai";
|
|
151
|
+
client;
|
|
152
|
+
model;
|
|
153
|
+
constructor(config = {}) {
|
|
154
|
+
const apiKey = config.apiKey ?? process.env.OPENAI_API_KEY;
|
|
155
|
+
const baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
|
|
156
|
+
if (!apiKey && baseUrl === DEFAULT_BASE_URL) throw new ScrapeError("OpenAI API key required. Set OPENAI_API_KEY env var or pass apiKey in config.", "LLM_ERROR");
|
|
157
|
+
this.model = config.model ?? DEFAULT_MODEL;
|
|
158
|
+
try {
|
|
159
|
+
const { OpenAI } = __require("openai");
|
|
160
|
+
this.client = new OpenAI({
|
|
161
|
+
apiKey: apiKey ?? "local",
|
|
162
|
+
baseURL: baseUrl
|
|
163
|
+
});
|
|
164
|
+
} catch {
|
|
165
|
+
throw new ScrapeError("openai package is required for OpenAI provider. Install with: npm install openai", "LLM_ERROR");
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
async complete(prompt, options = {}) {
|
|
169
|
+
try {
|
|
170
|
+
const client = this.client;
|
|
171
|
+
const messages = [];
|
|
172
|
+
if (options.systemPrompt) messages.push({
|
|
173
|
+
role: "system",
|
|
174
|
+
content: options.systemPrompt
|
|
175
|
+
});
|
|
176
|
+
messages.push({
|
|
177
|
+
role: "user",
|
|
178
|
+
content: prompt
|
|
179
|
+
});
|
|
180
|
+
const content = (await client.chat.completions.create({
|
|
181
|
+
model: this.model,
|
|
182
|
+
max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
183
|
+
messages,
|
|
184
|
+
temperature: options.temperature
|
|
185
|
+
})).choices[0]?.message?.content;
|
|
186
|
+
if (!content) throw new ScrapeError("Empty response from OpenAI", "LLM_ERROR");
|
|
187
|
+
return content;
|
|
188
|
+
} catch (error) {
|
|
189
|
+
if (error instanceof ScrapeError) throw error;
|
|
190
|
+
throw new ScrapeError(`OpenAI API error: ${error instanceof Error ? error.message : String(error)}`, "LLM_ERROR", void 0, error instanceof Error ? error : void 0);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
async completeJSON(prompt, schema, options = {}) {
|
|
194
|
+
const client = this.client;
|
|
195
|
+
try {
|
|
196
|
+
const messages = [{
|
|
197
|
+
role: "system",
|
|
198
|
+
content: options.systemPrompt ?? "You are a helpful assistant that extracts information from content."
|
|
199
|
+
}, {
|
|
200
|
+
role: "user",
|
|
201
|
+
content: prompt
|
|
202
|
+
}];
|
|
203
|
+
const content = (await client.chat.completions.create({
|
|
204
|
+
model: this.model,
|
|
205
|
+
max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
206
|
+
messages,
|
|
207
|
+
temperature: options.temperature,
|
|
208
|
+
response_format: { type: "json_object" }
|
|
209
|
+
})).choices[0]?.message?.content;
|
|
210
|
+
if (!content) throw new ScrapeError("Empty response from OpenAI", "LLM_ERROR");
|
|
211
|
+
const parsed = JSON.parse(content);
|
|
212
|
+
return schema.parse(parsed);
|
|
213
|
+
} catch (error) {
|
|
214
|
+
if (error instanceof ScrapeError) throw error;
|
|
215
|
+
const jsonPrompt = `${prompt}
|
|
216
|
+
|
|
217
|
+
Respond ONLY with valid JSON matching this schema:
|
|
218
|
+
${JSON.stringify(zodToJsonSchema(schema), null, 2)}
|
|
219
|
+
|
|
220
|
+
Do not include any explanation or markdown formatting. Just the JSON object.`;
|
|
221
|
+
const response = await this.complete(jsonPrompt, {
|
|
222
|
+
...options,
|
|
223
|
+
systemPrompt: "You respond only with valid JSON."
|
|
224
|
+
});
|
|
225
|
+
try {
|
|
226
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
227
|
+
if (!jsonMatch) throw new Error("No JSON object found in response");
|
|
228
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
229
|
+
return schema.parse(parsed);
|
|
230
|
+
} catch (parseError) {
|
|
231
|
+
throw new ScrapeError(`Failed to parse LLM response as JSON: ${parseError instanceof Error ? parseError.message : String(parseError)}`, "VALIDATION_ERROR", void 0, parseError instanceof Error ? parseError : void 0);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
/**
|
|
237
|
+
* Convert a Zod schema to JSON Schema for structured outputs
|
|
238
|
+
*/
|
|
239
|
+
function zodToJsonSchema(schema) {
|
|
240
|
+
const def = schema._def;
|
|
241
|
+
switch (def.typeName) {
|
|
242
|
+
case "ZodObject": {
|
|
243
|
+
const shape = schema.shape;
|
|
244
|
+
const properties = {};
|
|
245
|
+
const required = [];
|
|
246
|
+
for (const [key, value] of Object.entries(shape)) {
|
|
247
|
+
properties[key] = zodToJsonSchema(value);
|
|
248
|
+
if (value._def.typeName !== "ZodOptional") required.push(key);
|
|
249
|
+
}
|
|
250
|
+
return {
|
|
251
|
+
type: "object",
|
|
252
|
+
properties,
|
|
253
|
+
required
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
case "ZodArray": return {
|
|
257
|
+
type: "array",
|
|
258
|
+
items: zodToJsonSchema(def.type)
|
|
259
|
+
};
|
|
260
|
+
case "ZodString": return { type: "string" };
|
|
261
|
+
case "ZodNumber": return { type: "number" };
|
|
262
|
+
case "ZodBoolean": return { type: "boolean" };
|
|
263
|
+
case "ZodEnum": return {
|
|
264
|
+
type: "string",
|
|
265
|
+
enum: def.values
|
|
266
|
+
};
|
|
267
|
+
case "ZodOptional": return zodToJsonSchema(def.innerType);
|
|
268
|
+
default: return { type: "string" };
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Create an OpenAI provider with default settings
|
|
273
|
+
*/
|
|
274
|
+
function createOpenAI(config) {
|
|
275
|
+
return new OpenAIProvider(config);
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Create an Ollama provider
|
|
279
|
+
*
|
|
280
|
+
* @example
|
|
281
|
+
* ```ts
|
|
282
|
+
* const provider = createOllama({ model: 'llama3.2' });
|
|
283
|
+
* ```
|
|
284
|
+
*/
|
|
285
|
+
function createOllama(config = { model: "llama3.2" }) {
|
|
286
|
+
return new OpenAIProvider({
|
|
287
|
+
baseUrl: `http://localhost:${config.port ?? 11434}/v1`,
|
|
288
|
+
model: config.model,
|
|
289
|
+
apiKey: "ollama"
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Create an LM Studio provider
|
|
294
|
+
*
|
|
295
|
+
* @example
|
|
296
|
+
* ```ts
|
|
297
|
+
* const provider = createLMStudio({ model: 'local-model' });
|
|
298
|
+
* ```
|
|
299
|
+
*/
|
|
300
|
+
function createLMStudio(config = { model: "local-model" }) {
|
|
301
|
+
return new OpenAIProvider({
|
|
302
|
+
baseUrl: `http://localhost:${config.port ?? 1234}/v1`,
|
|
303
|
+
model: config.model,
|
|
304
|
+
apiKey: "lm-studio"
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
//#endregion
|
|
309
|
+
export { AnthropicProvider, ClassifySchema, EntitiesSchema, OpenAIProvider, SummarySchema, TagsSchema, ask, createLMStudio, createOllama, createOpenAI, enhance, extract };
|
|
310
|
+
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.mjs","names":["DEFAULT_MODEL","DEFAULT_MAX_TOKENS","zodToJsonSchema","properties: Record<string, object>","messages: Array<{ role: 'system' | 'user'; content: string }>","properties: Record<string, object>","required: string[]"],"sources":["../../src/llm/anthropic.ts","../../src/llm/openai.ts"],"sourcesContent":["import type { z } from 'zod';\nimport { ScrapeError } from '@/core/errors.js';\nimport type { AnthropicConfig, CompletionOptions, LLMProvider } from './types.js';\n\nconst DEFAULT_MODEL = 'claude-3-5-haiku-20241022';\nconst DEFAULT_MAX_TOKENS = 1024;\n\n/**\n * Anthropic Claude provider\n *\n * Requires @anthropic-ai/sdk as a peer dependency.\n *\n * @example\n * ```ts\n * const provider = new AnthropicProvider({ apiKey: 'sk-...' });\n * const result = await scrape(url, { llm: provider, enhance: ['summarize'] });\n * ```\n */\nexport class AnthropicProvider implements LLMProvider {\n readonly name = 'anthropic';\n private client: unknown;\n private model: string;\n\n constructor(config: AnthropicConfig = {}) {\n const apiKey = config.apiKey ?? process.env.ANTHROPIC_API_KEY;\n if (!apiKey) {\n throw new ScrapeError(\n 'Anthropic API key required. Set ANTHROPIC_API_KEY env var or pass apiKey in config.',\n 'LLM_ERROR'\n );\n }\n\n this.model = config.model ?? DEFAULT_MODEL;\n\n // Dynamic import to avoid requiring the SDK if not used\n try {\n // eslint-disable-next-line @typescript-eslint/no-require-imports\n const { Anthropic } = require('@anthropic-ai/sdk') as typeof import('@anthropic-ai/sdk');\n this.client = new Anthropic({\n apiKey,\n baseURL: config.baseUrl,\n });\n } catch {\n throw new ScrapeError(\n '@anthropic-ai/sdk is required for Anthropic provider. Install with: npm install @anthropic-ai/sdk',\n 'LLM_ERROR'\n );\n }\n }\n\n async complete(prompt: string, options: CompletionOptions = {}): Promise<string> {\n try {\n const client = this.client as import('@anthropic-ai/sdk').Anthropic;\n const response = await client.messages.create({\n model: this.model,\n max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,\n messages: [{ role: 'user', content: prompt }],\n system: options.systemPrompt,\n temperature: options.temperature,\n });\n\n const content = response.content[0];\n if (content?.type === 'text' && content.text) {\n return content.text;\n }\n\n throw new ScrapeError('Unexpected or empty response from Anthropic', 'LLM_ERROR');\n } catch (error) {\n if (error instanceof ScrapeError) throw error;\n throw new ScrapeError(\n `Anthropic API error: ${error instanceof Error ? error.message : String(error)}`,\n 'LLM_ERROR',\n undefined,\n error instanceof Error ? error : undefined\n );\n }\n }\n\n async completeJSON<T>(\n prompt: string,\n schema: z.ZodType<T>,\n options: CompletionOptions = {}\n ): Promise<T> {\n const jsonPrompt = `${prompt}\n\nRespond ONLY with valid JSON matching this schema:\n${JSON.stringify(zodToJsonSchema(schema), null, 2)}\n\nDo not include any explanation or markdown formatting. Just the JSON object.`;\n\n const response = await this.complete(jsonPrompt, {\n ...options,\n systemPrompt:\n options.systemPrompt ?? 'You are a helpful assistant that responds only with valid JSON.',\n });\n\n try {\n // Try to extract JSON from the response\n const jsonMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (!jsonMatch) {\n throw new Error('No JSON object found in response');\n }\n\n const parsed = JSON.parse(jsonMatch[0]);\n return schema.parse(parsed);\n } catch (error) {\n throw new ScrapeError(\n `Failed to parse LLM response as JSON: ${error instanceof Error ? error.message : String(error)}`,\n 'VALIDATION_ERROR',\n undefined,\n error instanceof Error ? error : undefined\n );\n }\n }\n}\n\n/**\n * Convert a Zod schema to a simple JSON Schema representation\n * (simplified version for prompt engineering)\n */\nfunction zodToJsonSchema(schema: z.ZodType<unknown>): object {\n const def = (schema as z.ZodType<unknown> & { _def: { typeName: string } })._def;\n\n switch (def.typeName) {\n case 'ZodObject': {\n const shape = (schema as z.ZodObject<z.ZodRawShape>).shape;\n const properties: Record<string, object> = {};\n for (const [key, value] of Object.entries(shape)) {\n properties[key] = zodToJsonSchema(value as z.ZodType<unknown>);\n }\n return { type: 'object', properties };\n }\n case 'ZodArray': {\n const arrayDef = def as unknown as { type: z.ZodType<unknown> };\n return { type: 'array', items: zodToJsonSchema(arrayDef.type) };\n }\n case 'ZodString':\n return { type: 'string' };\n case 'ZodNumber':\n return { type: 'number' };\n case 'ZodBoolean':\n return { type: 'boolean' };\n case 'ZodEnum': {\n const enumDef = def as unknown as { values: string[] };\n return { type: 'string', enum: enumDef.values };\n }\n default:\n return { type: 'string' };\n }\n}\n","import type { z } from 'zod';\nimport { ScrapeError } from '@/core/errors.js';\nimport type { CompletionOptions, LLMProvider, OpenAICompatibleConfig } from './types.js';\n\nconst DEFAULT_MODEL = 'gpt-4o-mini';\nconst DEFAULT_MAX_TOKENS = 1024;\nconst DEFAULT_BASE_URL = 'https://api.openai.com/v1';\n\n/**\n * OpenAI-compatible provider\n *\n * Works with:\n * - OpenAI API\n * - Ollama (http://localhost:11434/v1)\n * - LM Studio (http://localhost:1234/v1)\n * - LocalAI\n * - vLLM\n * - Any OpenAI-compatible API\n *\n * Requires `openai` as a peer dependency.\n *\n * @example\n * ```ts\n * // OpenAI\n * const provider = new OpenAIProvider({ apiKey: 'sk-...' });\n *\n * // Ollama\n * const provider = new OpenAIProvider({\n * baseUrl: 'http://localhost:11434/v1',\n * model: 'llama3.2',\n * apiKey: 'ollama' // Ollama doesn't require a real key\n * });\n *\n * // LM Studio\n * const provider = new OpenAIProvider({\n * baseUrl: 'http://localhost:1234/v1',\n * model: 'local-model',\n * apiKey: 'lm-studio'\n * });\n * ```\n */\nexport class OpenAIProvider implements LLMProvider {\n readonly name = 'openai';\n private client: unknown;\n private model: string;\n\n constructor(config: OpenAICompatibleConfig = {}) {\n const apiKey = config.apiKey ?? process.env.OPENAI_API_KEY;\n const baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;\n\n // Only require API key for OpenAI (not for local providers)\n if (!apiKey && baseUrl === DEFAULT_BASE_URL) {\n throw new ScrapeError(\n 'OpenAI API key required. Set OPENAI_API_KEY env var or pass apiKey in config.',\n 'LLM_ERROR'\n );\n }\n\n this.model = config.model ?? DEFAULT_MODEL;\n\n // Dynamic import to avoid requiring the SDK if not used\n try {\n // eslint-disable-next-line @typescript-eslint/no-require-imports\n const { OpenAI } = require('openai') as typeof import('openai');\n this.client = new OpenAI({\n apiKey: apiKey ?? 'local', // Use 'local' as placeholder for local providers\n baseURL: baseUrl,\n });\n } catch {\n throw new ScrapeError(\n 'openai package is required for OpenAI provider. Install with: npm install openai',\n 'LLM_ERROR'\n );\n }\n }\n\n async complete(prompt: string, options: CompletionOptions = {}): Promise<string> {\n try {\n const client = this.client as import('openai').OpenAI;\n const messages: Array<{ role: 'system' | 'user'; content: string }> = [];\n\n if (options.systemPrompt) {\n messages.push({ role: 'system', content: options.systemPrompt });\n }\n messages.push({ role: 'user', content: prompt });\n\n const response = await client.chat.completions.create({\n model: this.model,\n max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,\n messages,\n temperature: options.temperature,\n });\n\n const content = response.choices[0]?.message?.content;\n if (!content) {\n throw new ScrapeError('Empty response from OpenAI', 'LLM_ERROR');\n }\n\n return content;\n } catch (error) {\n if (error instanceof ScrapeError) throw error;\n throw new ScrapeError(\n `OpenAI API error: ${error instanceof Error ? error.message : String(error)}`,\n 'LLM_ERROR',\n undefined,\n error instanceof Error ? error : undefined\n );\n }\n }\n\n async completeJSON<T>(\n prompt: string,\n schema: z.ZodType<T>,\n options: CompletionOptions = {}\n ): Promise<T> {\n const client = this.client as import('openai').OpenAI;\n\n try {\n // Use JSON mode for structured outputs\n const messages: Array<{ role: 'system' | 'user'; content: string }> = [\n {\n role: 'system',\n content:\n options.systemPrompt ??\n 'You are a helpful assistant that extracts information from content.',\n },\n { role: 'user', content: prompt },\n ];\n\n const response = await client.chat.completions.create({\n model: this.model,\n max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,\n messages,\n temperature: options.temperature,\n response_format: { type: 'json_object' },\n });\n\n const content = response.choices[0]?.message?.content;\n if (!content) {\n throw new ScrapeError('Empty response from OpenAI', 'LLM_ERROR');\n }\n\n const parsed = JSON.parse(content);\n return schema.parse(parsed);\n } catch (error) {\n // Fallback to regular completion with JSON instruction\n if (error instanceof ScrapeError) throw error;\n\n // If structured output failed, try regular completion\n const jsonPrompt = `${prompt}\n\nRespond ONLY with valid JSON matching this schema:\n${JSON.stringify(zodToJsonSchema(schema), null, 2)}\n\nDo not include any explanation or markdown formatting. Just the JSON object.`;\n\n const response = await this.complete(jsonPrompt, {\n ...options,\n systemPrompt: 'You respond only with valid JSON.',\n });\n\n try {\n const jsonMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (!jsonMatch) {\n throw new Error('No JSON object found in response');\n }\n\n const parsed = JSON.parse(jsonMatch[0]);\n return schema.parse(parsed);\n } catch (parseError) {\n throw new ScrapeError(\n `Failed to parse LLM response as JSON: ${parseError instanceof Error ? parseError.message : String(parseError)}`,\n 'VALIDATION_ERROR',\n undefined,\n parseError instanceof Error ? parseError : undefined\n );\n }\n }\n }\n}\n\n/**\n * Convert a Zod schema to JSON Schema for structured outputs\n */\nfunction zodToJsonSchema(schema: z.ZodType<unknown>): object {\n const def = (schema as z.ZodType<unknown> & { _def: { typeName: string } })._def;\n\n switch (def.typeName) {\n case 'ZodObject': {\n const shape = (schema as z.ZodObject<z.ZodRawShape>).shape;\n const properties: Record<string, object> = {};\n const required: string[] = [];\n\n for (const [key, value] of Object.entries(shape)) {\n properties[key] = zodToJsonSchema(value as z.ZodType<unknown>);\n // Assume all fields are required unless wrapped in ZodOptional\n const valueDef = (value as z.ZodType<unknown> & { _def: { typeName: string } })._def;\n if (valueDef.typeName !== 'ZodOptional') {\n required.push(key);\n }\n }\n return { type: 'object', properties, required };\n }\n case 'ZodArray': {\n const arrayDef = def as unknown as { type: z.ZodType<unknown> };\n return { type: 'array', items: zodToJsonSchema(arrayDef.type) };\n }\n case 'ZodString':\n return { type: 'string' };\n case 'ZodNumber':\n return { type: 'number' };\n case 'ZodBoolean':\n return { type: 'boolean' };\n case 'ZodEnum': {\n const enumDef = def as unknown as { values: string[] };\n return { type: 'string', enum: enumDef.values };\n }\n case 'ZodOptional': {\n const optionalDef = def as unknown as { innerType: z.ZodType<unknown> };\n return zodToJsonSchema(optionalDef.innerType);\n }\n default:\n return { type: 'string' };\n }\n}\n\n// Convenience factory functions\n\n/**\n * Create an OpenAI provider with default settings\n */\nexport function createOpenAI(config?: OpenAICompatibleConfig): OpenAIProvider {\n return new OpenAIProvider(config);\n}\n\n/**\n * Create an Ollama provider\n *\n * @example\n * ```ts\n * const provider = createOllama({ model: 'llama3.2' });\n * ```\n */\nexport function createOllama(\n config: { model: string; port?: number } = { model: 'llama3.2' }\n): OpenAIProvider {\n return new OpenAIProvider({\n baseUrl: `http://localhost:${config.port ?? 11434}/v1`,\n model: config.model,\n apiKey: 'ollama',\n });\n}\n\n/**\n * Create an LM Studio provider\n *\n * @example\n * ```ts\n * const provider = createLMStudio({ model: 'local-model' });\n * ```\n */\nexport function createLMStudio(\n config: { model: string; port?: number } = { model: 'local-model' }\n): OpenAIProvider {\n return new OpenAIProvider({\n baseUrl: `http://localhost:${config.port ?? 1234}/v1`,\n model: config.model,\n apiKey: 'lm-studio',\n });\n}\n"],"mappings":";;;;;;;;AAIA,MAAMA,kBAAgB;AACtB,MAAMC,uBAAqB;;;;;;;;;;;;AAa3B,IAAa,oBAAb,MAAsD;CACpD,AAAS,OAAO;CAChB,AAAQ;CACR,AAAQ;CAER,YAAY,SAA0B,EAAE,EAAE;EACxC,MAAM,SAAS,OAAO,UAAU,QAAQ,IAAI;AAC5C,MAAI,CAAC,OACH,OAAM,IAAI,YACR,uFACA,YACD;AAGH,OAAK,QAAQ,OAAO,SAASD;AAG7B,MAAI;GAEF,MAAM,EAAE,wBAAsB,oBAAoB;AAClD,QAAK,SAAS,IAAI,UAAU;IAC1B;IACA,SAAS,OAAO;IACjB,CAAC;UACI;AACN,SAAM,IAAI,YACR,qGACA,YACD;;;CAIL,MAAM,SAAS,QAAgB,UAA6B,EAAE,EAAmB;AAC/E,MAAI;GAUF,MAAM,WARW,MADF,KAAK,OACU,SAAS,OAAO;IAC5C,OAAO,KAAK;IACZ,YAAY,QAAQ,aAAaC;IACjC,UAAU,CAAC;KAAE,MAAM;KAAQ,SAAS;KAAQ,CAAC;IAC7C,QAAQ,QAAQ;IAChB,aAAa,QAAQ;IACtB,CAAC,EAEuB,QAAQ;AACjC,OAAI,SAAS,SAAS,UAAU,QAAQ,KACtC,QAAO,QAAQ;AAGjB,SAAM,IAAI,YAAY,+CAA+C,YAAY;WAC1E,OAAO;AACd,OAAI,iBAAiB,YAAa,OAAM;AACxC,SAAM,IAAI,YACR,wBAAwB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,MAAM,IAC9E,aACA,QACA,iBAAiB,QAAQ,QAAQ,OAClC;;;CAIL,MAAM,aACJ,QACA,QACA,UAA6B,EAAE,EACnB;EACZ,MAAM,aAAa,GAAG,OAAO;;;EAG/B,KAAK,UAAUC,kBAAgB,OAAO,EAAE,MAAM,EAAE,CAAC;;;EAI/C,MAAM,WAAW,MAAM,KAAK,SAAS,YAAY;GAC/C,GAAG;GACH,cACE,QAAQ,gBAAgB;GAC3B,CAAC;AAEF,MAAI;GAEF,MAAM,YAAY,SAAS,MAAM,cAAc;AAC/C,OAAI,CAAC,UACH,OAAM,IAAI,MAAM,mCAAmC;GAGrD,MAAM,SAAS,KAAK,MAAM,UAAU,GAAG;AACvC,UAAO,OAAO,MAAM,OAAO;WACpB,OAAO;AACd,SAAM,IAAI,YACR,yCAAyC,iBAAiB,QAAQ,MAAM,UAAU,OAAO,MAAM,IAC/F,oBACA,QACA,iBAAiB,QAAQ,QAAQ,OAClC;;;;;;;;AASP,SAASA,kBAAgB,QAAoC;CAC3D,MAAM,MAAO,OAA+D;AAE5E,SAAQ,IAAI,UAAZ;EACE,KAAK,aAAa;GAChB,MAAM,QAAS,OAAsC;GACrD,MAAMC,aAAqC,EAAE;AAC7C,QAAK,MAAM,CAAC,KAAK,UAAU,OAAO,QAAQ,MAAM,CAC9C,YAAW,OAAOD,kBAAgB,MAA4B;AAEhE,UAAO;IAAE,MAAM;IAAU;IAAY;;EAEvC,KAAK,WAEH,QAAO;GAAE,MAAM;GAAS,OAAOA,kBADd,IACuC,KAAK;GAAE;EAEjE,KAAK,YACH,QAAO,EAAE,MAAM,UAAU;EAC3B,KAAK,YACH,QAAO,EAAE,MAAM,UAAU;EAC3B,KAAK,aACH,QAAO,EAAE,MAAM,WAAW;EAC5B,KAAK,UAEH,QAAO;GAAE,MAAM;GAAU,MADT,IACuB;GAAQ;EAEjD,QACE,QAAO,EAAE,MAAM,UAAU;;;;;;AC/I/B,MAAM,gBAAgB;AACtB,MAAM,qBAAqB;AAC3B,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAmCzB,IAAa,iBAAb,MAAmD;CACjD,AAAS,OAAO;CAChB,AAAQ;CACR,AAAQ;CAER,YAAY,SAAiC,EAAE,EAAE;EAC/C,MAAM,SAAS,OAAO,UAAU,QAAQ,IAAI;EAC5C,MAAM,UAAU,OAAO,WAAW;AAGlC,MAAI,CAAC,UAAU,YAAY,iBACzB,OAAM,IAAI,YACR,iFACA,YACD;AAGH,OAAK,QAAQ,OAAO,SAAS;AAG7B,MAAI;GAEF,MAAM,EAAE,qBAAmB,SAAS;AACpC,QAAK,SAAS,IAAI,OAAO;IACvB,QAAQ,UAAU;IAClB,SAAS;IACV,CAAC;UACI;AACN,SAAM,IAAI,YACR,oFACA,YACD;;;CAIL,MAAM,SAAS,QAAgB,UAA6B,EAAE,EAAmB;AAC/E,MAAI;GACF,MAAM,SAAS,KAAK;GACpB,MAAME,WAAgE,EAAE;AAExE,OAAI,QAAQ,aACV,UAAS,KAAK;IAAE,MAAM;IAAU,SAAS,QAAQ;IAAc,CAAC;AAElE,YAAS,KAAK;IAAE,MAAM;IAAQ,SAAS;IAAQ,CAAC;GAShD,MAAM,WAPW,MAAM,OAAO,KAAK,YAAY,OAAO;IACpD,OAAO,KAAK;IACZ,YAAY,QAAQ,aAAa;IACjC;IACA,aAAa,QAAQ;IACtB,CAAC,EAEuB,QAAQ,IAAI,SAAS;AAC9C,OAAI,CAAC,QACH,OAAM,IAAI,YAAY,8BAA8B,YAAY;AAGlE,UAAO;WACA,OAAO;AACd,OAAI,iBAAiB,YAAa,OAAM;AACxC,SAAM,IAAI,YACR,qBAAqB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,MAAM,IAC3E,aACA,QACA,iBAAiB,QAAQ,QAAQ,OAClC;;;CAIL,MAAM,aACJ,QACA,QACA,UAA6B,EAAE,EACnB;EACZ,MAAM,SAAS,KAAK;AAEpB,MAAI;GAEF,MAAMA,WAAgE,CACpE;IACE,MAAM;IACN,SACE,QAAQ,gBACR;IACH,EACD;IAAE,MAAM;IAAQ,SAAS;IAAQ,CAClC;GAUD,MAAM,WARW,MAAM,OAAO,KAAK,YAAY,OAAO;IACpD,OAAO,KAAK;IACZ,YAAY,QAAQ,aAAa;IACjC;IACA,aAAa,QAAQ;IACrB,iBAAiB,EAAE,MAAM,eAAe;IACzC,CAAC,EAEuB,QAAQ,IAAI,SAAS;AAC9C,OAAI,CAAC,QACH,OAAM,IAAI,YAAY,8BAA8B,YAAY;GAGlE,MAAM,SAAS,KAAK,MAAM,QAAQ;AAClC,UAAO,OAAO,MAAM,OAAO;WACpB,OAAO;AAEd,OAAI,iBAAiB,YAAa,OAAM;GAGxC,MAAM,aAAa,GAAG,OAAO;;;EAGjC,KAAK,UAAU,gBAAgB,OAAO,EAAE,MAAM,EAAE,CAAC;;;GAI7C,MAAM,WAAW,MAAM,KAAK,SAAS,YAAY;IAC/C,GAAG;IACH,cAAc;IACf,CAAC;AAEF,OAAI;IACF,MAAM,YAAY,SAAS,MAAM,cAAc;AAC/C,QAAI,CAAC,UACH,OAAM,IAAI,MAAM,mCAAmC;IAGrD,MAAM,SAAS,KAAK,MAAM,UAAU,GAAG;AACvC,WAAO,OAAO,MAAM,OAAO;YACpB,YAAY;AACnB,UAAM,IAAI,YACR,yCAAyC,sBAAsB,QAAQ,WAAW,UAAU,OAAO,WAAW,IAC9G,oBACA,QACA,sBAAsB,QAAQ,aAAa,OAC5C;;;;;;;;AAST,SAAS,gBAAgB,QAAoC;CAC3D,MAAM,MAAO,OAA+D;AAE5E,SAAQ,IAAI,UAAZ;EACE,KAAK,aAAa;GAChB,MAAM,QAAS,OAAsC;GACrD,MAAMC,aAAqC,EAAE;GAC7C,MAAMC,WAAqB,EAAE;AAE7B,QAAK,MAAM,CAAC,KAAK,UAAU,OAAO,QAAQ,MAAM,EAAE;AAChD,eAAW,OAAO,gBAAgB,MAA4B;AAG9D,QADkB,MAA8D,KACnE,aAAa,cACxB,UAAS,KAAK,IAAI;;AAGtB,UAAO;IAAE,MAAM;IAAU;IAAY;IAAU;;EAEjD,KAAK,WAEH,QAAO;GAAE,MAAM;GAAS,OAAO,gBADd,IACuC,KAAK;GAAE;EAEjE,KAAK,YACH,QAAO,EAAE,MAAM,UAAU;EAC3B,KAAK,YACH,QAAO,EAAE,MAAM,UAAU;EAC3B,KAAK,aACH,QAAO,EAAE,MAAM,WAAW;EAC5B,KAAK,UAEH,QAAO;GAAE,MAAM;GAAU,MADT,IACuB;GAAQ;EAEjD,KAAK,cAEH,QAAO,gBADa,IACe,UAAU;EAE/C,QACE,QAAO,EAAE,MAAM,UAAU;;;;;;AAS/B,SAAgB,aAAa,QAAiD;AAC5E,QAAO,IAAI,eAAe,OAAO;;;;;;;;;;AAWnC,SAAgB,aACd,SAA2C,EAAE,OAAO,YAAY,EAChD;AAChB,QAAO,IAAI,eAAe;EACxB,SAAS,oBAAoB,OAAO,QAAQ,MAAM;EAClD,OAAO,OAAO;EACd,QAAQ;EACT,CAAC;;;;;;;;;;AAWJ,SAAgB,eACd,SAA2C,EAAE,OAAO,eAAe,EACnD;AAChB,QAAO,IAAI,eAAe;EACxB,SAAS,oBAAoB,OAAO,QAAQ,KAAK;EACjD,OAAO,OAAO;EACd,QAAQ;EACT,CAAC"}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
const require_index = require('../index.cjs');
|
|
2
|
+
let mdast_util_from_markdown = require("mdast-util-from-markdown");
|
|
3
|
+
let mdast_util_to_string = require("mdast-util-to-string");
|
|
4
|
+
let unist_util_visit = require("unist-util-visit");
|
|
5
|
+
|
|
6
|
+
//#region src/parsers/github.ts
|
|
7
|
+
/**
|
|
8
|
+
* GitHub-specific utilities for parsing repositories.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Check if a URL is a GitHub repository
|
|
12
|
+
*/
|
|
13
|
+
function isGitHubRepo(url) {
|
|
14
|
+
return /^https?:\/\/(www\.)?github\.com\/[^/]+\/[^/]+\/?$/.test(url);
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Extract GitHub repo info from URL
|
|
18
|
+
*/
|
|
19
|
+
function parseGitHubUrl(url) {
|
|
20
|
+
const match = url.match(/github\.com\/([^/]+)\/([^/]+)/);
|
|
21
|
+
if (!match || !match[1] || !match[2]) return null;
|
|
22
|
+
return {
|
|
23
|
+
owner: match[1],
|
|
24
|
+
repo: match[2].replace(/\.git$/, "")
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Convert a GitHub repo URL to raw content URL
|
|
29
|
+
*/
|
|
30
|
+
function toRawUrl(url, branch = "main", file = "README.md") {
|
|
31
|
+
const info = parseGitHubUrl(url);
|
|
32
|
+
if (!info) return url;
|
|
33
|
+
return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Fetch GitHub API metadata for a repository
|
|
37
|
+
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
38
|
+
*/
|
|
39
|
+
async function fetchRepoMeta(owner, repo, _token) {
|
|
40
|
+
return {
|
|
41
|
+
repoOwner: owner,
|
|
42
|
+
repoName: repo
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Group links by their category/section
|
|
47
|
+
*/
|
|
48
|
+
function groupByCategory(links) {
|
|
49
|
+
const groups = /* @__PURE__ */ new Map();
|
|
50
|
+
for (const link of links) {
|
|
51
|
+
const category = link.context || "Uncategorized";
|
|
52
|
+
const existing = groups.get(category) || [];
|
|
53
|
+
existing.push(link);
|
|
54
|
+
groups.set(category, existing);
|
|
55
|
+
}
|
|
56
|
+
return groups;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
//#endregion
|
|
60
|
+
//#region src/parsers/markdown.ts
|
|
61
|
+
/**
|
|
62
|
+
* Generic Markdown parser.
|
|
63
|
+
* Extracts structure, links, and code blocks from markdown content.
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* ```ts
|
|
67
|
+
* const parser = new MarkdownParser();
|
|
68
|
+
* const result = parser.parse(markdownContent);
|
|
69
|
+
* console.log(result.data.sections);
|
|
70
|
+
* console.log(result.data.links);
|
|
71
|
+
* ```
|
|
72
|
+
*/
|
|
73
|
+
var MarkdownParser = class {
|
|
74
|
+
name = "markdown";
|
|
75
|
+
canParse(content) {
|
|
76
|
+
return content.includes("# ") || content.includes("## ") || content.includes("- [") || content.includes("* [") || content.includes("```");
|
|
77
|
+
}
|
|
78
|
+
parse(content) {
|
|
79
|
+
const tree = (0, mdast_util_from_markdown.fromMarkdown)(content);
|
|
80
|
+
const sections = [];
|
|
81
|
+
const allLinks = [];
|
|
82
|
+
const codeBlocks = [];
|
|
83
|
+
let frontmatter;
|
|
84
|
+
if (content.startsWith("---")) {
|
|
85
|
+
const endIndex = content.indexOf("---", 3);
|
|
86
|
+
if (endIndex !== -1) {
|
|
87
|
+
const frontmatterContent = content.slice(3, endIndex).trim();
|
|
88
|
+
frontmatter = this.parseFrontmatter(frontmatterContent);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
let currentSection = null;
|
|
92
|
+
(0, unist_util_visit.visit)(tree, (node) => {
|
|
93
|
+
if (node.type === "heading") {
|
|
94
|
+
const heading = node;
|
|
95
|
+
const title = (0, mdast_util_to_string.toString)(heading);
|
|
96
|
+
if (currentSection) sections.push(currentSection);
|
|
97
|
+
currentSection = {
|
|
98
|
+
level: heading.depth,
|
|
99
|
+
title,
|
|
100
|
+
content: "",
|
|
101
|
+
links: []
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
if (node.type === "link") {
|
|
105
|
+
const link = node;
|
|
106
|
+
const text = (0, mdast_util_to_string.toString)(link);
|
|
107
|
+
const linkData = {
|
|
108
|
+
url: link.url,
|
|
109
|
+
text,
|
|
110
|
+
title: link.title ?? void 0,
|
|
111
|
+
context: currentSection?.title
|
|
112
|
+
};
|
|
113
|
+
allLinks.push(linkData);
|
|
114
|
+
if (currentSection) currentSection.links.push(linkData);
|
|
115
|
+
}
|
|
116
|
+
if (node.type === "code") {
|
|
117
|
+
const code = node;
|
|
118
|
+
codeBlocks.push({
|
|
119
|
+
language: code.lang ?? void 0,
|
|
120
|
+
code: code.value,
|
|
121
|
+
meta: code.meta ?? void 0
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
if (currentSection && node.type === "paragraph") {
|
|
125
|
+
const text = (0, mdast_util_to_string.toString)(node);
|
|
126
|
+
currentSection.content += (currentSection.content ? "\n\n" : "") + text;
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
if (currentSection) sections.push(currentSection);
|
|
130
|
+
return { data: {
|
|
131
|
+
title: frontmatter?.title ?? sections.find((s) => s.level === 1)?.title,
|
|
132
|
+
description: frontmatter?.description ?? this.extractDescription(tree),
|
|
133
|
+
sections,
|
|
134
|
+
links: allLinks,
|
|
135
|
+
codeBlocks,
|
|
136
|
+
frontmatter
|
|
137
|
+
} };
|
|
138
|
+
}
|
|
139
|
+
parseFrontmatter(content) {
|
|
140
|
+
const result = {};
|
|
141
|
+
const lines = content.split("\n");
|
|
142
|
+
for (const line of lines) {
|
|
143
|
+
const colonIndex = line.indexOf(":");
|
|
144
|
+
if (colonIndex > 0) {
|
|
145
|
+
const key = line.slice(0, colonIndex).trim();
|
|
146
|
+
let value = line.slice(colonIndex + 1).trim();
|
|
147
|
+
if (value === "true") value = true;
|
|
148
|
+
else if (value === "false") value = false;
|
|
149
|
+
else if (/^-?\d+(\.\d+)?$/.test(value)) value = Number(value);
|
|
150
|
+
else if (value.startsWith("\"") && value.endsWith("\"")) value = value.slice(1, -1);
|
|
151
|
+
else if (value.startsWith("'") && value.endsWith("'")) value = value.slice(1, -1);
|
|
152
|
+
result[key] = value;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return result;
|
|
156
|
+
}
|
|
157
|
+
extractDescription(tree) {
|
|
158
|
+
for (const node of tree.children) {
|
|
159
|
+
if (node.type === "heading") break;
|
|
160
|
+
if (node.type === "paragraph") return (0, mdast_util_to_string.toString)(node);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
/**
|
|
165
|
+
* Extract links from a list-based markdown structure (like awesome lists)
|
|
166
|
+
*/
|
|
167
|
+
function extractListLinks(markdown) {
|
|
168
|
+
const tree = (0, mdast_util_from_markdown.fromMarkdown)(markdown);
|
|
169
|
+
const links = [];
|
|
170
|
+
let currentHeading = "";
|
|
171
|
+
(0, unist_util_visit.visit)(tree, (node) => {
|
|
172
|
+
if (node.type === "heading") currentHeading = (0, mdast_util_to_string.toString)(node);
|
|
173
|
+
if (node.type === "listItem") (0, unist_util_visit.visit)(node, "link", (linkNode) => {
|
|
174
|
+
links.push({
|
|
175
|
+
url: linkNode.url,
|
|
176
|
+
text: (0, mdast_util_to_string.toString)(linkNode),
|
|
177
|
+
title: linkNode.title ?? void 0,
|
|
178
|
+
context: currentHeading || void 0
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
});
|
|
182
|
+
return links;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Parse markdown into sections by heading level
|
|
186
|
+
*/
|
|
187
|
+
function parseByHeadings(markdown, minLevel = 2) {
|
|
188
|
+
return new MarkdownParser().parse(markdown).data.sections.filter((s) => s.level >= minLevel);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
//#endregion
|
|
192
|
+
exports.MarkdownParser = MarkdownParser;
|
|
193
|
+
exports.extractListLinks = extractListLinks;
|
|
194
|
+
exports.fetchRepoMeta = fetchRepoMeta;
|
|
195
|
+
exports.groupByCategory = groupByCategory;
|
|
196
|
+
exports.isGitHubRepo = isGitHubRepo;
|
|
197
|
+
exports.parseByHeadings = parseByHeadings;
|
|
198
|
+
exports.parseGitHubUrl = parseGitHubUrl;
|
|
199
|
+
exports.toRawUrl = toRawUrl;
|
|
200
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.cjs","names":["sections: MarkdownSection[]","allLinks: MarkdownLink[]","codeBlocks: CodeBlock[]","frontmatter: Record<string, unknown> | undefined","currentSection: MarkdownSection | null","linkData: MarkdownLink","result: Record<string, unknown>","value: string | boolean | number","links: MarkdownLink[]"],"sources":["../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":["import type { GitHubMeta, MarkdownLink } from './types.js';\n\n/**\n * GitHub-specific utilities for parsing repositories.\n */\n\n/**\n * Check if a URL is a GitHub repository\n */\nexport function isGitHubRepo(url: string): boolean {\n return /^https?:\\/\\/(www\\.)?github\\.com\\/[^/]+\\/[^/]+\\/?$/.test(url);\n}\n\n/**\n * Extract GitHub repo info from URL\n */\nexport function parseGitHubUrl(url: string): { owner: string; repo: string } | null {\n const match = url.match(/github\\.com\\/([^/]+)\\/([^/]+)/);\n if (!match || !match[1] || !match[2]) return null;\n return {\n owner: match[1],\n repo: match[2].replace(/\\.git$/, ''),\n };\n}\n\n/**\n * Convert a GitHub repo URL to raw content URL\n */\nexport function toRawUrl(url: string, branch = 'main', file = 'README.md'): string {\n const info = parseGitHubUrl(url);\n if (!info) return url;\n return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;\n}\n\n/**\n * Fetch GitHub API metadata for a repository\n * Note: This is a placeholder - actual implementation would need GitHub API access\n */\nexport async function fetchRepoMeta(\n owner: string,\n repo: string,\n _token?: string\n): Promise<GitHubMeta> {\n // This would make actual API calls in a full implementation\n // For now, return basic info\n return {\n repoOwner: owner,\n repoName: repo,\n };\n}\n\n/**\n * Group links by their category/section\n */\nexport function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]> {\n const groups = new Map<string, MarkdownLink[]>();\n\n for (const link of links) {\n const category = link.context || 'Uncategorized';\n const existing = groups.get(category) || [];\n existing.push(link);\n groups.set(category, existing);\n }\n\n return groups;\n}\n","import type { Code, Heading, Link, ListItem, Root } from 'mdast';\nimport { fromMarkdown } from 'mdast-util-from-markdown';\nimport { toString as mdastToString } from 'mdast-util-to-string';\nimport { visit } from 'unist-util-visit';\nimport type {\n CodeBlock,\n MarkdownLink,\n MarkdownSection,\n ParsedMarkdown,\n ParserResult,\n SourceParser,\n} from './types.js';\n\n/**\n * Generic Markdown parser.\n * Extracts structure, links, and code blocks from markdown content.\n *\n * @example\n * ```ts\n * const parser = new MarkdownParser();\n * const result = parser.parse(markdownContent);\n * console.log(result.data.sections);\n * console.log(result.data.links);\n * ```\n */\nexport class MarkdownParser implements SourceParser<ParsedMarkdown> {\n readonly name = 'markdown';\n\n canParse(content: string): boolean {\n // Check for common markdown patterns\n return (\n content.includes('# ') ||\n content.includes('## ') ||\n content.includes('- [') ||\n content.includes('* [') ||\n content.includes('```')\n );\n }\n\n parse(content: string): ParserResult<ParsedMarkdown> {\n const tree = fromMarkdown(content);\n const sections: MarkdownSection[] = [];\n const allLinks: MarkdownLink[] = [];\n const codeBlocks: CodeBlock[] = [];\n let frontmatter: Record<string, unknown> | undefined;\n\n // Extract frontmatter if present\n if (content.startsWith('---')) {\n const endIndex = content.indexOf('---', 3);\n if (endIndex !== -1) {\n const frontmatterContent = content.slice(3, endIndex).trim();\n frontmatter = this.parseFrontmatter(frontmatterContent);\n }\n }\n\n // Track current section\n let currentSection: MarkdownSection | null = null;\n\n // Process the AST\n visit(tree, (node) => {\n // Handle headings\n if (node.type === 'heading') {\n const heading = node as Heading;\n const title = mdastToString(heading);\n\n // Finalize previous section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n currentSection = {\n level: heading.depth,\n title,\n content: '',\n links: [],\n };\n }\n\n // Handle links\n if (node.type === 'link') {\n const link = node as Link;\n const text = mdastToString(link);\n const linkData: MarkdownLink = {\n url: link.url,\n text,\n title: link.title ?? undefined,\n context: currentSection?.title,\n };\n\n allLinks.push(linkData);\n if (currentSection) {\n currentSection.links.push(linkData);\n }\n }\n\n // Handle code blocks\n if (node.type === 'code') {\n const code = node as Code;\n codeBlocks.push({\n language: code.lang ?? undefined,\n code: code.value,\n meta: code.meta ?? undefined,\n });\n }\n\n // Accumulate content for current section\n if (currentSection && node.type === 'paragraph') {\n const text = mdastToString(node);\n currentSection.content += (currentSection.content ? '\\n\\n' : '') + text;\n }\n });\n\n // Finalize last section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n // Extract title from first h1 or frontmatter\n const title = (frontmatter?.title as string) ?? sections.find((s) => s.level === 1)?.title;\n\n // Extract description from frontmatter or first paragraph before any heading\n const description = (frontmatter?.description as string) ?? this.extractDescription(tree);\n\n return {\n data: {\n title,\n description,\n sections,\n links: allLinks,\n codeBlocks,\n frontmatter,\n },\n };\n }\n\n private parseFrontmatter(content: string): Record<string, unknown> {\n const result: Record<string, unknown> = {};\n const lines = content.split('\\n');\n\n for (const line of lines) {\n const colonIndex = line.indexOf(':');\n if (colonIndex > 0) {\n const key = line.slice(0, colonIndex).trim();\n let value: string | boolean | number = line.slice(colonIndex + 1).trim();\n\n // Parse simple types\n if (value === 'true') value = true;\n else if (value === 'false') value = false;\n else if (/^-?\\d+(\\.\\d+)?$/.test(value)) value = Number(value);\n else if (value.startsWith('\"') && value.endsWith('\"')) value = value.slice(1, -1);\n else if (value.startsWith(\"'\") && value.endsWith(\"'\")) value = value.slice(1, -1);\n\n result[key] = value;\n }\n }\n\n return result;\n }\n\n private extractDescription(tree: Root): string | undefined {\n // Find first paragraph before any heading\n for (const node of tree.children) {\n if (node.type === 'heading') break;\n if (node.type === 'paragraph') {\n return mdastToString(node);\n }\n }\n return undefined;\n }\n}\n\n/**\n * Extract links from a list-based markdown structure (like awesome lists)\n */\nexport function extractListLinks(markdown: string): MarkdownLink[] {\n const tree = fromMarkdown(markdown);\n const links: MarkdownLink[] = [];\n let currentHeading = '';\n\n visit(tree, (node) => {\n if (node.type === 'heading') {\n currentHeading = mdastToString(node as Heading);\n }\n\n if (node.type === 'listItem') {\n const listItem = node as ListItem;\n\n // Find links in this list item\n visit(listItem, 'link', (linkNode: Link) => {\n links.push({\n url: linkNode.url,\n text: mdastToString(linkNode),\n title: linkNode.title ?? undefined,\n context: currentHeading || undefined,\n });\n });\n }\n });\n\n return links;\n}\n\n/**\n * Parse markdown into sections by heading level\n */\nexport function parseByHeadings(markdown: string, minLevel = 2): MarkdownSection[] {\n const parser = new MarkdownParser();\n const result = parser.parse(markdown);\n return result.data.sections.filter((s) => s.level >= minLevel);\n}\n"],"mappings":";;;;;;;;;;;;AASA,SAAgB,aAAa,KAAsB;AACjD,QAAO,oDAAoD,KAAK,IAAI;;;;;AAMtE,SAAgB,eAAe,KAAqD;CAClF,MAAM,QAAQ,IAAI,MAAM,gCAAgC;AACxD,KAAI,CAAC,SAAS,CAAC,MAAM,MAAM,CAAC,MAAM,GAAI,QAAO;AAC7C,QAAO;EACL,OAAO,MAAM;EACb,MAAM,MAAM,GAAG,QAAQ,UAAU,GAAG;EACrC;;;;;AAMH,SAAgB,SAAS,KAAa,SAAS,QAAQ,OAAO,aAAqB;CACjF,MAAM,OAAO,eAAe,IAAI;AAChC,KAAI,CAAC,KAAM,QAAO;AAClB,QAAO,qCAAqC,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,OAAO,GAAG;;;;;;AAOnF,eAAsB,cACpB,OACA,MACA,QACqB;AAGrB,QAAO;EACL,WAAW;EACX,UAAU;EACX;;;;;AAMH,SAAgB,gBAAgB,OAAoD;CAClF,MAAM,yBAAS,IAAI,KAA6B;AAEhD,MAAK,MAAM,QAAQ,OAAO;EACxB,MAAM,WAAW,KAAK,WAAW;EACjC,MAAM,WAAW,OAAO,IAAI,SAAS,IAAI,EAAE;AAC3C,WAAS,KAAK,KAAK;AACnB,SAAO,IAAI,UAAU,SAAS;;AAGhC,QAAO;;;;;;;;;;;;;;;;;ACvCT,IAAa,iBAAb,MAAoE;CAClE,AAAS,OAAO;CAEhB,SAAS,SAA0B;AAEjC,SACE,QAAQ,SAAS,KAAK,IACtB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM;;CAI3B,MAAM,SAA+C;EACnD,MAAM,kDAAoB,QAAQ;EAClC,MAAMA,WAA8B,EAAE;EACtC,MAAMC,WAA2B,EAAE;EACnC,MAAMC,aAA0B,EAAE;EAClC,IAAIC;AAGJ,MAAI,QAAQ,WAAW,MAAM,EAAE;GAC7B,MAAM,WAAW,QAAQ,QAAQ,OAAO,EAAE;AAC1C,OAAI,aAAa,IAAI;IACnB,MAAM,qBAAqB,QAAQ,MAAM,GAAG,SAAS,CAAC,MAAM;AAC5D,kBAAc,KAAK,iBAAiB,mBAAmB;;;EAK3D,IAAIC,iBAAyC;AAG7C,8BAAM,OAAO,SAAS;AAEpB,OAAI,KAAK,SAAS,WAAW;IAC3B,MAAM,UAAU;IAChB,MAAM,2CAAsB,QAAQ;AAGpC,QAAI,eACF,UAAS,KAAK,eAAe;AAG/B,qBAAiB;KACf,OAAO,QAAQ;KACf;KACA,SAAS;KACT,OAAO,EAAE;KACV;;AAIH,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;IACb,MAAM,0CAAqB,KAAK;IAChC,MAAMC,WAAyB;KAC7B,KAAK,KAAK;KACV;KACA,OAAO,KAAK,SAAS;KACrB,SAAS,gBAAgB;KAC1B;AAED,aAAS,KAAK,SAAS;AACvB,QAAI,eACF,gBAAe,MAAM,KAAK,SAAS;;AAKvC,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;AACb,eAAW,KAAK;KACd,UAAU,KAAK,QAAQ;KACvB,MAAM,KAAK;KACX,MAAM,KAAK,QAAQ;KACpB,CAAC;;AAIJ,OAAI,kBAAkB,KAAK,SAAS,aAAa;IAC/C,MAAM,0CAAqB,KAAK;AAChC,mBAAe,YAAY,eAAe,UAAU,SAAS,MAAM;;IAErE;AAGF,MAAI,eACF,UAAS,KAAK,eAAe;AAS/B,SAAO,EACL,MAAM;GACJ,OAPW,aAAa,SAAoB,SAAS,MAAM,MAAM,EAAE,UAAU,EAAE,EAAE;GAQjF,aALiB,aAAa,eAA0B,KAAK,mBAAmB,KAAK;GAMrF;GACA,OAAO;GACP;GACA;GACD,EACF;;CAGH,AAAQ,iBAAiB,SAA0C;EACjE,MAAMC,SAAkC,EAAE;EAC1C,MAAM,QAAQ,QAAQ,MAAM,KAAK;AAEjC,OAAK,MAAM,QAAQ,OAAO;GACxB,MAAM,aAAa,KAAK,QAAQ,IAAI;AACpC,OAAI,aAAa,GAAG;IAClB,MAAM,MAAM,KAAK,MAAM,GAAG,WAAW,CAAC,MAAM;IAC5C,IAAIC,QAAmC,KAAK,MAAM,aAAa,EAAE,CAAC,MAAM;AAGxE,QAAI,UAAU,OAAQ,SAAQ;aACrB,UAAU,QAAS,SAAQ;aAC3B,kBAAkB,KAAK,MAAM,CAAE,SAAQ,OAAO,MAAM;aACpD,MAAM,WAAW,KAAI,IAAI,MAAM,SAAS,KAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;aACxE,MAAM,WAAW,IAAI,IAAI,MAAM,SAAS,IAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;AAEjF,WAAO,OAAO;;;AAIlB,SAAO;;CAGT,AAAQ,mBAAmB,MAAgC;AAEzD,OAAK,MAAM,QAAQ,KAAK,UAAU;AAChC,OAAI,KAAK,SAAS,UAAW;AAC7B,OAAI,KAAK,SAAS,YAChB,2CAAqB,KAAK;;;;;;;AAUlC,SAAgB,iBAAiB,UAAkC;CACjE,MAAM,kDAAoB,SAAS;CACnC,MAAMC,QAAwB,EAAE;CAChC,IAAI,iBAAiB;AAErB,6BAAM,OAAO,SAAS;AACpB,MAAI,KAAK,SAAS,UAChB,qDAA+B,KAAgB;AAGjD,MAAI,KAAK,SAAS,WAIhB,6BAHiB,MAGD,SAAS,aAAmB;AAC1C,SAAM,KAAK;IACT,KAAK,SAAS;IACd,yCAAoB,SAAS;IAC7B,OAAO,SAAS,SAAS;IACzB,SAAS,kBAAkB;IAC5B,CAAC;IACF;GAEJ;AAEF,QAAO;;;;;AAMT,SAAgB,gBAAgB,UAAkB,WAAW,GAAsB;AAGjF,QAFe,IAAI,gBAAgB,CACb,MAAM,SAAS,CACvB,KAAK,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS"}
|