scrapex 1.0.0-alpha.1 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +164 -5
  2. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  3. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  4. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  5. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  6. package/dist/index-CDgcRnig.d.cts +268 -0
  7. package/dist/index-CDgcRnig.d.cts.map +1 -0
  8. package/dist/index-piS5wtki.d.mts +268 -0
  9. package/dist/index-piS5wtki.d.mts.map +1 -0
  10. package/dist/index.cjs +1192 -37
  11. package/dist/index.cjs.map +1 -1
  12. package/dist/index.d.cts +318 -2
  13. package/dist/index.d.cts.map +1 -1
  14. package/dist/index.d.mts +318 -2
  15. package/dist/index.d.mts.map +1 -1
  16. package/dist/index.mjs +1164 -6
  17. package/dist/index.mjs.map +1 -1
  18. package/dist/llm/index.cjs +250 -232
  19. package/dist/llm/index.cjs.map +1 -1
  20. package/dist/llm/index.d.cts +132 -85
  21. package/dist/llm/index.d.cts.map +1 -1
  22. package/dist/llm/index.d.mts +132 -85
  23. package/dist/llm/index.d.mts.map +1 -1
  24. package/dist/llm/index.mjs +243 -236
  25. package/dist/llm/index.mjs.map +1 -1
  26. package/dist/parsers/index.cjs +10 -199
  27. package/dist/parsers/index.d.cts +2 -133
  28. package/dist/parsers/index.d.mts +2 -133
  29. package/dist/parsers/index.mjs +2 -191
  30. package/dist/parsers-Bneuws8x.cjs +569 -0
  31. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  32. package/dist/parsers-CwkYnyWY.mjs +482 -0
  33. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  34. package/dist/types-CadAXrme.d.mts +674 -0
  35. package/dist/types-CadAXrme.d.mts.map +1 -0
  36. package/dist/types-DPEtPihB.d.cts +674 -0
  37. package/dist/types-DPEtPihB.d.cts.map +1 -0
  38. package/package.json +15 -16
  39. package/dist/enhancer-Q6CSc1gA.mjs +0 -220
  40. package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
  41. package/dist/enhancer-oM4BhYYS.cjs +0 -268
  42. package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
  43. package/dist/parsers/index.cjs.map +0 -1
  44. package/dist/parsers/index.d.cts.map +0 -1
  45. package/dist/parsers/index.d.mts.map +0 -1
  46. package/dist/parsers/index.mjs.map +0 -1
  47. package/dist/types-CNQZVW36.d.mts +0 -150
  48. package/dist/types-CNQZVW36.d.mts.map +0 -1
  49. package/dist/types-D0HYR95H.d.cts +0 -150
  50. package/dist/types-D0HYR95H.d.cts.map +0 -1
@@ -1 +0,0 @@
1
- {"version":3,"file":"index.d.mts","names":[],"sources":["../../src/parsers/types.ts","../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":[],"mappings":";;AAOA;;;;;AAiBA;AAQiB,UAzBA,YAyBY,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAUZ,SAAA,IAAA,EAAA,MAAe;EAUf;;;EAKH,QAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACE;;AAMhB;EASiB,KAAA,CAAA,OAAU,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAvDa,YAuDb,CAvD0B,KAuD1B,EAvDiC,KAuDjC,CAAA;;;;AChE3B;AAOgB,UDQC,YCRa,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAYd,IAAA,EDHR,KCGgB;EAUF,IAAA,CAAA,EDZb,KCYa;AAgBtB;;;;AAA2D,UDtB1C,YAAA,CCsB0C;;;;EC7B9C,OAAA,CAAA,EAAA,MAAA;;;;;AAAsC,UFiBlC,eAAA,CEjBkC;EAqJnC,KAAA,EAAA,MAAA;EA+BA,KAAA,EAAA,MAAA;;SF/JP;;;;;UAMQ,cAAA;;;YAGL;SACH;cACK;gBACE;;;;;UAMC,SAAA;;;;;;;;UASA,UAAA;;;;;;;;AAlEjB;;;;;AAiBA;AAQiB,iBCvBD,YAAA,CDuBa,GAAA,EAAA,MAAA,CAAA,EAAA,OAAA;AAU7B;AAUA;;AAIS,iBCxCO,cAAA,CDwCP,GAAA,EAAA,MAAA,CAAA,EAAA;EACK,KAAA,EAAA,MAAA;EACE,IAAA,EAAA,MAAA;CAAM,GAAA,IAAA;AAMtB;AASA;;iBC7CgB,QAAA;;AAnBhB;AAOA;AAYA;AAUsB,iBAAA,aAAA,CAIX,KAAA,EAAR,MAAA,EAAO,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAP,OAAO,CAAC,UAAD,CAAA;AAYV;;;AAAwD,iBAAxC,eAAA,CAAwC,KAAA,EAAjB,YAAiB,EAAA,CAAA,EAAA,GAAA,CAAA,MAAA,EAAY,YAAZ,EAAA,CAAA;;;AD/CxD;;;;;AAiBA;AAQA;AAUA;AAUA;;;;AAMgB,cEjCH,cAAA,YAA0B,YFiCvB,CEjCoC,cFiCpC,CAAA,CAAA;EAAM,SAAA,IAAA,GAAA,UAAA;EAML,QAAA,CAAA,OAAS,EAAA,MAAA,CAAA,EAAA,OAAA;EAST,KAAA,CAAA,OAAU,EAAA,MAAA,CAAA,EElCD,YFkCC,CElCY,cFkCZ,CAAA;;;;AChE3B;AAOA;AAYA;AAUsB,iBCwIN,gBAAA,CDpIL,QAAR,EAAA,MAAO,CAAA,ECoI0C,YDpI1C,EAAA;AAYV;;;AAAwD,iBCuJxC,eAAA,CDvJwC,QAAA,EAAA,MAAA,EAAA,QAAA,CAAA,EAAA,MAAA,CAAA,ECuJS,eDvJT,EAAA"}
@@ -1 +0,0 @@
1
- {"version":3,"file":"index.mjs","names":["sections: MarkdownSection[]","allLinks: MarkdownLink[]","codeBlocks: CodeBlock[]","frontmatter: Record<string, unknown> | undefined","currentSection: MarkdownSection | null","mdastToString","linkData: MarkdownLink","result: Record<string, unknown>","value: string | boolean | number","links: MarkdownLink[]"],"sources":["../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":["import type { GitHubMeta, MarkdownLink } from './types.js';\n\n/**\n * GitHub-specific utilities for parsing repositories.\n */\n\n/**\n * Check if a URL is a GitHub repository\n */\nexport function isGitHubRepo(url: string): boolean {\n return /^https?:\\/\\/(www\\.)?github\\.com\\/[^/]+\\/[^/]+\\/?$/.test(url);\n}\n\n/**\n * Extract GitHub repo info from URL\n */\nexport function parseGitHubUrl(url: string): { owner: string; repo: string } | null {\n const match = url.match(/github\\.com\\/([^/]+)\\/([^/]+)/);\n if (!match || !match[1] || !match[2]) return null;\n return {\n owner: match[1],\n repo: match[2].replace(/\\.git$/, ''),\n };\n}\n\n/**\n * Convert a GitHub repo URL to raw content URL\n */\nexport function toRawUrl(url: string, branch = 'main', file = 'README.md'): string {\n const info = parseGitHubUrl(url);\n if (!info) return url;\n return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;\n}\n\n/**\n * Fetch GitHub API metadata for a repository\n * Note: This is a placeholder - actual implementation would need GitHub API access\n */\nexport async function fetchRepoMeta(\n owner: string,\n repo: string,\n _token?: string\n): Promise<GitHubMeta> {\n // This would make actual API calls in a full implementation\n // For now, return basic info\n return {\n repoOwner: owner,\n repoName: repo,\n };\n}\n\n/**\n * Group links by their category/section\n */\nexport function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]> {\n const groups = new Map<string, MarkdownLink[]>();\n\n for (const link of links) {\n const category = link.context || 'Uncategorized';\n const existing = groups.get(category) || [];\n existing.push(link);\n groups.set(category, existing);\n }\n\n return groups;\n}\n","import type { Code, Heading, Link, ListItem, Root } from 'mdast';\nimport { fromMarkdown } from 'mdast-util-from-markdown';\nimport { toString as mdastToString } from 'mdast-util-to-string';\nimport { visit } from 'unist-util-visit';\nimport type {\n CodeBlock,\n MarkdownLink,\n MarkdownSection,\n ParsedMarkdown,\n ParserResult,\n SourceParser,\n} from './types.js';\n\n/**\n * Generic Markdown parser.\n * Extracts structure, links, and code blocks from markdown content.\n *\n * @example\n * ```ts\n * const parser = new MarkdownParser();\n * const result = parser.parse(markdownContent);\n * console.log(result.data.sections);\n * console.log(result.data.links);\n * ```\n */\nexport class MarkdownParser implements SourceParser<ParsedMarkdown> {\n readonly name = 'markdown';\n\n canParse(content: string): boolean {\n // Check for common markdown patterns\n return (\n content.includes('# ') ||\n content.includes('## ') ||\n content.includes('- [') ||\n content.includes('* [') ||\n content.includes('```')\n );\n }\n\n parse(content: string): ParserResult<ParsedMarkdown> {\n const tree = fromMarkdown(content);\n const sections: MarkdownSection[] = [];\n const allLinks: MarkdownLink[] = [];\n const codeBlocks: CodeBlock[] = [];\n let frontmatter: Record<string, unknown> | undefined;\n\n // Extract frontmatter if present\n if (content.startsWith('---')) {\n const endIndex = content.indexOf('---', 3);\n if (endIndex !== -1) {\n const frontmatterContent = content.slice(3, endIndex).trim();\n frontmatter = this.parseFrontmatter(frontmatterContent);\n }\n }\n\n // Track current section\n let currentSection: MarkdownSection | null = null;\n\n // Process the AST\n visit(tree, (node) => {\n // Handle headings\n if (node.type === 'heading') {\n const heading = node as Heading;\n const title = mdastToString(heading);\n\n // Finalize previous section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n currentSection = {\n level: heading.depth,\n title,\n content: '',\n links: [],\n };\n }\n\n // Handle links\n if (node.type === 'link') {\n const link = node as Link;\n const text = mdastToString(link);\n const linkData: MarkdownLink = {\n url: link.url,\n text,\n title: link.title ?? undefined,\n context: currentSection?.title,\n };\n\n allLinks.push(linkData);\n if (currentSection) {\n currentSection.links.push(linkData);\n }\n }\n\n // Handle code blocks\n if (node.type === 'code') {\n const code = node as Code;\n codeBlocks.push({\n language: code.lang ?? undefined,\n code: code.value,\n meta: code.meta ?? undefined,\n });\n }\n\n // Accumulate content for current section\n if (currentSection && node.type === 'paragraph') {\n const text = mdastToString(node);\n currentSection.content += (currentSection.content ? '\\n\\n' : '') + text;\n }\n });\n\n // Finalize last section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n // Extract title from first h1 or frontmatter\n const title = (frontmatter?.title as string) ?? sections.find((s) => s.level === 1)?.title;\n\n // Extract description from frontmatter or first paragraph before any heading\n const description = (frontmatter?.description as string) ?? this.extractDescription(tree);\n\n return {\n data: {\n title,\n description,\n sections,\n links: allLinks,\n codeBlocks,\n frontmatter,\n },\n };\n }\n\n private parseFrontmatter(content: string): Record<string, unknown> {\n const result: Record<string, unknown> = {};\n const lines = content.split('\\n');\n\n for (const line of lines) {\n const colonIndex = line.indexOf(':');\n if (colonIndex > 0) {\n const key = line.slice(0, colonIndex).trim();\n let value: string | boolean | number = line.slice(colonIndex + 1).trim();\n\n // Parse simple types\n if (value === 'true') value = true;\n else if (value === 'false') value = false;\n else if (/^-?\\d+(\\.\\d+)?$/.test(value)) value = Number(value);\n else if (value.startsWith('\"') && value.endsWith('\"')) value = value.slice(1, -1);\n else if (value.startsWith(\"'\") && value.endsWith(\"'\")) value = value.slice(1, -1);\n\n result[key] = value;\n }\n }\n\n return result;\n }\n\n private extractDescription(tree: Root): string | undefined {\n // Find first paragraph before any heading\n for (const node of tree.children) {\n if (node.type === 'heading') break;\n if (node.type === 'paragraph') {\n return mdastToString(node);\n }\n }\n return undefined;\n }\n}\n\n/**\n * Extract links from a list-based markdown structure (like awesome lists)\n */\nexport function extractListLinks(markdown: string): MarkdownLink[] {\n const tree = fromMarkdown(markdown);\n const links: MarkdownLink[] = [];\n let currentHeading = '';\n\n visit(tree, (node) => {\n if (node.type === 'heading') {\n currentHeading = mdastToString(node as Heading);\n }\n\n if (node.type === 'listItem') {\n const listItem = node as ListItem;\n\n // Find links in this list item\n visit(listItem, 'link', (linkNode: Link) => {\n links.push({\n url: linkNode.url,\n text: mdastToString(linkNode),\n title: linkNode.title ?? undefined,\n context: currentHeading || undefined,\n });\n });\n }\n });\n\n return links;\n}\n\n/**\n * Parse markdown into sections by heading level\n */\nexport function parseByHeadings(markdown: string, minLevel = 2): MarkdownSection[] {\n const parser = new MarkdownParser();\n const result = parser.parse(markdown);\n return result.data.sections.filter((s) => s.level >= minLevel);\n}\n"],"mappings":";;;;;;;;;;;AASA,SAAgB,aAAa,KAAsB;AACjD,QAAO,oDAAoD,KAAK,IAAI;;;;;AAMtE,SAAgB,eAAe,KAAqD;CAClF,MAAM,QAAQ,IAAI,MAAM,gCAAgC;AACxD,KAAI,CAAC,SAAS,CAAC,MAAM,MAAM,CAAC,MAAM,GAAI,QAAO;AAC7C,QAAO;EACL,OAAO,MAAM;EACb,MAAM,MAAM,GAAG,QAAQ,UAAU,GAAG;EACrC;;;;;AAMH,SAAgB,SAAS,KAAa,SAAS,QAAQ,OAAO,aAAqB;CACjF,MAAM,OAAO,eAAe,IAAI;AAChC,KAAI,CAAC,KAAM,QAAO;AAClB,QAAO,qCAAqC,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,OAAO,GAAG;;;;;;AAOnF,eAAsB,cACpB,OACA,MACA,QACqB;AAGrB,QAAO;EACL,WAAW;EACX,UAAU;EACX;;;;;AAMH,SAAgB,gBAAgB,OAAoD;CAClF,MAAM,yBAAS,IAAI,KAA6B;AAEhD,MAAK,MAAM,QAAQ,OAAO;EACxB,MAAM,WAAW,KAAK,WAAW;EACjC,MAAM,WAAW,OAAO,IAAI,SAAS,IAAI,EAAE;AAC3C,WAAS,KAAK,KAAK;AACnB,SAAO,IAAI,UAAU,SAAS;;AAGhC,QAAO;;;;;;;;;;;;;;;;;ACvCT,IAAa,iBAAb,MAAoE;CAClE,AAAS,OAAO;CAEhB,SAAS,SAA0B;AAEjC,SACE,QAAQ,SAAS,KAAK,IACtB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM;;CAI3B,MAAM,SAA+C;EACnD,MAAM,OAAO,aAAa,QAAQ;EAClC,MAAMA,WAA8B,EAAE;EACtC,MAAMC,WAA2B,EAAE;EACnC,MAAMC,aAA0B,EAAE;EAClC,IAAIC;AAGJ,MAAI,QAAQ,WAAW,MAAM,EAAE;GAC7B,MAAM,WAAW,QAAQ,QAAQ,OAAO,EAAE;AAC1C,OAAI,aAAa,IAAI;IACnB,MAAM,qBAAqB,QAAQ,MAAM,GAAG,SAAS,CAAC,MAAM;AAC5D,kBAAc,KAAK,iBAAiB,mBAAmB;;;EAK3D,IAAIC,iBAAyC;AAG7C,QAAM,OAAO,SAAS;AAEpB,OAAI,KAAK,SAAS,WAAW;IAC3B,MAAM,UAAU;IAChB,MAAM,QAAQC,SAAc,QAAQ;AAGpC,QAAI,eACF,UAAS,KAAK,eAAe;AAG/B,qBAAiB;KACf,OAAO,QAAQ;KACf;KACA,SAAS;KACT,OAAO,EAAE;KACV;;AAIH,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;IACb,MAAM,OAAOA,SAAc,KAAK;IAChC,MAAMC,WAAyB;KAC7B,KAAK,KAAK;KACV;KACA,OAAO,KAAK,SAAS;KACrB,SAAS,gBAAgB;KAC1B;AAED,aAAS,KAAK,SAAS;AACvB,QAAI,eACF,gBAAe,MAAM,KAAK,SAAS;;AAKvC,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;AACb,eAAW,KAAK;KACd,UAAU,KAAK,QAAQ;KACvB,MAAM,KAAK;KACX,MAAM,KAAK,QAAQ;KACpB,CAAC;;AAIJ,OAAI,kBAAkB,KAAK,SAAS,aAAa;IAC/C,MAAM,OAAOD,SAAc,KAAK;AAChC,mBAAe,YAAY,eAAe,UAAU,SAAS,MAAM;;IAErE;AAGF,MAAI,eACF,UAAS,KAAK,eAAe;AAS/B,SAAO,EACL,MAAM;GACJ,OAPW,aAAa,SAAoB,SAAS,MAAM,MAAM,EAAE,UAAU,EAAE,EAAE;GAQjF,aALiB,aAAa,eAA0B,KAAK,mBAAmB,KAAK;GAMrF;GACA,OAAO;GACP;GACA;GACD,EACF;;CAGH,AAAQ,iBAAiB,SAA0C;EACjE,MAAME,SAAkC,EAAE;EAC1C,MAAM,QAAQ,QAAQ,MAAM,KAAK;AAEjC,OAAK,MAAM,QAAQ,OAAO;GACxB,MAAM,aAAa,KAAK,QAAQ,IAAI;AACpC,OAAI,aAAa,GAAG;IAClB,MAAM,MAAM,KAAK,MAAM,GAAG,WAAW,CAAC,MAAM;IAC5C,IAAIC,QAAmC,KAAK,MAAM,aAAa,EAAE,CAAC,MAAM;AAGxE,QAAI,UAAU,OAAQ,SAAQ;aACrB,UAAU,QAAS,SAAQ;aAC3B,kBAAkB,KAAK,MAAM,CAAE,SAAQ,OAAO,MAAM;aACpD,MAAM,WAAW,KAAI,IAAI,MAAM,SAAS,KAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;aACxE,MAAM,WAAW,IAAI,IAAI,MAAM,SAAS,IAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;AAEjF,WAAO,OAAO;;;AAIlB,SAAO;;CAGT,AAAQ,mBAAmB,MAAgC;AAEzD,OAAK,MAAM,QAAQ,KAAK,UAAU;AAChC,OAAI,KAAK,SAAS,UAAW;AAC7B,OAAI,KAAK,SAAS,YAChB,QAAOH,SAAc,KAAK;;;;;;;AAUlC,SAAgB,iBAAiB,UAAkC;CACjE,MAAM,OAAO,aAAa,SAAS;CACnC,MAAMI,QAAwB,EAAE;CAChC,IAAI,iBAAiB;AAErB,OAAM,OAAO,SAAS;AACpB,MAAI,KAAK,SAAS,UAChB,kBAAiBJ,SAAc,KAAgB;AAGjD,MAAI,KAAK,SAAS,WAIhB,OAHiB,MAGD,SAAS,aAAmB;AAC1C,SAAM,KAAK;IACT,KAAK,SAAS;IACd,MAAMA,SAAc,SAAS;IAC7B,OAAO,SAAS,SAAS;IACzB,SAAS,kBAAkB;IAC5B,CAAC;IACF;GAEJ;AAEF,QAAO;;;;;AAMT,SAAgB,gBAAgB,UAAkB,WAAW,GAAsB;AAGjF,QAFe,IAAI,gBAAgB,CACb,MAAM,SAAS,CACvB,KAAK,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS"}
@@ -1,150 +0,0 @@
1
- import { CheerioAPI } from "cheerio";
2
-
3
- //#region src/core/types.d.ts
4
-
5
- /**
6
- * Content type classification for scraped URLs
7
- */
8
- type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
9
- /**
10
- * Extracted link from content
11
- */
12
- interface ExtractedLink {
13
- url: string;
14
- text: string;
15
- isExternal: boolean;
16
- }
17
- /**
18
- * Extracted entities from LLM enhancement
19
- */
20
- interface ExtractedEntities {
21
- people: string[];
22
- organizations: string[];
23
- technologies: string[];
24
- locations: string[];
25
- concepts: string[];
26
- }
27
- /**
28
- * Main result of metadata scraping - optimized for LLM consumption
29
- */
30
- interface ScrapedData {
31
- url: string;
32
- canonicalUrl: string;
33
- domain: string;
34
- title: string;
35
- description: string;
36
- image?: string;
37
- favicon?: string;
38
- content: string;
39
- textContent: string;
40
- excerpt: string;
41
- wordCount: number;
42
- author?: string;
43
- publishedAt?: string;
44
- modifiedAt?: string;
45
- siteName?: string;
46
- language?: string;
47
- contentType: ContentType;
48
- keywords: string[];
49
- jsonLd?: Record<string, unknown>[];
50
- links?: ExtractedLink[];
51
- summary?: string;
52
- suggestedTags?: string[];
53
- entities?: ExtractedEntities;
54
- extracted?: Record<string, unknown>;
55
- custom?: Record<string, unknown>;
56
- scrapedAt: string;
57
- scrapeTimeMs: number;
58
- error?: string;
59
- }
60
- /**
61
- * LLM enhancement types
62
- */
63
- type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
64
- /**
65
- * Schema for structured LLM extraction
66
- */
67
- type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
68
- type ExtractionSchema = Record<string, ExtractionSchemaType>;
69
- /**
70
- * Forward declaration for LLM provider (defined in llm/types.ts)
71
- */
72
- interface LLMProvider {
73
- readonly name: string;
74
- complete(prompt: string, options?: CompletionOptions): Promise<string>;
75
- completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
76
- }
77
- interface CompletionOptions {
78
- maxTokens?: number;
79
- temperature?: number;
80
- systemPrompt?: string;
81
- }
82
- /**
83
- * Forward declaration for Fetcher (defined in fetchers/types.ts)
84
- */
85
- interface Fetcher {
86
- readonly name: string;
87
- fetch(url: string, options: FetchOptions): Promise<FetchResult>;
88
- }
89
- interface FetchOptions {
90
- timeout?: number;
91
- userAgent?: string;
92
- headers?: Record<string, string>;
93
- }
94
- interface FetchResult {
95
- html: string;
96
- finalUrl: string;
97
- statusCode: number;
98
- contentType: string;
99
- headers?: Record<string, string>;
100
- }
101
- /**
102
- * Forward declaration for Extractor (defined in extractors/types.ts)
103
- */
104
- interface Extractor {
105
- readonly name: string;
106
- readonly priority?: number;
107
- extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
108
- }
109
- /**
110
- * Shared context passed to all extractors
111
- */
112
- interface ExtractionContext {
113
- url: string;
114
- finalUrl: string;
115
- html: string;
116
- $: CheerioAPI;
117
- getDocument(): Document;
118
- results: Partial<ScrapedData>;
119
- options: ScrapeOptions;
120
- }
121
- /**
122
- * Options for scraping
123
- */
124
- interface ScrapeOptions {
125
- /** Timeout in milliseconds (default: 10000) */
126
- timeout?: number;
127
- /** User agent string */
128
- userAgent?: string;
129
- /** Whether to extract full content (default: true) */
130
- extractContent?: boolean;
131
- /** Maximum content length in characters (default: 50000) */
132
- maxContentLength?: number;
133
- /** Custom fetcher (for Puppeteer/Playwright) */
134
- fetcher?: Fetcher;
135
- /** Custom extractors to run */
136
- extractors?: Extractor[];
137
- /** If true, only run custom extractors (replace defaults) */
138
- replaceDefaultExtractors?: boolean;
139
- /** Check robots.txt before scraping (default: false) */
140
- respectRobots?: boolean;
141
- /** LLM provider for enhancements */
142
- llm?: LLMProvider;
143
- /** LLM enhancement types to run */
144
- enhance?: EnhancementType[];
145
- /** Schema for structured LLM extraction */
146
- extract?: ExtractionSchema;
147
- }
148
- //#endregion
149
- export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
150
- //# sourceMappingURL=types-CNQZVW36.d.mts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"types-CNQZVW36.d.mts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}
@@ -1,150 +0,0 @@
1
- import { CheerioAPI } from "cheerio";
2
-
3
- //#region src/core/types.d.ts
4
-
5
- /**
6
- * Content type classification for scraped URLs
7
- */
8
- type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
9
- /**
10
- * Extracted link from content
11
- */
12
- interface ExtractedLink {
13
- url: string;
14
- text: string;
15
- isExternal: boolean;
16
- }
17
- /**
18
- * Extracted entities from LLM enhancement
19
- */
20
- interface ExtractedEntities {
21
- people: string[];
22
- organizations: string[];
23
- technologies: string[];
24
- locations: string[];
25
- concepts: string[];
26
- }
27
- /**
28
- * Main result of metadata scraping - optimized for LLM consumption
29
- */
30
- interface ScrapedData {
31
- url: string;
32
- canonicalUrl: string;
33
- domain: string;
34
- title: string;
35
- description: string;
36
- image?: string;
37
- favicon?: string;
38
- content: string;
39
- textContent: string;
40
- excerpt: string;
41
- wordCount: number;
42
- author?: string;
43
- publishedAt?: string;
44
- modifiedAt?: string;
45
- siteName?: string;
46
- language?: string;
47
- contentType: ContentType;
48
- keywords: string[];
49
- jsonLd?: Record<string, unknown>[];
50
- links?: ExtractedLink[];
51
- summary?: string;
52
- suggestedTags?: string[];
53
- entities?: ExtractedEntities;
54
- extracted?: Record<string, unknown>;
55
- custom?: Record<string, unknown>;
56
- scrapedAt: string;
57
- scrapeTimeMs: number;
58
- error?: string;
59
- }
60
- /**
61
- * LLM enhancement types
62
- */
63
- type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
64
- /**
65
- * Schema for structured LLM extraction
66
- */
67
- type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
68
- type ExtractionSchema = Record<string, ExtractionSchemaType>;
69
- /**
70
- * Forward declaration for LLM provider (defined in llm/types.ts)
71
- */
72
- interface LLMProvider {
73
- readonly name: string;
74
- complete(prompt: string, options?: CompletionOptions): Promise<string>;
75
- completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
76
- }
77
- interface CompletionOptions {
78
- maxTokens?: number;
79
- temperature?: number;
80
- systemPrompt?: string;
81
- }
82
- /**
83
- * Forward declaration for Fetcher (defined in fetchers/types.ts)
84
- */
85
- interface Fetcher {
86
- readonly name: string;
87
- fetch(url: string, options: FetchOptions): Promise<FetchResult>;
88
- }
89
- interface FetchOptions {
90
- timeout?: number;
91
- userAgent?: string;
92
- headers?: Record<string, string>;
93
- }
94
- interface FetchResult {
95
- html: string;
96
- finalUrl: string;
97
- statusCode: number;
98
- contentType: string;
99
- headers?: Record<string, string>;
100
- }
101
- /**
102
- * Forward declaration for Extractor (defined in extractors/types.ts)
103
- */
104
- interface Extractor {
105
- readonly name: string;
106
- readonly priority?: number;
107
- extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
108
- }
109
- /**
110
- * Shared context passed to all extractors
111
- */
112
- interface ExtractionContext {
113
- url: string;
114
- finalUrl: string;
115
- html: string;
116
- $: CheerioAPI;
117
- getDocument(): Document;
118
- results: Partial<ScrapedData>;
119
- options: ScrapeOptions;
120
- }
121
- /**
122
- * Options for scraping
123
- */
124
- interface ScrapeOptions {
125
- /** Timeout in milliseconds (default: 10000) */
126
- timeout?: number;
127
- /** User agent string */
128
- userAgent?: string;
129
- /** Whether to extract full content (default: true) */
130
- extractContent?: boolean;
131
- /** Maximum content length in characters (default: 50000) */
132
- maxContentLength?: number;
133
- /** Custom fetcher (for Puppeteer/Playwright) */
134
- fetcher?: Fetcher;
135
- /** Custom extractors to run */
136
- extractors?: Extractor[];
137
- /** If true, only run custom extractors (replace defaults) */
138
- replaceDefaultExtractors?: boolean;
139
- /** Check robots.txt before scraping (default: false) */
140
- respectRobots?: boolean;
141
- /** LLM provider for enhancements */
142
- llm?: LLMProvider;
143
- /** LLM enhancement types to run */
144
- enhance?: EnhancementType[];
145
- /** Schema for structured LLM extraction */
146
- extract?: ExtractionSchema;
147
- }
148
- //#endregion
149
- export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
150
- //# sourceMappingURL=types-D0HYR95H.d.cts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"types-D0HYR95H.d.cts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}