scrapex 0.5.3 → 1.0.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +392 -145
- package/dist/enhancer-Q6CSc1gA.mjs +220 -0
- package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
- package/dist/enhancer-oM4BhYYS.cjs +268 -0
- package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
- package/dist/index.cjs +852 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +264 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +264 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +798 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +316 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +211 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +211 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +310 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +200 -0
- package/dist/parsers/index.cjs.map +1 -0
- package/dist/parsers/index.d.cts +133 -0
- package/dist/parsers/index.d.cts.map +1 -0
- package/dist/parsers/index.d.mts +133 -0
- package/dist/parsers/index.d.mts.map +1 -0
- package/dist/parsers/index.mjs +192 -0
- package/dist/parsers/index.mjs.map +1 -0
- package/dist/types-CNQZVW36.d.mts +150 -0
- package/dist/types-CNQZVW36.d.mts.map +1 -0
- package/dist/types-D0HYR95H.d.cts +150 -0
- package/dist/types-D0HYR95H.d.cts.map +1 -0
- package/package.json +80 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1130
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1122
- package/dist/scrapex.esm.js.map +0 -1
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
//#region src/parsers/types.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* Generic source parser interface.
|
|
4
|
+
* Parsers transform raw content into structured data with metadata.
|
|
5
|
+
*
|
|
6
|
+
* @template TData - The main data type (e.g., array of links)
|
|
7
|
+
* @template TMeta - Optional metadata type
|
|
8
|
+
*/
|
|
9
|
+
interface SourceParser<TData, TMeta = unknown> {
|
|
10
|
+
readonly name: string;
|
|
11
|
+
/**
|
|
12
|
+
* Check if this parser can handle the given content
|
|
13
|
+
*/
|
|
14
|
+
canParse(content: string, url?: string): boolean;
|
|
15
|
+
/**
|
|
16
|
+
* Parse the content and extract structured data
|
|
17
|
+
*/
|
|
18
|
+
parse(content: string, url?: string): ParserResult<TData, TMeta>;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Result from a parser
|
|
22
|
+
*/
|
|
23
|
+
interface ParserResult<TData, TMeta = unknown> {
|
|
24
|
+
data: TData;
|
|
25
|
+
meta?: TMeta;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Markdown link extracted from content
|
|
29
|
+
*/
|
|
30
|
+
interface MarkdownLink {
|
|
31
|
+
url: string;
|
|
32
|
+
text: string;
|
|
33
|
+
title?: string;
|
|
34
|
+
context?: string;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Markdown section (heading + content)
|
|
38
|
+
*/
|
|
39
|
+
interface MarkdownSection {
|
|
40
|
+
level: number;
|
|
41
|
+
title: string;
|
|
42
|
+
content: string;
|
|
43
|
+
links: MarkdownLink[];
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Parsed markdown structure
|
|
47
|
+
*/
|
|
48
|
+
interface ParsedMarkdown {
|
|
49
|
+
title?: string;
|
|
50
|
+
description?: string;
|
|
51
|
+
sections: MarkdownSection[];
|
|
52
|
+
links: MarkdownLink[];
|
|
53
|
+
codeBlocks: CodeBlock[];
|
|
54
|
+
frontmatter?: Record<string, unknown>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Code block from markdown
|
|
58
|
+
*/
|
|
59
|
+
interface CodeBlock {
|
|
60
|
+
language?: string;
|
|
61
|
+
code: string;
|
|
62
|
+
meta?: string;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* GitHub repository metadata
|
|
66
|
+
*/
|
|
67
|
+
interface GitHubMeta {
|
|
68
|
+
repoOwner?: string;
|
|
69
|
+
repoName?: string;
|
|
70
|
+
stars?: number;
|
|
71
|
+
lastUpdated?: string;
|
|
72
|
+
}
|
|
73
|
+
//#endregion
|
|
74
|
+
//#region src/parsers/github.d.ts
|
|
75
|
+
/**
|
|
76
|
+
* GitHub-specific utilities for parsing repositories.
|
|
77
|
+
*/
|
|
78
|
+
/**
|
|
79
|
+
* Check if a URL is a GitHub repository
|
|
80
|
+
*/
|
|
81
|
+
declare function isGitHubRepo(url: string): boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Extract GitHub repo info from URL
|
|
84
|
+
*/
|
|
85
|
+
declare function parseGitHubUrl(url: string): {
|
|
86
|
+
owner: string;
|
|
87
|
+
repo: string;
|
|
88
|
+
} | null;
|
|
89
|
+
/**
|
|
90
|
+
* Convert a GitHub repo URL to raw content URL
|
|
91
|
+
*/
|
|
92
|
+
declare function toRawUrl(url: string, branch?: string, file?: string): string;
|
|
93
|
+
/**
|
|
94
|
+
* Fetch GitHub API metadata for a repository
|
|
95
|
+
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
96
|
+
*/
|
|
97
|
+
declare function fetchRepoMeta(owner: string, repo: string, _token?: string): Promise<GitHubMeta>;
|
|
98
|
+
/**
|
|
99
|
+
* Group links by their category/section
|
|
100
|
+
*/
|
|
101
|
+
declare function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]>;
|
|
102
|
+
//#endregion
|
|
103
|
+
//#region src/parsers/markdown.d.ts
|
|
104
|
+
/**
|
|
105
|
+
* Generic Markdown parser.
|
|
106
|
+
* Extracts structure, links, and code blocks from markdown content.
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```ts
|
|
110
|
+
* const parser = new MarkdownParser();
|
|
111
|
+
* const result = parser.parse(markdownContent);
|
|
112
|
+
* console.log(result.data.sections);
|
|
113
|
+
* console.log(result.data.links);
|
|
114
|
+
* ```
|
|
115
|
+
*/
|
|
116
|
+
declare class MarkdownParser implements SourceParser<ParsedMarkdown> {
|
|
117
|
+
readonly name = "markdown";
|
|
118
|
+
canParse(content: string): boolean;
|
|
119
|
+
parse(content: string): ParserResult<ParsedMarkdown>;
|
|
120
|
+
private parseFrontmatter;
|
|
121
|
+
private extractDescription;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Extract links from a list-based markdown structure (like awesome lists)
|
|
125
|
+
*/
|
|
126
|
+
declare function extractListLinks(markdown: string): MarkdownLink[];
|
|
127
|
+
/**
|
|
128
|
+
* Parse markdown into sections by heading level
|
|
129
|
+
*/
|
|
130
|
+
declare function parseByHeadings(markdown: string, minLevel?: number): MarkdownSection[];
|
|
131
|
+
//#endregion
|
|
132
|
+
export { type CodeBlock, type GitHubMeta, type MarkdownLink, MarkdownParser, type MarkdownSection, type ParsedMarkdown, type ParserResult, type SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
133
|
+
//# sourceMappingURL=index.d.cts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.cts","names":[],"sources":["../../src/parsers/types.ts","../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":[],"mappings":";;AAOA;;;;;AAiBA;AAQiB,UAzBA,YAyBY,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAUZ,SAAA,IAAA,EAAA,MAAe;EAUf;;;EAKH,QAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACE;;AAMhB;EASiB,KAAA,CAAA,OAAU,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAvDa,YAuDb,CAvD0B,KAuD1B,EAvDiC,KAuDjC,CAAA;;;;AChE3B;AAOgB,UDQC,YCRa,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAYd,IAAA,EDHR,KCGgB;EAUF,IAAA,CAAA,EDZb,KCYa;AAgBtB;;;;AAA2D,UDtB1C,YAAA,CCsB0C;;;;EC7B9C,OAAA,CAAA,EAAA,MAAA;;;;;AAAsC,UFiBlC,eAAA,CEjBkC;EAqJnC,KAAA,EAAA,MAAA;EA+BA,KAAA,EAAA,MAAA;;SF/JP;;;;;UAMQ,cAAA;;;YAGL;SACH;cACK;gBACE;;;;;UAMC,SAAA;;;;;;;;UASA,UAAA;;;;;;;;AAlEjB;;;;;AAiBA;AAQiB,iBCvBD,YAAA,CDuBa,GAAA,EAAA,MAAA,CAAA,EAAA,OAAA;AAU7B;AAUA;;AAIS,iBCxCO,cAAA,CDwCP,GAAA,EAAA,MAAA,CAAA,EAAA;EACK,KAAA,EAAA,MAAA;EACE,IAAA,EAAA,MAAA;CAAM,GAAA,IAAA;AAMtB;AASA;;iBC7CgB,QAAA;;AAnBhB;AAOA;AAYA;AAUsB,iBAAA,aAAA,CAIX,KAAA,EAAR,MAAA,EAAO,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAP,OAAO,CAAC,UAAD,CAAA;AAYV;;;AAAwD,iBAAxC,eAAA,CAAwC,KAAA,EAAjB,YAAiB,EAAA,CAAA,EAAA,GAAA,CAAA,MAAA,EAAY,YAAZ,EAAA,CAAA;;;AD/CxD;;;;;AAiBA;AAQA;AAUA;AAUA;;;;AAMgB,cEjCH,cAAA,YAA0B,YFiCvB,CEjCoC,cFiCpC,CAAA,CAAA;EAAM,SAAA,IAAA,GAAA,UAAA;EAML,QAAA,CAAA,OAAS,EAAA,MAAA,CAAA,EAAA,OAAA;EAST,KAAA,CAAA,OAAU,EAAA,MAAA,CAAA,EElCD,YFkCC,CElCY,cFkCZ,CAAA;;;;AChE3B;AAOA;AAYA;AAUsB,iBCwIN,gBAAA,CDpIL,QAAR,EAAA,MAAO,CAAA,ECoI0C,YDpI1C,EAAA;AAYV;;;AAAwD,iBCuJxC,eAAA,CDvJwC,QAAA,EAAA,MAAA,EAAA,QAAA,CAAA,EAAA,MAAA,CAAA,ECuJS,eDvJT,EAAA"}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
//#region src/parsers/types.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* Generic source parser interface.
|
|
4
|
+
* Parsers transform raw content into structured data with metadata.
|
|
5
|
+
*
|
|
6
|
+
* @template TData - The main data type (e.g., array of links)
|
|
7
|
+
* @template TMeta - Optional metadata type
|
|
8
|
+
*/
|
|
9
|
+
interface SourceParser<TData, TMeta = unknown> {
|
|
10
|
+
readonly name: string;
|
|
11
|
+
/**
|
|
12
|
+
* Check if this parser can handle the given content
|
|
13
|
+
*/
|
|
14
|
+
canParse(content: string, url?: string): boolean;
|
|
15
|
+
/**
|
|
16
|
+
* Parse the content and extract structured data
|
|
17
|
+
*/
|
|
18
|
+
parse(content: string, url?: string): ParserResult<TData, TMeta>;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Result from a parser
|
|
22
|
+
*/
|
|
23
|
+
interface ParserResult<TData, TMeta = unknown> {
|
|
24
|
+
data: TData;
|
|
25
|
+
meta?: TMeta;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Markdown link extracted from content
|
|
29
|
+
*/
|
|
30
|
+
interface MarkdownLink {
|
|
31
|
+
url: string;
|
|
32
|
+
text: string;
|
|
33
|
+
title?: string;
|
|
34
|
+
context?: string;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Markdown section (heading + content)
|
|
38
|
+
*/
|
|
39
|
+
interface MarkdownSection {
|
|
40
|
+
level: number;
|
|
41
|
+
title: string;
|
|
42
|
+
content: string;
|
|
43
|
+
links: MarkdownLink[];
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Parsed markdown structure
|
|
47
|
+
*/
|
|
48
|
+
interface ParsedMarkdown {
|
|
49
|
+
title?: string;
|
|
50
|
+
description?: string;
|
|
51
|
+
sections: MarkdownSection[];
|
|
52
|
+
links: MarkdownLink[];
|
|
53
|
+
codeBlocks: CodeBlock[];
|
|
54
|
+
frontmatter?: Record<string, unknown>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Code block from markdown
|
|
58
|
+
*/
|
|
59
|
+
interface CodeBlock {
|
|
60
|
+
language?: string;
|
|
61
|
+
code: string;
|
|
62
|
+
meta?: string;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* GitHub repository metadata
|
|
66
|
+
*/
|
|
67
|
+
interface GitHubMeta {
|
|
68
|
+
repoOwner?: string;
|
|
69
|
+
repoName?: string;
|
|
70
|
+
stars?: number;
|
|
71
|
+
lastUpdated?: string;
|
|
72
|
+
}
|
|
73
|
+
//#endregion
|
|
74
|
+
//#region src/parsers/github.d.ts
|
|
75
|
+
/**
|
|
76
|
+
* GitHub-specific utilities for parsing repositories.
|
|
77
|
+
*/
|
|
78
|
+
/**
|
|
79
|
+
* Check if a URL is a GitHub repository
|
|
80
|
+
*/
|
|
81
|
+
declare function isGitHubRepo(url: string): boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Extract GitHub repo info from URL
|
|
84
|
+
*/
|
|
85
|
+
declare function parseGitHubUrl(url: string): {
|
|
86
|
+
owner: string;
|
|
87
|
+
repo: string;
|
|
88
|
+
} | null;
|
|
89
|
+
/**
|
|
90
|
+
* Convert a GitHub repo URL to raw content URL
|
|
91
|
+
*/
|
|
92
|
+
declare function toRawUrl(url: string, branch?: string, file?: string): string;
|
|
93
|
+
/**
|
|
94
|
+
* Fetch GitHub API metadata for a repository
|
|
95
|
+
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
96
|
+
*/
|
|
97
|
+
declare function fetchRepoMeta(owner: string, repo: string, _token?: string): Promise<GitHubMeta>;
|
|
98
|
+
/**
|
|
99
|
+
* Group links by their category/section
|
|
100
|
+
*/
|
|
101
|
+
declare function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]>;
|
|
102
|
+
//#endregion
|
|
103
|
+
//#region src/parsers/markdown.d.ts
|
|
104
|
+
/**
|
|
105
|
+
* Generic Markdown parser.
|
|
106
|
+
* Extracts structure, links, and code blocks from markdown content.
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```ts
|
|
110
|
+
* const parser = new MarkdownParser();
|
|
111
|
+
* const result = parser.parse(markdownContent);
|
|
112
|
+
* console.log(result.data.sections);
|
|
113
|
+
* console.log(result.data.links);
|
|
114
|
+
* ```
|
|
115
|
+
*/
|
|
116
|
+
declare class MarkdownParser implements SourceParser<ParsedMarkdown> {
|
|
117
|
+
readonly name = "markdown";
|
|
118
|
+
canParse(content: string): boolean;
|
|
119
|
+
parse(content: string): ParserResult<ParsedMarkdown>;
|
|
120
|
+
private parseFrontmatter;
|
|
121
|
+
private extractDescription;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Extract links from a list-based markdown structure (like awesome lists)
|
|
125
|
+
*/
|
|
126
|
+
declare function extractListLinks(markdown: string): MarkdownLink[];
|
|
127
|
+
/**
|
|
128
|
+
* Parse markdown into sections by heading level
|
|
129
|
+
*/
|
|
130
|
+
declare function parseByHeadings(markdown: string, minLevel?: number): MarkdownSection[];
|
|
131
|
+
//#endregion
|
|
132
|
+
export { type CodeBlock, type GitHubMeta, type MarkdownLink, MarkdownParser, type MarkdownSection, type ParsedMarkdown, type ParserResult, type SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
133
|
+
//# sourceMappingURL=index.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../../src/parsers/types.ts","../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":[],"mappings":";;AAOA;;;;;AAiBA;AAQiB,UAzBA,YAyBY,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAUZ,SAAA,IAAA,EAAA,MAAe;EAUf;;;EAKH,QAAA,CAAA,OAAA,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAAA,OAAA;EACE;;AAMhB;EASiB,KAAA,CAAA,OAAU,EAAA,MAAA,EAAA,GAAA,CAAA,EAAA,MAAA,CAAA,EAvDa,YAuDb,CAvD0B,KAuD1B,EAvDiC,KAuDjC,CAAA;;;;AChE3B;AAOgB,UDQC,YCRa,CAAA,KAAA,EAAA,QAAA,OAAA,CAAA,CAAA;EAYd,IAAA,EDHR,KCGgB;EAUF,IAAA,CAAA,EDZb,KCYa;AAgBtB;;;;AAA2D,UDtB1C,YAAA,CCsB0C;;;;EC7B9C,OAAA,CAAA,EAAA,MAAA;;;;;AAAsC,UFiBlC,eAAA,CEjBkC;EAqJnC,KAAA,EAAA,MAAA;EA+BA,KAAA,EAAA,MAAA;;SF/JP;;;;;UAMQ,cAAA;;;YAGL;SACH;cACK;gBACE;;;;;UAMC,SAAA;;;;;;;;UASA,UAAA;;;;;;;;AAlEjB;;;;;AAiBA;AAQiB,iBCvBD,YAAA,CDuBa,GAAA,EAAA,MAAA,CAAA,EAAA,OAAA;AAU7B;AAUA;;AAIS,iBCxCO,cAAA,CDwCP,GAAA,EAAA,MAAA,CAAA,EAAA;EACK,KAAA,EAAA,MAAA;EACE,IAAA,EAAA,MAAA;CAAM,GAAA,IAAA;AAMtB;AASA;;iBC7CgB,QAAA;;AAnBhB;AAOA;AAYA;AAUsB,iBAAA,aAAA,CAIX,KAAA,EAAR,MAAA,EAAO,IAAA,EAAA,MAAA,EAAA,MAAA,CAAA,EAAA,MAAA,CAAA,EAAP,OAAO,CAAC,UAAD,CAAA;AAYV;;;AAAwD,iBAAxC,eAAA,CAAwC,KAAA,EAAjB,YAAiB,EAAA,CAAA,EAAA,GAAA,CAAA,MAAA,EAAY,YAAZ,EAAA,CAAA;;;AD/CxD;;;;;AAiBA;AAQA;AAUA;AAUA;;;;AAMgB,cEjCH,cAAA,YAA0B,YFiCvB,CEjCoC,cFiCpC,CAAA,CAAA;EAAM,SAAA,IAAA,GAAA,UAAA;EAML,QAAA,CAAA,OAAS,EAAA,MAAA,CAAA,EAAA,OAAA;EAST,KAAA,CAAA,OAAU,EAAA,MAAA,CAAA,EElCD,YFkCC,CElCY,cFkCZ,CAAA;;;;AChE3B;AAOA;AAYA;AAUsB,iBCwIN,gBAAA,CDpIL,QAAR,EAAA,MAAO,CAAA,ECoI0C,YDpI1C,EAAA;AAYV;;;AAAwD,iBCuJxC,eAAA,CDvJwC,QAAA,EAAA,MAAA,EAAA,QAAA,CAAA,EAAA,MAAA,CAAA,ECuJS,eDvJT,EAAA"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import { fromMarkdown } from "mdast-util-from-markdown";
|
|
2
|
+
import { toString } from "mdast-util-to-string";
|
|
3
|
+
import { visit } from "unist-util-visit";
|
|
4
|
+
|
|
5
|
+
//#region src/parsers/github.ts
|
|
6
|
+
/**
|
|
7
|
+
* GitHub-specific utilities for parsing repositories.
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Check if a URL is a GitHub repository
|
|
11
|
+
*/
|
|
12
|
+
function isGitHubRepo(url) {
|
|
13
|
+
return /^https?:\/\/(www\.)?github\.com\/[^/]+\/[^/]+\/?$/.test(url);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Extract GitHub repo info from URL
|
|
17
|
+
*/
|
|
18
|
+
function parseGitHubUrl(url) {
|
|
19
|
+
const match = url.match(/github\.com\/([^/]+)\/([^/]+)/);
|
|
20
|
+
if (!match || !match[1] || !match[2]) return null;
|
|
21
|
+
return {
|
|
22
|
+
owner: match[1],
|
|
23
|
+
repo: match[2].replace(/\.git$/, "")
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Convert a GitHub repo URL to raw content URL
|
|
28
|
+
*/
|
|
29
|
+
function toRawUrl(url, branch = "main", file = "README.md") {
|
|
30
|
+
const info = parseGitHubUrl(url);
|
|
31
|
+
if (!info) return url;
|
|
32
|
+
return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Fetch GitHub API metadata for a repository
|
|
36
|
+
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
37
|
+
*/
|
|
38
|
+
async function fetchRepoMeta(owner, repo, _token) {
|
|
39
|
+
return {
|
|
40
|
+
repoOwner: owner,
|
|
41
|
+
repoName: repo
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Group links by their category/section
|
|
46
|
+
*/
|
|
47
|
+
function groupByCategory(links) {
|
|
48
|
+
const groups = /* @__PURE__ */ new Map();
|
|
49
|
+
for (const link of links) {
|
|
50
|
+
const category = link.context || "Uncategorized";
|
|
51
|
+
const existing = groups.get(category) || [];
|
|
52
|
+
existing.push(link);
|
|
53
|
+
groups.set(category, existing);
|
|
54
|
+
}
|
|
55
|
+
return groups;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
//#endregion
|
|
59
|
+
//#region src/parsers/markdown.ts
|
|
60
|
+
/**
|
|
61
|
+
* Generic Markdown parser.
|
|
62
|
+
* Extracts structure, links, and code blocks from markdown content.
|
|
63
|
+
*
|
|
64
|
+
* @example
|
|
65
|
+
* ```ts
|
|
66
|
+
* const parser = new MarkdownParser();
|
|
67
|
+
* const result = parser.parse(markdownContent);
|
|
68
|
+
* console.log(result.data.sections);
|
|
69
|
+
* console.log(result.data.links);
|
|
70
|
+
* ```
|
|
71
|
+
*/
|
|
72
|
+
var MarkdownParser = class {
|
|
73
|
+
name = "markdown";
|
|
74
|
+
canParse(content) {
|
|
75
|
+
return content.includes("# ") || content.includes("## ") || content.includes("- [") || content.includes("* [") || content.includes("```");
|
|
76
|
+
}
|
|
77
|
+
parse(content) {
|
|
78
|
+
const tree = fromMarkdown(content);
|
|
79
|
+
const sections = [];
|
|
80
|
+
const allLinks = [];
|
|
81
|
+
const codeBlocks = [];
|
|
82
|
+
let frontmatter;
|
|
83
|
+
if (content.startsWith("---")) {
|
|
84
|
+
const endIndex = content.indexOf("---", 3);
|
|
85
|
+
if (endIndex !== -1) {
|
|
86
|
+
const frontmatterContent = content.slice(3, endIndex).trim();
|
|
87
|
+
frontmatter = this.parseFrontmatter(frontmatterContent);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
let currentSection = null;
|
|
91
|
+
visit(tree, (node) => {
|
|
92
|
+
if (node.type === "heading") {
|
|
93
|
+
const heading = node;
|
|
94
|
+
const title = toString(heading);
|
|
95
|
+
if (currentSection) sections.push(currentSection);
|
|
96
|
+
currentSection = {
|
|
97
|
+
level: heading.depth,
|
|
98
|
+
title,
|
|
99
|
+
content: "",
|
|
100
|
+
links: []
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
if (node.type === "link") {
|
|
104
|
+
const link = node;
|
|
105
|
+
const text = toString(link);
|
|
106
|
+
const linkData = {
|
|
107
|
+
url: link.url,
|
|
108
|
+
text,
|
|
109
|
+
title: link.title ?? void 0,
|
|
110
|
+
context: currentSection?.title
|
|
111
|
+
};
|
|
112
|
+
allLinks.push(linkData);
|
|
113
|
+
if (currentSection) currentSection.links.push(linkData);
|
|
114
|
+
}
|
|
115
|
+
if (node.type === "code") {
|
|
116
|
+
const code = node;
|
|
117
|
+
codeBlocks.push({
|
|
118
|
+
language: code.lang ?? void 0,
|
|
119
|
+
code: code.value,
|
|
120
|
+
meta: code.meta ?? void 0
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
if (currentSection && node.type === "paragraph") {
|
|
124
|
+
const text = toString(node);
|
|
125
|
+
currentSection.content += (currentSection.content ? "\n\n" : "") + text;
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
if (currentSection) sections.push(currentSection);
|
|
129
|
+
return { data: {
|
|
130
|
+
title: frontmatter?.title ?? sections.find((s) => s.level === 1)?.title,
|
|
131
|
+
description: frontmatter?.description ?? this.extractDescription(tree),
|
|
132
|
+
sections,
|
|
133
|
+
links: allLinks,
|
|
134
|
+
codeBlocks,
|
|
135
|
+
frontmatter
|
|
136
|
+
} };
|
|
137
|
+
}
|
|
138
|
+
parseFrontmatter(content) {
|
|
139
|
+
const result = {};
|
|
140
|
+
const lines = content.split("\n");
|
|
141
|
+
for (const line of lines) {
|
|
142
|
+
const colonIndex = line.indexOf(":");
|
|
143
|
+
if (colonIndex > 0) {
|
|
144
|
+
const key = line.slice(0, colonIndex).trim();
|
|
145
|
+
let value = line.slice(colonIndex + 1).trim();
|
|
146
|
+
if (value === "true") value = true;
|
|
147
|
+
else if (value === "false") value = false;
|
|
148
|
+
else if (/^-?\d+(\.\d+)?$/.test(value)) value = Number(value);
|
|
149
|
+
else if (value.startsWith("\"") && value.endsWith("\"")) value = value.slice(1, -1);
|
|
150
|
+
else if (value.startsWith("'") && value.endsWith("'")) value = value.slice(1, -1);
|
|
151
|
+
result[key] = value;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return result;
|
|
155
|
+
}
|
|
156
|
+
extractDescription(tree) {
|
|
157
|
+
for (const node of tree.children) {
|
|
158
|
+
if (node.type === "heading") break;
|
|
159
|
+
if (node.type === "paragraph") return toString(node);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
};
|
|
163
|
+
/**
|
|
164
|
+
* Extract links from a list-based markdown structure (like awesome lists)
|
|
165
|
+
*/
|
|
166
|
+
function extractListLinks(markdown) {
|
|
167
|
+
const tree = fromMarkdown(markdown);
|
|
168
|
+
const links = [];
|
|
169
|
+
let currentHeading = "";
|
|
170
|
+
visit(tree, (node) => {
|
|
171
|
+
if (node.type === "heading") currentHeading = toString(node);
|
|
172
|
+
if (node.type === "listItem") visit(node, "link", (linkNode) => {
|
|
173
|
+
links.push({
|
|
174
|
+
url: linkNode.url,
|
|
175
|
+
text: toString(linkNode),
|
|
176
|
+
title: linkNode.title ?? void 0,
|
|
177
|
+
context: currentHeading || void 0
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
});
|
|
181
|
+
return links;
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Parse markdown into sections by heading level
|
|
185
|
+
*/
|
|
186
|
+
function parseByHeadings(markdown, minLevel = 2) {
|
|
187
|
+
return new MarkdownParser().parse(markdown).data.sections.filter((s) => s.level >= minLevel);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
//#endregion
|
|
191
|
+
export { MarkdownParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
192
|
+
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.mjs","names":["sections: MarkdownSection[]","allLinks: MarkdownLink[]","codeBlocks: CodeBlock[]","frontmatter: Record<string, unknown> | undefined","currentSection: MarkdownSection | null","mdastToString","linkData: MarkdownLink","result: Record<string, unknown>","value: string | boolean | number","links: MarkdownLink[]"],"sources":["../../src/parsers/github.ts","../../src/parsers/markdown.ts"],"sourcesContent":["import type { GitHubMeta, MarkdownLink } from './types.js';\n\n/**\n * GitHub-specific utilities for parsing repositories.\n */\n\n/**\n * Check if a URL is a GitHub repository\n */\nexport function isGitHubRepo(url: string): boolean {\n return /^https?:\\/\\/(www\\.)?github\\.com\\/[^/]+\\/[^/]+\\/?$/.test(url);\n}\n\n/**\n * Extract GitHub repo info from URL\n */\nexport function parseGitHubUrl(url: string): { owner: string; repo: string } | null {\n const match = url.match(/github\\.com\\/([^/]+)\\/([^/]+)/);\n if (!match || !match[1] || !match[2]) return null;\n return {\n owner: match[1],\n repo: match[2].replace(/\\.git$/, ''),\n };\n}\n\n/**\n * Convert a GitHub repo URL to raw content URL\n */\nexport function toRawUrl(url: string, branch = 'main', file = 'README.md'): string {\n const info = parseGitHubUrl(url);\n if (!info) return url;\n return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;\n}\n\n/**\n * Fetch GitHub API metadata for a repository\n * Note: This is a placeholder - actual implementation would need GitHub API access\n */\nexport async function fetchRepoMeta(\n owner: string,\n repo: string,\n _token?: string\n): Promise<GitHubMeta> {\n // This would make actual API calls in a full implementation\n // For now, return basic info\n return {\n repoOwner: owner,\n repoName: repo,\n };\n}\n\n/**\n * Group links by their category/section\n */\nexport function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]> {\n const groups = new Map<string, MarkdownLink[]>();\n\n for (const link of links) {\n const category = link.context || 'Uncategorized';\n const existing = groups.get(category) || [];\n existing.push(link);\n groups.set(category, existing);\n }\n\n return groups;\n}\n","import type { Code, Heading, Link, ListItem, Root } from 'mdast';\nimport { fromMarkdown } from 'mdast-util-from-markdown';\nimport { toString as mdastToString } from 'mdast-util-to-string';\nimport { visit } from 'unist-util-visit';\nimport type {\n CodeBlock,\n MarkdownLink,\n MarkdownSection,\n ParsedMarkdown,\n ParserResult,\n SourceParser,\n} from './types.js';\n\n/**\n * Generic Markdown parser.\n * Extracts structure, links, and code blocks from markdown content.\n *\n * @example\n * ```ts\n * const parser = new MarkdownParser();\n * const result = parser.parse(markdownContent);\n * console.log(result.data.sections);\n * console.log(result.data.links);\n * ```\n */\nexport class MarkdownParser implements SourceParser<ParsedMarkdown> {\n readonly name = 'markdown';\n\n canParse(content: string): boolean {\n // Check for common markdown patterns\n return (\n content.includes('# ') ||\n content.includes('## ') ||\n content.includes('- [') ||\n content.includes('* [') ||\n content.includes('```')\n );\n }\n\n parse(content: string): ParserResult<ParsedMarkdown> {\n const tree = fromMarkdown(content);\n const sections: MarkdownSection[] = [];\n const allLinks: MarkdownLink[] = [];\n const codeBlocks: CodeBlock[] = [];\n let frontmatter: Record<string, unknown> | undefined;\n\n // Extract frontmatter if present\n if (content.startsWith('---')) {\n const endIndex = content.indexOf('---', 3);\n if (endIndex !== -1) {\n const frontmatterContent = content.slice(3, endIndex).trim();\n frontmatter = this.parseFrontmatter(frontmatterContent);\n }\n }\n\n // Track current section\n let currentSection: MarkdownSection | null = null;\n\n // Process the AST\n visit(tree, (node) => {\n // Handle headings\n if (node.type === 'heading') {\n const heading = node as Heading;\n const title = mdastToString(heading);\n\n // Finalize previous section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n currentSection = {\n level: heading.depth,\n title,\n content: '',\n links: [],\n };\n }\n\n // Handle links\n if (node.type === 'link') {\n const link = node as Link;\n const text = mdastToString(link);\n const linkData: MarkdownLink = {\n url: link.url,\n text,\n title: link.title ?? undefined,\n context: currentSection?.title,\n };\n\n allLinks.push(linkData);\n if (currentSection) {\n currentSection.links.push(linkData);\n }\n }\n\n // Handle code blocks\n if (node.type === 'code') {\n const code = node as Code;\n codeBlocks.push({\n language: code.lang ?? undefined,\n code: code.value,\n meta: code.meta ?? undefined,\n });\n }\n\n // Accumulate content for current section\n if (currentSection && node.type === 'paragraph') {\n const text = mdastToString(node);\n currentSection.content += (currentSection.content ? '\\n\\n' : '') + text;\n }\n });\n\n // Finalize last section\n if (currentSection) {\n sections.push(currentSection);\n }\n\n // Extract title from first h1 or frontmatter\n const title = (frontmatter?.title as string) ?? sections.find((s) => s.level === 1)?.title;\n\n // Extract description from frontmatter or first paragraph before any heading\n const description = (frontmatter?.description as string) ?? this.extractDescription(tree);\n\n return {\n data: {\n title,\n description,\n sections,\n links: allLinks,\n codeBlocks,\n frontmatter,\n },\n };\n }\n\n private parseFrontmatter(content: string): Record<string, unknown> {\n const result: Record<string, unknown> = {};\n const lines = content.split('\\n');\n\n for (const line of lines) {\n const colonIndex = line.indexOf(':');\n if (colonIndex > 0) {\n const key = line.slice(0, colonIndex).trim();\n let value: string | boolean | number = line.slice(colonIndex + 1).trim();\n\n // Parse simple types\n if (value === 'true') value = true;\n else if (value === 'false') value = false;\n else if (/^-?\\d+(\\.\\d+)?$/.test(value)) value = Number(value);\n else if (value.startsWith('\"') && value.endsWith('\"')) value = value.slice(1, -1);\n else if (value.startsWith(\"'\") && value.endsWith(\"'\")) value = value.slice(1, -1);\n\n result[key] = value;\n }\n }\n\n return result;\n }\n\n private extractDescription(tree: Root): string | undefined {\n // Find first paragraph before any heading\n for (const node of tree.children) {\n if (node.type === 'heading') break;\n if (node.type === 'paragraph') {\n return mdastToString(node);\n }\n }\n return undefined;\n }\n}\n\n/**\n * Extract links from a list-based markdown structure (like awesome lists)\n */\nexport function extractListLinks(markdown: string): MarkdownLink[] {\n const tree = fromMarkdown(markdown);\n const links: MarkdownLink[] = [];\n let currentHeading = '';\n\n visit(tree, (node) => {\n if (node.type === 'heading') {\n currentHeading = mdastToString(node as Heading);\n }\n\n if (node.type === 'listItem') {\n const listItem = node as ListItem;\n\n // Find links in this list item\n visit(listItem, 'link', (linkNode: Link) => {\n links.push({\n url: linkNode.url,\n text: mdastToString(linkNode),\n title: linkNode.title ?? undefined,\n context: currentHeading || undefined,\n });\n });\n }\n });\n\n return links;\n}\n\n/**\n * Parse markdown into sections by heading level\n */\nexport function parseByHeadings(markdown: string, minLevel = 2): MarkdownSection[] {\n const parser = new MarkdownParser();\n const result = parser.parse(markdown);\n return result.data.sections.filter((s) => s.level >= minLevel);\n}\n"],"mappings":";;;;;;;;;;;AASA,SAAgB,aAAa,KAAsB;AACjD,QAAO,oDAAoD,KAAK,IAAI;;;;;AAMtE,SAAgB,eAAe,KAAqD;CAClF,MAAM,QAAQ,IAAI,MAAM,gCAAgC;AACxD,KAAI,CAAC,SAAS,CAAC,MAAM,MAAM,CAAC,MAAM,GAAI,QAAO;AAC7C,QAAO;EACL,OAAO,MAAM;EACb,MAAM,MAAM,GAAG,QAAQ,UAAU,GAAG;EACrC;;;;;AAMH,SAAgB,SAAS,KAAa,SAAS,QAAQ,OAAO,aAAqB;CACjF,MAAM,OAAO,eAAe,IAAI;AAChC,KAAI,CAAC,KAAM,QAAO;AAClB,QAAO,qCAAqC,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,OAAO,GAAG;;;;;;AAOnF,eAAsB,cACpB,OACA,MACA,QACqB;AAGrB,QAAO;EACL,WAAW;EACX,UAAU;EACX;;;;;AAMH,SAAgB,gBAAgB,OAAoD;CAClF,MAAM,yBAAS,IAAI,KAA6B;AAEhD,MAAK,MAAM,QAAQ,OAAO;EACxB,MAAM,WAAW,KAAK,WAAW;EACjC,MAAM,WAAW,OAAO,IAAI,SAAS,IAAI,EAAE;AAC3C,WAAS,KAAK,KAAK;AACnB,SAAO,IAAI,UAAU,SAAS;;AAGhC,QAAO;;;;;;;;;;;;;;;;;ACvCT,IAAa,iBAAb,MAAoE;CAClE,AAAS,OAAO;CAEhB,SAAS,SAA0B;AAEjC,SACE,QAAQ,SAAS,KAAK,IACtB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM,IACvB,QAAQ,SAAS,MAAM;;CAI3B,MAAM,SAA+C;EACnD,MAAM,OAAO,aAAa,QAAQ;EAClC,MAAMA,WAA8B,EAAE;EACtC,MAAMC,WAA2B,EAAE;EACnC,MAAMC,aAA0B,EAAE;EAClC,IAAIC;AAGJ,MAAI,QAAQ,WAAW,MAAM,EAAE;GAC7B,MAAM,WAAW,QAAQ,QAAQ,OAAO,EAAE;AAC1C,OAAI,aAAa,IAAI;IACnB,MAAM,qBAAqB,QAAQ,MAAM,GAAG,SAAS,CAAC,MAAM;AAC5D,kBAAc,KAAK,iBAAiB,mBAAmB;;;EAK3D,IAAIC,iBAAyC;AAG7C,QAAM,OAAO,SAAS;AAEpB,OAAI,KAAK,SAAS,WAAW;IAC3B,MAAM,UAAU;IAChB,MAAM,QAAQC,SAAc,QAAQ;AAGpC,QAAI,eACF,UAAS,KAAK,eAAe;AAG/B,qBAAiB;KACf,OAAO,QAAQ;KACf;KACA,SAAS;KACT,OAAO,EAAE;KACV;;AAIH,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;IACb,MAAM,OAAOA,SAAc,KAAK;IAChC,MAAMC,WAAyB;KAC7B,KAAK,KAAK;KACV;KACA,OAAO,KAAK,SAAS;KACrB,SAAS,gBAAgB;KAC1B;AAED,aAAS,KAAK,SAAS;AACvB,QAAI,eACF,gBAAe,MAAM,KAAK,SAAS;;AAKvC,OAAI,KAAK,SAAS,QAAQ;IACxB,MAAM,OAAO;AACb,eAAW,KAAK;KACd,UAAU,KAAK,QAAQ;KACvB,MAAM,KAAK;KACX,MAAM,KAAK,QAAQ;KACpB,CAAC;;AAIJ,OAAI,kBAAkB,KAAK,SAAS,aAAa;IAC/C,MAAM,OAAOD,SAAc,KAAK;AAChC,mBAAe,YAAY,eAAe,UAAU,SAAS,MAAM;;IAErE;AAGF,MAAI,eACF,UAAS,KAAK,eAAe;AAS/B,SAAO,EACL,MAAM;GACJ,OAPW,aAAa,SAAoB,SAAS,MAAM,MAAM,EAAE,UAAU,EAAE,EAAE;GAQjF,aALiB,aAAa,eAA0B,KAAK,mBAAmB,KAAK;GAMrF;GACA,OAAO;GACP;GACA;GACD,EACF;;CAGH,AAAQ,iBAAiB,SAA0C;EACjE,MAAME,SAAkC,EAAE;EAC1C,MAAM,QAAQ,QAAQ,MAAM,KAAK;AAEjC,OAAK,MAAM,QAAQ,OAAO;GACxB,MAAM,aAAa,KAAK,QAAQ,IAAI;AACpC,OAAI,aAAa,GAAG;IAClB,MAAM,MAAM,KAAK,MAAM,GAAG,WAAW,CAAC,MAAM;IAC5C,IAAIC,QAAmC,KAAK,MAAM,aAAa,EAAE,CAAC,MAAM;AAGxE,QAAI,UAAU,OAAQ,SAAQ;aACrB,UAAU,QAAS,SAAQ;aAC3B,kBAAkB,KAAK,MAAM,CAAE,SAAQ,OAAO,MAAM;aACpD,MAAM,WAAW,KAAI,IAAI,MAAM,SAAS,KAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;aACxE,MAAM,WAAW,IAAI,IAAI,MAAM,SAAS,IAAI,CAAE,SAAQ,MAAM,MAAM,GAAG,GAAG;AAEjF,WAAO,OAAO;;;AAIlB,SAAO;;CAGT,AAAQ,mBAAmB,MAAgC;AAEzD,OAAK,MAAM,QAAQ,KAAK,UAAU;AAChC,OAAI,KAAK,SAAS,UAAW;AAC7B,OAAI,KAAK,SAAS,YAChB,QAAOH,SAAc,KAAK;;;;;;;AAUlC,SAAgB,iBAAiB,UAAkC;CACjE,MAAM,OAAO,aAAa,SAAS;CACnC,MAAMI,QAAwB,EAAE;CAChC,IAAI,iBAAiB;AAErB,OAAM,OAAO,SAAS;AACpB,MAAI,KAAK,SAAS,UAChB,kBAAiBJ,SAAc,KAAgB;AAGjD,MAAI,KAAK,SAAS,WAIhB,OAHiB,MAGD,SAAS,aAAmB;AAC1C,SAAM,KAAK;IACT,KAAK,SAAS;IACd,MAAMA,SAAc,SAAS;IAC7B,OAAO,SAAS,SAAS;IACzB,SAAS,kBAAkB;IAC5B,CAAC;IACF;GAEJ;AAEF,QAAO;;;;;AAMT,SAAgB,gBAAgB,UAAkB,WAAW,GAAsB;AAGjF,QAFe,IAAI,gBAAgB,CACb,MAAM,SAAS,CACvB,KAAK,SAAS,QAAQ,MAAM,EAAE,SAAS,SAAS"}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import { CheerioAPI } from "cheerio";
|
|
2
|
+
|
|
3
|
+
//#region src/core/types.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Content type classification for scraped URLs
|
|
7
|
+
*/
|
|
8
|
+
type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
|
|
9
|
+
/**
|
|
10
|
+
* Extracted link from content
|
|
11
|
+
*/
|
|
12
|
+
interface ExtractedLink {
|
|
13
|
+
url: string;
|
|
14
|
+
text: string;
|
|
15
|
+
isExternal: boolean;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Extracted entities from LLM enhancement
|
|
19
|
+
*/
|
|
20
|
+
interface ExtractedEntities {
|
|
21
|
+
people: string[];
|
|
22
|
+
organizations: string[];
|
|
23
|
+
technologies: string[];
|
|
24
|
+
locations: string[];
|
|
25
|
+
concepts: string[];
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Main result of metadata scraping - optimized for LLM consumption
|
|
29
|
+
*/
|
|
30
|
+
interface ScrapedData {
|
|
31
|
+
url: string;
|
|
32
|
+
canonicalUrl: string;
|
|
33
|
+
domain: string;
|
|
34
|
+
title: string;
|
|
35
|
+
description: string;
|
|
36
|
+
image?: string;
|
|
37
|
+
favicon?: string;
|
|
38
|
+
content: string;
|
|
39
|
+
textContent: string;
|
|
40
|
+
excerpt: string;
|
|
41
|
+
wordCount: number;
|
|
42
|
+
author?: string;
|
|
43
|
+
publishedAt?: string;
|
|
44
|
+
modifiedAt?: string;
|
|
45
|
+
siteName?: string;
|
|
46
|
+
language?: string;
|
|
47
|
+
contentType: ContentType;
|
|
48
|
+
keywords: string[];
|
|
49
|
+
jsonLd?: Record<string, unknown>[];
|
|
50
|
+
links?: ExtractedLink[];
|
|
51
|
+
summary?: string;
|
|
52
|
+
suggestedTags?: string[];
|
|
53
|
+
entities?: ExtractedEntities;
|
|
54
|
+
extracted?: Record<string, unknown>;
|
|
55
|
+
custom?: Record<string, unknown>;
|
|
56
|
+
scrapedAt: string;
|
|
57
|
+
scrapeTimeMs: number;
|
|
58
|
+
error?: string;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* LLM enhancement types
|
|
62
|
+
*/
|
|
63
|
+
type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
|
|
64
|
+
/**
|
|
65
|
+
* Schema for structured LLM extraction
|
|
66
|
+
*/
|
|
67
|
+
type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
|
|
68
|
+
type ExtractionSchema = Record<string, ExtractionSchemaType>;
|
|
69
|
+
/**
|
|
70
|
+
* Forward declaration for LLM provider (defined in llm/types.ts)
|
|
71
|
+
*/
|
|
72
|
+
interface LLMProvider {
|
|
73
|
+
readonly name: string;
|
|
74
|
+
complete(prompt: string, options?: CompletionOptions): Promise<string>;
|
|
75
|
+
completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
|
|
76
|
+
}
|
|
77
|
+
interface CompletionOptions {
|
|
78
|
+
maxTokens?: number;
|
|
79
|
+
temperature?: number;
|
|
80
|
+
systemPrompt?: string;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Forward declaration for Fetcher (defined in fetchers/types.ts)
|
|
84
|
+
*/
|
|
85
|
+
interface Fetcher {
|
|
86
|
+
readonly name: string;
|
|
87
|
+
fetch(url: string, options: FetchOptions): Promise<FetchResult>;
|
|
88
|
+
}
|
|
89
|
+
interface FetchOptions {
|
|
90
|
+
timeout?: number;
|
|
91
|
+
userAgent?: string;
|
|
92
|
+
headers?: Record<string, string>;
|
|
93
|
+
}
|
|
94
|
+
interface FetchResult {
|
|
95
|
+
html: string;
|
|
96
|
+
finalUrl: string;
|
|
97
|
+
statusCode: number;
|
|
98
|
+
contentType: string;
|
|
99
|
+
headers?: Record<string, string>;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Forward declaration for Extractor (defined in extractors/types.ts)
|
|
103
|
+
*/
|
|
104
|
+
interface Extractor {
|
|
105
|
+
readonly name: string;
|
|
106
|
+
readonly priority?: number;
|
|
107
|
+
extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Shared context passed to all extractors
|
|
111
|
+
*/
|
|
112
|
+
interface ExtractionContext {
|
|
113
|
+
url: string;
|
|
114
|
+
finalUrl: string;
|
|
115
|
+
html: string;
|
|
116
|
+
$: CheerioAPI;
|
|
117
|
+
getDocument(): Document;
|
|
118
|
+
results: Partial<ScrapedData>;
|
|
119
|
+
options: ScrapeOptions;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Options for scraping
|
|
123
|
+
*/
|
|
124
|
+
interface ScrapeOptions {
|
|
125
|
+
/** Timeout in milliseconds (default: 10000) */
|
|
126
|
+
timeout?: number;
|
|
127
|
+
/** User agent string */
|
|
128
|
+
userAgent?: string;
|
|
129
|
+
/** Whether to extract full content (default: true) */
|
|
130
|
+
extractContent?: boolean;
|
|
131
|
+
/** Maximum content length in characters (default: 50000) */
|
|
132
|
+
maxContentLength?: number;
|
|
133
|
+
/** Custom fetcher (for Puppeteer/Playwright) */
|
|
134
|
+
fetcher?: Fetcher;
|
|
135
|
+
/** Custom extractors to run */
|
|
136
|
+
extractors?: Extractor[];
|
|
137
|
+
/** If true, only run custom extractors (replace defaults) */
|
|
138
|
+
replaceDefaultExtractors?: boolean;
|
|
139
|
+
/** Check robots.txt before scraping (default: false) */
|
|
140
|
+
respectRobots?: boolean;
|
|
141
|
+
/** LLM provider for enhancements */
|
|
142
|
+
llm?: LLMProvider;
|
|
143
|
+
/** LLM enhancement types to run */
|
|
144
|
+
enhance?: EnhancementType[];
|
|
145
|
+
/** Schema for structured LLM extraction */
|
|
146
|
+
extract?: ExtractionSchema;
|
|
147
|
+
}
|
|
148
|
+
//#endregion
|
|
149
|
+
export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
|
|
150
|
+
//# sourceMappingURL=types-CNQZVW36.d.mts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types-CNQZVW36.d.mts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}
|