scrapex 1.0.0-alpha.1 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -5
- package/dist/embeddings/index.cjs +52 -0
- package/dist/embeddings/index.d.cts +3 -0
- package/dist/embeddings/index.d.mts +3 -0
- package/dist/embeddings/index.mjs +4 -0
- package/dist/embeddings-BjNTQSG9.cjs +1455 -0
- package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
- package/dist/embeddings-Bsymy_jA.mjs +1215 -0
- package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
- package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
- package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
- package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
- package/dist/enhancer-INx5NlgO.mjs.map +1 -0
- package/dist/http-base-CHLf-Tco.cjs +684 -0
- package/dist/http-base-CHLf-Tco.cjs.map +1 -0
- package/dist/http-base-DM7YNo6X.mjs +618 -0
- package/dist/http-base-DM7YNo6X.mjs.map +1 -0
- package/dist/index-Bvseqli-.d.cts +268 -0
- package/dist/index-Bvseqli-.d.cts.map +1 -0
- package/dist/index-CIFjNySr.d.mts +268 -0
- package/dist/index-CIFjNySr.d.mts.map +1 -0
- package/dist/index-D6qfjmZQ.d.mts +401 -0
- package/dist/index-D6qfjmZQ.d.mts.map +1 -0
- package/dist/index-RFSpP5g8.d.cts +401 -0
- package/dist/index-RFSpP5g8.d.cts.map +1 -0
- package/dist/index.cjs +171 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -2
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +61 -2
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +129 -6
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +252 -233
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +132 -85
- package/dist/llm/index.d.cts.map +1 -1
- package/dist/llm/index.d.mts +132 -85
- package/dist/llm/index.d.mts.map +1 -1
- package/dist/llm/index.mjs +244 -236
- package/dist/llm/index.mjs.map +1 -1
- package/dist/parsers/index.cjs +10 -199
- package/dist/parsers/index.d.cts +2 -133
- package/dist/parsers/index.d.mts +2 -133
- package/dist/parsers/index.mjs +2 -191
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-DsawHeo0.mjs +482 -0
- package/dist/parsers-DsawHeo0.mjs.map +1 -0
- package/dist/types-BOcHQU9s.d.mts +831 -0
- package/dist/types-BOcHQU9s.d.mts.map +1 -0
- package/dist/types-DutdBpqd.d.cts +831 -0
- package/dist/types-DutdBpqd.d.cts.map +1 -0
- package/package.json +15 -16
- package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
- package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
- package/dist/parsers/index.cjs.map +0 -1
- package/dist/parsers/index.d.cts.map +0 -1
- package/dist/parsers/index.d.mts.map +0 -1
- package/dist/parsers/index.mjs.map +0 -1
- package/dist/types-CNQZVW36.d.mts +0 -150
- package/dist/types-CNQZVW36.d.mts.map +0 -1
- package/dist/types-D0HYR95H.d.cts +0 -150
- package/dist/types-D0HYR95H.d.cts.map +0 -1
package/dist/parsers/index.cjs
CHANGED
|
@@ -1,200 +1,11 @@
|
|
|
1
|
-
const
|
|
2
|
-
let mdast_util_from_markdown = require("mdast-util-from-markdown");
|
|
3
|
-
let mdast_util_to_string = require("mdast-util-to-string");
|
|
4
|
-
let unist_util_visit = require("unist-util-visit");
|
|
1
|
+
const require_parsers = require('../parsers-Bneuws8x.cjs');
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Extract GitHub repo info from URL
|
|
18
|
-
*/
|
|
19
|
-
function parseGitHubUrl(url) {
|
|
20
|
-
const match = url.match(/github\.com\/([^/]+)\/([^/]+)/);
|
|
21
|
-
if (!match || !match[1] || !match[2]) return null;
|
|
22
|
-
return {
|
|
23
|
-
owner: match[1],
|
|
24
|
-
repo: match[2].replace(/\.git$/, "")
|
|
25
|
-
};
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Convert a GitHub repo URL to raw content URL
|
|
29
|
-
*/
|
|
30
|
-
function toRawUrl(url, branch = "main", file = "README.md") {
|
|
31
|
-
const info = parseGitHubUrl(url);
|
|
32
|
-
if (!info) return url;
|
|
33
|
-
return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Fetch GitHub API metadata for a repository
|
|
37
|
-
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
38
|
-
*/
|
|
39
|
-
async function fetchRepoMeta(owner, repo, _token) {
|
|
40
|
-
return {
|
|
41
|
-
repoOwner: owner,
|
|
42
|
-
repoName: repo
|
|
43
|
-
};
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Group links by their category/section
|
|
47
|
-
*/
|
|
48
|
-
function groupByCategory(links) {
|
|
49
|
-
const groups = /* @__PURE__ */ new Map();
|
|
50
|
-
for (const link of links) {
|
|
51
|
-
const category = link.context || "Uncategorized";
|
|
52
|
-
const existing = groups.get(category) || [];
|
|
53
|
-
existing.push(link);
|
|
54
|
-
groups.set(category, existing);
|
|
55
|
-
}
|
|
56
|
-
return groups;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
//#endregion
|
|
60
|
-
//#region src/parsers/markdown.ts
|
|
61
|
-
/**
|
|
62
|
-
* Generic Markdown parser.
|
|
63
|
-
* Extracts structure, links, and code blocks from markdown content.
|
|
64
|
-
*
|
|
65
|
-
* @example
|
|
66
|
-
* ```ts
|
|
67
|
-
* const parser = new MarkdownParser();
|
|
68
|
-
* const result = parser.parse(markdownContent);
|
|
69
|
-
* console.log(result.data.sections);
|
|
70
|
-
* console.log(result.data.links);
|
|
71
|
-
* ```
|
|
72
|
-
*/
|
|
73
|
-
var MarkdownParser = class {
|
|
74
|
-
name = "markdown";
|
|
75
|
-
canParse(content) {
|
|
76
|
-
return content.includes("# ") || content.includes("## ") || content.includes("- [") || content.includes("* [") || content.includes("```");
|
|
77
|
-
}
|
|
78
|
-
parse(content) {
|
|
79
|
-
const tree = (0, mdast_util_from_markdown.fromMarkdown)(content);
|
|
80
|
-
const sections = [];
|
|
81
|
-
const allLinks = [];
|
|
82
|
-
const codeBlocks = [];
|
|
83
|
-
let frontmatter;
|
|
84
|
-
if (content.startsWith("---")) {
|
|
85
|
-
const endIndex = content.indexOf("---", 3);
|
|
86
|
-
if (endIndex !== -1) {
|
|
87
|
-
const frontmatterContent = content.slice(3, endIndex).trim();
|
|
88
|
-
frontmatter = this.parseFrontmatter(frontmatterContent);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
let currentSection = null;
|
|
92
|
-
(0, unist_util_visit.visit)(tree, (node) => {
|
|
93
|
-
if (node.type === "heading") {
|
|
94
|
-
const heading = node;
|
|
95
|
-
const title = (0, mdast_util_to_string.toString)(heading);
|
|
96
|
-
if (currentSection) sections.push(currentSection);
|
|
97
|
-
currentSection = {
|
|
98
|
-
level: heading.depth,
|
|
99
|
-
title,
|
|
100
|
-
content: "",
|
|
101
|
-
links: []
|
|
102
|
-
};
|
|
103
|
-
}
|
|
104
|
-
if (node.type === "link") {
|
|
105
|
-
const link = node;
|
|
106
|
-
const text = (0, mdast_util_to_string.toString)(link);
|
|
107
|
-
const linkData = {
|
|
108
|
-
url: link.url,
|
|
109
|
-
text,
|
|
110
|
-
title: link.title ?? void 0,
|
|
111
|
-
context: currentSection?.title
|
|
112
|
-
};
|
|
113
|
-
allLinks.push(linkData);
|
|
114
|
-
if (currentSection) currentSection.links.push(linkData);
|
|
115
|
-
}
|
|
116
|
-
if (node.type === "code") {
|
|
117
|
-
const code = node;
|
|
118
|
-
codeBlocks.push({
|
|
119
|
-
language: code.lang ?? void 0,
|
|
120
|
-
code: code.value,
|
|
121
|
-
meta: code.meta ?? void 0
|
|
122
|
-
});
|
|
123
|
-
}
|
|
124
|
-
if (currentSection && node.type === "paragraph") {
|
|
125
|
-
const text = (0, mdast_util_to_string.toString)(node);
|
|
126
|
-
currentSection.content += (currentSection.content ? "\n\n" : "") + text;
|
|
127
|
-
}
|
|
128
|
-
});
|
|
129
|
-
if (currentSection) sections.push(currentSection);
|
|
130
|
-
return { data: {
|
|
131
|
-
title: frontmatter?.title ?? sections.find((s) => s.level === 1)?.title,
|
|
132
|
-
description: frontmatter?.description ?? this.extractDescription(tree),
|
|
133
|
-
sections,
|
|
134
|
-
links: allLinks,
|
|
135
|
-
codeBlocks,
|
|
136
|
-
frontmatter
|
|
137
|
-
} };
|
|
138
|
-
}
|
|
139
|
-
parseFrontmatter(content) {
|
|
140
|
-
const result = {};
|
|
141
|
-
const lines = content.split("\n");
|
|
142
|
-
for (const line of lines) {
|
|
143
|
-
const colonIndex = line.indexOf(":");
|
|
144
|
-
if (colonIndex > 0) {
|
|
145
|
-
const key = line.slice(0, colonIndex).trim();
|
|
146
|
-
let value = line.slice(colonIndex + 1).trim();
|
|
147
|
-
if (value === "true") value = true;
|
|
148
|
-
else if (value === "false") value = false;
|
|
149
|
-
else if (/^-?\d+(\.\d+)?$/.test(value)) value = Number(value);
|
|
150
|
-
else if (value.startsWith("\"") && value.endsWith("\"")) value = value.slice(1, -1);
|
|
151
|
-
else if (value.startsWith("'") && value.endsWith("'")) value = value.slice(1, -1);
|
|
152
|
-
result[key] = value;
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
return result;
|
|
156
|
-
}
|
|
157
|
-
extractDescription(tree) {
|
|
158
|
-
for (const node of tree.children) {
|
|
159
|
-
if (node.type === "heading") break;
|
|
160
|
-
if (node.type === "paragraph") return (0, mdast_util_to_string.toString)(node);
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
};
|
|
164
|
-
/**
|
|
165
|
-
* Extract links from a list-based markdown structure (like awesome lists)
|
|
166
|
-
*/
|
|
167
|
-
function extractListLinks(markdown) {
|
|
168
|
-
const tree = (0, mdast_util_from_markdown.fromMarkdown)(markdown);
|
|
169
|
-
const links = [];
|
|
170
|
-
let currentHeading = "";
|
|
171
|
-
(0, unist_util_visit.visit)(tree, (node) => {
|
|
172
|
-
if (node.type === "heading") currentHeading = (0, mdast_util_to_string.toString)(node);
|
|
173
|
-
if (node.type === "listItem") (0, unist_util_visit.visit)(node, "link", (linkNode) => {
|
|
174
|
-
links.push({
|
|
175
|
-
url: linkNode.url,
|
|
176
|
-
text: (0, mdast_util_to_string.toString)(linkNode),
|
|
177
|
-
title: linkNode.title ?? void 0,
|
|
178
|
-
context: currentHeading || void 0
|
|
179
|
-
});
|
|
180
|
-
});
|
|
181
|
-
});
|
|
182
|
-
return links;
|
|
183
|
-
}
|
|
184
|
-
/**
|
|
185
|
-
* Parse markdown into sections by heading level
|
|
186
|
-
*/
|
|
187
|
-
function parseByHeadings(markdown, minLevel = 2) {
|
|
188
|
-
return new MarkdownParser().parse(markdown).data.sections.filter((s) => s.level >= minLevel);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
//#endregion
|
|
192
|
-
exports.MarkdownParser = MarkdownParser;
|
|
193
|
-
exports.extractListLinks = extractListLinks;
|
|
194
|
-
exports.fetchRepoMeta = fetchRepoMeta;
|
|
195
|
-
exports.groupByCategory = groupByCategory;
|
|
196
|
-
exports.isGitHubRepo = isGitHubRepo;
|
|
197
|
-
exports.parseByHeadings = parseByHeadings;
|
|
198
|
-
exports.parseGitHubUrl = parseGitHubUrl;
|
|
199
|
-
exports.toRawUrl = toRawUrl;
|
|
200
|
-
//# sourceMappingURL=index.cjs.map
|
|
3
|
+
exports.MarkdownParser = require_parsers.MarkdownParser;
|
|
4
|
+
exports.RSSParser = require_parsers.RSSParser;
|
|
5
|
+
exports.extractListLinks = require_parsers.extractListLinks;
|
|
6
|
+
exports.fetchRepoMeta = require_parsers.fetchRepoMeta;
|
|
7
|
+
exports.groupByCategory = require_parsers.groupByCategory;
|
|
8
|
+
exports.isGitHubRepo = require_parsers.isGitHubRepo;
|
|
9
|
+
exports.parseByHeadings = require_parsers.parseByHeadings;
|
|
10
|
+
exports.parseGitHubUrl = require_parsers.parseGitHubUrl;
|
|
11
|
+
exports.toRawUrl = require_parsers.toRawUrl;
|
package/dist/parsers/index.d.cts
CHANGED
|
@@ -1,133 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
* Generic source parser interface.
|
|
4
|
-
* Parsers transform raw content into structured data with metadata.
|
|
5
|
-
*
|
|
6
|
-
* @template TData - The main data type (e.g., array of links)
|
|
7
|
-
* @template TMeta - Optional metadata type
|
|
8
|
-
*/
|
|
9
|
-
interface SourceParser<TData, TMeta = unknown> {
|
|
10
|
-
readonly name: string;
|
|
11
|
-
/**
|
|
12
|
-
* Check if this parser can handle the given content
|
|
13
|
-
*/
|
|
14
|
-
canParse(content: string, url?: string): boolean;
|
|
15
|
-
/**
|
|
16
|
-
* Parse the content and extract structured data
|
|
17
|
-
*/
|
|
18
|
-
parse(content: string, url?: string): ParserResult<TData, TMeta>;
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* Result from a parser
|
|
22
|
-
*/
|
|
23
|
-
interface ParserResult<TData, TMeta = unknown> {
|
|
24
|
-
data: TData;
|
|
25
|
-
meta?: TMeta;
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Markdown link extracted from content
|
|
29
|
-
*/
|
|
30
|
-
interface MarkdownLink {
|
|
31
|
-
url: string;
|
|
32
|
-
text: string;
|
|
33
|
-
title?: string;
|
|
34
|
-
context?: string;
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Markdown section (heading + content)
|
|
38
|
-
*/
|
|
39
|
-
interface MarkdownSection {
|
|
40
|
-
level: number;
|
|
41
|
-
title: string;
|
|
42
|
-
content: string;
|
|
43
|
-
links: MarkdownLink[];
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Parsed markdown structure
|
|
47
|
-
*/
|
|
48
|
-
interface ParsedMarkdown {
|
|
49
|
-
title?: string;
|
|
50
|
-
description?: string;
|
|
51
|
-
sections: MarkdownSection[];
|
|
52
|
-
links: MarkdownLink[];
|
|
53
|
-
codeBlocks: CodeBlock[];
|
|
54
|
-
frontmatter?: Record<string, unknown>;
|
|
55
|
-
}
|
|
56
|
-
/**
|
|
57
|
-
* Code block from markdown
|
|
58
|
-
*/
|
|
59
|
-
interface CodeBlock {
|
|
60
|
-
language?: string;
|
|
61
|
-
code: string;
|
|
62
|
-
meta?: string;
|
|
63
|
-
}
|
|
64
|
-
/**
|
|
65
|
-
* GitHub repository metadata
|
|
66
|
-
*/
|
|
67
|
-
interface GitHubMeta {
|
|
68
|
-
repoOwner?: string;
|
|
69
|
-
repoName?: string;
|
|
70
|
-
stars?: number;
|
|
71
|
-
lastUpdated?: string;
|
|
72
|
-
}
|
|
73
|
-
//#endregion
|
|
74
|
-
//#region src/parsers/github.d.ts
|
|
75
|
-
/**
|
|
76
|
-
* GitHub-specific utilities for parsing repositories.
|
|
77
|
-
*/
|
|
78
|
-
/**
|
|
79
|
-
* Check if a URL is a GitHub repository
|
|
80
|
-
*/
|
|
81
|
-
declare function isGitHubRepo(url: string): boolean;
|
|
82
|
-
/**
|
|
83
|
-
* Extract GitHub repo info from URL
|
|
84
|
-
*/
|
|
85
|
-
declare function parseGitHubUrl(url: string): {
|
|
86
|
-
owner: string;
|
|
87
|
-
repo: string;
|
|
88
|
-
} | null;
|
|
89
|
-
/**
|
|
90
|
-
* Convert a GitHub repo URL to raw content URL
|
|
91
|
-
*/
|
|
92
|
-
declare function toRawUrl(url: string, branch?: string, file?: string): string;
|
|
93
|
-
/**
|
|
94
|
-
* Fetch GitHub API metadata for a repository
|
|
95
|
-
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
96
|
-
*/
|
|
97
|
-
declare function fetchRepoMeta(owner: string, repo: string, _token?: string): Promise<GitHubMeta>;
|
|
98
|
-
/**
|
|
99
|
-
* Group links by their category/section
|
|
100
|
-
*/
|
|
101
|
-
declare function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]>;
|
|
102
|
-
//#endregion
|
|
103
|
-
//#region src/parsers/markdown.d.ts
|
|
104
|
-
/**
|
|
105
|
-
* Generic Markdown parser.
|
|
106
|
-
* Extracts structure, links, and code blocks from markdown content.
|
|
107
|
-
*
|
|
108
|
-
* @example
|
|
109
|
-
* ```ts
|
|
110
|
-
* const parser = new MarkdownParser();
|
|
111
|
-
* const result = parser.parse(markdownContent);
|
|
112
|
-
* console.log(result.data.sections);
|
|
113
|
-
* console.log(result.data.links);
|
|
114
|
-
* ```
|
|
115
|
-
*/
|
|
116
|
-
declare class MarkdownParser implements SourceParser<ParsedMarkdown> {
|
|
117
|
-
readonly name = "markdown";
|
|
118
|
-
canParse(content: string): boolean;
|
|
119
|
-
parse(content: string): ParserResult<ParsedMarkdown>;
|
|
120
|
-
private parseFrontmatter;
|
|
121
|
-
private extractDescription;
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Extract links from a list-based markdown structure (like awesome lists)
|
|
125
|
-
*/
|
|
126
|
-
declare function extractListLinks(markdown: string): MarkdownLink[];
|
|
127
|
-
/**
|
|
128
|
-
* Parse markdown into sections by heading level
|
|
129
|
-
*/
|
|
130
|
-
declare function parseByHeadings(markdown: string, minLevel?: number): MarkdownSection[];
|
|
131
|
-
//#endregion
|
|
132
|
-
export { type CodeBlock, type GitHubMeta, type MarkdownLink, MarkdownParser, type MarkdownSection, type ParsedMarkdown, type ParserResult, type SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
133
|
-
//# sourceMappingURL=index.d.cts.map
|
|
1
|
+
import { _ as MarkdownSection, a as parseByHeadings, b as ParserResult, c as isGitHubRepo, d as CodeBlock, f as FeedEnclosure, g as MarkdownLink, h as GitHubMeta, i as extractListLinks, l as parseGitHubUrl, m as FeedMeta, n as RSSParserOptions, o as fetchRepoMeta, p as FeedItem, r as MarkdownParser, s as groupByCategory, t as RSSParser, u as toRawUrl, v as ParsedFeed, x as SourceParser, y as ParsedMarkdown } from "../index-Bvseqli-.cjs";
|
|
2
|
+
export { CodeBlock, FeedEnclosure, FeedItem, FeedMeta, GitHubMeta, MarkdownLink, MarkdownParser, MarkdownSection, ParsedFeed, ParsedMarkdown, ParserResult, RSSParser, RSSParserOptions, SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
package/dist/parsers/index.d.mts
CHANGED
|
@@ -1,133 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
* Generic source parser interface.
|
|
4
|
-
* Parsers transform raw content into structured data with metadata.
|
|
5
|
-
*
|
|
6
|
-
* @template TData - The main data type (e.g., array of links)
|
|
7
|
-
* @template TMeta - Optional metadata type
|
|
8
|
-
*/
|
|
9
|
-
interface SourceParser<TData, TMeta = unknown> {
|
|
10
|
-
readonly name: string;
|
|
11
|
-
/**
|
|
12
|
-
* Check if this parser can handle the given content
|
|
13
|
-
*/
|
|
14
|
-
canParse(content: string, url?: string): boolean;
|
|
15
|
-
/**
|
|
16
|
-
* Parse the content and extract structured data
|
|
17
|
-
*/
|
|
18
|
-
parse(content: string, url?: string): ParserResult<TData, TMeta>;
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* Result from a parser
|
|
22
|
-
*/
|
|
23
|
-
interface ParserResult<TData, TMeta = unknown> {
|
|
24
|
-
data: TData;
|
|
25
|
-
meta?: TMeta;
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Markdown link extracted from content
|
|
29
|
-
*/
|
|
30
|
-
interface MarkdownLink {
|
|
31
|
-
url: string;
|
|
32
|
-
text: string;
|
|
33
|
-
title?: string;
|
|
34
|
-
context?: string;
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Markdown section (heading + content)
|
|
38
|
-
*/
|
|
39
|
-
interface MarkdownSection {
|
|
40
|
-
level: number;
|
|
41
|
-
title: string;
|
|
42
|
-
content: string;
|
|
43
|
-
links: MarkdownLink[];
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Parsed markdown structure
|
|
47
|
-
*/
|
|
48
|
-
interface ParsedMarkdown {
|
|
49
|
-
title?: string;
|
|
50
|
-
description?: string;
|
|
51
|
-
sections: MarkdownSection[];
|
|
52
|
-
links: MarkdownLink[];
|
|
53
|
-
codeBlocks: CodeBlock[];
|
|
54
|
-
frontmatter?: Record<string, unknown>;
|
|
55
|
-
}
|
|
56
|
-
/**
|
|
57
|
-
* Code block from markdown
|
|
58
|
-
*/
|
|
59
|
-
interface CodeBlock {
|
|
60
|
-
language?: string;
|
|
61
|
-
code: string;
|
|
62
|
-
meta?: string;
|
|
63
|
-
}
|
|
64
|
-
/**
|
|
65
|
-
* GitHub repository metadata
|
|
66
|
-
*/
|
|
67
|
-
interface GitHubMeta {
|
|
68
|
-
repoOwner?: string;
|
|
69
|
-
repoName?: string;
|
|
70
|
-
stars?: number;
|
|
71
|
-
lastUpdated?: string;
|
|
72
|
-
}
|
|
73
|
-
//#endregion
|
|
74
|
-
//#region src/parsers/github.d.ts
|
|
75
|
-
/**
|
|
76
|
-
* GitHub-specific utilities for parsing repositories.
|
|
77
|
-
*/
|
|
78
|
-
/**
|
|
79
|
-
* Check if a URL is a GitHub repository
|
|
80
|
-
*/
|
|
81
|
-
declare function isGitHubRepo(url: string): boolean;
|
|
82
|
-
/**
|
|
83
|
-
* Extract GitHub repo info from URL
|
|
84
|
-
*/
|
|
85
|
-
declare function parseGitHubUrl(url: string): {
|
|
86
|
-
owner: string;
|
|
87
|
-
repo: string;
|
|
88
|
-
} | null;
|
|
89
|
-
/**
|
|
90
|
-
* Convert a GitHub repo URL to raw content URL
|
|
91
|
-
*/
|
|
92
|
-
declare function toRawUrl(url: string, branch?: string, file?: string): string;
|
|
93
|
-
/**
|
|
94
|
-
* Fetch GitHub API metadata for a repository
|
|
95
|
-
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
96
|
-
*/
|
|
97
|
-
declare function fetchRepoMeta(owner: string, repo: string, _token?: string): Promise<GitHubMeta>;
|
|
98
|
-
/**
|
|
99
|
-
* Group links by their category/section
|
|
100
|
-
*/
|
|
101
|
-
declare function groupByCategory(links: MarkdownLink[]): Map<string, MarkdownLink[]>;
|
|
102
|
-
//#endregion
|
|
103
|
-
//#region src/parsers/markdown.d.ts
|
|
104
|
-
/**
|
|
105
|
-
* Generic Markdown parser.
|
|
106
|
-
* Extracts structure, links, and code blocks from markdown content.
|
|
107
|
-
*
|
|
108
|
-
* @example
|
|
109
|
-
* ```ts
|
|
110
|
-
* const parser = new MarkdownParser();
|
|
111
|
-
* const result = parser.parse(markdownContent);
|
|
112
|
-
* console.log(result.data.sections);
|
|
113
|
-
* console.log(result.data.links);
|
|
114
|
-
* ```
|
|
115
|
-
*/
|
|
116
|
-
declare class MarkdownParser implements SourceParser<ParsedMarkdown> {
|
|
117
|
-
readonly name = "markdown";
|
|
118
|
-
canParse(content: string): boolean;
|
|
119
|
-
parse(content: string): ParserResult<ParsedMarkdown>;
|
|
120
|
-
private parseFrontmatter;
|
|
121
|
-
private extractDescription;
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Extract links from a list-based markdown structure (like awesome lists)
|
|
125
|
-
*/
|
|
126
|
-
declare function extractListLinks(markdown: string): MarkdownLink[];
|
|
127
|
-
/**
|
|
128
|
-
* Parse markdown into sections by heading level
|
|
129
|
-
*/
|
|
130
|
-
declare function parseByHeadings(markdown: string, minLevel?: number): MarkdownSection[];
|
|
131
|
-
//#endregion
|
|
132
|
-
export { type CodeBlock, type GitHubMeta, type MarkdownLink, MarkdownParser, type MarkdownSection, type ParsedMarkdown, type ParserResult, type SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
133
|
-
//# sourceMappingURL=index.d.mts.map
|
|
1
|
+
import { _ as MarkdownSection, a as parseByHeadings, b as ParserResult, c as isGitHubRepo, d as CodeBlock, f as FeedEnclosure, g as MarkdownLink, h as GitHubMeta, i as extractListLinks, l as parseGitHubUrl, m as FeedMeta, n as RSSParserOptions, o as fetchRepoMeta, p as FeedItem, r as MarkdownParser, s as groupByCategory, t as RSSParser, u as toRawUrl, v as ParsedFeed, x as SourceParser, y as ParsedMarkdown } from "../index-CIFjNySr.mjs";
|
|
2
|
+
export { CodeBlock, FeedEnclosure, FeedItem, FeedMeta, GitHubMeta, MarkdownLink, MarkdownParser, MarkdownSection, ParsedFeed, ParsedMarkdown, ParserResult, RSSParser, RSSParserOptions, SourceParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
package/dist/parsers/index.mjs
CHANGED
|
@@ -1,192 +1,3 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { toString } from "mdast-util-to-string";
|
|
3
|
-
import { visit } from "unist-util-visit";
|
|
1
|
+
import { a as fetchRepoMeta, c as parseGitHubUrl, i as parseByHeadings, l as toRawUrl, n as MarkdownParser, o as groupByCategory, r as extractListLinks, s as isGitHubRepo, t as RSSParser } from "../parsers-DsawHeo0.mjs";
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* GitHub-specific utilities for parsing repositories.
|
|
8
|
-
*/
|
|
9
|
-
/**
|
|
10
|
-
* Check if a URL is a GitHub repository
|
|
11
|
-
*/
|
|
12
|
-
function isGitHubRepo(url) {
|
|
13
|
-
return /^https?:\/\/(www\.)?github\.com\/[^/]+\/[^/]+\/?$/.test(url);
|
|
14
|
-
}
|
|
15
|
-
/**
|
|
16
|
-
* Extract GitHub repo info from URL
|
|
17
|
-
*/
|
|
18
|
-
function parseGitHubUrl(url) {
|
|
19
|
-
const match = url.match(/github\.com\/([^/]+)\/([^/]+)/);
|
|
20
|
-
if (!match || !match[1] || !match[2]) return null;
|
|
21
|
-
return {
|
|
22
|
-
owner: match[1],
|
|
23
|
-
repo: match[2].replace(/\.git$/, "")
|
|
24
|
-
};
|
|
25
|
-
}
|
|
26
|
-
/**
|
|
27
|
-
* Convert a GitHub repo URL to raw content URL
|
|
28
|
-
*/
|
|
29
|
-
function toRawUrl(url, branch = "main", file = "README.md") {
|
|
30
|
-
const info = parseGitHubUrl(url);
|
|
31
|
-
if (!info) return url;
|
|
32
|
-
return `https://raw.githubusercontent.com/${info.owner}/${info.repo}/${branch}/${file}`;
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Fetch GitHub API metadata for a repository
|
|
36
|
-
* Note: This is a placeholder - actual implementation would need GitHub API access
|
|
37
|
-
*/
|
|
38
|
-
async function fetchRepoMeta(owner, repo, _token) {
|
|
39
|
-
return {
|
|
40
|
-
repoOwner: owner,
|
|
41
|
-
repoName: repo
|
|
42
|
-
};
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Group links by their category/section
|
|
46
|
-
*/
|
|
47
|
-
function groupByCategory(links) {
|
|
48
|
-
const groups = /* @__PURE__ */ new Map();
|
|
49
|
-
for (const link of links) {
|
|
50
|
-
const category = link.context || "Uncategorized";
|
|
51
|
-
const existing = groups.get(category) || [];
|
|
52
|
-
existing.push(link);
|
|
53
|
-
groups.set(category, existing);
|
|
54
|
-
}
|
|
55
|
-
return groups;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
//#endregion
|
|
59
|
-
//#region src/parsers/markdown.ts
|
|
60
|
-
/**
|
|
61
|
-
* Generic Markdown parser.
|
|
62
|
-
* Extracts structure, links, and code blocks from markdown content.
|
|
63
|
-
*
|
|
64
|
-
* @example
|
|
65
|
-
* ```ts
|
|
66
|
-
* const parser = new MarkdownParser();
|
|
67
|
-
* const result = parser.parse(markdownContent);
|
|
68
|
-
* console.log(result.data.sections);
|
|
69
|
-
* console.log(result.data.links);
|
|
70
|
-
* ```
|
|
71
|
-
*/
|
|
72
|
-
var MarkdownParser = class {
|
|
73
|
-
name = "markdown";
|
|
74
|
-
canParse(content) {
|
|
75
|
-
return content.includes("# ") || content.includes("## ") || content.includes("- [") || content.includes("* [") || content.includes("```");
|
|
76
|
-
}
|
|
77
|
-
parse(content) {
|
|
78
|
-
const tree = fromMarkdown(content);
|
|
79
|
-
const sections = [];
|
|
80
|
-
const allLinks = [];
|
|
81
|
-
const codeBlocks = [];
|
|
82
|
-
let frontmatter;
|
|
83
|
-
if (content.startsWith("---")) {
|
|
84
|
-
const endIndex = content.indexOf("---", 3);
|
|
85
|
-
if (endIndex !== -1) {
|
|
86
|
-
const frontmatterContent = content.slice(3, endIndex).trim();
|
|
87
|
-
frontmatter = this.parseFrontmatter(frontmatterContent);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
let currentSection = null;
|
|
91
|
-
visit(tree, (node) => {
|
|
92
|
-
if (node.type === "heading") {
|
|
93
|
-
const heading = node;
|
|
94
|
-
const title = toString(heading);
|
|
95
|
-
if (currentSection) sections.push(currentSection);
|
|
96
|
-
currentSection = {
|
|
97
|
-
level: heading.depth,
|
|
98
|
-
title,
|
|
99
|
-
content: "",
|
|
100
|
-
links: []
|
|
101
|
-
};
|
|
102
|
-
}
|
|
103
|
-
if (node.type === "link") {
|
|
104
|
-
const link = node;
|
|
105
|
-
const text = toString(link);
|
|
106
|
-
const linkData = {
|
|
107
|
-
url: link.url,
|
|
108
|
-
text,
|
|
109
|
-
title: link.title ?? void 0,
|
|
110
|
-
context: currentSection?.title
|
|
111
|
-
};
|
|
112
|
-
allLinks.push(linkData);
|
|
113
|
-
if (currentSection) currentSection.links.push(linkData);
|
|
114
|
-
}
|
|
115
|
-
if (node.type === "code") {
|
|
116
|
-
const code = node;
|
|
117
|
-
codeBlocks.push({
|
|
118
|
-
language: code.lang ?? void 0,
|
|
119
|
-
code: code.value,
|
|
120
|
-
meta: code.meta ?? void 0
|
|
121
|
-
});
|
|
122
|
-
}
|
|
123
|
-
if (currentSection && node.type === "paragraph") {
|
|
124
|
-
const text = toString(node);
|
|
125
|
-
currentSection.content += (currentSection.content ? "\n\n" : "") + text;
|
|
126
|
-
}
|
|
127
|
-
});
|
|
128
|
-
if (currentSection) sections.push(currentSection);
|
|
129
|
-
return { data: {
|
|
130
|
-
title: frontmatter?.title ?? sections.find((s) => s.level === 1)?.title,
|
|
131
|
-
description: frontmatter?.description ?? this.extractDescription(tree),
|
|
132
|
-
sections,
|
|
133
|
-
links: allLinks,
|
|
134
|
-
codeBlocks,
|
|
135
|
-
frontmatter
|
|
136
|
-
} };
|
|
137
|
-
}
|
|
138
|
-
parseFrontmatter(content) {
|
|
139
|
-
const result = {};
|
|
140
|
-
const lines = content.split("\n");
|
|
141
|
-
for (const line of lines) {
|
|
142
|
-
const colonIndex = line.indexOf(":");
|
|
143
|
-
if (colonIndex > 0) {
|
|
144
|
-
const key = line.slice(0, colonIndex).trim();
|
|
145
|
-
let value = line.slice(colonIndex + 1).trim();
|
|
146
|
-
if (value === "true") value = true;
|
|
147
|
-
else if (value === "false") value = false;
|
|
148
|
-
else if (/^-?\d+(\.\d+)?$/.test(value)) value = Number(value);
|
|
149
|
-
else if (value.startsWith("\"") && value.endsWith("\"")) value = value.slice(1, -1);
|
|
150
|
-
else if (value.startsWith("'") && value.endsWith("'")) value = value.slice(1, -1);
|
|
151
|
-
result[key] = value;
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
return result;
|
|
155
|
-
}
|
|
156
|
-
extractDescription(tree) {
|
|
157
|
-
for (const node of tree.children) {
|
|
158
|
-
if (node.type === "heading") break;
|
|
159
|
-
if (node.type === "paragraph") return toString(node);
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
};
|
|
163
|
-
/**
|
|
164
|
-
* Extract links from a list-based markdown structure (like awesome lists)
|
|
165
|
-
*/
|
|
166
|
-
function extractListLinks(markdown) {
|
|
167
|
-
const tree = fromMarkdown(markdown);
|
|
168
|
-
const links = [];
|
|
169
|
-
let currentHeading = "";
|
|
170
|
-
visit(tree, (node) => {
|
|
171
|
-
if (node.type === "heading") currentHeading = toString(node);
|
|
172
|
-
if (node.type === "listItem") visit(node, "link", (linkNode) => {
|
|
173
|
-
links.push({
|
|
174
|
-
url: linkNode.url,
|
|
175
|
-
text: toString(linkNode),
|
|
176
|
-
title: linkNode.title ?? void 0,
|
|
177
|
-
context: currentHeading || void 0
|
|
178
|
-
});
|
|
179
|
-
});
|
|
180
|
-
});
|
|
181
|
-
return links;
|
|
182
|
-
}
|
|
183
|
-
/**
|
|
184
|
-
* Parse markdown into sections by heading level
|
|
185
|
-
*/
|
|
186
|
-
function parseByHeadings(markdown, minLevel = 2) {
|
|
187
|
-
return new MarkdownParser().parse(markdown).data.sections.filter((s) => s.level >= minLevel);
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
//#endregion
|
|
191
|
-
export { MarkdownParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|
|
192
|
-
//# sourceMappingURL=index.mjs.map
|
|
3
|
+
export { MarkdownParser, RSSParser, extractListLinks, fetchRepoMeta, groupByCategory, isGitHubRepo, parseByHeadings, parseGitHubUrl, toRawUrl };
|