@mytechtoday/url-reference-mapper 1.3.1 → 2.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ export interface ExtractedMetadata {
2
+ wordCount?: number;
3
+ readingTime?: number;
4
+ tags?: string;
5
+ summary?: string;
6
+ tldr?: string;
7
+ categories?: string;
8
+ author?: string;
9
+ featuredImages?: string[];
10
+ authorImage?: string;
11
+ authorUrl?: string;
12
+ quotes?: string[];
13
+ internalLinks?: string[];
14
+ externalLinks?: string[];
15
+ relatedPosts?: string[];
16
+ }
17
+ export interface ExtractionConfig {
18
+ readingSpeed?: number;
19
+ maxTags?: number;
20
+ maxQuotes?: number;
21
+ maxLinks?: number;
22
+ baseUrl?: string;
23
+ }
24
+ export declare function extractMetadata(filePath: string, config?: ExtractionConfig): Promise<ExtractedMetadata>;
25
+ //# sourceMappingURL=extractors.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractors.d.ts","sourceRoot":"","sources":["../src/extractors.ts"],"names":[],"mappings":"AAMA,MAAM,WAAW,iBAAiB;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;CACzB;AAKD,MAAM,WAAW,gBAAgB;IAE/B,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AASD,wBAAsB,eAAe,CACnC,QAAQ,EAAE,MAAM,EAChB,MAAM,GAAE,gBAAqB,GAC5B,OAAO,CAAC,iBAAiB,CAAC,CAoB5B"}
@@ -0,0 +1,320 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.extractMetadata = void 0;
27
+ const fs = __importStar(require("fs"));
28
+ const path = __importStar(require("path"));
29
+ async function extractMetadata(filePath, config = {}) {
30
+ if (!fs.existsSync(filePath)) {
31
+ throw new Error(`File not found: ${filePath}`);
32
+ }
33
+ const content = fs.readFileSync(filePath, 'utf-8');
34
+ const ext = path.extname(filePath).toLowerCase();
35
+ switch (ext) {
36
+ case '.md':
37
+ case '.markdown':
38
+ return extractFromMarkdown(content, config);
39
+ case '.html':
40
+ case '.htm':
41
+ return extractFromHtml(content, config);
42
+ case '.txt':
43
+ return extractFromText(content, config);
44
+ default:
45
+ return extractFromText(content, config);
46
+ }
47
+ }
48
+ exports.extractMetadata = extractMetadata;
49
+ function extractFromMarkdown(content, config) {
50
+ const metadata = {};
51
+ const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
52
+ if (frontmatterMatch) {
53
+ const frontmatter = frontmatterMatch[1];
54
+ const authorMatch = frontmatter.match(/author:\s*(.+)/i);
55
+ if (authorMatch)
56
+ metadata.author = authorMatch[1].trim();
57
+ const categoriesMatch = frontmatter.match(/categories?:\s*(.+)/i);
58
+ if (categoriesMatch)
59
+ metadata.categories = categoriesMatch[1].trim();
60
+ const tagsMatch = frontmatter.match(/tags?:\s*(.+)/i);
61
+ if (tagsMatch)
62
+ metadata.tags = tagsMatch[1].trim();
63
+ }
64
+ const textContent = content.replace(/^---\n[\s\S]*?\n---\n/, '');
65
+ const wordCount = countWords(textContent);
66
+ metadata.wordCount = wordCount;
67
+ metadata.readingTime = calculateReadingTime(wordCount, config.readingSpeed || 225);
68
+ const quotes = extractQuotes(textContent, config.maxQuotes || 10);
69
+ if (quotes.length > 0)
70
+ metadata.quotes = quotes;
71
+ const { internalLinks, externalLinks } = extractLinks(textContent, config.baseUrl, config.maxLinks || 10);
72
+ if (internalLinks.length > 0)
73
+ metadata.internalLinks = internalLinks;
74
+ if (externalLinks.length > 0)
75
+ metadata.externalLinks = externalLinks;
76
+ const images = extractMarkdownImages(textContent);
77
+ if (images.length > 0)
78
+ metadata.featuredImages = images;
79
+ const summary = extractSummary(textContent);
80
+ if (summary)
81
+ metadata.summary = summary;
82
+ const tldr = extractTldr(textContent);
83
+ if (tldr)
84
+ metadata.tldr = tldr;
85
+ return metadata;
86
+ }
87
+ function extractFromHtml(content, config) {
88
+ const metadata = {};
89
+ const authorMatch = content.match(/<meta\s+name=["']author["']\s+content=["']([^"']+)["']/i);
90
+ if (authorMatch)
91
+ metadata.author = authorMatch[1];
92
+ const descriptionMatch = content.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
93
+ if (descriptionMatch)
94
+ metadata.summary = descriptionMatch[1];
95
+ const keywordsMatch = content.match(/<meta\s+name=["']keywords["']\s+content=["']([^"']+)["']/i);
96
+ if (keywordsMatch)
97
+ metadata.tags = keywordsMatch[1];
98
+ const textContent = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
99
+ const wordCount = countWords(textContent);
100
+ metadata.wordCount = wordCount;
101
+ metadata.readingTime = calculateReadingTime(wordCount, config.readingSpeed || 225);
102
+ const { internalLinks, externalLinks } = extractHtmlLinks(content, config.baseUrl, config.maxLinks || 10);
103
+ if (internalLinks.length > 0)
104
+ metadata.internalLinks = internalLinks;
105
+ if (externalLinks.length > 0)
106
+ metadata.externalLinks = externalLinks;
107
+ const images = extractHtmlImages(content);
108
+ if (images.length > 0)
109
+ metadata.featuredImages = images;
110
+ const quotes = extractHtmlQuotes(content, config.maxQuotes || 10);
111
+ if (quotes.length > 0)
112
+ metadata.quotes = quotes;
113
+ const tldr = extractTldr(textContent);
114
+ if (tldr)
115
+ metadata.tldr = tldr;
116
+ return metadata;
117
+ }
118
+ function extractFromText(content, config) {
119
+ const metadata = {};
120
+ const wordCount = countWords(content);
121
+ metadata.wordCount = wordCount;
122
+ metadata.readingTime = calculateReadingTime(wordCount, config.readingSpeed || 225);
123
+ const quotes = extractQuotes(content, config.maxQuotes || 10);
124
+ if (quotes.length > 0)
125
+ metadata.quotes = quotes;
126
+ const { internalLinks, externalLinks } = extractTextLinks(content, config.baseUrl, config.maxLinks || 10);
127
+ if (internalLinks.length > 0)
128
+ metadata.internalLinks = internalLinks;
129
+ if (externalLinks.length > 0)
130
+ metadata.externalLinks = externalLinks;
131
+ const summary = extractSummary(content);
132
+ if (summary)
133
+ metadata.summary = summary;
134
+ const tldr = extractTldr(content);
135
+ if (tldr)
136
+ metadata.tldr = tldr;
137
+ return metadata;
138
+ }
139
+ function countWords(text) {
140
+ const words = text.trim().split(/\s+/).filter(word => word.length > 0);
141
+ return words.length;
142
+ }
143
+ function calculateReadingTime(wordCount, wordsPerMinute) {
144
+ return Math.ceil(wordCount / wordsPerMinute);
145
+ }
146
+ function extractQuotes(content, maxQuotes) {
147
+ const quotes = [];
148
+ const lines = content.split('\n');
149
+ let currentQuote = '';
150
+ for (const line of lines) {
151
+ if (line.trim().startsWith('>')) {
152
+ currentQuote += line.replace(/^>\s*/, '') + ' ';
153
+ }
154
+ else if (currentQuote) {
155
+ quotes.push(currentQuote.trim());
156
+ currentQuote = '';
157
+ if (quotes.length >= maxQuotes)
158
+ break;
159
+ }
160
+ }
161
+ if (currentQuote && quotes.length < maxQuotes) {
162
+ quotes.push(currentQuote.trim());
163
+ }
164
+ return quotes.slice(0, maxQuotes);
165
+ }
166
+ function extractHtmlQuotes(content, maxQuotes) {
167
+ const quotes = [];
168
+ const blockquoteRegex = /<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi;
169
+ let match;
170
+ while ((match = blockquoteRegex.exec(content)) !== null && quotes.length < maxQuotes) {
171
+ const quoteText = match[1]
172
+ .replace(/<[^>]+>/g, ' ')
173
+ .replace(/\s+/g, ' ')
174
+ .trim();
175
+ if (quoteText.length > 0) {
176
+ quotes.push(quoteText);
177
+ }
178
+ }
179
+ return quotes.slice(0, maxQuotes);
180
+ }
181
+ function extractLinks(content, baseUrl, maxLinks) {
182
+ const internalLinks = [];
183
+ const externalLinks = [];
184
+ const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
185
+ let match;
186
+ while ((match = linkRegex.exec(content)) !== null) {
187
+ const url = match[2];
188
+ if (url.startsWith('http://') || url.startsWith('https://')) {
189
+ if (baseUrl && url.startsWith(baseUrl)) {
190
+ if (internalLinks.length < maxLinks)
191
+ internalLinks.push(url);
192
+ }
193
+ else {
194
+ if (externalLinks.length < maxLinks)
195
+ externalLinks.push(url);
196
+ }
197
+ }
198
+ if (internalLinks.length >= maxLinks && externalLinks.length >= maxLinks)
199
+ break;
200
+ }
201
+ return { internalLinks, externalLinks };
202
+ }
203
+ function extractHtmlLinks(content, baseUrl, maxLinks) {
204
+ const internalLinks = [];
205
+ const externalLinks = [];
206
+ const linkRegex = /<a\s+[^>]*href=["']([^"']+)["']/gi;
207
+ let match;
208
+ while ((match = linkRegex.exec(content)) !== null) {
209
+ const url = match[1];
210
+ if (url.startsWith('http://') || url.startsWith('https://')) {
211
+ if (baseUrl && url.startsWith(baseUrl)) {
212
+ if (internalLinks.length < maxLinks)
213
+ internalLinks.push(url);
214
+ }
215
+ else {
216
+ if (externalLinks.length < maxLinks)
217
+ externalLinks.push(url);
218
+ }
219
+ }
220
+ if (internalLinks.length >= maxLinks && externalLinks.length >= maxLinks)
221
+ break;
222
+ }
223
+ return { internalLinks, externalLinks };
224
+ }
225
+ function extractTextLinks(content, baseUrl, maxLinks) {
226
+ const internalLinks = [];
227
+ const externalLinks = [];
228
+ const urlRegex = /https?:\/\/[^\s<>"]+/g;
229
+ let match;
230
+ while ((match = urlRegex.exec(content)) !== null) {
231
+ const url = match[0];
232
+ if (baseUrl && url.startsWith(baseUrl)) {
233
+ if (internalLinks.length < maxLinks)
234
+ internalLinks.push(url);
235
+ }
236
+ else {
237
+ if (externalLinks.length < maxLinks)
238
+ externalLinks.push(url);
239
+ }
240
+ if (internalLinks.length >= maxLinks && externalLinks.length >= maxLinks)
241
+ break;
242
+ }
243
+ return { internalLinks, externalLinks };
244
+ }
245
+ function extractMarkdownImages(content) {
246
+ const images = [];
247
+ const imageRegex = /!\[([^\]]*)\]\(([^)]+)\)/g;
248
+ let match;
249
+ while ((match = imageRegex.exec(content)) !== null) {
250
+ const url = match[2];
251
+ if (url.startsWith('http://') || url.startsWith('https://')) {
252
+ images.push(url);
253
+ }
254
+ }
255
+ return images;
256
+ }
257
+ function extractHtmlImages(content) {
258
+ const images = [];
259
+ const imageRegex = /<img\s+[^>]*src=["']([^"']+)["']/gi;
260
+ let match;
261
+ while ((match = imageRegex.exec(content)) !== null) {
262
+ const url = match[1];
263
+ if (url.startsWith('http://') || url.startsWith('https://')) {
264
+ images.push(url);
265
+ }
266
+ }
267
+ return images;
268
+ }
269
+ function extractSummary(content) {
270
+ const textContent = content.replace(/^---\n[\s\S]*?\n---\n/, '');
271
+ const paragraphs = textContent.split(/\n\n+/).map(p => p.trim()).filter(p => p.length > 0);
272
+ for (const para of paragraphs) {
273
+ if (!para.startsWith('#') && para.length > 50 && para.length < 500) {
274
+ const cleaned = para
275
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
276
+ .replace(/[*_`]/g, '')
277
+ .replace(/^>\s*/gm, '')
278
+ .trim();
279
+ const firstSentence = cleaned.split(/[.!?]\s/)[0];
280
+ if (firstSentence && firstSentence.length > 30) {
281
+ return firstSentence + '.';
282
+ }
283
+ }
284
+ }
285
+ return undefined;
286
+ }
287
+ function extractTldr(content) {
288
+ const textContent = content.replace(/^---\n[\s\S]*?\n---\n/, '');
289
+ const paragraphs = textContent
290
+ .split(/\n\n+/)
291
+ .map(p => p.trim())
292
+ .filter(p => p.length > 0 && !p.startsWith('#'));
293
+ if (paragraphs.length === 0)
294
+ return undefined;
295
+ let tldr = '';
296
+ for (const para of paragraphs) {
297
+ const cleaned = para
298
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
299
+ .replace(/[*_`]/g, '')
300
+ .replace(/^>\s*/gm, '')
301
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
302
+ .trim();
303
+ if (cleaned.length > 0) {
304
+ if (tldr.length === 0) {
305
+ tldr = cleaned;
306
+ }
307
+ else {
308
+ tldr += ' ' + cleaned;
309
+ }
310
+ if (tldr.length >= 200) {
311
+ break;
312
+ }
313
+ }
314
+ }
315
+ if (tldr.length > 900) {
316
+ tldr = tldr.substring(0, 897) + '...';
317
+ }
318
+ return tldr.length >= 200 ? tldr : undefined;
319
+ }
320
+ //# sourceMappingURL=extractors.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractors.js","sourceRoot":"","sources":["../src/extractors.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AACzB,2CAA6B;AA6CtB,KAAK,UAAU,eAAe,CACnC,QAAgB,EAChB,SAA2B,EAAE;IAE7B,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,KAAK,CAAC,mBAAmB,QAAQ,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAEjD,QAAQ,GAAG,EAAE,CAAC;QACZ,KAAK,KAAK,CAAC;QACX,KAAK,WAAW;YACd,OAAO,mBAAmB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QAC9C,KAAK,OAAO,CAAC;QACb,KAAK,MAAM;YACT,OAAO,eAAe,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QAC1C,KAAK,MAAM;YACT,OAAO,eAAe,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QAC1C;YACE,OAAO,eAAe,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC;AAvBD,0CAuBC;AAKD,SAAS,mBAAmB,CAAC,OAAe,EAAE,MAAwB;IACpE,MAAM,QAAQ,GAAsB,EAAE,CAAC;IAGvC,MAAM,gBAAgB,GAAG,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IAChE,IAAI,gBAAgB,EAAE,CAAC;QACrB,MAAM,WAAW,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;QAGxC,MAAM,WAAW,GAAG,WAAW,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QACzD,IAAI,WAAW;YAAE,QAAQ,CAAC,MAAM,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAGzD,MAAM,eAAe,GAAG,WAAW,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAClE,IAAI,eAAe;YAAE,QAAQ,CAAC,UAAU,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAGrE,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QACtD,IAAI,SAAS;YAAE,QAAQ,CAAC,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACrD,CAAC;IAGD,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAC;IAGjE,MAAM,SAAS,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC;IAC1C,QAAQ,CAAC,SAAS,GAAG,SAAS,CAAC;IAC/B,QAAQ,CAAC,WAAW,GAAG,oBAAoB,CAAC,SAAS,EAAE,MAAM,CAAC,YAAY,IAAI,GAAG,CAAC,CAAC;IAGnF,MAAM,MAAM,GAAG,aAAa,CAAC,WAAW,EAAE,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;IAClE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,MAAM,GAAG,MAAM,CAAC;IAGhD,MAAM,EAAE,aAAa,EAAE,aAAa,EAAE,GAAG,YAAY,CAAC,WAAW,EAAE,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IAC1G,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IACrE,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IAGrE,MAAM,MAAM,GAAG,qBAAqB,CAAC,WAAW,CAAC,CAAC;IAClD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,cAAc,GAAG,MAAM,CAAC;IAGxD,MAAM,OAAO,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAC5C,IAAI,OAAO;QAAE,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;IAGxC,MAAM,IAAI,GAAG,WAAW,CAAC,WAAW,CAAC,CAAC;IACtC,IAAI,IAAI;QAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;IAE/B,OAAO,QAAQ,CAAC;AAClB,CAAC;AAKD,SAAS,eAAe,CAAC,OAAe,EAAE,MAAwB;IAChE,MAAM,QAAQ,GAAsB,EAAE,CAAC;IAGvC,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7F,IAAI,WAAW;QAAE,QAAQ,CAAC,MAAM,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;IAElD,MAAM,gBAAgB,GAAG,OAAO,CAAC,KAAK,CAAC,8DAA8D,CAAC,CAAC;IACvG,IAAI,gBAAgB;QAAE,QAAQ,CAAC,OAAO,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAE7D,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAC,2DAA2D,CAAC,CAAC;IACjG,IAAI,aAAa;QAAE,QAAQ,CAAC,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAGpD,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAGjF,MAAM,SAAS,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC;IAC1C,QAAQ,CAAC,SAAS,GAAG,SAAS,CAAC;IAC/B,QAAQ,CAAC,WAAW,GAAG,oBAAoB,CAAC,SAAS,EAAE,MAAM,CAAC,YAAY,IAAI,GAAG,CAAC,CAAC;IAGnF,MAAM,EAAE,aAAa,EAAE,aAAa,EAAE,GAAG,gBAAgB,CAAC,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IAC1G,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IACrE,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IAGrE,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAC1C,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,cAAc,GAAG,MAAM,CAAC;IAGxD,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,EAAE,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;IAClE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,MAAM,GAAG,MAAM,CAAC;IAGhD,MAAM,IAAI,GAAG,WAAW,CAAC,WAAW,CAAC,CAAC;IACtC,IAAI,IAAI;QAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;IAE/B,OAAO,QAAQ,CAAC;AAClB,CAAC;AAKD,SAAS,eAAe,CAAC,OAAe,EAAE,MAAwB;IAChE,MAAM,QAAQ,GAAsB,EAAE,CAAC;IAGvC,MAAM,SAAS,GAAG,UAAU,CAAC,OAAO,CAAC,CAAC;IACtC,QAAQ,CAAC,SAAS,GAAG,SAAS,CAAC;IAC/B,QAAQ,CAAC,WAAW,GAAG,oBAAoB,CAAC,SAAS,EAAE,MAAM,CAAC,YAAY,IAAI,GAAG,CAAC,CAAC;IAGnF,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,EAAE,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;IAC9D,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,MAAM,GAAG,MAAM,CAAC;IAGhD,MAAM,EAAE,aAAa,EAAE,aAAa,EAAE,GAAG,gBAAgB,CAAC,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IAC1G,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IACrE,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC;QAAE,QAAQ,CAAC,aAAa,GAAG,aAAa,CAAC;IAGrE,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IACxC,IAAI,OAAO;QAAE,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;IAExC,MAAM,IAAI,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC;IAClC,IAAI,IAAI;QAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;IAE/B,OAAO,QAAQ,CAAC;AAClB,CAAC;AAKD,SAAS,UAAU,CAAC,IAAY;IAE9B,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACvE,OAAO,KAAK,CAAC,MAAM,CAAC;AACtB,CAAC;AAKD,SAAS,oBAAoB,CAAC,SAAiB,EAAE,cAAsB;IACrE,OAAO,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,cAAc,CAAC,CAAC;AAC/C,CAAC;AAKD,SAAS,aAAa,CAAC,OAAe,EAAE,SAAiB;IACvD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAChC,YAAY,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC;QAClD,CAAC;aAAM,IAAI,YAAY,EAAE,CAAC;YACxB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;YACjC,YAAY,GAAG,EAAE,CAAC;YAClB,IAAI,MAAM,CAAC,MAAM,IAAI,SAAS;gBAAE,MAAM;QACxC,CAAC;IACH,CAAC;IAED,IAAI,YAAY,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAC9C,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,CAAC;IAED,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;AACpC,CAAC;AAKD,SAAS,iBAAiB,CAAC,OAAe,EAAE,SAAiB;IAC3D,MAAM,MAAM,GAAa,EAAE,CAAC;IAG5B,MAAM,eAAe,GAAG,6CAA6C,CAAC;IACtE,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;QAErF,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC;aACvB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;aACxB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;aACpB,IAAI,EAAE,CAAC;QAEV,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;AACpC,CAAC;AAKD,SAAS,YAAY,CACnB,OAAe,EACf,OAA2B,EAC3B,QAAgB;IAEhB,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,MAAM,aAAa,GAAa,EAAE,CAAC;IAGnC,MAAM,SAAS,GAAG,0BAA0B,CAAC;IAC7C,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAErB,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,IAAI,OAAO,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvC,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;oBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC/D,CAAC;iBAAM,CAAC;gBACN,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;oBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ;YAAE,MAAM;IAClF,CAAC;IAED,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC;AAC1C,CAAC;AAKD,SAAS,gBAAgB,CACvB,OAAe,EACf,OAA2B,EAC3B,QAAgB;IAEhB,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,MAAM,aAAa,GAAa,EAAE,CAAC;IAGnC,MAAM,SAAS,GAAG,mCAAmC,CAAC;IACtD,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAErB,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,IAAI,OAAO,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;gBACvC,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;oBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC/D,CAAC;iBAAM,CAAC;gBACN,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;oBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ;YAAE,MAAM;IAClF,CAAC;IAED,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC;AAC1C,CAAC;AAKD,SAAS,gBAAgB,CACvB,OAAe,EACf,OAA2B,EAC3B,QAAgB;IAEhB,MAAM,aAAa,GAAa,EAAE,CAAC;IACnC,MAAM,aAAa,GAAa,EAAE,CAAC;IAGnC,MAAM,QAAQ,GAAG,uBAAuB,CAAC;IACzC,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACjD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAErB,IAAI,OAAO,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YACvC,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;gBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC/D,CAAC;aAAM,CAAC;YACN,IAAI,aAAa,CAAC,MAAM,GAAG,QAAQ;gBAAE,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC/D,CAAC;QAED,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ,IAAI,aAAa,CAAC,MAAM,IAAI,QAAQ;YAAE,MAAM;IAClF,CAAC;IAED,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC;AAC1C,CAAC;AAKD,SAAS,qBAAqB,CAAC,OAAe;IAC5C,MAAM,MAAM,GAAa,EAAE,CAAC;IAG5B,MAAM,UAAU,GAAG,2BAA2B,CAAC;IAC/C,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACnD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACrB,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAKD,SAAS,iBAAiB,CAAC,OAAe;IACxC,MAAM,MAAM,GAAa,EAAE,CAAC;IAG5B,MAAM,UAAU,GAAG,oCAAoC,CAAC;IACxD,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACnD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACrB,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAKD,SAAS,cAAc,CAAC,OAAe;IAErC,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAC;IAGjE,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAG3F,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAEnE,MAAM,OAAO,GAAG,IAAI;iBACjB,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;iBACvC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;iBACrB,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;iBACtB,IAAI,EAAE,CAAC;YAGV,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;YAClD,IAAI,aAAa,IAAI,aAAa,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;gBAC/C,OAAO,aAAa,GAAG,GAAG,CAAC;YAC7B,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAMD,SAAS,WAAW,CAAC,OAAe;IAElC,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAC;IAGjE,MAAM,UAAU,GAAG,WAAW;SAC3B,KAAK,CAAC,OAAO,CAAC;SACd,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IAEnD,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC;IAG9C,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAE9B,MAAM,OAAO,GAAG,IAAI;aACjB,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;aACvC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;aACrB,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;aACtB,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC;aACtC,IAAI,EAAE,CAAC;QAEV,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACtB,IAAI,GAAG,OAAO,CAAC;YACjB,CAAC;iBAAM,CAAC;gBACN,IAAI,IAAI,GAAG,GAAG,OAAO,CAAC;YACxB,CAAC;YAGD,IAAI,IAAI,CAAC,MAAM,IAAI,GAAG,EAAE,CAAC;gBACvB,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAGD,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QACtB,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC;IACxC,CAAC;IAGD,OAAO,IAAI,CAAC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC;AAC/C,CAAC"}
@@ -0,0 +1,30 @@
1
+ export interface HtmlParseResult {
2
+ text: string;
3
+ meta: {
4
+ title?: string;
5
+ description?: string;
6
+ keywords?: string[];
7
+ author?: string;
8
+ ogTitle?: string;
9
+ ogDescription?: string;
10
+ ogImage?: string;
11
+ ogUrl?: string;
12
+ twitterCard?: string;
13
+ twitterTitle?: string;
14
+ twitterDescription?: string;
15
+ twitterImage?: string;
16
+ [key: string]: any;
17
+ };
18
+ links: Array<{
19
+ url: string;
20
+ text?: string;
21
+ title?: string;
22
+ }>;
23
+ images: Array<{
24
+ src: string;
25
+ alt?: string;
26
+ title?: string;
27
+ }>;
28
+ }
29
+ export declare function parseHtml(html: string): HtmlParseResult;
30
+ //# sourceMappingURL=html-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/html-parser.ts"],"names":[],"mappings":"AAaA,MAAM,WAAW,eAAe;IAE9B,IAAI,EAAE,MAAM,CAAC;IAEb,IAAI,EAAE;QACJ,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;QACpB,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;KACpB,CAAC;IAEF,KAAK,EAAE,KAAK,CAAC;QACX,GAAG,EAAE,MAAM,CAAC;QACZ,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IAEH,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;CACJ;AAQD,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,eAAe,CAsCvD"}
@@ -0,0 +1,116 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.parseHtml = void 0;
27
+ const cheerio = __importStar(require("cheerio"));
28
+ function parseHtml(html) {
29
+ try {
30
+ const $ = cheerio.load(html, {
31
+ xml: {
32
+ xmlMode: false,
33
+ },
34
+ });
35
+ const text = extractPlainText($);
36
+ const meta = extractMetaTags($);
37
+ const links = extractLinks($);
38
+ const images = extractImages($);
39
+ return {
40
+ text,
41
+ meta,
42
+ links,
43
+ images,
44
+ };
45
+ }
46
+ catch (error) {
47
+ console.warn('HTML parsing error:', error);
48
+ return {
49
+ text: html.replace(/<[^>]*>/g, ''),
50
+ meta: {},
51
+ links: [],
52
+ images: [],
53
+ };
54
+ }
55
+ }
56
+ exports.parseHtml = parseHtml;
57
+ function extractPlainText($) {
58
+ $('script, style, noscript').remove();
59
+ const text = $('body').text() || $.text();
60
+ return text
61
+ .replace(/\s+/g, ' ')
62
+ .replace(/\n{3,}/g, '\n\n')
63
+ .trim();
64
+ }
65
+ function extractMetaTags($) {
66
+ const meta = {};
67
+ meta.title = $('title').text() || $('meta[property="og:title"]').attr('content');
68
+ meta.description = $('meta[name="description"]').attr('content');
69
+ meta.keywords = $('meta[name="keywords"]').attr('content')?.split(',').map(k => k.trim());
70
+ meta.author = $('meta[name="author"]').attr('content');
71
+ meta.ogTitle = $('meta[property="og:title"]').attr('content');
72
+ meta.ogDescription = $('meta[property="og:description"]').attr('content');
73
+ meta.ogImage = $('meta[property="og:image"]').attr('content');
74
+ meta.ogUrl = $('meta[property="og:url"]').attr('content');
75
+ meta.twitterCard = $('meta[name="twitter:card"]').attr('content');
76
+ meta.twitterTitle = $('meta[name="twitter:title"]').attr('content');
77
+ meta.twitterDescription = $('meta[name="twitter:description"]').attr('content');
78
+ meta.twitterImage = $('meta[name="twitter:image"]').attr('content');
79
+ Object.keys(meta).forEach(key => {
80
+ if (meta[key] === undefined) {
81
+ delete meta[key];
82
+ }
83
+ });
84
+ return meta;
85
+ }
86
+ function extractLinks($) {
87
+ const links = [];
88
+ $('a[href]').each((_, element) => {
89
+ const $el = $(element);
90
+ const href = $el.attr('href');
91
+ if (href) {
92
+ links.push({
93
+ url: href,
94
+ text: $el.text().trim() || undefined,
95
+ title: $el.attr('title') || undefined,
96
+ });
97
+ }
98
+ });
99
+ return links;
100
+ }
101
+ function extractImages($) {
102
+ const images = [];
103
+ $('img[src]').each((_, element) => {
104
+ const $el = $(element);
105
+ const src = $el.attr('src');
106
+ if (src) {
107
+ images.push({
108
+ src,
109
+ alt: $el.attr('alt') || undefined,
110
+ title: $el.attr('title') || undefined,
111
+ });
112
+ }
113
+ });
114
+ return images;
115
+ }
116
+ //# sourceMappingURL=html-parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parsers/html-parser.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAWA,iDAAmC;AAyCnC,SAAgB,SAAS,CAAC,IAAY;IACpC,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE;YAC3B,GAAG,EAAE;gBACH,OAAO,EAAE,KAAK;aACf;SACF,CAAC,CAAC;QAGH,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;QAGjC,MAAM,IAAI,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;QAGhC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;QAG9B,MAAM,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QAEhC,OAAO;YACL,IAAI;YACJ,IAAI;YACJ,KAAK;YACL,MAAM;SACP,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,OAAO,CAAC,IAAI,CAAC,qBAAqB,EAAE,KAAK,CAAC,CAAC;QAG3C,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;YAClC,IAAI,EAAE,EAAE;YACR,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;SACX,CAAC;IACJ,CAAC;AACH,CAAC;AAtCD,8BAsCC;AAKD,SAAS,gBAAgB,CAAC,CAAqB;IAE7C,CAAC,CAAC,yBAAyB,CAAC,CAAC,MAAM,EAAE,CAAC;IAGtC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;IAG1C,OAAO,IAAI;SACR,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,IAAI,EAAE,CAAC;AACZ,CAAC;AAKD,SAAS,eAAe,CAAC,CAAqB;IAC5C,MAAM,IAAI,GAA4B,EAAE,CAAC;IAGzC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACjF,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACjE,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IAC1F,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAGvD,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9D,IAAI,CAAC,aAAa,GAAG,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1E,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9D,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAG1D,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAClE,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACpE,IAAI,CAAC,kBAAkB,GAAG,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAChF,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAGpE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;QAC9B,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,SAAS,EAAE,CAAC;YAC5B,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,IAAI,CAAC;AACd,CAAC;AAKD,SAAS,YAAY,CAAC,CAAqB;IACzC,MAAM,KAAK,GAA6B,EAAE,CAAC;IAE3C,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC/B,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;QACvB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAE9B,IAAI,IAAI,EAAE,CAAC;YACT,KAAK,CAAC,IAAI,CAAC;gBACT,GAAG,EAAE,IAAI;gBACT,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,SAAS;gBACpC,KAAK,EAAE,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,SAAS;aACtC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC;AAKD,SAAS,aAAa,CAAC,CAAqB;IAC1C,MAAM,MAAM,GAA8B,EAAE,CAAC;IAE7C,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAChC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;QACvB,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAE5B,IAAI,GAAG,EAAE,CAAC;YACR,MAAM,CAAC,IAAI,CAAC;gBACV,GAAG;gBACH,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,SAAS;gBACjC,KAAK,EAAE,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,SAAS;aACtC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,13 @@
1
+ export interface MarkdownParseResult {
2
+ text: string;
3
+ frontmatter: Record<string, any> | null;
4
+ links: Array<{
5
+ url: string;
6
+ title?: string;
7
+ text?: string;
8
+ }>;
9
+ rawFrontmatter?: string;
10
+ }
11
+ export declare function parseMarkdown(content: string): Promise<MarkdownParseResult>;
12
+ export declare function isValidMarkdown(content: string): boolean;
13
+ //# sourceMappingURL=markdown-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/markdown-parser.ts"],"names":[],"mappings":"AAkBA,MAAM,WAAW,mBAAmB;IAElC,IAAI,EAAE,MAAM,CAAC;IAEb,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,IAAI,CAAC;IAExC,KAAK,EAAE,KAAK,CAAC;QACX,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,IAAI,CAAC,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IAEH,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAQD,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,CAAC,CA4CjF;AAqED,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAQxD"}
@@ -0,0 +1,84 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.isValidMarkdown = exports.parseMarkdown = void 0;
7
+ const unified_1 = require("unified");
8
+ const remark_parse_1 = __importDefault(require("remark-parse"));
9
+ const remark_frontmatter_1 = __importDefault(require("remark-frontmatter"));
10
+ const remark_stringify_1 = __importDefault(require("remark-stringify"));
11
+ const gray_matter_1 = __importDefault(require("gray-matter"));
12
+ const unist_util_visit_1 = require("unist-util-visit");
13
+ async function parseMarkdown(content) {
14
+ try {
15
+ const { data: frontmatter, content: markdownContent, matter: rawFrontmatter } = (0, gray_matter_1.default)(content);
16
+ const processor = (0, unified_1.unified)()
17
+ .use(remark_parse_1.default)
18
+ .use(remark_frontmatter_1.default, ['yaml', 'toml'])
19
+ .use(remark_stringify_1.default);
20
+ const ast = processor.parse(markdownContent);
21
+ const links = [];
22
+ (0, unist_util_visit_1.visit)(ast, 'link', (node) => {
23
+ links.push({
24
+ url: node.url,
25
+ title: node.title || undefined,
26
+ text: extractTextFromNode(node),
27
+ });
28
+ });
29
+ const text = extractPlainText(markdownContent);
30
+ return {
31
+ text,
32
+ frontmatter: Object.keys(frontmatter).length > 0 ? frontmatter : null,
33
+ links,
34
+ rawFrontmatter: rawFrontmatter || undefined,
35
+ };
36
+ }
37
+ catch (error) {
38
+ console.warn('Markdown parsing error:', error);
39
+ return {
40
+ text: content,
41
+ frontmatter: null,
42
+ links: [],
43
+ };
44
+ }
45
+ }
46
+ exports.parseMarkdown = parseMarkdown;
47
+ function extractPlainText(markdown) {
48
+ let text = markdown;
49
+ text = text.replace(/```[\s\S]*?```/g, '');
50
+ text = text.replace(/`[^`]+`/g, '');
51
+ text = text.replace(/^#{1,6}\s+/gm, '');
52
+ text = text.replace(/(\*\*|__)(.*?)\1/g, '$2');
53
+ text = text.replace(/(\*|_)(.*?)\1/g, '$2');
54
+ text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
55
+ text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '');
56
+ text = text.replace(/^>\s+/gm, '');
57
+ text = text.replace(/^(-{3,}|\*{3,}|_{3,})$/gm, '');
58
+ text = text.replace(/^[\s]*[-*+]\s+/gm, '');
59
+ text = text.replace(/^[\s]*\d+\.\s+/gm, '');
60
+ text = text.replace(/\n{3,}/g, '\n\n');
61
+ text = text.trim();
62
+ return text;
63
+ }
64
+ function extractTextFromNode(node) {
65
+ if (node.type === 'text') {
66
+ return node.value;
67
+ }
68
+ if (node.children) {
69
+ return node.children.map(extractTextFromNode).join('');
70
+ }
71
+ return '';
72
+ }
73
+ function isValidMarkdown(content) {
74
+ try {
75
+ const processor = (0, unified_1.unified)().use(remark_parse_1.default);
76
+ processor.parse(content);
77
+ return true;
78
+ }
79
+ catch {
80
+ return false;
81
+ }
82
+ }
83
+ exports.isValidMarkdown = isValidMarkdown;
84
+ //# sourceMappingURL=markdown-parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown-parser.js","sourceRoot":"","sources":["../../src/parsers/markdown-parser.ts"],"names":[],"mappings":";;;;;;AAUA,qCAAkC;AAClC,gEAAuC;AACvC,4EAAmD;AACnD,wEAA+C;AAC/C,8DAAiC;AACjC,uDAAyC;AAwBlC,KAAK,UAAU,aAAa,CAAC,OAAe;IACjD,IAAI,CAAC;QAEH,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,eAAe,EAAE,MAAM,EAAE,cAAc,EAAE,GAAG,IAAA,qBAAM,EAAC,OAAO,CAAC,CAAC;QAGhG,MAAM,SAAS,GAAG,IAAA,iBAAO,GAAE;aACxB,GAAG,CAAC,sBAAW,CAAC;aAChB,GAAG,CAAC,4BAAiB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;aACxC,GAAG,CAAC,0BAAe,CAAC,CAAC;QAExB,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,eAAe,CAAS,CAAC;QAGrD,MAAM,KAAK,GAA0D,EAAE,CAAC;QAExE,IAAA,wBAAK,EAAC,GAAG,EAAE,MAAM,EAAE,CAAC,IAAU,EAAE,EAAE;YAChC,KAAK,CAAC,IAAI,CAAC;gBACT,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;gBAC9B,IAAI,EAAE,mBAAmB,CAAC,IAAI,CAAC;aAChC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAGH,MAAM,IAAI,GAAG,gBAAgB,CAAC,eAAe,CAAC,CAAC;QAE/C,OAAO;YACL,IAAI;YACJ,WAAW,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI;YACrE,KAAK;YACL,cAAc,EAAE,cAAc,IAAI,SAAS;SAC5C,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,OAAO,CAAC,IAAI,CAAC,yBAAyB,EAAE,KAAK,CAAC,CAAC;QAG/C,OAAO;YACL,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,IAAI;YACjB,KAAK,EAAE,EAAE;SACV,CAAC;IACJ,CAAC;AACH,CAAC;AA5CD,sCA4CC;AAQD,SAAS,gBAAgB,CAAC,QAAgB;IACxC,IAAI,IAAI,GAAG,QAAQ,CAAC;IAGpB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;IAC3C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAGpC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;IAGxC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC,CAAC;IAC/C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;IAG5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;IAGpD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;IAGnD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAGnC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,0BAA0B,EAAE,EAAE,CAAC,CAAC;IAGpD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAC5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;IAG5C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IACvC,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAEnB,OAAO,IAAI,CAAC;AACd,CAAC;AAQD,SAAS,mBAAmB,CAAC,IAAS;IACpC,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAQD,SAAgB,eAAe,CAAC,OAAe;IAC7C,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAA,iBAAO,GAAE,CAAC,GAAG,CAAC,sBAAW,CAAC,CAAC;QAC7C,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AARD,0CAQC"}
@@ -0,0 +1,18 @@
1
+ export interface TextParseResult {
2
+ text: string;
3
+ urls: string[];
4
+ lineCount: number;
5
+ charCount: number;
6
+ }
7
+ export declare function parseText(content: string): TextParseResult;
8
+ export declare function normalizeLineEndings(text: string): string;
9
+ export declare function extractUrls(text: string): string[];
10
+ export declare function isPlainText(content: string): boolean;
11
+ export declare function getTextStats(text: string): {
12
+ lines: number;
13
+ words: number;
14
+ chars: number;
15
+ urls: number;
16
+ };
17
+ export declare function truncateText(text: string, maxLength: number, suffix?: string): string;
18
+ //# sourceMappingURL=text-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"text-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/text-parser.ts"],"names":[],"mappings":"AAWA,MAAM,WAAW,eAAe;IAE9B,IAAI,EAAE,MAAM,CAAC;IAEb,IAAI,EAAE,MAAM,EAAE,CAAC;IAEf,SAAS,EAAE,MAAM,CAAC;IAElB,SAAS,EAAE,MAAM,CAAC;CACnB;AAcD,wBAAgB,SAAS,CAAC,OAAO,EAAE,MAAM,GAAG,eAAe,CAgC1D;AASD,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAIzD;AAQD,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAUlD;AAQD,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAkBpD;AAQD,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACd,CAYA;AAUD,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,GAAE,MAAc,GAAG,MAAM,CAM5F"}