webcontext-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +583 -0
- package/dist/browser/manager.d.ts +47 -0
- package/dist/browser/manager.d.ts.map +1 -0
- package/dist/browser/manager.js +215 -0
- package/dist/browser/manager.js.map +1 -0
- package/dist/cache/cache.d.ts +22 -0
- package/dist/cache/cache.d.ts.map +1 -0
- package/dist/cache/cache.js +150 -0
- package/dist/cache/cache.js.map +1 -0
- package/dist/chunking/chunker.d.ts +26 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +208 -0
- package/dist/chunking/chunker.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +406 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/pipeline.d.ts +35 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +476 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/stream.d.ts +48 -0
- package/dist/core/stream.d.ts.map +1 -0
- package/dist/core/stream.js +72 -0
- package/dist/core/stream.js.map +1 -0
- package/dist/core/types.d.ts +259 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +4 -0
- package/dist/core/types.js.map +1 -0
- package/dist/export/index.d.ts +3 -0
- package/dist/export/index.d.ts.map +1 -0
- package/dist/export/index.js +8 -0
- package/dist/export/index.js.map +1 -0
- package/dist/export/templates.d.ts +25 -0
- package/dist/export/templates.d.ts.map +1 -0
- package/dist/export/templates.js +76 -0
- package/dist/export/templates.js.map +1 -0
- package/dist/export/vectordb.d.ts +21 -0
- package/dist/export/vectordb.d.ts.map +1 -0
- package/dist/export/vectordb.js +101 -0
- package/dist/export/vectordb.js.map +1 -0
- package/dist/extractors/content.d.ts +23 -0
- package/dist/extractors/content.d.ts.map +1 -0
- package/dist/extractors/content.js +328 -0
- package/dist/extractors/content.js.map +1 -0
- package/dist/extractors/github.d.ts +19 -0
- package/dist/extractors/github.d.ts.map +1 -0
- package/dist/extractors/github.js +150 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/images.d.ts +20 -0
- package/dist/extractors/images.d.ts.map +1 -0
- package/dist/extractors/images.js +73 -0
- package/dist/extractors/images.js.map +1 -0
- package/dist/extractors/pdf.d.ts +11 -0
- package/dist/extractors/pdf.d.ts.map +1 -0
- package/dist/extractors/pdf.js +107 -0
- package/dist/extractors/pdf.js.map +1 -0
- package/dist/extractors/screenshot.d.ts +21 -0
- package/dist/extractors/screenshot.d.ts.map +1 -0
- package/dist/extractors/screenshot.js +85 -0
- package/dist/extractors/screenshot.js.map +1 -0
- package/dist/index.d.ts +70 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +206 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +108 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/sdk/client.d.ts +48 -0
- package/dist/sdk/client.d.ts.map +1 -0
- package/dist/sdk/client.js +120 -0
- package/dist/sdk/client.js.map +1 -0
- package/dist/sdk/mcp.d.ts +12 -0
- package/dist/sdk/mcp.d.ts.map +1 -0
- package/dist/sdk/mcp.js +146 -0
- package/dist/sdk/mcp.js.map +1 -0
- package/dist/sdk/server.d.ts +5 -0
- package/dist/sdk/server.d.ts.map +1 -0
- package/dist/sdk/server.js +158 -0
- package/dist/sdk/server.js.map +1 -0
- package/dist/search/vector.d.ts +26 -0
- package/dist/search/vector.d.ts.map +1 -0
- package/dist/search/vector.js +142 -0
- package/dist/search/vector.js.map +1 -0
- package/dist/transformers/markdown.d.ts +21 -0
- package/dist/transformers/markdown.d.ts.map +1 -0
- package/dist/transformers/markdown.js +242 -0
- package/dist/transformers/markdown.js.map +1 -0
- package/dist/utils/dedup.d.ts +20 -0
- package/dist/utils/dedup.d.ts.map +1 -0
- package/dist/utils/dedup.js +61 -0
- package/dist/utils/dedup.js.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +15 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/metrics.d.ts +16 -0
- package/dist/utils/metrics.d.ts.map +1 -0
- package/dist/utils/metrics.js +28 -0
- package/dist/utils/metrics.js.map +1 -0
- package/dist/utils/scheduler.d.ts +19 -0
- package/dist/utils/scheduler.d.ts.map +1 -0
- package/dist/utils/scheduler.js +63 -0
- package/dist/utils/scheduler.js.map +1 -0
- package/dist/utils/sitemap.d.ts +17 -0
- package/dist/utils/sitemap.d.ts.map +1 -0
- package/dist/utils/sitemap.js +118 -0
- package/dist/utils/sitemap.js.map +1 -0
- package/dist/utils/validation.d.ts +142 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +35 -0
- package/dist/utils/validation.js.map +1 -0
- package/dist/utils/webhook.d.ts +21 -0
- package/dist/utils/webhook.d.ts.map +1 -0
- package/dist/utils/webhook.js +108 -0
- package/dist/utils/webhook.js.map +1 -0
- package/package.json +109 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.ContentExtractor = void 0;
|
|
27
|
+
const cheerio = __importStar(require("cheerio"));
|
|
28
|
+
/**
|
|
29
|
+
* Content extractor that cleans HTML and extracts structured content.
|
|
30
|
+
* Uses Readability algorithm + custom heuristics for developer content.
|
|
31
|
+
*/
|
|
32
|
+
class ContentExtractor {
|
|
33
|
+
// Selectors for noise elements to remove
|
|
34
|
+
static NOISE_SELECTORS = [
|
|
35
|
+
'nav', 'header', 'footer', 'aside',
|
|
36
|
+
'.sidebar', '.navigation', '.nav',
|
|
37
|
+
'.cookie-banner', '.cookie-consent', '.gdpr',
|
|
38
|
+
'.advertisement', '.ad', '.ads', '[class*="ad-"]',
|
|
39
|
+
'.social-share', '.share-buttons',
|
|
40
|
+
'.comments', '#comments', '.disqus',
|
|
41
|
+
'.newsletter', '.subscribe',
|
|
42
|
+
'.popup', '.modal', '.overlay',
|
|
43
|
+
'.breadcrumb', '.pagination',
|
|
44
|
+
'script', 'style', 'noscript', 'iframe',
|
|
45
|
+
'[role="banner"]', '[role="navigation"]', '[role="complementary"]',
|
|
46
|
+
];
|
|
47
|
+
// Content-likely selectors (priority order)
|
|
48
|
+
static CONTENT_SELECTORS = [
|
|
49
|
+
'article', 'main', '[role="main"]',
|
|
50
|
+
'.markdown-body', '.documentation', '.doc-content',
|
|
51
|
+
'.post-content', '.article-content', '.entry-content',
|
|
52
|
+
'.readme', '#readme',
|
|
53
|
+
'.content', '#content',
|
|
54
|
+
];
|
|
55
|
+
extract(html, url, focusMode = 'full') {
|
|
56
|
+
const $ = cheerio.load(html);
|
|
57
|
+
// Remove noise
|
|
58
|
+
ContentExtractor.NOISE_SELECTORS.forEach(sel => $(sel).remove());
|
|
59
|
+
// Find main content area
|
|
60
|
+
const contentEl = this.findContentElement($, focusMode);
|
|
61
|
+
const title = this.extractTitle($);
|
|
62
|
+
const description = $('meta[name="description"]').attr('content') ||
|
|
63
|
+
$('meta[property="og:description"]').attr('content') || '';
|
|
64
|
+
// Extract structured data before converting
|
|
65
|
+
let codeBlocks = this.extractCodeBlocks($, contentEl);
|
|
66
|
+
const headings = this.extractHeadings($, contentEl);
|
|
67
|
+
const links = this.extractLinks($, contentEl, url);
|
|
68
|
+
const metadata = this.extractMetadata($, url);
|
|
69
|
+
// Extract OpenAPI endpoints in API focus mode
|
|
70
|
+
if (focusMode === 'api') {
|
|
71
|
+
const apiBlocks = this.extractOpenAPIEndpoints($, contentEl);
|
|
72
|
+
codeBlocks = codeBlocks.concat(apiBlocks);
|
|
73
|
+
}
|
|
74
|
+
// Get clean text
|
|
75
|
+
const text = contentEl.text().replace(/\s+/g, ' ').trim();
|
|
76
|
+
// Get clean HTML for markdown conversion
|
|
77
|
+
const cleanHtml = contentEl.html() || '';
|
|
78
|
+
return {
|
|
79
|
+
url,
|
|
80
|
+
title,
|
|
81
|
+
description,
|
|
82
|
+
markdown: '', // Filled by transformer pipeline
|
|
83
|
+
html: cleanHtml,
|
|
84
|
+
text,
|
|
85
|
+
codeBlocks,
|
|
86
|
+
headings,
|
|
87
|
+
links,
|
|
88
|
+
metadata,
|
|
89
|
+
timestamp: new Date().toISOString(),
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
findContentElement($, focusMode) {
|
|
93
|
+
if (focusMode === 'code') {
|
|
94
|
+
const codeContainer = $('pre, .highlight, .code-block').parent();
|
|
95
|
+
if (codeContainer.length)
|
|
96
|
+
return codeContainer;
|
|
97
|
+
}
|
|
98
|
+
if (focusMode === 'api') {
|
|
99
|
+
const apiContent = $('.api-content, .endpoint, .method-section, .operation').first();
|
|
100
|
+
if (apiContent.length)
|
|
101
|
+
return apiContent;
|
|
102
|
+
}
|
|
103
|
+
for (const selector of ContentExtractor.CONTENT_SELECTORS) {
|
|
104
|
+
const el = $(selector).first();
|
|
105
|
+
if (el.length && (el.text().length > 200)) {
|
|
106
|
+
return el;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Score remaining candidates by content density
|
|
110
|
+
const body = $('body');
|
|
111
|
+
const candidates = body.find('div, section').toArray();
|
|
112
|
+
let best = null;
|
|
113
|
+
let bestScore = 0;
|
|
114
|
+
for (const candidate of candidates) {
|
|
115
|
+
const $c = $(candidate);
|
|
116
|
+
const textLen = $c.text().trim().length;
|
|
117
|
+
if (textLen < 200)
|
|
118
|
+
continue;
|
|
119
|
+
const htmlLen = ($c.html() || '').length;
|
|
120
|
+
if (htmlLen === 0)
|
|
121
|
+
continue;
|
|
122
|
+
const density = textLen / htmlLen;
|
|
123
|
+
const paragraphs = $c.find('p').length;
|
|
124
|
+
const codeEls = $c.find('pre, code').length;
|
|
125
|
+
const score = density * (1 + paragraphs * 0.1 + codeEls * 0.2);
|
|
126
|
+
if (score > bestScore) {
|
|
127
|
+
bestScore = score;
|
|
128
|
+
best = $c;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return best || body;
|
|
132
|
+
}
|
|
133
|
+
extractTitle($) {
|
|
134
|
+
return $('h1').first().text().trim() ||
|
|
135
|
+
$('title').text().trim() ||
|
|
136
|
+
$('meta[property="og:title"]').attr('content') || 'Untitled';
|
|
137
|
+
}
|
|
138
|
+
extractCodeBlocks($, container) {
|
|
139
|
+
const blocks = [];
|
|
140
|
+
container.find('pre code, pre').each((_, el) => {
|
|
141
|
+
const $el = $(el);
|
|
142
|
+
const code = $el.text().trim();
|
|
143
|
+
if (!code)
|
|
144
|
+
return;
|
|
145
|
+
// Detect language from class
|
|
146
|
+
const classes = ($el.attr('class') || '') + ' ' + ($el.parent().attr('class') || '');
|
|
147
|
+
const langMatch = classes.match(/(?:language-|lang-|highlight-)(\w+)/);
|
|
148
|
+
const language = langMatch?.[1] || this.detectLanguage(code);
|
|
149
|
+
// Get surrounding context (previous heading or paragraph)
|
|
150
|
+
const prevHeading = $el.closest('section, div').find('h1,h2,h3,h4').last().text().trim();
|
|
151
|
+
blocks.push({ language, code, context: prevHeading || undefined });
|
|
152
|
+
});
|
|
153
|
+
return blocks;
|
|
154
|
+
}
|
|
155
|
+
extractHeadings($, container) {
|
|
156
|
+
const headings = [];
|
|
157
|
+
container.find('h1, h2, h3, h4, h5, h6').each((_, el) => {
|
|
158
|
+
const $el = $(el);
|
|
159
|
+
headings.push({
|
|
160
|
+
level: parseInt(el.tagName?.[1] || '1'),
|
|
161
|
+
text: $el.text().trim(),
|
|
162
|
+
id: $el.attr('id') || undefined,
|
|
163
|
+
});
|
|
164
|
+
});
|
|
165
|
+
return headings;
|
|
166
|
+
}
|
|
167
|
+
extractLinks($, container, baseUrl) {
|
|
168
|
+
const links = [];
|
|
169
|
+
const baseHost = new URL(baseUrl).hostname;
|
|
170
|
+
container.find('a[href]').each((_, el) => {
|
|
171
|
+
const $el = $(el);
|
|
172
|
+
const href = $el.attr('href') || '';
|
|
173
|
+
const text = $el.text().trim();
|
|
174
|
+
if (!href || !text || href.startsWith('#'))
|
|
175
|
+
return;
|
|
176
|
+
try {
|
|
177
|
+
const resolved = new URL(href, baseUrl).href;
|
|
178
|
+
links.push({
|
|
179
|
+
href: resolved,
|
|
180
|
+
text,
|
|
181
|
+
isInternal: new URL(resolved).hostname === baseHost,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
catch { }
|
|
185
|
+
});
|
|
186
|
+
return links;
|
|
187
|
+
}
|
|
188
|
+
extractMetadata($, url) {
|
|
189
|
+
const meta = {};
|
|
190
|
+
meta.author = $('meta[name="author"]').attr('content') ||
|
|
191
|
+
$('[rel="author"]').text().trim() || undefined;
|
|
192
|
+
meta.publishedDate = $('meta[property="article:published_time"]').attr('content') ||
|
|
193
|
+
$('time[datetime]').first().attr('datetime') || undefined;
|
|
194
|
+
meta.language = $('html').attr('lang') || undefined;
|
|
195
|
+
meta.ogImage = $('meta[property="og:image"]').attr('content') || undefined;
|
|
196
|
+
meta.canonical = $('link[rel="canonical"]').attr('href') || undefined;
|
|
197
|
+
meta.siteName = $('meta[property="og:site_name"]').attr('content') || undefined;
|
|
198
|
+
meta.type = this.detectContentType($, url);
|
|
199
|
+
meta.framework = this.detectFramework($, url);
|
|
200
|
+
meta.version = this.detectVersion($);
|
|
201
|
+
return meta;
|
|
202
|
+
}
|
|
203
|
+
detectContentType($, url) {
|
|
204
|
+
const urlLower = url.toLowerCase();
|
|
205
|
+
if (urlLower.includes('/api') || urlLower.includes('/reference'))
|
|
206
|
+
return 'api-reference';
|
|
207
|
+
if (urlLower.includes('/docs') || urlLower.includes('/documentation'))
|
|
208
|
+
return 'documentation';
|
|
209
|
+
if (urlLower.includes('/blog') || $('article').length)
|
|
210
|
+
return 'blog-post';
|
|
211
|
+
if (urlLower.includes('github.com') && urlLower.includes('/readme'))
|
|
212
|
+
return 'readme';
|
|
213
|
+
if ($('#readme').length || $('.markdown-body').length)
|
|
214
|
+
return 'readme';
|
|
215
|
+
if (urlLower.includes('/tutorial') || urlLower.includes('/guide'))
|
|
216
|
+
return 'tutorial';
|
|
217
|
+
if (urlLower.includes('/changelog') || urlLower.includes('/releases'))
|
|
218
|
+
return 'changelog';
|
|
219
|
+
return 'unknown';
|
|
220
|
+
}
|
|
221
|
+
detectFramework($, url) {
|
|
222
|
+
const text = $('body').text().toLowerCase();
|
|
223
|
+
const frameworks = [
|
|
224
|
+
'react', 'vue', 'angular', 'svelte', 'next.js', 'nuxt',
|
|
225
|
+
'express', 'fastapi', 'django', 'flask', 'spring', 'rails',
|
|
226
|
+
'tailwind', 'bootstrap', 'pytorch', 'tensorflow',
|
|
227
|
+
];
|
|
228
|
+
return frameworks.find(f => text.includes(f) || url.includes(f));
|
|
229
|
+
}
|
|
230
|
+
detectVersion($) {
|
|
231
|
+
// Check meta tags first
|
|
232
|
+
const metaVersion = $('meta[name="version"]').attr('content') ||
|
|
233
|
+
$('meta[name="doc-version"]').attr('content');
|
|
234
|
+
if (metaVersion)
|
|
235
|
+
return metaVersion;
|
|
236
|
+
// Search visible text for version patterns
|
|
237
|
+
const text = $('body').text();
|
|
238
|
+
const match = text.match(/(?:v|version\s*)(\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?)/i);
|
|
239
|
+
return match ? match[0].trim() : undefined;
|
|
240
|
+
}
|
|
241
|
+
extractOpenAPIEndpoints($, container) {
|
|
242
|
+
const blocks = [];
|
|
243
|
+
const methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'HEAD', 'OPTIONS'];
|
|
244
|
+
// Look for structured endpoint elements (Swagger UI, Redoc, etc.)
|
|
245
|
+
container.find('.opblock, .operation, [class*="endpoint"], [class*="method"]').each((_, el) => {
|
|
246
|
+
const $el = $(el);
|
|
247
|
+
const text = $el.text();
|
|
248
|
+
const methodMatch = text.match(new RegExp(`\\b(${methods.join('|')})\\b`));
|
|
249
|
+
const pathMatch = text.match(/\/[\w\-\/.{}:]+/);
|
|
250
|
+
if (methodMatch && pathMatch) {
|
|
251
|
+
const description = $el.find('.opblock-summary-description, .description, p').first().text().trim();
|
|
252
|
+
const endpoint = `${methodMatch[1]} ${pathMatch[0]}`;
|
|
253
|
+
blocks.push({
|
|
254
|
+
language: 'http',
|
|
255
|
+
code: description ? `${endpoint}\n# ${description}` : endpoint,
|
|
256
|
+
context: 'API Endpoint',
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
// Fallback: scan for HTTP method + path patterns in text nodes
|
|
261
|
+
if (!blocks.length) {
|
|
262
|
+
const text = container.text();
|
|
263
|
+
const endpointRegex = new RegExp(`\\b(${methods.join('|')})\\s+(\/[\\w\\-\\/.{}:?&=]+)`, 'g');
|
|
264
|
+
let match;
|
|
265
|
+
while ((match = endpointRegex.exec(text)) !== null) {
|
|
266
|
+
blocks.push({
|
|
267
|
+
language: 'http',
|
|
268
|
+
code: `${match[1]} ${match[2]}`,
|
|
269
|
+
context: 'API Endpoint',
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
return blocks;
|
|
274
|
+
}
|
|
275
|
+
detectLanguage(code) {
|
|
276
|
+
// Python
|
|
277
|
+
if (code.includes('import ') && code.includes('from '))
|
|
278
|
+
return 'python';
|
|
279
|
+
// TypeScript (check before JS due to overlap)
|
|
280
|
+
if (code.includes('interface ') || code.includes(': string'))
|
|
281
|
+
return 'typescript';
|
|
282
|
+
// JavaScript
|
|
283
|
+
if (code.includes('const ') || code.includes('let ') || code.includes('=>'))
|
|
284
|
+
return 'javascript';
|
|
285
|
+
// Go
|
|
286
|
+
if (code.includes('func ') && code.includes(':='))
|
|
287
|
+
return 'go';
|
|
288
|
+
// Rust
|
|
289
|
+
if (code.includes('fn ') && code.includes('->'))
|
|
290
|
+
return 'rust';
|
|
291
|
+
// Java
|
|
292
|
+
if (code.includes('public class') || code.includes('System.out'))
|
|
293
|
+
return 'java';
|
|
294
|
+
// C#
|
|
295
|
+
if (code.includes('using System') || (code.includes('namespace') && code.includes('public static')))
|
|
296
|
+
return 'csharp';
|
|
297
|
+
// Kotlin
|
|
298
|
+
if (code.includes('fun ') && (code.includes('val ') || code.includes('package')))
|
|
299
|
+
return 'kotlin';
|
|
300
|
+
// Swift
|
|
301
|
+
if (code.includes('import Foundation') || (code.includes('func ') && code.includes('let ')))
|
|
302
|
+
return 'swift';
|
|
303
|
+
// Ruby
|
|
304
|
+
if (code.includes('def ') && code.includes('end'))
|
|
305
|
+
return 'ruby';
|
|
306
|
+
// PHP
|
|
307
|
+
if (code.includes('<?php'))
|
|
308
|
+
return 'php';
|
|
309
|
+
// SQL
|
|
310
|
+
if (/\b(SELECT|INSERT|UPDATE|CREATE TABLE)\b/.test(code))
|
|
311
|
+
return 'sql';
|
|
312
|
+
// Shell
|
|
313
|
+
if (code.startsWith('#!/bin/bash') || /^\s*\$/m.test(code))
|
|
314
|
+
return 'bash';
|
|
315
|
+
// HTML
|
|
316
|
+
if (code.trimStart().startsWith('<') && /<\w+[\s>]/.test(code))
|
|
317
|
+
return 'html';
|
|
318
|
+
// CSS
|
|
319
|
+
if (/\{[^}]*(color|margin|padding|display|font)\s*:/.test(code))
|
|
320
|
+
return 'css';
|
|
321
|
+
// YAML
|
|
322
|
+
if (/^[\w-]+\s*:(?:\s|$)/m.test(code) && !code.includes('{'))
|
|
323
|
+
return 'yaml';
|
|
324
|
+
return 'text';
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
exports.ContentExtractor = ContentExtractor;
|
|
328
|
+
//# sourceMappingURL=content.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content.js","sourceRoot":"","sources":["../../src/extractors/content.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iDAAmC;AAGnC;;;GAGG;AACH,MAAa,gBAAgB;IAC3B,yCAAyC;IACjC,MAAM,CAAC,eAAe,GAAG;QAC/B,KAAK,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO;QAClC,UAAU,EAAE,aAAa,EAAE,MAAM;QACjC,gBAAgB,EAAE,iBAAiB,EAAE,OAAO;QAC5C,gBAAgB,EAAE,KAAK,EAAE,MAAM,EAAE,gBAAgB;QACjD,eAAe,EAAE,gBAAgB;QACjC,WAAW,EAAE,WAAW,EAAE,SAAS;QACnC,aAAa,EAAE,YAAY;QAC3B,QAAQ,EAAE,QAAQ,EAAE,UAAU;QAC9B,aAAa,EAAE,aAAa;QAC5B,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ;QACvC,iBAAiB,EAAE,qBAAqB,EAAE,wBAAwB;KACnE,CAAC;IAEF,4CAA4C;IACpC,MAAM,CAAC,iBAAiB,GAAG;QACjC,SAAS,EAAE,MAAM,EAAE,eAAe;QAClC,gBAAgB,EAAE,gBAAgB,EAAE,cAAc;QAClD,eAAe,EAAE,kBAAkB,EAAE,gBAAgB;QACrD,SAAS,EAAE,SAAS;QACpB,UAAU,EAAE,UAAU;KACvB,CAAC;IAEF,OAAO,CAAC,IAAY,EAAE,GAAW,EAAE,YAAuB,MAAM;QAC9D,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE7B,eAAe;QACf,gBAAgB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;QAEjE,yBAAyB;QACzB,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACxD,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACnC,MAAM,WAAW,GAAG,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC9C,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAE9E,4CAA4C;QAC5C,IAAI,UAAU,GAAG,IAAI,CAAC,iBAAiB,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACpD,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QAE9C,8CAA8C;QAC9C,IAAI,SAAS,KAAK,KAAK,EAAE,CAAC;YACxB,MAAM,SAAS,GAAG,IAAI,CAAC,uBAAuB,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;YAC7D,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC5C,CAAC;QAED,iBAAiB;QACjB,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAE1D,yCAAyC;QACzC,MAAM,SAAS,GAAG,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;QAEzC,OAAO;YACL,GAAG;YACH,KAAK;YACL,WAAW;YACX,QAAQ,EAAE,EAAE,EAAE,iCAAiC;YAC/C,IAAI,EAAE,SAAS;YACf,IAAI;YACJ,UAAU;YACV,QAAQ;YACR,KAAK;YACL,QAAQ;YACR,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC;IACJ,CAAC;IAEO,kBAAkB,CAAC,CAAqB,EAAE,SAAoB;QACpE,IAAI,SAAS,KAAK,MAAM,EAAE,CAAC;YACzB,MAAM,aAAa,GAAG,CAAC,CAAC,8BAA8B,CAAC,CAAC,MAAM,EAAE,CAAC;YACjE,IAAI,aAAa,CAAC,MAAM;gBAAE,OAAO,aAAa,CAAC;QACjD,CAAC;QAED,IAAI,SAAS,KAAK,KAAK,EAAE,CAAC;YACxB,MAAM,UAAU,GAAG,CAAC,CAAC,sDAAsD,CAAC,CAAC,KAAK,EAAE,CAAC;YACrF,IAAI,UAAU,CAAC,MAAM;gBAAE,OAAO,UAAU,CAAC;QAC3C,CAAC;QAED,KAAK,MAAM,QAAQ,IAAI,gBAAgB,CAAC,iBAAiB,EAAE,CAAC;YAC1D,MAAM,EAAE,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;YAC/B,IAAI,EAAE,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,GAAG,CAAC,EAAE,CAAC;gBAC1C,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;QAED,gDAAgD;QAChD,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;QACvB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,OAAO,EAAE,CAAC;QACvD,IAAI,IAAI,GAAgC,IAAI,CAAC;QAC7C,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,MAAM,EAAE,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,OAAO,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;YACxC,IAAI,OAAO,GAAG,GAAG;gBAAE,SAAS;YAE5B,MAAM,OAAO,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;YACzC,IAAI,OAAO,KAAK,CAAC;gBAAE,SAAS;YAE5B,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;YAClC,MAAM,UAAU,GAAG,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;YACvC,MAAM,OAAO,GAAG,EAAE,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,CAAC;YAC5C,MAAM,KAAK,GAAG,OAAO,GAAG,CAAC,CAAC,GAAG,UAAU,GAAG,GAAG,GAAG,OAAO,GAAG,GAAG,CAAC,CAAC;YAE/D,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;gBACtB,SAAS,GAAG,KAAK,CAAC;gBAClB,IAAI,GAAG,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;QAED,OAAO,IAAI,IAAI,IAAI,CAAC;IACtB,CAAC;IAEO,YAAY,CAAC,CAAqB;QACxC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;YAC7B,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;YACxB,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC;IACtE,CAAC;IAEO,iBAAiB,CAAC,CAAqB,EAAE,SAA+B;QAC9E,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,SAAS,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC7C,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI;gBAAE,OAAO;YAElB,6BAA6B;YAC7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;YACrF,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC;YACvE,MAAM,QAAQ,GAAG,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAE7D,0DAA0D;YAC1D,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAEzF,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,WAAW,IAAI,SAAS,EAAE,CAAC,CAAC;QACrE,CAAC,CAAC,CAAC;QACH,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,eAAe,CAAC,CAAqB,EAAE,SAA+B;QAC5E,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,SAAS,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACtD,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,QAAQ,CAAC,IAAI,CAAC;gBACZ,KAAK,EAAE,QAAQ,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;gBACvC,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;gBACvB,EAAE,EAAE,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,SAAS;aAChC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QACH,OAAO,QAAQ,CAAC;IAClB,CAAC;IAEO,YAAY,CAAC,CAAqB,EAAE,SAA+B,EAAE,OAAe;QAC1F,MAAM,KAAK,GAAe,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC;QAE3C,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACvC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YACpC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,OAAO;YAEnD,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;gBAC7C,KAAK,CAAC,IAAI,CAAC;oBACT,IAAI,EAAE,QAAQ;oBACd,IAAI;oBACJ,UAAU,EAAE,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,QAAQ,KAAK,QAAQ;iBACpD,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC,CAAC,CAAC;QACH,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,eAAe,CAAC,CAAqB,EAAE,GAAW;QACxD,MAAM,IAAI,GAAiB,EAAE,CAAC;QAE9B,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACxC,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC;QAC7D,IAAI,CAAC,aAAa,GAAG,CAAC,CAAC,yCAAyC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YAC5D,CAAC,CAAC,gBAAgB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,SAAS,CAAC;QAC/E,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,SAAS,CAAC;QACpD,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,SAAS,CAAC;QAC3E,IAAI,CAAC,SAAS,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,SAAS,CAAC;QACtE,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,SAAS,CAAC;QAChF,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,iBAAiB,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QAC3C,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QAC9C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;QAErC,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,iBAAiB,CAAC,CAAqB,EAAE,GAAW;QAC1D,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC;QACnC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC;YAAE,OAAO,eAAe,CAAC;QACzF,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC;YAAE,OAAO,eAAe,CAAC;QAC9F,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM;YAAE,OAAO,WAAW,CAAC;QAC1E,IAAI,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC;YAAE,OAAO,QAAQ,CAAC;QACrF,IAAI,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,gBAAgB,CAAC,CAAC,MAAM;YAAE,OAAO,QAAQ,CAAC;QACvE,IAAI,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC;YAAE,OAAO,UAAU,CAAC;QACrF,IAAI,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC;YAAE,OAAO,WAAW,CAAC;QAC1F,OAAO,SAAS,CAAC;IACnB,CAAC;IAEO,eAAe,CAAC,CAAqB,EAAE,GAAW;QACxD,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,UAAU,GAAG;YACjB,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM;YACtD,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;YAC1D,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,YAAY;SACjD,CAAC;QACF,OAAO,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;IACnE,CAAC;IAEO,aAAa,CAAC,CAAqB;QACzC,wBAAwB;QACxB,MAAM,WAAW,GAAG,CAAC,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;YACzC,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAClE,IAAI,WAAW;YAAE,OAAO,WAAW,CAAC;QAEpC,2CAA2C;QAC3C,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mDAAmD,CAAC,CAAC;QAC9E,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAC7C,CAAC;IAED,uBAAuB,CAAC,CAAqB,EAAE,SAA+B;QAC5E,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,MAAM,OAAO,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAE7E,kEAAkE;QAClE,SAAS,CAAC,IAAI,CAAC,8DAA8D,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC5F,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC;YAC3E,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;YAChD,IAAI,WAAW,IAAI,SAAS,EAAE,CAAC;gBAC7B,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACpG,MAAM,QAAQ,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;gBACrD,MAAM,CAAC,IAAI,CAAC;oBACV,QAAQ,EAAE,MAAM;oBAChB,IAAI,EAAE,WAAW,CAAC,CAAC,CAAC,GAAG,QAAQ,OAAO,WAAW,EAAE,CAAC,CAAC,CAAC,QAAQ;oBAC9D,OAAO,EAAE,cAAc;iBACxB,CAAC,CAAC;YACL,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,+DAA+D;QAC/D,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC;YAC9B,MAAM,aAAa,GAAG,IAAI,MAAM,CAAC,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,8BAA8B,EAAE,GAAG,CAAC,CAAC;YAC9F,IAAI,KAA6B,CAAC;YAClC,OAAO,CAAC,KAAK,GAAG,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;gBACnD,MAAM,CAAC,IAAI,CAAC;oBACV,QAAQ,EAAE,MAAM;oBAChB,IAAI,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE;oBAC/B,OAAO,EAAE,cAAc;iBACxB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,SAAS;QACT,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC;YAAE,OAAO,QAAQ,CAAC;QACxE,8CAA8C;QAC9C,IAAI,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;YAAE,OAAO,YAAY,CAAC;QAClF,aAAa;QACb,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,OAAO,YAAY,CAAC;QACjG,KAAK;QACL,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;QAC/D,OAAO;QACP,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,OAAO,MAAM,CAAC;QAC/D,OAAO;QACP,IAAI,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;YAAE,OAAO,MAAM,CAAC;QAChF,KAAK;QACL,IAAI,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;YAAE,OAAO,QAAQ,CAAC;QACrH,SAAS;QACT,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;YAAE,OAAO,QAAQ,CAAC;QAClG,QAAQ;QACR,IAAI,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YAAE,OAAO,OAAO,CAAC;QAC5G,OAAO;QACP,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,MAAM,CAAC;QACjE,MAAM;QACN,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC;YAAE,OAAO,KAAK,CAAC;QACzC,MAAM;QACN,IAAI,yCAAyC,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,KAAK,CAAC;QACvE,QAAQ;QACR,IAAI,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,MAAM,CAAC;QAC1E,OAAO;QACP,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,MAAM,CAAC;QAC9E,MAAM;QACN,IAAI,gDAAgD,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,KAAK,CAAC;QAC9E,OAAO;QACP,IAAI,sBAAsB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,MAAM,CAAC;QAC5E,OAAO,MAAM,CAAC;IAChB,CAAC;;AA7SH,4CA8SC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { ExtractedContent } from '../core/types';
|
|
2
|
+
/**
|
|
3
|
+
* GitHub repository extractor.
|
|
4
|
+
* Fetches README and optionally docs folder from GitHub repos.
|
|
5
|
+
*/
|
|
6
|
+
export declare class GitHubExtractor {
|
|
7
|
+
constructor();
|
|
8
|
+
isGitHubUrl(url: string): boolean;
|
|
9
|
+
parseRepoUrl(url: string): {
|
|
10
|
+
owner: string;
|
|
11
|
+
repo: string;
|
|
12
|
+
branch?: string;
|
|
13
|
+
path?: string;
|
|
14
|
+
} | null;
|
|
15
|
+
extractReadme(url: string): Promise<ExtractedContent>;
|
|
16
|
+
extractDocs(url: string, docsPath?: string): Promise<ExtractedContent[]>;
|
|
17
|
+
private extractCodeBlocks;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=github.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.d.ts","sourceRoot":"","sources":["../../src/extractors/github.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAqB,MAAM,eAAe,CAAC;AAEpE;;;GAGG;AACH,qBAAa,eAAe;;IAG1B,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIjC,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI;IAM3F,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAgFrD,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,QAAQ,GAAE,MAAe,GAAG,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAiDtF,OAAO,CAAC,iBAAiB;CAS1B"}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GitHubExtractor = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* GitHub repository extractor.
|
|
6
|
+
* Fetches README and optionally docs folder from GitHub repos.
|
|
7
|
+
*/
|
|
8
|
+
class GitHubExtractor {
|
|
9
|
+
constructor() { }
|
|
10
|
+
isGitHubUrl(url) {
|
|
11
|
+
return /^https?:\/\/(www\.)?github\.com\/[\w.-]+\/[\w.-]+/.test(url);
|
|
12
|
+
}
|
|
13
|
+
parseRepoUrl(url) {
|
|
14
|
+
const match = url.match(/github\.com\/([\w.-]+)\/([\w.-]+)(?:\/(?:tree|blob)\/([\w.-]+)(?:\/(.+))?)?/);
|
|
15
|
+
if (!match)
|
|
16
|
+
return null;
|
|
17
|
+
return { owner: match[1], repo: match[2], branch: match[3], path: match[4] };
|
|
18
|
+
}
|
|
19
|
+
async extractReadme(url) {
|
|
20
|
+
const parsed = this.parseRepoUrl(url);
|
|
21
|
+
if (!parsed)
|
|
22
|
+
throw new Error(`Invalid GitHub URL: ${url}`);
|
|
23
|
+
const { owner, repo, branch } = parsed;
|
|
24
|
+
const ref = branch || 'main';
|
|
25
|
+
// Try fetching README
|
|
26
|
+
const readmeUrls = [
|
|
27
|
+
`https://raw.githubusercontent.com/${owner}/${repo}/${ref}/README.md`,
|
|
28
|
+
`https://raw.githubusercontent.com/${owner}/${repo}/${ref}/readme.md`,
|
|
29
|
+
`https://raw.githubusercontent.com/${owner}/${repo}/${ref}/Readme.md`,
|
|
30
|
+
`https://raw.githubusercontent.com/${owner}/${repo}/master/README.md`,
|
|
31
|
+
];
|
|
32
|
+
let markdown = '';
|
|
33
|
+
let fetchedUrl = '';
|
|
34
|
+
for (const readmeUrl of readmeUrls) {
|
|
35
|
+
try {
|
|
36
|
+
const res = await fetch(readmeUrl, { signal: AbortSignal.timeout(10000) });
|
|
37
|
+
if (res.ok) {
|
|
38
|
+
markdown = await res.text();
|
|
39
|
+
fetchedUrl = readmeUrl;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
catch { }
|
|
44
|
+
}
|
|
45
|
+
if (!markdown)
|
|
46
|
+
throw new Error(`Could not find README for ${owner}/${repo}`);
|
|
47
|
+
// Extract headings from markdown
|
|
48
|
+
const headings = [];
|
|
49
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
50
|
+
let match;
|
|
51
|
+
while ((match = headingRegex.exec(markdown)) !== null) {
|
|
52
|
+
headings.push({ level: match[1].length, text: match[2].trim() });
|
|
53
|
+
}
|
|
54
|
+
// Extract links
|
|
55
|
+
const links = [];
|
|
56
|
+
const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
|
|
57
|
+
while ((match = linkRegex.exec(markdown)) !== null) {
|
|
58
|
+
const href = match[2].startsWith('http') ? match[2] : `https://github.com/${owner}/${repo}/blob/${ref}/${match[2]}`;
|
|
59
|
+
links.push({ href, text: match[1], isInternal: !match[2].startsWith('http') });
|
|
60
|
+
}
|
|
61
|
+
// Fetch repo metadata from API (no auth needed for public repos)
|
|
62
|
+
let description = '';
|
|
63
|
+
let tags = [];
|
|
64
|
+
try {
|
|
65
|
+
const apiRes = await fetch(`https://api.github.com/repos/${owner}/${repo}`, {
|
|
66
|
+
headers: { 'Accept': 'application/vnd.github.v3+json' },
|
|
67
|
+
signal: AbortSignal.timeout(5000),
|
|
68
|
+
});
|
|
69
|
+
if (apiRes.ok) {
|
|
70
|
+
const repoData = await apiRes.json();
|
|
71
|
+
description = repoData.description || '';
|
|
72
|
+
tags = repoData.topics || [];
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
catch { }
|
|
76
|
+
return {
|
|
77
|
+
url,
|
|
78
|
+
title: `${owner}/${repo}`,
|
|
79
|
+
description,
|
|
80
|
+
markdown,
|
|
81
|
+
text: markdown.replace(/[#*`\[\]()>-]/g, ' ').replace(/\s+/g, ' ').trim(),
|
|
82
|
+
codeBlocks: this.extractCodeBlocks(markdown),
|
|
83
|
+
headings,
|
|
84
|
+
links,
|
|
85
|
+
metadata: {
|
|
86
|
+
author: owner,
|
|
87
|
+
type: 'readme',
|
|
88
|
+
tags,
|
|
89
|
+
siteName: 'GitHub',
|
|
90
|
+
},
|
|
91
|
+
timestamp: new Date().toISOString(),
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
async extractDocs(url, docsPath = 'docs') {
|
|
95
|
+
const parsed = this.parseRepoUrl(url);
|
|
96
|
+
if (!parsed)
|
|
97
|
+
throw new Error(`Invalid GitHub URL: ${url}`);
|
|
98
|
+
const { owner, repo, branch } = parsed;
|
|
99
|
+
const ref = branch || 'main';
|
|
100
|
+
// Fetch directory listing from GitHub API
|
|
101
|
+
const apiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${docsPath}?ref=${ref}`;
|
|
102
|
+
const res = await fetch(apiUrl, {
|
|
103
|
+
headers: { 'Accept': 'application/vnd.github.v3+json' },
|
|
104
|
+
signal: AbortSignal.timeout(10000),
|
|
105
|
+
});
|
|
106
|
+
if (!res.ok)
|
|
107
|
+
return [];
|
|
108
|
+
const files = await res.json();
|
|
109
|
+
const mdFiles = files.filter((f) => f.name.endsWith('.md') && f.type === 'file');
|
|
110
|
+
const results = [];
|
|
111
|
+
for (const file of mdFiles) {
|
|
112
|
+
try {
|
|
113
|
+
const contentRes = await fetch(file.download_url, { signal: AbortSignal.timeout(10000) });
|
|
114
|
+
if (!contentRes.ok)
|
|
115
|
+
continue;
|
|
116
|
+
const markdown = await contentRes.text();
|
|
117
|
+
const headings = [];
|
|
118
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
119
|
+
let match;
|
|
120
|
+
while ((match = headingRegex.exec(markdown)) !== null) {
|
|
121
|
+
headings.push({ level: match[1].length, text: match[2].trim() });
|
|
122
|
+
}
|
|
123
|
+
results.push({
|
|
124
|
+
url: file.html_url,
|
|
125
|
+
title: headings[0]?.text || file.name.replace('.md', ''),
|
|
126
|
+
markdown,
|
|
127
|
+
text: markdown.replace(/[#*`\[\]()>-]/g, ' ').replace(/\s+/g, ' ').trim(),
|
|
128
|
+
codeBlocks: this.extractCodeBlocks(markdown),
|
|
129
|
+
headings,
|
|
130
|
+
links: [],
|
|
131
|
+
metadata: { type: 'documentation', siteName: 'GitHub' },
|
|
132
|
+
timestamp: new Date().toISOString(),
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
catch { }
|
|
136
|
+
}
|
|
137
|
+
return results;
|
|
138
|
+
}
|
|
139
|
+
extractCodeBlocks(markdown) {
|
|
140
|
+
const blocks = [];
|
|
141
|
+
const regex = /```(\w*)\n([\s\S]*?)```/g;
|
|
142
|
+
let match;
|
|
143
|
+
while ((match = regex.exec(markdown)) !== null) {
|
|
144
|
+
blocks.push({ language: match[1] || 'text', code: match[2].trim() });
|
|
145
|
+
}
|
|
146
|
+
return blocks;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
exports.GitHubExtractor = GitHubExtractor;
|
|
150
|
+
//# sourceMappingURL=github.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.js","sourceRoot":"","sources":["../../src/extractors/github.ts"],"names":[],"mappings":";;;AAEA;;;GAGG;AACH,MAAa,eAAe;IAC1B,gBAAe,CAAC;IAEhB,WAAW,CAAC,GAAW;QACrB,OAAO,mDAAmD,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvE,CAAC;IAED,YAAY,CAAC,GAAW;QACtB,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,6EAA6E,CAAC,CAAC;QACvG,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QACxB,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/E,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,GAAW;QAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtC,IAAI,CAAC,MAAM;YAAE,MAAM,IAAI,KAAK,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE3D,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;QACvC,MAAM,GAAG,GAAG,MAAM,IAAI,MAAM,CAAC;QAE7B,sBAAsB;QACtB,MAAM,UAAU,GAAG;YACjB,qCAAqC,KAAK,IAAI,IAAI,IAAI,GAAG,YAAY;YACrE,qCAAqC,KAAK,IAAI,IAAI,IAAI,GAAG,YAAY;YACrE,qCAAqC,KAAK,IAAI,IAAI,IAAI,GAAG,YAAY;YACrE,qCAAqC,KAAK,IAAI,IAAI,mBAAmB;SACtE,CAAC;QAEF,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,IAAI,UAAU,GAAG,EAAE,CAAC;QACpB,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;gBAC3E,IAAI,GAAG,CAAC,EAAE,EAAE,CAAC;oBACX,QAAQ,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;oBAC5B,UAAU,GAAG,SAAS,CAAC;oBACvB,MAAM;gBACR,CAAC;YACH,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC;QAED,IAAI,CAAC,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,IAAI,IAAI,EAAE,CAAC,CAAC;QAE7E,iCAAiC;QACjC,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,YAAY,GAAG,qBAAqB,CAAC;QAC3C,IAAI,KAA6B,CAAC;QAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACtD,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACnE,CAAC;QAED,gBAAgB;QAChB,MAAM,KAAK,GAAe,EAAE,CAAC;QAC7B,MAAM,SAAS,GAAG,0BAA0B,CAAC;QAC7C,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,sBAAsB,KAAK,IAAI,IAAI,SAAS,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;YACpH,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;QAED,iEAAiE;QACjE,IAAI,WAAW,GAAG,EAAE,CAAC;QACrB,IAAI,IAAI,GAAa,EAAE,CAAC;QACxB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,gCAAgC,KAAK,IAAI,IAAI,EAAE,EAAE;gBAC1E,OAAO,EAAE,EAAE,QAAQ,EAAE,gCAAgC,EAAE;gBACvD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;aAClC,CAAC,CAAC;YACH,IAAI,MAAM,CAAC,EAAE,EAAE,CAAC;gBACd,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;gBACrC,WAAW,GAAG,QAAQ,CAAC,WAAW,IAAI,EAAE,CAAC;gBACzC,IAAI,GAAG,QAAQ,CAAC,MAAM,IAAI,EAAE,CAAC;YAC/B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;QAEV,OAAO;YACL,GAAG;YACH,KAAK,EAAE,GAAG,KAAK,IAAI,IAAI,EAAE;YACzB,WAAW;YACX,QAAQ;YACR,IAAI,EAAE,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE;YACzE,UAAU,EAAE,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC;YAC5C,QAAQ;YACR,KAAK;YACL,QAAQ,EAAE;gBACR,MAAM,EAAE,KAAK;gBACb,IAAI,EAAE,QAAQ;gBACd,IAAI;gBACJ,QAAQ,EAAE,QAAQ;aACnB;YACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,GAAW,EAAE,WAAmB,MAAM;QACtD,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtC,IAAI,CAAC,MAAM;YAAE,MAAM,IAAI,KAAK,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE3D,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;QACvC,MAAM,GAAG,GAAG,MAAM,IAAI,MAAM,CAAC;QAE7B,0CAA0C;QAC1C,MAAM,MAAM,GAAG,gCAAgC,KAAK,IAAI,IAAI,aAAa,QAAQ,QAAQ,GAAG,EAAE,CAAC;QAC/F,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,MAAM,EAAE;YAC9B,OAAO,EAAE,EAAE,QAAQ,EAAE,gCAAgC,EAAE;YACvD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;SACnC,CAAC,CAAC;QAEH,IAAI,CAAC,GAAG,CAAC,EAAE;YAAE,OAAO,EAAE,CAAC;QACvB,MAAM,KAAK,GAAU,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC;QAEtF,MAAM,OAAO,GAAuB,EAAE,CAAC;QACvC,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;YAC3B,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,YAAY,EAAE,EAAE,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;gBAC1F,IAAI,CAAC,UAAU,CAAC,EAAE;oBAAE,SAAS;gBAC7B,MAAM,QAAQ,GAAG,MAAM,UAAU,CAAC,IAAI,EAAE,CAAC;gBAEzC,MAAM,QAAQ,GAAc,EAAE,CAAC;gBAC/B,MAAM,YAAY,GAAG,qBAAqB,CAAC;gBAC3C,IAAI,KAA6B,CAAC;gBAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;oBACtD,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACnE,CAAC;gBAED,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG,EAAE,IAAI,CAAC,QAAQ;oBAClB,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;oBACxD,QAAQ;oBACR,IAAI,EAAE,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE;oBACzE,UAAU,EAAE,IAAI,CAAC,iBAAiB,CAAC,QAAQ,CAAC;oBAC5C,QAAQ;oBACR,KAAK,EAAE,EAAE;oBACT,QAAQ,EAAE,EAAE,IAAI,EAAE,eAAe,EAAE,QAAQ,EAAE,QAAQ,EAAE;oBACvD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;iBACpC,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAEO,iBAAiB,CAAC,QAAgB;QACxC,MAAM,MAAM,GAA2D,EAAE,CAAC;QAC1E,MAAM,KAAK,GAAG,0BAA0B,CAAC;QACzC,IAAI,KAA6B,CAAC;QAClC,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAvJD,0CAuJC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export interface ExtractedImage {
|
|
2
|
+
src: string;
|
|
3
|
+
alt: string;
|
|
4
|
+
title?: string;
|
|
5
|
+
width?: number;
|
|
6
|
+
height?: number;
|
|
7
|
+
context?: string;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Extracts images and their alt text/context from HTML.
|
|
11
|
+
* Useful for understanding diagrams, charts, and visual documentation.
|
|
12
|
+
*/
|
|
13
|
+
export declare class ImageExtractor {
|
|
14
|
+
extract(html: string, baseUrl: string): ExtractedImage[];
|
|
15
|
+
/** Convert extracted images to markdown references */
|
|
16
|
+
toMarkdown(images: ExtractedImage[]): string;
|
|
17
|
+
/** Get image descriptions as plain text (for LLM context) */
|
|
18
|
+
toDescriptions(images: ExtractedImage[]): string[];
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=images.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"images.d.ts","sourceRoot":"","sources":["../../src/extractors/images.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,cAAc,EAAE;IA2BxD,sDAAsD;IACtD,UAAU,CAAC,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM;IAU5C,6DAA6D;IAC7D,cAAc,CAAC,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,EAAE;CAKnD"}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.ImageExtractor = void 0;
|
|
27
|
+
const cheerio = __importStar(require("cheerio"));
|
|
28
|
+
/**
|
|
29
|
+
* Extracts images and their alt text/context from HTML.
|
|
30
|
+
* Useful for understanding diagrams, charts, and visual documentation.
|
|
31
|
+
*/
|
|
32
|
+
class ImageExtractor {
|
|
33
|
+
extract(html, baseUrl) {
|
|
34
|
+
const $ = cheerio.load(html);
|
|
35
|
+
const images = [];
|
|
36
|
+
$('img').each((_, el) => {
|
|
37
|
+
const $el = $(el);
|
|
38
|
+
const src = $el.attr('src') || '';
|
|
39
|
+
if (!src || src.startsWith('data:'))
|
|
40
|
+
return;
|
|
41
|
+
const resolvedSrc = src.startsWith('http') ? src : new URL(src, baseUrl).href;
|
|
42
|
+
const alt = $el.attr('alt') || '';
|
|
43
|
+
const title = $el.attr('title');
|
|
44
|
+
const width = parseInt($el.attr('width') || '0') || undefined;
|
|
45
|
+
const height = parseInt($el.attr('height') || '0') || undefined;
|
|
46
|
+
// Get surrounding context
|
|
47
|
+
const parent = $el.closest('figure, p, section, div');
|
|
48
|
+
const caption = parent.find('figcaption').text().trim();
|
|
49
|
+
const prevHeading = $el.closest('section, div').find('h1,h2,h3,h4').last().text().trim();
|
|
50
|
+
const context = caption || prevHeading || undefined;
|
|
51
|
+
images.push({ src: resolvedSrc, alt, title, width, height, context });
|
|
52
|
+
});
|
|
53
|
+
return images;
|
|
54
|
+
}
|
|
55
|
+
/** Convert extracted images to markdown references */
|
|
56
|
+
toMarkdown(images) {
|
|
57
|
+
return images
|
|
58
|
+
.filter(img => img.alt || img.context)
|
|
59
|
+
.map(img => {
|
|
60
|
+
const desc = img.alt || img.context || 'Image';
|
|
61
|
+
return ``;
|
|
62
|
+
})
|
|
63
|
+
.join('\n\n');
|
|
64
|
+
}
|
|
65
|
+
/** Get image descriptions as plain text (for LLM context) */
|
|
66
|
+
toDescriptions(images) {
|
|
67
|
+
return images
|
|
68
|
+
.filter(img => img.alt || img.context)
|
|
69
|
+
.map(img => `[Image: ${img.alt || img.context || 'untitled'}] (${img.src})`);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
exports.ImageExtractor = ImageExtractor;
|
|
73
|
+
//# sourceMappingURL=images.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"images.js","sourceRoot":"","sources":["../../src/extractors/images.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iDAAmC;AAWnC;;;GAGG;AACH,MAAa,cAAc;IACzB,OAAO,CAAC,IAAY,EAAE,OAAe;QACnC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,MAAM,MAAM,GAAqB,EAAE,CAAC;QAEpC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAClC,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC;gBAAE,OAAO;YAE5C,MAAM,WAAW,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YAC9E,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAClC,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,SAAS,CAAC;YAC9D,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,IAAI,SAAS,CAAC;YAEhE,0BAA0B;YAC1B,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,yBAAyB,CAAC,CAAC;YACtD,MAAM,OAAO,GAAG,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACxD,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACzF,MAAM,OAAO,GAAG,OAAO,IAAI,WAAW,IAAI,SAAS,CAAC;YAEpD,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,WAAW,EAAE,GAAG,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,CAAC;QACxE,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,sDAAsD;IACtD,UAAU,CAAC,MAAwB;QACjC,OAAO,MAAM;aACV,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC;aACrC,GAAG,CAAC,GAAG,CAAC,EAAE;YACT,MAAM,IAAI,GAAG,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,IAAI,OAAO,CAAC;YAC/C,OAAO,KAAK,IAAI,KAAK,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC;QACvE,CAAC,CAAC;aACD,IAAI,CAAC,MAAM,CAAC,CAAC;IAClB,CAAC;IAED,6DAA6D;IAC7D,cAAc,CAAC,MAAwB;QACrC,OAAO,MAAM;aACV,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC;aACrC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,WAAW,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,IAAI,UAAU,MAAM,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC;IACjF,CAAC;CACF;AA7CD,wCA6CC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { ExtractedContent } from '../core/types';
|
|
2
|
+
/**
|
|
3
|
+
* PDF content extractor. Requires optional dependency: npm install pdf-parse
|
|
4
|
+
*/
|
|
5
|
+
export declare class PdfExtractor {
|
|
6
|
+
private pdfParse;
|
|
7
|
+
private loadParser;
|
|
8
|
+
extract(source: string): Promise<ExtractedContent>;
|
|
9
|
+
isPdf(url: string): boolean;
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=pdf.d.ts.map
|