@chatbot-packages/rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,95 @@
1
+ import { m as ExtractorOptions, n as ExtractorType, j as ExtractedDocument } from '../types-CjnplPJD.js';
2
+
3
+ /**
4
+ * Base Extractor
5
+ */
6
+
7
+ declare abstract class BaseExtractor {
8
+ protected options: ExtractorOptions;
9
+ constructor(options: ExtractorOptions);
10
+ /** Get the extractor type */
11
+ abstract getType(): ExtractorType;
12
+ /** Check if this extractor can handle the given path */
13
+ abstract canHandle(path: string): boolean;
14
+ /** Extract documents from the source */
15
+ abstract extract(): Promise<ExtractedDocument[]>;
16
+ /** Get file extension */
17
+ protected getExtension(path: string): string;
18
+ /** Normalize line endings */
19
+ protected normalizeLineEndings(text: string): string;
20
+ /** Clean excessive whitespace */
21
+ protected cleanWhitespace(text: string): string;
22
+ }
23
+
24
+ /**
25
+ * CHM (Compiled HTML Help) Extractor
26
+ *
27
+ * Extracts HTML content from .chm files using 7z.
28
+ * CHM files are Microsoft's compiled HTML help format.
29
+ */
30
+
31
+ declare class CHMExtractor extends BaseExtractor {
32
+ private tempDir;
33
+ constructor(options: {
34
+ sourcePath: string;
35
+ outputDir?: string;
36
+ });
37
+ getType(): ExtractorType;
38
+ canHandle(path: string): boolean;
39
+ extract(): Promise<ExtractedDocument[]>;
40
+ private extractWithSevenZip;
41
+ private findHTMLFiles;
42
+ }
43
+
44
+ /**
45
+ * HTML Extractor
46
+ *
47
+ * Extracts text content from HTML files, preserving structure.
48
+ */
49
+
50
+ interface HTMLExtractorOptions {
51
+ sourcePath: string;
52
+ /** Whether to preserve headings structure */
53
+ preserveHeadings?: boolean;
54
+ /** Whether to include links */
55
+ includeLinks?: boolean;
56
+ /** Tags to remove */
57
+ removeTags?: string[];
58
+ }
59
+ declare class HTMLExtractor extends BaseExtractor {
60
+ private htmlOptions;
61
+ constructor(options: HTMLExtractorOptions);
62
+ getType(): ExtractorType;
63
+ canHandle(path: string): boolean;
64
+ extract(): Promise<ExtractedDocument[]>;
65
+ extractFromString(html: string, filePath: string): Promise<ExtractedDocument>;
66
+ private extractFromDirectory;
67
+ }
68
+
69
+ /**
70
+ * Markdown Extractor
71
+ *
72
+ * Extracts content from Markdown files.
73
+ */
74
+
75
+ declare class MarkdownExtractor extends BaseExtractor {
76
+ getType(): ExtractorType;
77
+ canHandle(path: string): boolean;
78
+ extract(): Promise<ExtractedDocument[]>;
79
+ private extractFromFile;
80
+ private extractTitle;
81
+ private extractFromDirectory;
82
+ }
83
+
84
+ /**
85
+ * Document Extractors
86
+ *
87
+ * Extract content from various file formats (CHM, HTML, Markdown)
88
+ */
89
+
90
+ /**
91
+ * Auto-detect and extract documents from a path
92
+ */
93
+ declare function extractDocuments(sourcePath: string): Promise<ExtractedDocument[]>;
94
+
95
+ export { BaseExtractor, CHMExtractor, HTMLExtractor, type HTMLExtractorOptions, MarkdownExtractor, extractDocuments };
@@ -0,0 +1,343 @@
1
+ // src/extractors/base.ts
2
+ var BaseExtractor = class {
3
+ options;
4
+ constructor(options) {
5
+ this.options = options;
6
+ }
7
+ /** Get file extension */
8
+ getExtension(path) {
9
+ const parts = path.split(".");
10
+ return parts.length > 1 ? parts[parts.length - 1].toLowerCase() : "";
11
+ }
12
+ /** Normalize line endings */
13
+ normalizeLineEndings(text) {
14
+ return text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
15
+ }
16
+ /** Clean excessive whitespace */
17
+ cleanWhitespace(text) {
18
+ return text.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").trim();
19
+ }
20
+ };
21
+
22
+ // src/extractors/chm.ts
23
+ import { exec } from "child_process";
24
+ import { promisify } from "util";
25
+ import { readdir as readdir2, readFile as readFile2, mkdir, rm } from "fs/promises";
26
+ import { existsSync } from "fs";
27
+ import { join as join2, basename as basename2, dirname } from "path";
28
+
29
+ // src/extractors/html.ts
30
+ import { readFile, readdir, stat } from "fs/promises";
31
+ import { join, basename } from "path";
32
+ import * as cheerio from "cheerio";
33
+ import { convert } from "html-to-text";
34
+ var HTMLExtractor = class extends BaseExtractor {
35
+ htmlOptions;
36
+ constructor(options) {
37
+ super(options);
38
+ this.htmlOptions = {
39
+ preserveHeadings: true,
40
+ includeLinks: false,
41
+ removeTags: ["script", "style", "nav", "footer", "header", "aside", "meta", "link"],
42
+ ...options
43
+ };
44
+ }
45
+ getType() {
46
+ return "html";
47
+ }
48
+ canHandle(path) {
49
+ const ext = this.getExtension(path);
50
+ return ext === "html" || ext === "htm";
51
+ }
52
+ async extract() {
53
+ const { sourcePath } = this.options;
54
+ const stats = await stat(sourcePath);
55
+ if (stats.isFile()) {
56
+ const content = await readFile(sourcePath, "utf-8");
57
+ return [await this.extractFromString(content, sourcePath)];
58
+ }
59
+ if (stats.isDirectory()) {
60
+ return this.extractFromDirectory(sourcePath);
61
+ }
62
+ throw new Error(`Invalid path: ${sourcePath}`);
63
+ }
64
+ async extractFromString(html, filePath) {
65
+ const $ = cheerio.load(html);
66
+ for (const tag of this.htmlOptions.removeTags || []) {
67
+ $(tag).remove();
68
+ }
69
+ let title = $("title").text().trim();
70
+ if (!title) {
71
+ title = $("h1").first().text().trim();
72
+ }
73
+ if (!title) {
74
+ title = basename(filePath, ".html").replace(/-|_/g, " ");
75
+ }
76
+ const text = convert($.html(), {
77
+ wordwrap: false,
78
+ preserveNewlines: true,
79
+ selectors: [
80
+ { selector: "h1", options: { uppercase: false, prefix: "\n# " } },
81
+ { selector: "h2", options: { uppercase: false, prefix: "\n## " } },
82
+ { selector: "h3", options: { uppercase: false, prefix: "\n### " } },
83
+ { selector: "h4", options: { uppercase: false, prefix: "\n#### " } },
84
+ { selector: "h5", options: { uppercase: false, prefix: "\n##### " } },
85
+ { selector: "h6", options: { uppercase: false, prefix: "\n###### " } },
86
+ { selector: "ul", options: { itemPrefix: " - " } },
87
+ { selector: "ol", options: { itemPrefix: " 1. " } },
88
+ { selector: "table", format: "dataTable" },
89
+ { selector: "a", options: { ignoreHref: !this.htmlOptions.includeLinks } },
90
+ { selector: "img", format: "skip" }
91
+ ]
92
+ });
93
+ const content = this.cleanWhitespace(this.normalizeLineEndings(text));
94
+ return {
95
+ path: filePath,
96
+ title,
97
+ content,
98
+ format: "html",
99
+ metadata: {
100
+ originalLength: html.length,
101
+ extractedLength: content.length
102
+ }
103
+ };
104
+ }
105
+ async extractFromDirectory(dir) {
106
+ const documents = [];
107
+ const scanDir = async (currentDir) => {
108
+ const entries = await readdir(currentDir, { withFileTypes: true });
109
+ for (const entry of entries) {
110
+ const fullPath = join(currentDir, entry.name);
111
+ if (entry.isDirectory()) {
112
+ await scanDir(fullPath);
113
+ } else if (entry.isFile() && this.canHandle(entry.name)) {
114
+ try {
115
+ const content = await readFile(fullPath, "utf-8");
116
+ const doc = await this.extractFromString(content, fullPath);
117
+ if (doc.content.trim()) {
118
+ documents.push(doc);
119
+ }
120
+ } catch (error) {
121
+ console.warn(`Failed to extract ${fullPath}:`, error);
122
+ }
123
+ }
124
+ }
125
+ };
126
+ await scanDir(dir);
127
+ return documents;
128
+ }
129
+ };
130
+
131
+ // src/extractors/chm.ts
132
+ var execAsync = promisify(exec);
133
+ var CHMExtractor = class extends BaseExtractor {
134
+ tempDir;
135
+ constructor(options) {
136
+ super(options);
137
+ this.tempDir = options.outputDir || join2(dirname(options.sourcePath), ".chm-extract-temp");
138
+ }
139
+ getType() {
140
+ return "chm";
141
+ }
142
+ canHandle(path) {
143
+ return this.getExtension(path) === "chm";
144
+ }
145
+ async extract() {
146
+ const { sourcePath } = this.options;
147
+ if (!existsSync(sourcePath)) {
148
+ throw new Error(`CHM file not found: ${sourcePath}`);
149
+ }
150
+ await mkdir(this.tempDir, { recursive: true });
151
+ try {
152
+ await this.extractWithSevenZip(sourcePath, this.tempDir);
153
+ const htmlFiles = await this.findHTMLFiles(this.tempDir);
154
+ const documents = [];
155
+ for (const htmlFile of htmlFiles) {
156
+ try {
157
+ const content = await readFile2(htmlFile, "utf-8");
158
+ const htmlExtractor = new HTMLExtractor({ sourcePath: htmlFile });
159
+ const extracted = await htmlExtractor.extractFromString(content, htmlFile);
160
+ if (extracted.content.trim()) {
161
+ documents.push({
162
+ ...extracted,
163
+ format: "chm",
164
+ metadata: {
165
+ ...extracted.metadata,
166
+ sourceChm: basename2(sourcePath),
167
+ originalPath: htmlFile.replace(this.tempDir, "")
168
+ }
169
+ });
170
+ }
171
+ } catch (error) {
172
+ console.warn(`Failed to parse HTML file: ${htmlFile}`, error);
173
+ }
174
+ }
175
+ return documents;
176
+ } finally {
177
+ if (existsSync(this.tempDir)) {
178
+ await rm(this.tempDir, { recursive: true, force: true });
179
+ }
180
+ }
181
+ }
182
+ async extractWithSevenZip(chmPath, outputDir) {
183
+ try {
184
+ await execAsync(`7z x "${chmPath}" -o"${outputDir}" -y`, {
185
+ maxBuffer: 50 * 1024 * 1024
186
+ // 50MB buffer
187
+ });
188
+ } catch (error) {
189
+ try {
190
+ await execAsync(`7za x "${chmPath}" -o"${outputDir}" -y`, {
191
+ maxBuffer: 50 * 1024 * 1024
192
+ });
193
+ } catch {
194
+ throw new Error(
195
+ `Failed to extract CHM file. Please ensure 7z is installed.
196
+ - On Ubuntu/Debian: sudo apt-get install p7zip-full
197
+ - On macOS: brew install p7zip
198
+ - On Windows: Install 7-Zip from https://7-zip.org/
199
+ Original error: ${error}`
200
+ );
201
+ }
202
+ }
203
+ }
204
+ async findHTMLFiles(dir) {
205
+ const htmlFiles = [];
206
+ const scanDir = async (currentDir) => {
207
+ const entries = await readdir2(currentDir, { withFileTypes: true });
208
+ for (const entry of entries) {
209
+ const fullPath = join2(currentDir, entry.name);
210
+ if (entry.isDirectory()) {
211
+ if (!entry.name.startsWith("$") && !entry.name.startsWith("#")) {
212
+ await scanDir(fullPath);
213
+ }
214
+ } else if (entry.isFile()) {
215
+ const ext = this.getExtension(entry.name);
216
+ if (ext === "html" || ext === "htm") {
217
+ htmlFiles.push(fullPath);
218
+ }
219
+ }
220
+ }
221
+ };
222
+ await scanDir(dir);
223
+ return htmlFiles.sort();
224
+ }
225
+ };
226
+
227
+ // src/extractors/markdown.ts
228
+ import { readFile as readFile3, readdir as readdir3, stat as stat2 } from "fs/promises";
229
+ import { join as join3, basename as basename3 } from "path";
230
+ var MarkdownExtractor = class extends BaseExtractor {
231
+ getType() {
232
+ return "markdown";
233
+ }
234
+ canHandle(path) {
235
+ const ext = this.getExtension(path);
236
+ return ext === "md" || ext === "markdown";
237
+ }
238
+ async extract() {
239
+ const { sourcePath } = this.options;
240
+ const stats = await stat2(sourcePath);
241
+ if (stats.isFile()) {
242
+ return [await this.extractFromFile(sourcePath)];
243
+ }
244
+ if (stats.isDirectory()) {
245
+ return this.extractFromDirectory(sourcePath);
246
+ }
247
+ throw new Error(`Invalid path: ${sourcePath}`);
248
+ }
249
+ async extractFromFile(filePath) {
250
+ const content = await readFile3(filePath, "utf-8");
251
+ const normalizedContent = this.normalizeLineEndings(content);
252
+ let title = this.extractTitle(normalizedContent);
253
+ if (!title) {
254
+ title = basename3(filePath, ".md").replace(/-|_/g, " ");
255
+ }
256
+ return {
257
+ path: filePath,
258
+ title,
259
+ content: this.cleanWhitespace(normalizedContent),
260
+ format: "markdown",
261
+ metadata: {
262
+ originalLength: content.length
263
+ }
264
+ };
265
+ }
266
+ extractTitle(content) {
267
+ const h1Match = content.match(/^#\s+(.+)$/m);
268
+ if (h1Match) {
269
+ return h1Match[1].trim();
270
+ }
271
+ const underlineMatch = content.match(/^(.+)\n=+\s*$/m);
272
+ if (underlineMatch) {
273
+ return underlineMatch[1].trim();
274
+ }
275
+ return void 0;
276
+ }
277
+ async extractFromDirectory(dir) {
278
+ const documents = [];
279
+ const scanDir = async (currentDir) => {
280
+ const entries = await readdir3(currentDir, { withFileTypes: true });
281
+ for (const entry of entries) {
282
+ const fullPath = join3(currentDir, entry.name);
283
+ if (entry.isDirectory()) {
284
+ if (!entry.name.startsWith(".") && entry.name !== "node_modules") {
285
+ await scanDir(fullPath);
286
+ }
287
+ } else if (entry.isFile() && this.canHandle(entry.name)) {
288
+ try {
289
+ const doc = await this.extractFromFile(fullPath);
290
+ if (doc.content.trim()) {
291
+ documents.push(doc);
292
+ }
293
+ } catch (error) {
294
+ console.warn(`Failed to extract ${fullPath}:`, error);
295
+ }
296
+ }
297
+ }
298
+ };
299
+ await scanDir(dir);
300
+ return documents;
301
+ }
302
+ };
303
+
304
+ // src/extractors/index.ts
305
+ import { stat as stat3 } from "fs/promises";
306
+ async function extractDocuments(sourcePath) {
307
+ const stats = await stat3(sourcePath);
308
+ const ext = sourcePath.split(".").pop()?.toLowerCase() || "";
309
+ if (stats.isFile()) {
310
+ if (ext === "chm") {
311
+ const extractor = new CHMExtractor({ sourcePath });
312
+ return extractor.extract();
313
+ }
314
+ if (ext === "html" || ext === "htm") {
315
+ const extractor = new HTMLExtractor({ sourcePath });
316
+ return extractor.extract();
317
+ }
318
+ if (ext === "md" || ext === "markdown") {
319
+ const extractor = new MarkdownExtractor({ sourcePath });
320
+ return extractor.extract();
321
+ }
322
+ throw new Error(`Unsupported file format: ${ext}`);
323
+ }
324
+ if (stats.isDirectory()) {
325
+ const allDocuments = [];
326
+ const htmlExtractor = new HTMLExtractor({ sourcePath });
327
+ const htmlDocs = await htmlExtractor.extract();
328
+ allDocuments.push(...htmlDocs);
329
+ const mdExtractor = new MarkdownExtractor({ sourcePath });
330
+ const mdDocs = await mdExtractor.extract();
331
+ allDocuments.push(...mdDocs);
332
+ return allDocuments;
333
+ }
334
+ throw new Error(`Invalid path: ${sourcePath}`);
335
+ }
336
+ export {
337
+ BaseExtractor,
338
+ CHMExtractor,
339
+ HTMLExtractor,
340
+ MarkdownExtractor,
341
+ extractDocuments
342
+ };
343
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/extractors/base.ts","../../src/extractors/chm.ts","../../src/extractors/html.ts","../../src/extractors/markdown.ts","../../src/extractors/index.ts"],"sourcesContent":["/**\n * Base Extractor\n */\n\nimport type { ExtractedDocument, ExtractorOptions, ExtractorType } from '../types.js';\n\nexport abstract class BaseExtractor {\n protected options: ExtractorOptions;\n\n constructor(options: ExtractorOptions) {\n this.options = options;\n }\n\n /** Get the extractor type */\n abstract getType(): ExtractorType;\n\n /** Check if this extractor can handle the given path */\n abstract canHandle(path: string): boolean;\n\n /** Extract documents from the source */\n abstract extract(): Promise<ExtractedDocument[]>;\n\n /** Get file extension */\n protected getExtension(path: string): string {\n const parts = path.split('.');\n return parts.length > 1 ? parts[parts.length - 1].toLowerCase() : '';\n }\n\n /** Normalize line endings */\n protected normalizeLineEndings(text: string): string {\n return text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n }\n\n /** Clean excessive whitespace */\n protected cleanWhitespace(text: string): string {\n return text\n .replace(/\\n{3,}/g, '\\n\\n')\n .replace(/[ \\t]+/g, ' ')\n .trim();\n }\n}\n","/**\n * CHM (Compiled HTML Help) Extractor\n *\n * Extracts HTML content from .chm files using 7z.\n * CHM files are Microsoft's compiled HTML help format.\n */\n\nimport { exec } from 'child_process';\nimport { promisify } from 'util';\nimport { readdir, readFile, mkdir, rm } from 'fs/promises';\nimport { existsSync } from 'fs';\nimport { join, basename, dirname } from 'path';\nimport { BaseExtractor } from './base.js';\nimport { HTMLExtractor } from './html.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nconst execAsync = promisify(exec);\n\nexport class CHMExtractor extends BaseExtractor {\n private tempDir: string;\n\n constructor(options: { sourcePath: string; outputDir?: string }) {\n super(options);\n this.tempDir = options.outputDir || join(dirname(options.sourcePath), '.chm-extract-temp');\n }\n\n getType(): ExtractorType {\n return 'chm';\n }\n\n canHandle(path: string): boolean {\n return this.getExtension(path) === 'chm';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n\n if (!existsSync(sourcePath)) {\n throw new Error(`CHM file not found: ${sourcePath}`);\n }\n\n // Create temp directory\n await mkdir(this.tempDir, { recursive: true });\n\n try {\n // Extract CHM using 7z\n await this.extractWithSevenZip(sourcePath, this.tempDir);\n\n // Find and parse all HTML files\n const htmlFiles = await this.findHTMLFiles(this.tempDir);\n const documents: ExtractedDocument[] = [];\n\n for (const htmlFile of htmlFiles) {\n try {\n const content = await readFile(htmlFile, 'utf-8');\n const htmlExtractor = new HTMLExtractor({ sourcePath: htmlFile });\n const extracted = await htmlExtractor.extractFromString(content, htmlFile);\n\n if (extracted.content.trim()) {\n documents.push({\n ...extracted,\n format: 'chm',\n metadata: {\n ...extracted.metadata,\n sourceChm: basename(sourcePath),\n originalPath: htmlFile.replace(this.tempDir, ''),\n },\n });\n }\n } catch (error) {\n // Skip files that can't be parsed\n console.warn(`Failed to parse HTML file: ${htmlFile}`, error);\n }\n }\n\n return documents;\n } finally {\n // Cleanup temp directory\n if (existsSync(this.tempDir)) {\n await rm(this.tempDir, { recursive: true, force: true });\n }\n }\n }\n\n private async extractWithSevenZip(chmPath: string, outputDir: string): Promise<void> {\n try {\n // Try 7z first\n await execAsync(`7z x \"${chmPath}\" -o\"${outputDir}\" -y`, {\n maxBuffer: 50 * 1024 * 1024, // 50MB buffer\n });\n } catch (error) {\n // Try 7za as fallback (common on some systems)\n try {\n await execAsync(`7za x \"${chmPath}\" -o\"${outputDir}\" -y`, {\n maxBuffer: 50 * 1024 * 1024,\n });\n } catch {\n throw new Error(\n `Failed to extract CHM file. Please ensure 7z is installed.\\n` +\n ` - On Ubuntu/Debian: sudo apt-get install p7zip-full\\n` +\n ` - On macOS: brew install p7zip\\n` +\n ` - On Windows: Install 7-Zip from https://7-zip.org/\\n` +\n `Original error: ${error}`\n );\n }\n }\n }\n\n private async findHTMLFiles(dir: string): Promise<string[]> {\n const htmlFiles: string[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n // Skip system directories\n if (!entry.name.startsWith('$') && !entry.name.startsWith('#')) {\n await scanDir(fullPath);\n }\n } else if (entry.isFile()) {\n const ext = this.getExtension(entry.name);\n if (ext === 'html' || ext === 'htm') {\n htmlFiles.push(fullPath);\n }\n }\n }\n };\n\n await scanDir(dir);\n return htmlFiles.sort();\n }\n}\n","/**\n * HTML Extractor\n *\n * Extracts text content from HTML files, preserving structure.\n */\n\nimport { readFile, readdir, stat } from 'fs/promises';\nimport { join, basename } from 'path';\nimport * as cheerio from 'cheerio';\nimport { convert } from 'html-to-text';\nimport { BaseExtractor } from './base.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nexport interface HTMLExtractorOptions {\n sourcePath: string;\n /** Whether to preserve headings structure */\n preserveHeadings?: boolean;\n /** Whether to include links */\n includeLinks?: boolean;\n /** Tags to remove */\n removeTags?: string[];\n}\n\nexport class HTMLExtractor extends BaseExtractor {\n private htmlOptions: HTMLExtractorOptions;\n\n constructor(options: HTMLExtractorOptions) {\n super(options);\n this.htmlOptions = {\n preserveHeadings: true,\n includeLinks: false,\n removeTags: ['script', 'style', 'nav', 'footer', 'header', 'aside', 'meta', 'link'],\n ...options,\n };\n }\n\n getType(): ExtractorType {\n return 'html';\n }\n\n canHandle(path: string): boolean {\n const ext = this.getExtension(path);\n return ext === 'html' || ext === 'htm';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n const stats = await stat(sourcePath);\n\n if (stats.isFile()) {\n const content = await readFile(sourcePath, 'utf-8');\n return [await this.extractFromString(content, sourcePath)];\n }\n\n if (stats.isDirectory()) {\n return this.extractFromDirectory(sourcePath);\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n }\n\n async extractFromString(html: string, filePath: string): Promise<ExtractedDocument> {\n const $ = cheerio.load(html);\n\n // Remove unwanted tags\n for (const tag of this.htmlOptions.removeTags || []) {\n $(tag).remove();\n }\n\n // Extract title\n let title = $('title').text().trim();\n if (!title) {\n title = $('h1').first().text().trim();\n }\n if (!title) {\n title = basename(filePath, '.html').replace(/-|_/g, ' ');\n }\n\n // Convert to text with structure preservation\n const text = convert($.html(), {\n wordwrap: false,\n preserveNewlines: true,\n selectors: [\n { selector: 'h1', options: { uppercase: false, prefix: '\\n# ' } },\n { selector: 'h2', options: { uppercase: false, prefix: '\\n## ' } },\n { selector: 'h3', options: { uppercase: false, prefix: '\\n### ' } },\n { selector: 'h4', options: { uppercase: false, prefix: '\\n#### ' } },\n { selector: 'h5', options: { uppercase: false, prefix: '\\n##### ' } },\n { selector: 'h6', options: { uppercase: false, prefix: '\\n###### ' } },\n { selector: 'ul', options: { itemPrefix: ' - ' } },\n { selector: 'ol', options: { itemPrefix: ' 1. ' } },\n { selector: 'table', format: 'dataTable' },\n { selector: 'a', options: { ignoreHref: !this.htmlOptions.includeLinks } },\n { selector: 'img', format: 'skip' },\n ],\n });\n\n const content = this.cleanWhitespace(this.normalizeLineEndings(text));\n\n return {\n path: filePath,\n title,\n content,\n format: 'html',\n metadata: {\n originalLength: html.length,\n extractedLength: content.length,\n },\n };\n }\n\n private async extractFromDirectory(dir: string): Promise<ExtractedDocument[]> {\n const documents: ExtractedDocument[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n await scanDir(fullPath);\n } else if (entry.isFile() && this.canHandle(entry.name)) {\n try {\n const content = await readFile(fullPath, 'utf-8');\n const doc = await this.extractFromString(content, fullPath);\n if (doc.content.trim()) {\n documents.push(doc);\n }\n } catch (error) {\n console.warn(`Failed to extract ${fullPath}:`, error);\n }\n }\n }\n };\n\n await scanDir(dir);\n return documents;\n }\n}\n","/**\n * Markdown Extractor\n *\n * Extracts content from Markdown files.\n */\n\nimport { readFile, readdir, stat } from 'fs/promises';\nimport { join, basename } from 'path';\nimport { BaseExtractor } from './base.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nexport class MarkdownExtractor extends BaseExtractor {\n getType(): ExtractorType {\n return 'markdown';\n }\n\n canHandle(path: string): boolean {\n const ext = this.getExtension(path);\n return ext === 'md' || ext === 'markdown';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n const stats = await stat(sourcePath);\n\n if (stats.isFile()) {\n return [await this.extractFromFile(sourcePath)];\n }\n\n if (stats.isDirectory()) {\n return this.extractFromDirectory(sourcePath);\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n }\n\n private async extractFromFile(filePath: string): Promise<ExtractedDocument> {\n const content = await readFile(filePath, 'utf-8');\n const normalizedContent = this.normalizeLineEndings(content);\n\n // Extract title from first H1 or filename\n let title = this.extractTitle(normalizedContent);\n if (!title) {\n title = basename(filePath, '.md').replace(/-|_/g, ' ');\n }\n\n return {\n path: filePath,\n title,\n content: this.cleanWhitespace(normalizedContent),\n format: 'markdown',\n metadata: {\n originalLength: content.length,\n },\n };\n }\n\n private extractTitle(content: string): string | undefined {\n // Look for first H1 heading\n const h1Match = content.match(/^#\\s+(.+)$/m);\n if (h1Match) {\n return h1Match[1].trim();\n }\n\n // Look for underline-style H1\n const underlineMatch = content.match(/^(.+)\\n=+\\s*$/m);\n if (underlineMatch) {\n return underlineMatch[1].trim();\n }\n\n return undefined;\n }\n\n private async extractFromDirectory(dir: string): Promise<ExtractedDocument[]> {\n const documents: ExtractedDocument[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n // Skip hidden and common non-content directories\n if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {\n await scanDir(fullPath);\n }\n } else if (entry.isFile() && this.canHandle(entry.name)) {\n try {\n const doc = await this.extractFromFile(fullPath);\n if (doc.content.trim()) {\n documents.push(doc);\n }\n } catch (error) {\n console.warn(`Failed to extract ${fullPath}:`, error);\n }\n }\n }\n };\n\n await scanDir(dir);\n return documents;\n }\n}\n","/**\n * Document Extractors\n *\n * Extract content from various file formats (CHM, HTML, Markdown)\n */\n\nexport { BaseExtractor } from './base.js';\nexport { CHMExtractor } from './chm.js';\nexport { HTMLExtractor, type HTMLExtractorOptions } from './html.js';\nexport { MarkdownExtractor } from './markdown.js';\n\nimport { stat } from 'fs/promises';\nimport { CHMExtractor } from './chm.js';\nimport { HTMLExtractor } from './html.js';\nimport { MarkdownExtractor } from './markdown.js';\nimport type { ExtractedDocument } from '../types.js';\n\n/**\n * Auto-detect and extract documents from a path\n */\nexport async function extractDocuments(sourcePath: string): Promise<ExtractedDocument[]> {\n const stats = await stat(sourcePath);\n const ext = sourcePath.split('.').pop()?.toLowerCase() || '';\n\n // Single file\n if (stats.isFile()) {\n if (ext === 'chm') {\n const extractor = new CHMExtractor({ sourcePath });\n return extractor.extract();\n }\n if (ext === 'html' || ext === 'htm') {\n const extractor = new HTMLExtractor({ sourcePath });\n return extractor.extract();\n }\n if (ext === 'md' || ext === 'markdown') {\n const extractor = new MarkdownExtractor({ sourcePath });\n return extractor.extract();\n }\n throw new Error(`Unsupported file format: ${ext}`);\n }\n\n // Directory - extract all supported files\n if (stats.isDirectory()) {\n const allDocuments: ExtractedDocument[] = [];\n\n // Extract HTML files\n const htmlExtractor = new HTMLExtractor({ sourcePath });\n const htmlDocs = await htmlExtractor.extract();\n allDocuments.push(...htmlDocs);\n\n // Extract Markdown files\n const mdExtractor = new MarkdownExtractor({ sourcePath });\n const mdDocs = await mdExtractor.extract();\n allDocuments.push(...mdDocs);\n\n return allDocuments;\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n}\n"],"mappings":";AAMO,IAAe,gBAAf,MAA6B;AAAA,EACxB;AAAA,EAEV,YAAY,SAA2B;AACrC,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAYU,aAAa,MAAsB;AAC3C,UAAM,QAAQ,KAAK,MAAM,GAAG;AAC5B,WAAO,MAAM,SAAS,IAAI,MAAM,MAAM,SAAS,CAAC,EAAE,YAAY,IAAI;AAAA,EACpE;AAAA;AAAA,EAGU,qBAAqB,MAAsB;AACnD,WAAO,KAAK,QAAQ,SAAS,IAAI,EAAE,QAAQ,OAAO,IAAI;AAAA,EACxD;AAAA;AAAA,EAGU,gBAAgB,MAAsB;AAC9C,WAAO,KACJ,QAAQ,WAAW,MAAM,EACzB,QAAQ,WAAW,GAAG,EACtB,KAAK;AAAA,EACV;AACF;;;ACjCA,SAAS,YAAY;AACrB,SAAS,iBAAiB;AAC1B,SAAS,WAAAA,UAAS,YAAAC,WAAU,OAAO,UAAU;AAC7C,SAAS,kBAAkB;AAC3B,SAAS,QAAAC,OAAM,YAAAC,WAAU,eAAe;;;ACLxC,SAAS,UAAU,SAAS,YAAY;AACxC,SAAS,MAAM,gBAAgB;AAC/B,YAAY,aAAa;AACzB,SAAS,eAAe;AAcjB,IAAM,gBAAN,cAA4B,cAAc;AAAA,EACvC;AAAA,EAER,YAAY,SAA+B;AACzC,UAAM,OAAO;AACb,SAAK,cAAc;AAAA,MACjB,kBAAkB;AAAA,MAClB,cAAc;AAAA,MACd,YAAY,CAAC,UAAU,SAAS,OAAO,UAAU,UAAU,SAAS,QAAQ,MAAM;AAAA,MAClF,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEA,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,UAAM,MAAM,KAAK,aAAa,IAAI;AAClC,WAAO,QAAQ,UAAU,QAAQ;AAAA,EACnC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAC5B,UAAM,QAAQ,MAAM,KAAK,UAAU;AAEnC,QAAI,MAAM,OAAO,GAAG;AAClB,YAAM,UAAU,MAAM,SAAS,YAAY,OAAO;AAClD,aAAO,CAAC,MAAM,KAAK,kBAAkB,SAAS,UAAU,CAAC;AAAA,IAC3D;AAEA,QAAI,MAAM,YAAY,GAAG;AACvB,aAAO,KAAK,qBAAqB,UAAU;AAAA,IAC7C;AAEA,UAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAAA,EAC/C;AAAA,EAEA,MAAM,kBAAkB,MAAc,UAA8C;AAClF,UAAM,IAAY,aAAK,IAAI;AAG3B,eAAW,OAAO,KAAK,YAAY,cAAc,CAAC,GAAG;AACnD,QAAE,GAAG,EAAE,OAAO;AAAA,IAChB;AAGA,QAAI,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;AACnC,QAAI,CAAC,OAAO;AACV,cAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK;AAAA,IACtC;AACA,QAAI,CAAC,OAAO;AACV,cAAQ,SAAS,UAAU,OAAO,EAAE,QAAQ,QAAQ,GAAG;AAAA,IACzD;AAGA,UAAM,OAAO,QAAQ,EAAE,KAAK,GAAG;AAAA,MAC7B,UAAU;AAAA,MACV,kBAAkB;AAAA,MAClB,WAAW;AAAA,QACT,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,OAAO,EAAE;AAAA,QAChE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,QAAQ,EAAE;AAAA,QACjE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,SAAS,EAAE;AAAA,QAClE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,UAAU,EAAE;AAAA,QACnE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,WAAW,EAAE;AAAA,QACpE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,YAAY,EAAE;AAAA,QACrE,EAAE,UAAU,MAAM,SAAS,EAAE,YAAY,OAAO,EAAE;AAAA,QAClD,EAAE,UAAU,MAAM,SAAS,EAAE,YAAY,QAAQ,EAAE;AAAA,QACnD,EAAE,UAAU,SAAS,QAAQ,YAAY;AAAA,QACzC,EAAE,UAAU,KAAK,SAAS,EAAE,YAAY,CAAC,KAAK,YAAY,aAAa,EAAE;AAAA,QACzE,EAAE,UAAU,OAAO,QAAQ,OAAO;AAAA,MACpC;AAAA,IACF,CAAC;AAED,UAAM,UAAU,KAAK,gBAAgB,KAAK,qBAAqB,IAAI,CAAC;AAEpE,WAAO;AAAA,MACL,MAAM;AAAA,MACN;AAAA,MACA;AAAA,MACA,QAAQ;AAAA,MACR,UAAU;AAAA,QACR,gBAAgB,KAAK;AAAA,QACrB,iBAAiB,QAAQ;AAAA,MAC3B;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,qBAAqB,KAA2C;AAC5E,UAAM,YAAiC,CAAC;AAExC,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAM,QAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAW,KAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AACvB,gBAAM,QAAQ,QAAQ;AAAA,QACxB,WAAW,MAAM,OAAO,KAAK,KAAK,UAAU,MAAM,IAAI,GAAG;AACvD,cAAI;AACF,kBAAM,UAAU,MAAM,SAAS,UAAU,OAAO;AAChD,kBAAM,MAAM,MAAM,KAAK,kBAAkB,SAAS,QAAQ;AAC1D,gBAAI,IAAI,QAAQ,KAAK,GAAG;AACtB,wBAAU,KAAK,GAAG;AAAA,YACpB;AAAA,UACF,SAAS,OAAO;AACd,oBAAQ,KAAK,qBAAqB,QAAQ,KAAK,KAAK;AAAA,UACtD;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO;AAAA,EACT;AACF;;;AD3HA,IAAM,YAAY,UAAU,IAAI;AAEzB,IAAM,eAAN,cAA2B,cAAc;AAAA,EACtC;AAAA,EAER,YAAY,SAAqD;AAC/D,UAAM,OAAO;AACb,SAAK,UAAU,QAAQ,aAAaC,MAAK,QAAQ,QAAQ,UAAU,GAAG,mBAAmB;AAAA,EAC3F;AAAA,EAEA,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,WAAO,KAAK,aAAa,IAAI,MAAM;AAAA,EACrC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAE5B,QAAI,CAAC,WAAW,UAAU,GAAG;AAC3B,YAAM,IAAI,MAAM,uBAAuB,UAAU,EAAE;AAAA,IACrD;AAGA,UAAM,MAAM,KAAK,SAAS,EAAE,WAAW,KAAK,CAAC;AAE7C,QAAI;AAEF,YAAM,KAAK,oBAAoB,YAAY,KAAK,OAAO;AAGvD,YAAM,YAAY,MAAM,KAAK,cAAc,KAAK,OAAO;AACvD,YAAM,YAAiC,CAAC;AAExC,iBAAW,YAAY,WAAW;AAChC,YAAI;AACF,gBAAM,UAAU,MAAMC,UAAS,UAAU,OAAO;AAChD,gBAAM,gBAAgB,IAAI,cAAc,EAAE,YAAY,SAAS,CAAC;AAChE,gBAAM,YAAY,MAAM,cAAc,kBAAkB,SAAS,QAAQ;AAEzE,cAAI,UAAU,QAAQ,KAAK,GAAG;AAC5B,sBAAU,KAAK;AAAA,cACb,GAAG;AAAA,cACH,QAAQ;AAAA,cACR,UAAU;AAAA,gBACR,GAAG,UAAU;AAAA,gBACb,WAAWC,UAAS,UAAU;AAAA,gBAC9B,cAAc,SAAS,QAAQ,KAAK,SAAS,EAAE;AAAA,cACjD;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF,SAAS,OAAO;AAEd,kBAAQ,KAAK,8BAA8B,QAAQ,IAAI,KAAK;AAAA,QAC9D;AAAA,MACF;AAEA,aAAO;AAAA,IACT,UAAE;AAEA,UAAI,WAAW,KAAK,OAAO,GAAG;AAC5B,cAAM,GAAG,KAAK,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;AAAA,MACzD;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,oBAAoB,SAAiB,WAAkC;AACnF,QAAI;AAEF,YAAM,UAAU,SAAS,OAAO,QAAQ,SAAS,QAAQ;AAAA,QACvD,WAAW,KAAK,OAAO;AAAA;AAAA,MACzB,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,UAAI;AACF,cAAM,UAAU,UAAU,OAAO,QAAQ,SAAS,QAAQ;AAAA,UACxD,WAAW,KAAK,OAAO;AAAA,QACzB,CAAC;AAAA,MACH,QAAQ;AACN,cAAM,IAAI;AAAA,UACR;AAAA;AAAA;AAAA;AAAA,kBAIqB,KAAK;AAAA,QAC5B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,cAAc,KAAgC;AAC1D,UAAM,YAAsB,CAAC;AAE7B,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAMC,SAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAWH,MAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AAEvB,cAAI,CAAC,MAAM,KAAK,WAAW,GAAG,KAAK,CAAC,MAAM,KAAK,WAAW,GAAG,GAAG;AAC9D,kBAAM,QAAQ,QAAQ;AAAA,UACxB;AAAA,QACF,WAAW,MAAM,OAAO,GAAG;AACzB,gBAAM,MAAM,KAAK,aAAa,MAAM,IAAI;AACxC,cAAI,QAAQ,UAAU,QAAQ,OAAO;AACnC,sBAAU,KAAK,QAAQ;AAAA,UACzB;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO,UAAU,KAAK;AAAA,EACxB;AACF;;;AEhIA,SAAS,YAAAI,WAAU,WAAAC,UAAS,QAAAC,aAAY;AACxC,SAAS,QAAAC,OAAM,YAAAC,iBAAgB;AAIxB,IAAM,oBAAN,cAAgC,cAAc;AAAA,EACnD,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,UAAM,MAAM,KAAK,aAAa,IAAI;AAClC,WAAO,QAAQ,QAAQ,QAAQ;AAAA,EACjC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAC5B,UAAM,QAAQ,MAAMC,MAAK,UAAU;AAEnC,QAAI,MAAM,OAAO,GAAG;AAClB,aAAO,CAAC,MAAM,KAAK,gBAAgB,UAAU,CAAC;AAAA,IAChD;AAEA,QAAI,MAAM,YAAY,GAAG;AACvB,aAAO,KAAK,qBAAqB,UAAU;AAAA,IAC7C;AAEA,UAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAAA,EAC/C;AAAA,EAEA,MAAc,gBAAgB,UAA8C;AAC1E,UAAM,UAAU,MAAMC,UAAS,UAAU,OAAO;AAChD,UAAM,oBAAoB,KAAK,qBAAqB,OAAO;AAG3D,QAAI,QAAQ,KAAK,aAAa,iBAAiB;AAC/C,QAAI,CAAC,OAAO;AACV,cAAQC,UAAS,UAAU,KAAK,EAAE,QAAQ,QAAQ,GAAG;AAAA,IACvD;AAEA,WAAO;AAAA,MACL,MAAM;AAAA,MACN;AAAA,MACA,SAAS,KAAK,gBAAgB,iBAAiB;AAAA,MAC/C,QAAQ;AAAA,MACR,UAAU;AAAA,QACR,gBAAgB,QAAQ;AAAA,MAC1B;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,aAAa,SAAqC;AAExD,UAAM,UAAU,QAAQ,MAAM,aAAa;AAC3C,QAAI,SAAS;AACX,aAAO,QAAQ,CAAC,EAAE,KAAK;AAAA,IACzB;AAGA,UAAM,iBAAiB,QAAQ,MAAM,gBAAgB;AACrD,QAAI,gBAAgB;AAClB,aAAO,eAAe,CAAC,EAAE,KAAK;AAAA,IAChC;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,MAAc,qBAAqB,KAA2C;AAC5E,UAAM,YAAiC,CAAC;AAExC,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAMC,SAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAWC,MAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AAEvB,cAAI,CAAC,MAAM,KAAK,WAAW,GAAG,KAAK,MAAM,SAAS,gBAAgB;AAChE,kBAAM,QAAQ,QAAQ;AAAA,UACxB;AAAA,QACF,WAAW,MAAM,OAAO,KAAK,KAAK,UAAU,MAAM,IAAI,GAAG;AACvD,cAAI;AACF,kBAAM,MAAM,MAAM,KAAK,gBAAgB,QAAQ;AAC/C,gBAAI,IAAI,QAAQ,KAAK,GAAG;AACtB,wBAAU,KAAK,GAAG;AAAA,YACpB;AAAA,UACF,SAAS,OAAO;AACd,oBAAQ,KAAK,qBAAqB,QAAQ,KAAK,KAAK;AAAA,UACtD;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO;AAAA,EACT;AACF;;;AC5FA,SAAS,QAAAC,aAAY;AASrB,eAAsB,iBAAiB,YAAkD;AACvF,QAAM,QAAQ,MAAMC,MAAK,UAAU;AACnC,QAAM,MAAM,WAAW,MAAM,GAAG,EAAE,IAAI,GAAG,YAAY,KAAK;AAG1D,MAAI,MAAM,OAAO,GAAG;AAClB,QAAI,QAAQ,OAAO;AACjB,YAAM,YAAY,IAAI,aAAa,EAAE,WAAW,CAAC;AACjD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,QAAI,QAAQ,UAAU,QAAQ,OAAO;AACnC,YAAM,YAAY,IAAI,cAAc,EAAE,WAAW,CAAC;AAClD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,QAAI,QAAQ,QAAQ,QAAQ,YAAY;AACtC,YAAM,YAAY,IAAI,kBAAkB,EAAE,WAAW,CAAC;AACtD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,UAAM,IAAI,MAAM,4BAA4B,GAAG,EAAE;AAAA,EACnD;AAGA,MAAI,MAAM,YAAY,GAAG;AACvB,UAAM,eAAoC,CAAC;AAG3C,UAAM,gBAAgB,IAAI,cAAc,EAAE,WAAW,CAAC;AACtD,UAAM,WAAW,MAAM,cAAc,QAAQ;AAC7C,iBAAa,KAAK,GAAG,QAAQ;AAG7B,UAAM,cAAc,IAAI,kBAAkB,EAAE,WAAW,CAAC;AACxD,UAAM,SAAS,MAAM,YAAY,QAAQ;AACzC,iBAAa,KAAK,GAAG,MAAM;AAE3B,WAAO;AAAA,EACT;AAEA,QAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAC/C;","names":["readdir","readFile","join","basename","join","readFile","basename","readdir","readFile","readdir","stat","join","basename","stat","readFile","basename","readdir","join","stat","stat"]}
@@ -0,0 +1,78 @@
1
+ import { R as RAGService, a as RAGOptions, b as RetrievalOptions, c as RAGResponse } from './types-CjnplPJD.js';
2
+ export { C as ChunkMetadata, d as ChunkResult, e as ChunkingOptions, f as Citation, D as Document, g as DocumentChunk, E as EmbeddingBackend, h as EmbeddingOptions, i as EmbeddingResult, j as ExtractedDocument, k as RetrievalResult, S as SearchResult, V as VectorStore, l as VectorStoreOptions } from './types-CjnplPJD.js';
3
+ export { BaseExtractor, CHMExtractor, HTMLExtractor, MarkdownExtractor, extractDocuments } from './extractors/index.js';
4
+ export { HeaderAwareChunker } from './chunking/index.js';
5
+ export { LOCAL_MODELS, LocalEmbeddingBackend, OPENAI_MODELS, OpenAIEmbeddingBackend, createEmbeddingBackend } from './embeddings/index.js';
6
+ export { PostgresVectorStore, PostgresVectorStoreOptions, SQLiteVectorStore, SQLiteVectorStoreOptions, createVectorStore } from './vectorstore/index.js';
7
+ export { DenseRetriever, HybridRetriever, HybridRetrieverOptions } from './retrieval/index.js';
8
+ import 'pg';
9
+
10
+ /**
11
+ * RAG Service
12
+ *
13
+ * Main service that orchestrates document indexing and question answering.
14
+ */
15
+
16
+ declare class RAG implements RAGService {
17
+ private options;
18
+ private vectorStore;
19
+ private embeddings;
20
+ private retriever;
21
+ private llm;
22
+ private chunker;
23
+ private systemPrompt;
24
+ private initialized;
25
+ constructor(options: RAGOptions);
26
+ private createLLM;
27
+ /**
28
+ * Initialize the RAG service
29
+ */
30
+ initialize(): Promise<void>;
31
+ /**
32
+ * Index documents from a path
33
+ */
34
+ index(path: string, options?: {
35
+ sourceId?: string;
36
+ }): Promise<{
37
+ documentsIndexed: number;
38
+ chunksCreated: number;
39
+ }>;
40
+ /**
41
+ * Ask a question and get an answer with citations
42
+ */
43
+ ask(question: string, options?: RetrievalOptions): Promise<RAGResponse>;
44
+ /**
45
+ * Build context text from chunks
46
+ */
47
+ private buildContextText;
48
+ /**
49
+ * Generate answer using LLM
50
+ */
51
+ private generateAnswer;
52
+ /**
53
+ * Extract citations from answer text
54
+ */
55
+ private extractCitations;
56
+ /**
57
+ * Get document count
58
+ */
59
+ getDocumentCount(): Promise<number>;
60
+ /**
61
+ * Get chunk count
62
+ */
63
+ getChunkCount(): Promise<number>;
64
+ /**
65
+ * Clear all indexed data
66
+ */
67
+ clear(): Promise<void>;
68
+ /**
69
+ * Close connections
70
+ */
71
+ close(): Promise<void>;
72
+ }
73
+ /**
74
+ * Create a RAG service instance
75
+ */
76
+ declare function createRAG(options: RAGOptions): Promise<RAG>;
77
+
78
+ export { RAG, RAGOptions, RAGResponse, RAGService, RetrievalOptions, createRAG };