@chatbot-packages/rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunking/index.d.ts +51 -0
- package/dist/chunking/index.js +248 -0
- package/dist/chunking/index.js.map +1 -0
- package/dist/embeddings/index.d.ts +103 -0
- package/dist/embeddings/index.js +195 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/extractors/index.d.ts +95 -0
- package/dist/extractors/index.js +343 -0
- package/dist/extractors/index.js.map +1 -0
- package/dist/index.d.ts +78 -0
- package/dist/index.js +1576 -0
- package/dist/index.js.map +1 -0
- package/dist/retrieval/index.d.ts +65 -0
- package/dist/retrieval/index.js +144 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-CjnplPJD.d.ts +242 -0
- package/dist/vectorstore/index.d.ts +109 -0
- package/dist/vectorstore/index.js +422 -0
- package/dist/vectorstore/index.js.map +1 -0
- package/package.json +83 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { m as ExtractorOptions, n as ExtractorType, j as ExtractedDocument } from '../types-CjnplPJD.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Base Extractor
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
declare abstract class BaseExtractor {
|
|
8
|
+
protected options: ExtractorOptions;
|
|
9
|
+
constructor(options: ExtractorOptions);
|
|
10
|
+
/** Get the extractor type */
|
|
11
|
+
abstract getType(): ExtractorType;
|
|
12
|
+
/** Check if this extractor can handle the given path */
|
|
13
|
+
abstract canHandle(path: string): boolean;
|
|
14
|
+
/** Extract documents from the source */
|
|
15
|
+
abstract extract(): Promise<ExtractedDocument[]>;
|
|
16
|
+
/** Get file extension */
|
|
17
|
+
protected getExtension(path: string): string;
|
|
18
|
+
/** Normalize line endings */
|
|
19
|
+
protected normalizeLineEndings(text: string): string;
|
|
20
|
+
/** Clean excessive whitespace */
|
|
21
|
+
protected cleanWhitespace(text: string): string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* CHM (Compiled HTML Help) Extractor
|
|
26
|
+
*
|
|
27
|
+
* Extracts HTML content from .chm files using 7z.
|
|
28
|
+
* CHM files are Microsoft's compiled HTML help format.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
declare class CHMExtractor extends BaseExtractor {
|
|
32
|
+
private tempDir;
|
|
33
|
+
constructor(options: {
|
|
34
|
+
sourcePath: string;
|
|
35
|
+
outputDir?: string;
|
|
36
|
+
});
|
|
37
|
+
getType(): ExtractorType;
|
|
38
|
+
canHandle(path: string): boolean;
|
|
39
|
+
extract(): Promise<ExtractedDocument[]>;
|
|
40
|
+
private extractWithSevenZip;
|
|
41
|
+
private findHTMLFiles;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* HTML Extractor
|
|
46
|
+
*
|
|
47
|
+
* Extracts text content from HTML files, preserving structure.
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
interface HTMLExtractorOptions {
|
|
51
|
+
sourcePath: string;
|
|
52
|
+
/** Whether to preserve headings structure */
|
|
53
|
+
preserveHeadings?: boolean;
|
|
54
|
+
/** Whether to include links */
|
|
55
|
+
includeLinks?: boolean;
|
|
56
|
+
/** Tags to remove */
|
|
57
|
+
removeTags?: string[];
|
|
58
|
+
}
|
|
59
|
+
declare class HTMLExtractor extends BaseExtractor {
|
|
60
|
+
private htmlOptions;
|
|
61
|
+
constructor(options: HTMLExtractorOptions);
|
|
62
|
+
getType(): ExtractorType;
|
|
63
|
+
canHandle(path: string): boolean;
|
|
64
|
+
extract(): Promise<ExtractedDocument[]>;
|
|
65
|
+
extractFromString(html: string, filePath: string): Promise<ExtractedDocument>;
|
|
66
|
+
private extractFromDirectory;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Markdown Extractor
|
|
71
|
+
*
|
|
72
|
+
* Extracts content from Markdown files.
|
|
73
|
+
*/
|
|
74
|
+
|
|
75
|
+
declare class MarkdownExtractor extends BaseExtractor {
|
|
76
|
+
getType(): ExtractorType;
|
|
77
|
+
canHandle(path: string): boolean;
|
|
78
|
+
extract(): Promise<ExtractedDocument[]>;
|
|
79
|
+
private extractFromFile;
|
|
80
|
+
private extractTitle;
|
|
81
|
+
private extractFromDirectory;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Document Extractors
|
|
86
|
+
*
|
|
87
|
+
* Extract content from various file formats (CHM, HTML, Markdown)
|
|
88
|
+
*/
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Auto-detect and extract documents from a path
|
|
92
|
+
*/
|
|
93
|
+
declare function extractDocuments(sourcePath: string): Promise<ExtractedDocument[]>;
|
|
94
|
+
|
|
95
|
+
export { BaseExtractor, CHMExtractor, HTMLExtractor, type HTMLExtractorOptions, MarkdownExtractor, extractDocuments };
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
// src/extractors/base.ts
|
|
2
|
+
var BaseExtractor = class {
|
|
3
|
+
options;
|
|
4
|
+
constructor(options) {
|
|
5
|
+
this.options = options;
|
|
6
|
+
}
|
|
7
|
+
/** Get file extension */
|
|
8
|
+
getExtension(path) {
|
|
9
|
+
const parts = path.split(".");
|
|
10
|
+
return parts.length > 1 ? parts[parts.length - 1].toLowerCase() : "";
|
|
11
|
+
}
|
|
12
|
+
/** Normalize line endings */
|
|
13
|
+
normalizeLineEndings(text) {
|
|
14
|
+
return text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
|
15
|
+
}
|
|
16
|
+
/** Clean excessive whitespace */
|
|
17
|
+
cleanWhitespace(text) {
|
|
18
|
+
return text.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").trim();
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
// src/extractors/chm.ts
|
|
23
|
+
import { exec } from "child_process";
|
|
24
|
+
import { promisify } from "util";
|
|
25
|
+
import { readdir as readdir2, readFile as readFile2, mkdir, rm } from "fs/promises";
|
|
26
|
+
import { existsSync } from "fs";
|
|
27
|
+
import { join as join2, basename as basename2, dirname } from "path";
|
|
28
|
+
|
|
29
|
+
// src/extractors/html.ts
|
|
30
|
+
import { readFile, readdir, stat } from "fs/promises";
|
|
31
|
+
import { join, basename } from "path";
|
|
32
|
+
import * as cheerio from "cheerio";
|
|
33
|
+
import { convert } from "html-to-text";
|
|
34
|
+
var HTMLExtractor = class extends BaseExtractor {
|
|
35
|
+
htmlOptions;
|
|
36
|
+
constructor(options) {
|
|
37
|
+
super(options);
|
|
38
|
+
this.htmlOptions = {
|
|
39
|
+
preserveHeadings: true,
|
|
40
|
+
includeLinks: false,
|
|
41
|
+
removeTags: ["script", "style", "nav", "footer", "header", "aside", "meta", "link"],
|
|
42
|
+
...options
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
getType() {
|
|
46
|
+
return "html";
|
|
47
|
+
}
|
|
48
|
+
canHandle(path) {
|
|
49
|
+
const ext = this.getExtension(path);
|
|
50
|
+
return ext === "html" || ext === "htm";
|
|
51
|
+
}
|
|
52
|
+
async extract() {
|
|
53
|
+
const { sourcePath } = this.options;
|
|
54
|
+
const stats = await stat(sourcePath);
|
|
55
|
+
if (stats.isFile()) {
|
|
56
|
+
const content = await readFile(sourcePath, "utf-8");
|
|
57
|
+
return [await this.extractFromString(content, sourcePath)];
|
|
58
|
+
}
|
|
59
|
+
if (stats.isDirectory()) {
|
|
60
|
+
return this.extractFromDirectory(sourcePath);
|
|
61
|
+
}
|
|
62
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
63
|
+
}
|
|
64
|
+
async extractFromString(html, filePath) {
|
|
65
|
+
const $ = cheerio.load(html);
|
|
66
|
+
for (const tag of this.htmlOptions.removeTags || []) {
|
|
67
|
+
$(tag).remove();
|
|
68
|
+
}
|
|
69
|
+
let title = $("title").text().trim();
|
|
70
|
+
if (!title) {
|
|
71
|
+
title = $("h1").first().text().trim();
|
|
72
|
+
}
|
|
73
|
+
if (!title) {
|
|
74
|
+
title = basename(filePath, ".html").replace(/-|_/g, " ");
|
|
75
|
+
}
|
|
76
|
+
const text = convert($.html(), {
|
|
77
|
+
wordwrap: false,
|
|
78
|
+
preserveNewlines: true,
|
|
79
|
+
selectors: [
|
|
80
|
+
{ selector: "h1", options: { uppercase: false, prefix: "\n# " } },
|
|
81
|
+
{ selector: "h2", options: { uppercase: false, prefix: "\n## " } },
|
|
82
|
+
{ selector: "h3", options: { uppercase: false, prefix: "\n### " } },
|
|
83
|
+
{ selector: "h4", options: { uppercase: false, prefix: "\n#### " } },
|
|
84
|
+
{ selector: "h5", options: { uppercase: false, prefix: "\n##### " } },
|
|
85
|
+
{ selector: "h6", options: { uppercase: false, prefix: "\n###### " } },
|
|
86
|
+
{ selector: "ul", options: { itemPrefix: " - " } },
|
|
87
|
+
{ selector: "ol", options: { itemPrefix: " 1. " } },
|
|
88
|
+
{ selector: "table", format: "dataTable" },
|
|
89
|
+
{ selector: "a", options: { ignoreHref: !this.htmlOptions.includeLinks } },
|
|
90
|
+
{ selector: "img", format: "skip" }
|
|
91
|
+
]
|
|
92
|
+
});
|
|
93
|
+
const content = this.cleanWhitespace(this.normalizeLineEndings(text));
|
|
94
|
+
return {
|
|
95
|
+
path: filePath,
|
|
96
|
+
title,
|
|
97
|
+
content,
|
|
98
|
+
format: "html",
|
|
99
|
+
metadata: {
|
|
100
|
+
originalLength: html.length,
|
|
101
|
+
extractedLength: content.length
|
|
102
|
+
}
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
async extractFromDirectory(dir) {
|
|
106
|
+
const documents = [];
|
|
107
|
+
const scanDir = async (currentDir) => {
|
|
108
|
+
const entries = await readdir(currentDir, { withFileTypes: true });
|
|
109
|
+
for (const entry of entries) {
|
|
110
|
+
const fullPath = join(currentDir, entry.name);
|
|
111
|
+
if (entry.isDirectory()) {
|
|
112
|
+
await scanDir(fullPath);
|
|
113
|
+
} else if (entry.isFile() && this.canHandle(entry.name)) {
|
|
114
|
+
try {
|
|
115
|
+
const content = await readFile(fullPath, "utf-8");
|
|
116
|
+
const doc = await this.extractFromString(content, fullPath);
|
|
117
|
+
if (doc.content.trim()) {
|
|
118
|
+
documents.push(doc);
|
|
119
|
+
}
|
|
120
|
+
} catch (error) {
|
|
121
|
+
console.warn(`Failed to extract ${fullPath}:`, error);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
};
|
|
126
|
+
await scanDir(dir);
|
|
127
|
+
return documents;
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
// src/extractors/chm.ts
|
|
132
|
+
var execAsync = promisify(exec);
|
|
133
|
+
var CHMExtractor = class extends BaseExtractor {
|
|
134
|
+
tempDir;
|
|
135
|
+
constructor(options) {
|
|
136
|
+
super(options);
|
|
137
|
+
this.tempDir = options.outputDir || join2(dirname(options.sourcePath), ".chm-extract-temp");
|
|
138
|
+
}
|
|
139
|
+
getType() {
|
|
140
|
+
return "chm";
|
|
141
|
+
}
|
|
142
|
+
canHandle(path) {
|
|
143
|
+
return this.getExtension(path) === "chm";
|
|
144
|
+
}
|
|
145
|
+
async extract() {
|
|
146
|
+
const { sourcePath } = this.options;
|
|
147
|
+
if (!existsSync(sourcePath)) {
|
|
148
|
+
throw new Error(`CHM file not found: ${sourcePath}`);
|
|
149
|
+
}
|
|
150
|
+
await mkdir(this.tempDir, { recursive: true });
|
|
151
|
+
try {
|
|
152
|
+
await this.extractWithSevenZip(sourcePath, this.tempDir);
|
|
153
|
+
const htmlFiles = await this.findHTMLFiles(this.tempDir);
|
|
154
|
+
const documents = [];
|
|
155
|
+
for (const htmlFile of htmlFiles) {
|
|
156
|
+
try {
|
|
157
|
+
const content = await readFile2(htmlFile, "utf-8");
|
|
158
|
+
const htmlExtractor = new HTMLExtractor({ sourcePath: htmlFile });
|
|
159
|
+
const extracted = await htmlExtractor.extractFromString(content, htmlFile);
|
|
160
|
+
if (extracted.content.trim()) {
|
|
161
|
+
documents.push({
|
|
162
|
+
...extracted,
|
|
163
|
+
format: "chm",
|
|
164
|
+
metadata: {
|
|
165
|
+
...extracted.metadata,
|
|
166
|
+
sourceChm: basename2(sourcePath),
|
|
167
|
+
originalPath: htmlFile.replace(this.tempDir, "")
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
} catch (error) {
|
|
172
|
+
console.warn(`Failed to parse HTML file: ${htmlFile}`, error);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
return documents;
|
|
176
|
+
} finally {
|
|
177
|
+
if (existsSync(this.tempDir)) {
|
|
178
|
+
await rm(this.tempDir, { recursive: true, force: true });
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
async extractWithSevenZip(chmPath, outputDir) {
|
|
183
|
+
try {
|
|
184
|
+
await execAsync(`7z x "${chmPath}" -o"${outputDir}" -y`, {
|
|
185
|
+
maxBuffer: 50 * 1024 * 1024
|
|
186
|
+
// 50MB buffer
|
|
187
|
+
});
|
|
188
|
+
} catch (error) {
|
|
189
|
+
try {
|
|
190
|
+
await execAsync(`7za x "${chmPath}" -o"${outputDir}" -y`, {
|
|
191
|
+
maxBuffer: 50 * 1024 * 1024
|
|
192
|
+
});
|
|
193
|
+
} catch {
|
|
194
|
+
throw new Error(
|
|
195
|
+
`Failed to extract CHM file. Please ensure 7z is installed.
|
|
196
|
+
- On Ubuntu/Debian: sudo apt-get install p7zip-full
|
|
197
|
+
- On macOS: brew install p7zip
|
|
198
|
+
- On Windows: Install 7-Zip from https://7-zip.org/
|
|
199
|
+
Original error: ${error}`
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
async findHTMLFiles(dir) {
|
|
205
|
+
const htmlFiles = [];
|
|
206
|
+
const scanDir = async (currentDir) => {
|
|
207
|
+
const entries = await readdir2(currentDir, { withFileTypes: true });
|
|
208
|
+
for (const entry of entries) {
|
|
209
|
+
const fullPath = join2(currentDir, entry.name);
|
|
210
|
+
if (entry.isDirectory()) {
|
|
211
|
+
if (!entry.name.startsWith("$") && !entry.name.startsWith("#")) {
|
|
212
|
+
await scanDir(fullPath);
|
|
213
|
+
}
|
|
214
|
+
} else if (entry.isFile()) {
|
|
215
|
+
const ext = this.getExtension(entry.name);
|
|
216
|
+
if (ext === "html" || ext === "htm") {
|
|
217
|
+
htmlFiles.push(fullPath);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
await scanDir(dir);
|
|
223
|
+
return htmlFiles.sort();
|
|
224
|
+
}
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
// src/extractors/markdown.ts
|
|
228
|
+
import { readFile as readFile3, readdir as readdir3, stat as stat2 } from "fs/promises";
|
|
229
|
+
import { join as join3, basename as basename3 } from "path";
|
|
230
|
+
var MarkdownExtractor = class extends BaseExtractor {
|
|
231
|
+
getType() {
|
|
232
|
+
return "markdown";
|
|
233
|
+
}
|
|
234
|
+
canHandle(path) {
|
|
235
|
+
const ext = this.getExtension(path);
|
|
236
|
+
return ext === "md" || ext === "markdown";
|
|
237
|
+
}
|
|
238
|
+
async extract() {
|
|
239
|
+
const { sourcePath } = this.options;
|
|
240
|
+
const stats = await stat2(sourcePath);
|
|
241
|
+
if (stats.isFile()) {
|
|
242
|
+
return [await this.extractFromFile(sourcePath)];
|
|
243
|
+
}
|
|
244
|
+
if (stats.isDirectory()) {
|
|
245
|
+
return this.extractFromDirectory(sourcePath);
|
|
246
|
+
}
|
|
247
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
248
|
+
}
|
|
249
|
+
async extractFromFile(filePath) {
|
|
250
|
+
const content = await readFile3(filePath, "utf-8");
|
|
251
|
+
const normalizedContent = this.normalizeLineEndings(content);
|
|
252
|
+
let title = this.extractTitle(normalizedContent);
|
|
253
|
+
if (!title) {
|
|
254
|
+
title = basename3(filePath, ".md").replace(/-|_/g, " ");
|
|
255
|
+
}
|
|
256
|
+
return {
|
|
257
|
+
path: filePath,
|
|
258
|
+
title,
|
|
259
|
+
content: this.cleanWhitespace(normalizedContent),
|
|
260
|
+
format: "markdown",
|
|
261
|
+
metadata: {
|
|
262
|
+
originalLength: content.length
|
|
263
|
+
}
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
extractTitle(content) {
|
|
267
|
+
const h1Match = content.match(/^#\s+(.+)$/m);
|
|
268
|
+
if (h1Match) {
|
|
269
|
+
return h1Match[1].trim();
|
|
270
|
+
}
|
|
271
|
+
const underlineMatch = content.match(/^(.+)\n=+\s*$/m);
|
|
272
|
+
if (underlineMatch) {
|
|
273
|
+
return underlineMatch[1].trim();
|
|
274
|
+
}
|
|
275
|
+
return void 0;
|
|
276
|
+
}
|
|
277
|
+
async extractFromDirectory(dir) {
|
|
278
|
+
const documents = [];
|
|
279
|
+
const scanDir = async (currentDir) => {
|
|
280
|
+
const entries = await readdir3(currentDir, { withFileTypes: true });
|
|
281
|
+
for (const entry of entries) {
|
|
282
|
+
const fullPath = join3(currentDir, entry.name);
|
|
283
|
+
if (entry.isDirectory()) {
|
|
284
|
+
if (!entry.name.startsWith(".") && entry.name !== "node_modules") {
|
|
285
|
+
await scanDir(fullPath);
|
|
286
|
+
}
|
|
287
|
+
} else if (entry.isFile() && this.canHandle(entry.name)) {
|
|
288
|
+
try {
|
|
289
|
+
const doc = await this.extractFromFile(fullPath);
|
|
290
|
+
if (doc.content.trim()) {
|
|
291
|
+
documents.push(doc);
|
|
292
|
+
}
|
|
293
|
+
} catch (error) {
|
|
294
|
+
console.warn(`Failed to extract ${fullPath}:`, error);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
await scanDir(dir);
|
|
300
|
+
return documents;
|
|
301
|
+
}
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
// src/extractors/index.ts
|
|
305
|
+
import { stat as stat3 } from "fs/promises";
|
|
306
|
+
async function extractDocuments(sourcePath) {
|
|
307
|
+
const stats = await stat3(sourcePath);
|
|
308
|
+
const ext = sourcePath.split(".").pop()?.toLowerCase() || "";
|
|
309
|
+
if (stats.isFile()) {
|
|
310
|
+
if (ext === "chm") {
|
|
311
|
+
const extractor = new CHMExtractor({ sourcePath });
|
|
312
|
+
return extractor.extract();
|
|
313
|
+
}
|
|
314
|
+
if (ext === "html" || ext === "htm") {
|
|
315
|
+
const extractor = new HTMLExtractor({ sourcePath });
|
|
316
|
+
return extractor.extract();
|
|
317
|
+
}
|
|
318
|
+
if (ext === "md" || ext === "markdown") {
|
|
319
|
+
const extractor = new MarkdownExtractor({ sourcePath });
|
|
320
|
+
return extractor.extract();
|
|
321
|
+
}
|
|
322
|
+
throw new Error(`Unsupported file format: ${ext}`);
|
|
323
|
+
}
|
|
324
|
+
if (stats.isDirectory()) {
|
|
325
|
+
const allDocuments = [];
|
|
326
|
+
const htmlExtractor = new HTMLExtractor({ sourcePath });
|
|
327
|
+
const htmlDocs = await htmlExtractor.extract();
|
|
328
|
+
allDocuments.push(...htmlDocs);
|
|
329
|
+
const mdExtractor = new MarkdownExtractor({ sourcePath });
|
|
330
|
+
const mdDocs = await mdExtractor.extract();
|
|
331
|
+
allDocuments.push(...mdDocs);
|
|
332
|
+
return allDocuments;
|
|
333
|
+
}
|
|
334
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
335
|
+
}
|
|
336
|
+
export {
|
|
337
|
+
BaseExtractor,
|
|
338
|
+
CHMExtractor,
|
|
339
|
+
HTMLExtractor,
|
|
340
|
+
MarkdownExtractor,
|
|
341
|
+
extractDocuments
|
|
342
|
+
};
|
|
343
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/extractors/base.ts","../../src/extractors/chm.ts","../../src/extractors/html.ts","../../src/extractors/markdown.ts","../../src/extractors/index.ts"],"sourcesContent":["/**\n * Base Extractor\n */\n\nimport type { ExtractedDocument, ExtractorOptions, ExtractorType } from '../types.js';\n\nexport abstract class BaseExtractor {\n protected options: ExtractorOptions;\n\n constructor(options: ExtractorOptions) {\n this.options = options;\n }\n\n /** Get the extractor type */\n abstract getType(): ExtractorType;\n\n /** Check if this extractor can handle the given path */\n abstract canHandle(path: string): boolean;\n\n /** Extract documents from the source */\n abstract extract(): Promise<ExtractedDocument[]>;\n\n /** Get file extension */\n protected getExtension(path: string): string {\n const parts = path.split('.');\n return parts.length > 1 ? parts[parts.length - 1].toLowerCase() : '';\n }\n\n /** Normalize line endings */\n protected normalizeLineEndings(text: string): string {\n return text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n }\n\n /** Clean excessive whitespace */\n protected cleanWhitespace(text: string): string {\n return text\n .replace(/\\n{3,}/g, '\\n\\n')\n .replace(/[ \\t]+/g, ' ')\n .trim();\n }\n}\n","/**\n * CHM (Compiled HTML Help) Extractor\n *\n * Extracts HTML content from .chm files using 7z.\n * CHM files are Microsoft's compiled HTML help format.\n */\n\nimport { exec } from 'child_process';\nimport { promisify } from 'util';\nimport { readdir, readFile, mkdir, rm } from 'fs/promises';\nimport { existsSync } from 'fs';\nimport { join, basename, dirname } from 'path';\nimport { BaseExtractor } from './base.js';\nimport { HTMLExtractor } from './html.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nconst execAsync = promisify(exec);\n\nexport class CHMExtractor extends BaseExtractor {\n private tempDir: string;\n\n constructor(options: { sourcePath: string; outputDir?: string }) {\n super(options);\n this.tempDir = options.outputDir || join(dirname(options.sourcePath), '.chm-extract-temp');\n }\n\n getType(): ExtractorType {\n return 'chm';\n }\n\n canHandle(path: string): boolean {\n return this.getExtension(path) === 'chm';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n\n if (!existsSync(sourcePath)) {\n throw new Error(`CHM file not found: ${sourcePath}`);\n }\n\n // Create temp directory\n await mkdir(this.tempDir, { recursive: true });\n\n try {\n // Extract CHM using 7z\n await this.extractWithSevenZip(sourcePath, this.tempDir);\n\n // Find and parse all HTML files\n const htmlFiles = await this.findHTMLFiles(this.tempDir);\n const documents: ExtractedDocument[] = [];\n\n for (const htmlFile of htmlFiles) {\n try {\n const content = await readFile(htmlFile, 'utf-8');\n const htmlExtractor = new HTMLExtractor({ sourcePath: htmlFile });\n const extracted = await htmlExtractor.extractFromString(content, htmlFile);\n\n if (extracted.content.trim()) {\n documents.push({\n ...extracted,\n format: 'chm',\n metadata: {\n ...extracted.metadata,\n sourceChm: basename(sourcePath),\n originalPath: htmlFile.replace(this.tempDir, ''),\n },\n });\n }\n } catch (error) {\n // Skip files that can't be parsed\n console.warn(`Failed to parse HTML file: ${htmlFile}`, error);\n }\n }\n\n return documents;\n } finally {\n // Cleanup temp directory\n if (existsSync(this.tempDir)) {\n await rm(this.tempDir, { recursive: true, force: true });\n }\n }\n }\n\n private async extractWithSevenZip(chmPath: string, outputDir: string): Promise<void> {\n try {\n // Try 7z first\n await execAsync(`7z x \"${chmPath}\" -o\"${outputDir}\" -y`, {\n maxBuffer: 50 * 1024 * 1024, // 50MB buffer\n });\n } catch (error) {\n // Try 7za as fallback (common on some systems)\n try {\n await execAsync(`7za x \"${chmPath}\" -o\"${outputDir}\" -y`, {\n maxBuffer: 50 * 1024 * 1024,\n });\n } catch {\n throw new Error(\n `Failed to extract CHM file. Please ensure 7z is installed.\\n` +\n ` - On Ubuntu/Debian: sudo apt-get install p7zip-full\\n` +\n ` - On macOS: brew install p7zip\\n` +\n ` - On Windows: Install 7-Zip from https://7-zip.org/\\n` +\n `Original error: ${error}`\n );\n }\n }\n }\n\n private async findHTMLFiles(dir: string): Promise<string[]> {\n const htmlFiles: string[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n // Skip system directories\n if (!entry.name.startsWith('$') && !entry.name.startsWith('#')) {\n await scanDir(fullPath);\n }\n } else if (entry.isFile()) {\n const ext = this.getExtension(entry.name);\n if (ext === 'html' || ext === 'htm') {\n htmlFiles.push(fullPath);\n }\n }\n }\n };\n\n await scanDir(dir);\n return htmlFiles.sort();\n }\n}\n","/**\n * HTML Extractor\n *\n * Extracts text content from HTML files, preserving structure.\n */\n\nimport { readFile, readdir, stat } from 'fs/promises';\nimport { join, basename } from 'path';\nimport * as cheerio from 'cheerio';\nimport { convert } from 'html-to-text';\nimport { BaseExtractor } from './base.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nexport interface HTMLExtractorOptions {\n sourcePath: string;\n /** Whether to preserve headings structure */\n preserveHeadings?: boolean;\n /** Whether to include links */\n includeLinks?: boolean;\n /** Tags to remove */\n removeTags?: string[];\n}\n\nexport class HTMLExtractor extends BaseExtractor {\n private htmlOptions: HTMLExtractorOptions;\n\n constructor(options: HTMLExtractorOptions) {\n super(options);\n this.htmlOptions = {\n preserveHeadings: true,\n includeLinks: false,\n removeTags: ['script', 'style', 'nav', 'footer', 'header', 'aside', 'meta', 'link'],\n ...options,\n };\n }\n\n getType(): ExtractorType {\n return 'html';\n }\n\n canHandle(path: string): boolean {\n const ext = this.getExtension(path);\n return ext === 'html' || ext === 'htm';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n const stats = await stat(sourcePath);\n\n if (stats.isFile()) {\n const content = await readFile(sourcePath, 'utf-8');\n return [await this.extractFromString(content, sourcePath)];\n }\n\n if (stats.isDirectory()) {\n return this.extractFromDirectory(sourcePath);\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n }\n\n async extractFromString(html: string, filePath: string): Promise<ExtractedDocument> {\n const $ = cheerio.load(html);\n\n // Remove unwanted tags\n for (const tag of this.htmlOptions.removeTags || []) {\n $(tag).remove();\n }\n\n // Extract title\n let title = $('title').text().trim();\n if (!title) {\n title = $('h1').first().text().trim();\n }\n if (!title) {\n title = basename(filePath, '.html').replace(/-|_/g, ' ');\n }\n\n // Convert to text with structure preservation\n const text = convert($.html(), {\n wordwrap: false,\n preserveNewlines: true,\n selectors: [\n { selector: 'h1', options: { uppercase: false, prefix: '\\n# ' } },\n { selector: 'h2', options: { uppercase: false, prefix: '\\n## ' } },\n { selector: 'h3', options: { uppercase: false, prefix: '\\n### ' } },\n { selector: 'h4', options: { uppercase: false, prefix: '\\n#### ' } },\n { selector: 'h5', options: { uppercase: false, prefix: '\\n##### ' } },\n { selector: 'h6', options: { uppercase: false, prefix: '\\n###### ' } },\n { selector: 'ul', options: { itemPrefix: ' - ' } },\n { selector: 'ol', options: { itemPrefix: ' 1. ' } },\n { selector: 'table', format: 'dataTable' },\n { selector: 'a', options: { ignoreHref: !this.htmlOptions.includeLinks } },\n { selector: 'img', format: 'skip' },\n ],\n });\n\n const content = this.cleanWhitespace(this.normalizeLineEndings(text));\n\n return {\n path: filePath,\n title,\n content,\n format: 'html',\n metadata: {\n originalLength: html.length,\n extractedLength: content.length,\n },\n };\n }\n\n private async extractFromDirectory(dir: string): Promise<ExtractedDocument[]> {\n const documents: ExtractedDocument[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n await scanDir(fullPath);\n } else if (entry.isFile() && this.canHandle(entry.name)) {\n try {\n const content = await readFile(fullPath, 'utf-8');\n const doc = await this.extractFromString(content, fullPath);\n if (doc.content.trim()) {\n documents.push(doc);\n }\n } catch (error) {\n console.warn(`Failed to extract ${fullPath}:`, error);\n }\n }\n }\n };\n\n await scanDir(dir);\n return documents;\n }\n}\n","/**\n * Markdown Extractor\n *\n * Extracts content from Markdown files.\n */\n\nimport { readFile, readdir, stat } from 'fs/promises';\nimport { join, basename } from 'path';\nimport { BaseExtractor } from './base.js';\nimport type { ExtractedDocument, ExtractorType } from '../types.js';\n\nexport class MarkdownExtractor extends BaseExtractor {\n getType(): ExtractorType {\n return 'markdown';\n }\n\n canHandle(path: string): boolean {\n const ext = this.getExtension(path);\n return ext === 'md' || ext === 'markdown';\n }\n\n async extract(): Promise<ExtractedDocument[]> {\n const { sourcePath } = this.options;\n const stats = await stat(sourcePath);\n\n if (stats.isFile()) {\n return [await this.extractFromFile(sourcePath)];\n }\n\n if (stats.isDirectory()) {\n return this.extractFromDirectory(sourcePath);\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n }\n\n private async extractFromFile(filePath: string): Promise<ExtractedDocument> {\n const content = await readFile(filePath, 'utf-8');\n const normalizedContent = this.normalizeLineEndings(content);\n\n // Extract title from first H1 or filename\n let title = this.extractTitle(normalizedContent);\n if (!title) {\n title = basename(filePath, '.md').replace(/-|_/g, ' ');\n }\n\n return {\n path: filePath,\n title,\n content: this.cleanWhitespace(normalizedContent),\n format: 'markdown',\n metadata: {\n originalLength: content.length,\n },\n };\n }\n\n private extractTitle(content: string): string | undefined {\n // Look for first H1 heading\n const h1Match = content.match(/^#\\s+(.+)$/m);\n if (h1Match) {\n return h1Match[1].trim();\n }\n\n // Look for underline-style H1\n const underlineMatch = content.match(/^(.+)\\n=+\\s*$/m);\n if (underlineMatch) {\n return underlineMatch[1].trim();\n }\n\n return undefined;\n }\n\n private async extractFromDirectory(dir: string): Promise<ExtractedDocument[]> {\n const documents: ExtractedDocument[] = [];\n\n const scanDir = async (currentDir: string) => {\n const entries = await readdir(currentDir, { withFileTypes: true });\n\n for (const entry of entries) {\n const fullPath = join(currentDir, entry.name);\n\n if (entry.isDirectory()) {\n // Skip hidden and common non-content directories\n if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {\n await scanDir(fullPath);\n }\n } else if (entry.isFile() && this.canHandle(entry.name)) {\n try {\n const doc = await this.extractFromFile(fullPath);\n if (doc.content.trim()) {\n documents.push(doc);\n }\n } catch (error) {\n console.warn(`Failed to extract ${fullPath}:`, error);\n }\n }\n }\n };\n\n await scanDir(dir);\n return documents;\n }\n}\n","/**\n * Document Extractors\n *\n * Extract content from various file formats (CHM, HTML, Markdown)\n */\n\nexport { BaseExtractor } from './base.js';\nexport { CHMExtractor } from './chm.js';\nexport { HTMLExtractor, type HTMLExtractorOptions } from './html.js';\nexport { MarkdownExtractor } from './markdown.js';\n\nimport { stat } from 'fs/promises';\nimport { CHMExtractor } from './chm.js';\nimport { HTMLExtractor } from './html.js';\nimport { MarkdownExtractor } from './markdown.js';\nimport type { ExtractedDocument } from '../types.js';\n\n/**\n * Auto-detect and extract documents from a path\n */\nexport async function extractDocuments(sourcePath: string): Promise<ExtractedDocument[]> {\n const stats = await stat(sourcePath);\n const ext = sourcePath.split('.').pop()?.toLowerCase() || '';\n\n // Single file\n if (stats.isFile()) {\n if (ext === 'chm') {\n const extractor = new CHMExtractor({ sourcePath });\n return extractor.extract();\n }\n if (ext === 'html' || ext === 'htm') {\n const extractor = new HTMLExtractor({ sourcePath });\n return extractor.extract();\n }\n if (ext === 'md' || ext === 'markdown') {\n const extractor = new MarkdownExtractor({ sourcePath });\n return extractor.extract();\n }\n throw new Error(`Unsupported file format: ${ext}`);\n }\n\n // Directory - extract all supported files\n if (stats.isDirectory()) {\n const allDocuments: ExtractedDocument[] = [];\n\n // Extract HTML files\n const htmlExtractor = new HTMLExtractor({ sourcePath });\n const htmlDocs = await htmlExtractor.extract();\n allDocuments.push(...htmlDocs);\n\n // Extract Markdown files\n const mdExtractor = new MarkdownExtractor({ sourcePath });\n const mdDocs = await mdExtractor.extract();\n allDocuments.push(...mdDocs);\n\n return allDocuments;\n }\n\n throw new Error(`Invalid path: ${sourcePath}`);\n}\n"],"mappings":";AAMO,IAAe,gBAAf,MAA6B;AAAA,EACxB;AAAA,EAEV,YAAY,SAA2B;AACrC,SAAK,UAAU;AAAA,EACjB;AAAA;AAAA,EAYU,aAAa,MAAsB;AAC3C,UAAM,QAAQ,KAAK,MAAM,GAAG;AAC5B,WAAO,MAAM,SAAS,IAAI,MAAM,MAAM,SAAS,CAAC,EAAE,YAAY,IAAI;AAAA,EACpE;AAAA;AAAA,EAGU,qBAAqB,MAAsB;AACnD,WAAO,KAAK,QAAQ,SAAS,IAAI,EAAE,QAAQ,OAAO,IAAI;AAAA,EACxD;AAAA;AAAA,EAGU,gBAAgB,MAAsB;AAC9C,WAAO,KACJ,QAAQ,WAAW,MAAM,EACzB,QAAQ,WAAW,GAAG,EACtB,KAAK;AAAA,EACV;AACF;;;ACjCA,SAAS,YAAY;AACrB,SAAS,iBAAiB;AAC1B,SAAS,WAAAA,UAAS,YAAAC,WAAU,OAAO,UAAU;AAC7C,SAAS,kBAAkB;AAC3B,SAAS,QAAAC,OAAM,YAAAC,WAAU,eAAe;;;ACLxC,SAAS,UAAU,SAAS,YAAY;AACxC,SAAS,MAAM,gBAAgB;AAC/B,YAAY,aAAa;AACzB,SAAS,eAAe;AAcjB,IAAM,gBAAN,cAA4B,cAAc;AAAA,EACvC;AAAA,EAER,YAAY,SAA+B;AACzC,UAAM,OAAO;AACb,SAAK,cAAc;AAAA,MACjB,kBAAkB;AAAA,MAClB,cAAc;AAAA,MACd,YAAY,CAAC,UAAU,SAAS,OAAO,UAAU,UAAU,SAAS,QAAQ,MAAM;AAAA,MAClF,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEA,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,UAAM,MAAM,KAAK,aAAa,IAAI;AAClC,WAAO,QAAQ,UAAU,QAAQ;AAAA,EACnC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAC5B,UAAM,QAAQ,MAAM,KAAK,UAAU;AAEnC,QAAI,MAAM,OAAO,GAAG;AAClB,YAAM,UAAU,MAAM,SAAS,YAAY,OAAO;AAClD,aAAO,CAAC,MAAM,KAAK,kBAAkB,SAAS,UAAU,CAAC;AAAA,IAC3D;AAEA,QAAI,MAAM,YAAY,GAAG;AACvB,aAAO,KAAK,qBAAqB,UAAU;AAAA,IAC7C;AAEA,UAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAAA,EAC/C;AAAA,EAEA,MAAM,kBAAkB,MAAc,UAA8C;AAClF,UAAM,IAAY,aAAK,IAAI;AAG3B,eAAW,OAAO,KAAK,YAAY,cAAc,CAAC,GAAG;AACnD,QAAE,GAAG,EAAE,OAAO;AAAA,IAChB;AAGA,QAAI,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK;AACnC,QAAI,CAAC,OAAO;AACV,cAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK;AAAA,IACtC;AACA,QAAI,CAAC,OAAO;AACV,cAAQ,SAAS,UAAU,OAAO,EAAE,QAAQ,QAAQ,GAAG;AAAA,IACzD;AAGA,UAAM,OAAO,QAAQ,EAAE,KAAK,GAAG;AAAA,MAC7B,UAAU;AAAA,MACV,kBAAkB;AAAA,MAClB,WAAW;AAAA,QACT,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,OAAO,EAAE;AAAA,QAChE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,QAAQ,EAAE;AAAA,QACjE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,SAAS,EAAE;AAAA,QAClE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,UAAU,EAAE;AAAA,QACnE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,WAAW,EAAE;AAAA,QACpE,EAAE,UAAU,MAAM,SAAS,EAAE,WAAW,OAAO,QAAQ,YAAY,EAAE;AAAA,QACrE,EAAE,UAAU,MAAM,SAAS,EAAE,YAAY,OAAO,EAAE;AAAA,QAClD,EAAE,UAAU,MAAM,SAAS,EAAE,YAAY,QAAQ,EAAE;AAAA,QACnD,EAAE,UAAU,SAAS,QAAQ,YAAY;AAAA,QACzC,EAAE,UAAU,KAAK,SAAS,EAAE,YAAY,CAAC,KAAK,YAAY,aAAa,EAAE;AAAA,QACzE,EAAE,UAAU,OAAO,QAAQ,OAAO;AAAA,MACpC;AAAA,IACF,CAAC;AAED,UAAM,UAAU,KAAK,gBAAgB,KAAK,qBAAqB,IAAI,CAAC;AAEpE,WAAO;AAAA,MACL,MAAM;AAAA,MACN;AAAA,MACA;AAAA,MACA,QAAQ;AAAA,MACR,UAAU;AAAA,QACR,gBAAgB,KAAK;AAAA,QACrB,iBAAiB,QAAQ;AAAA,MAC3B;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,qBAAqB,KAA2C;AAC5E,UAAM,YAAiC,CAAC;AAExC,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAM,QAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAW,KAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AACvB,gBAAM,QAAQ,QAAQ;AAAA,QACxB,WAAW,MAAM,OAAO,KAAK,KAAK,UAAU,MAAM,IAAI,GAAG;AACvD,cAAI;AACF,kBAAM,UAAU,MAAM,SAAS,UAAU,OAAO;AAChD,kBAAM,MAAM,MAAM,KAAK,kBAAkB,SAAS,QAAQ;AAC1D,gBAAI,IAAI,QAAQ,KAAK,GAAG;AACtB,wBAAU,KAAK,GAAG;AAAA,YACpB;AAAA,UACF,SAAS,OAAO;AACd,oBAAQ,KAAK,qBAAqB,QAAQ,KAAK,KAAK;AAAA,UACtD;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO;AAAA,EACT;AACF;;;AD3HA,IAAM,YAAY,UAAU,IAAI;AAEzB,IAAM,eAAN,cAA2B,cAAc;AAAA,EACtC;AAAA,EAER,YAAY,SAAqD;AAC/D,UAAM,OAAO;AACb,SAAK,UAAU,QAAQ,aAAaC,MAAK,QAAQ,QAAQ,UAAU,GAAG,mBAAmB;AAAA,EAC3F;AAAA,EAEA,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,WAAO,KAAK,aAAa,IAAI,MAAM;AAAA,EACrC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAE5B,QAAI,CAAC,WAAW,UAAU,GAAG;AAC3B,YAAM,IAAI,MAAM,uBAAuB,UAAU,EAAE;AAAA,IACrD;AAGA,UAAM,MAAM,KAAK,SAAS,EAAE,WAAW,KAAK,CAAC;AAE7C,QAAI;AAEF,YAAM,KAAK,oBAAoB,YAAY,KAAK,OAAO;AAGvD,YAAM,YAAY,MAAM,KAAK,cAAc,KAAK,OAAO;AACvD,YAAM,YAAiC,CAAC;AAExC,iBAAW,YAAY,WAAW;AAChC,YAAI;AACF,gBAAM,UAAU,MAAMC,UAAS,UAAU,OAAO;AAChD,gBAAM,gBAAgB,IAAI,cAAc,EAAE,YAAY,SAAS,CAAC;AAChE,gBAAM,YAAY,MAAM,cAAc,kBAAkB,SAAS,QAAQ;AAEzE,cAAI,UAAU,QAAQ,KAAK,GAAG;AAC5B,sBAAU,KAAK;AAAA,cACb,GAAG;AAAA,cACH,QAAQ;AAAA,cACR,UAAU;AAAA,gBACR,GAAG,UAAU;AAAA,gBACb,WAAWC,UAAS,UAAU;AAAA,gBAC9B,cAAc,SAAS,QAAQ,KAAK,SAAS,EAAE;AAAA,cACjD;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF,SAAS,OAAO;AAEd,kBAAQ,KAAK,8BAA8B,QAAQ,IAAI,KAAK;AAAA,QAC9D;AAAA,MACF;AAEA,aAAO;AAAA,IACT,UAAE;AAEA,UAAI,WAAW,KAAK,OAAO,GAAG;AAC5B,cAAM,GAAG,KAAK,SAAS,EAAE,WAAW,MAAM,OAAO,KAAK,CAAC;AAAA,MACzD;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,oBAAoB,SAAiB,WAAkC;AACnF,QAAI;AAEF,YAAM,UAAU,SAAS,OAAO,QAAQ,SAAS,QAAQ;AAAA,QACvD,WAAW,KAAK,OAAO;AAAA;AAAA,MACzB,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,UAAI;AACF,cAAM,UAAU,UAAU,OAAO,QAAQ,SAAS,QAAQ;AAAA,UACxD,WAAW,KAAK,OAAO;AAAA,QACzB,CAAC;AAAA,MACH,QAAQ;AACN,cAAM,IAAI;AAAA,UACR;AAAA;AAAA;AAAA;AAAA,kBAIqB,KAAK;AAAA,QAC5B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,cAAc,KAAgC;AAC1D,UAAM,YAAsB,CAAC;AAE7B,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAMC,SAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAWH,MAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AAEvB,cAAI,CAAC,MAAM,KAAK,WAAW,GAAG,KAAK,CAAC,MAAM,KAAK,WAAW,GAAG,GAAG;AAC9D,kBAAM,QAAQ,QAAQ;AAAA,UACxB;AAAA,QACF,WAAW,MAAM,OAAO,GAAG;AACzB,gBAAM,MAAM,KAAK,aAAa,MAAM,IAAI;AACxC,cAAI,QAAQ,UAAU,QAAQ,OAAO;AACnC,sBAAU,KAAK,QAAQ;AAAA,UACzB;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO,UAAU,KAAK;AAAA,EACxB;AACF;;;AEhIA,SAAS,YAAAI,WAAU,WAAAC,UAAS,QAAAC,aAAY;AACxC,SAAS,QAAAC,OAAM,YAAAC,iBAAgB;AAIxB,IAAM,oBAAN,cAAgC,cAAc;AAAA,EACnD,UAAyB;AACvB,WAAO;AAAA,EACT;AAAA,EAEA,UAAU,MAAuB;AAC/B,UAAM,MAAM,KAAK,aAAa,IAAI;AAClC,WAAO,QAAQ,QAAQ,QAAQ;AAAA,EACjC;AAAA,EAEA,MAAM,UAAwC;AAC5C,UAAM,EAAE,WAAW,IAAI,KAAK;AAC5B,UAAM,QAAQ,MAAMC,MAAK,UAAU;AAEnC,QAAI,MAAM,OAAO,GAAG;AAClB,aAAO,CAAC,MAAM,KAAK,gBAAgB,UAAU,CAAC;AAAA,IAChD;AAEA,QAAI,MAAM,YAAY,GAAG;AACvB,aAAO,KAAK,qBAAqB,UAAU;AAAA,IAC7C;AAEA,UAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAAA,EAC/C;AAAA,EAEA,MAAc,gBAAgB,UAA8C;AAC1E,UAAM,UAAU,MAAMC,UAAS,UAAU,OAAO;AAChD,UAAM,oBAAoB,KAAK,qBAAqB,OAAO;AAG3D,QAAI,QAAQ,KAAK,aAAa,iBAAiB;AAC/C,QAAI,CAAC,OAAO;AACV,cAAQC,UAAS,UAAU,KAAK,EAAE,QAAQ,QAAQ,GAAG;AAAA,IACvD;AAEA,WAAO;AAAA,MACL,MAAM;AAAA,MACN;AAAA,MACA,SAAS,KAAK,gBAAgB,iBAAiB;AAAA,MAC/C,QAAQ;AAAA,MACR,UAAU;AAAA,QACR,gBAAgB,QAAQ;AAAA,MAC1B;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,aAAa,SAAqC;AAExD,UAAM,UAAU,QAAQ,MAAM,aAAa;AAC3C,QAAI,SAAS;AACX,aAAO,QAAQ,CAAC,EAAE,KAAK;AAAA,IACzB;AAGA,UAAM,iBAAiB,QAAQ,MAAM,gBAAgB;AACrD,QAAI,gBAAgB;AAClB,aAAO,eAAe,CAAC,EAAE,KAAK;AAAA,IAChC;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,MAAc,qBAAqB,KAA2C;AAC5E,UAAM,YAAiC,CAAC;AAExC,UAAM,UAAU,OAAO,eAAuB;AAC5C,YAAM,UAAU,MAAMC,SAAQ,YAAY,EAAE,eAAe,KAAK,CAAC;AAEjE,iBAAW,SAAS,SAAS;AAC3B,cAAM,WAAWC,MAAK,YAAY,MAAM,IAAI;AAE5C,YAAI,MAAM,YAAY,GAAG;AAEvB,cAAI,CAAC,MAAM,KAAK,WAAW,GAAG,KAAK,MAAM,SAAS,gBAAgB;AAChE,kBAAM,QAAQ,QAAQ;AAAA,UACxB;AAAA,QACF,WAAW,MAAM,OAAO,KAAK,KAAK,UAAU,MAAM,IAAI,GAAG;AACvD,cAAI;AACF,kBAAM,MAAM,MAAM,KAAK,gBAAgB,QAAQ;AAC/C,gBAAI,IAAI,QAAQ,KAAK,GAAG;AACtB,wBAAU,KAAK,GAAG;AAAA,YACpB;AAAA,UACF,SAAS,OAAO;AACd,oBAAQ,KAAK,qBAAqB,QAAQ,KAAK,KAAK;AAAA,UACtD;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,UAAM,QAAQ,GAAG;AACjB,WAAO;AAAA,EACT;AACF;;;AC5FA,SAAS,QAAAC,aAAY;AASrB,eAAsB,iBAAiB,YAAkD;AACvF,QAAM,QAAQ,MAAMC,MAAK,UAAU;AACnC,QAAM,MAAM,WAAW,MAAM,GAAG,EAAE,IAAI,GAAG,YAAY,KAAK;AAG1D,MAAI,MAAM,OAAO,GAAG;AAClB,QAAI,QAAQ,OAAO;AACjB,YAAM,YAAY,IAAI,aAAa,EAAE,WAAW,CAAC;AACjD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,QAAI,QAAQ,UAAU,QAAQ,OAAO;AACnC,YAAM,YAAY,IAAI,cAAc,EAAE,WAAW,CAAC;AAClD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,QAAI,QAAQ,QAAQ,QAAQ,YAAY;AACtC,YAAM,YAAY,IAAI,kBAAkB,EAAE,WAAW,CAAC;AACtD,aAAO,UAAU,QAAQ;AAAA,IAC3B;AACA,UAAM,IAAI,MAAM,4BAA4B,GAAG,EAAE;AAAA,EACnD;AAGA,MAAI,MAAM,YAAY,GAAG;AACvB,UAAM,eAAoC,CAAC;AAG3C,UAAM,gBAAgB,IAAI,cAAc,EAAE,WAAW,CAAC;AACtD,UAAM,WAAW,MAAM,cAAc,QAAQ;AAC7C,iBAAa,KAAK,GAAG,QAAQ;AAG7B,UAAM,cAAc,IAAI,kBAAkB,EAAE,WAAW,CAAC;AACxD,UAAM,SAAS,MAAM,YAAY,QAAQ;AACzC,iBAAa,KAAK,GAAG,MAAM;AAE3B,WAAO;AAAA,EACT;AAEA,QAAM,IAAI,MAAM,iBAAiB,UAAU,EAAE;AAC/C;","names":["readdir","readFile","join","basename","join","readFile","basename","readdir","readFile","readdir","stat","join","basename","stat","readFile","basename","readdir","join","stat","stat"]}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { R as RAGService, a as RAGOptions, b as RetrievalOptions, c as RAGResponse } from './types-CjnplPJD.js';
|
|
2
|
+
export { C as ChunkMetadata, d as ChunkResult, e as ChunkingOptions, f as Citation, D as Document, g as DocumentChunk, E as EmbeddingBackend, h as EmbeddingOptions, i as EmbeddingResult, j as ExtractedDocument, k as RetrievalResult, S as SearchResult, V as VectorStore, l as VectorStoreOptions } from './types-CjnplPJD.js';
|
|
3
|
+
export { BaseExtractor, CHMExtractor, HTMLExtractor, MarkdownExtractor, extractDocuments } from './extractors/index.js';
|
|
4
|
+
export { HeaderAwareChunker } from './chunking/index.js';
|
|
5
|
+
export { LOCAL_MODELS, LocalEmbeddingBackend, OPENAI_MODELS, OpenAIEmbeddingBackend, createEmbeddingBackend } from './embeddings/index.js';
|
|
6
|
+
export { PostgresVectorStore, PostgresVectorStoreOptions, SQLiteVectorStore, SQLiteVectorStoreOptions, createVectorStore } from './vectorstore/index.js';
|
|
7
|
+
export { DenseRetriever, HybridRetriever, HybridRetrieverOptions } from './retrieval/index.js';
|
|
8
|
+
import 'pg';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* RAG Service
|
|
12
|
+
*
|
|
13
|
+
* Main service that orchestrates document indexing and question answering.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
declare class RAG implements RAGService {
|
|
17
|
+
private options;
|
|
18
|
+
private vectorStore;
|
|
19
|
+
private embeddings;
|
|
20
|
+
private retriever;
|
|
21
|
+
private llm;
|
|
22
|
+
private chunker;
|
|
23
|
+
private systemPrompt;
|
|
24
|
+
private initialized;
|
|
25
|
+
constructor(options: RAGOptions);
|
|
26
|
+
private createLLM;
|
|
27
|
+
/**
|
|
28
|
+
* Initialize the RAG service
|
|
29
|
+
*/
|
|
30
|
+
initialize(): Promise<void>;
|
|
31
|
+
/**
|
|
32
|
+
* Index documents from a path
|
|
33
|
+
*/
|
|
34
|
+
index(path: string, options?: {
|
|
35
|
+
sourceId?: string;
|
|
36
|
+
}): Promise<{
|
|
37
|
+
documentsIndexed: number;
|
|
38
|
+
chunksCreated: number;
|
|
39
|
+
}>;
|
|
40
|
+
/**
|
|
41
|
+
* Ask a question and get an answer with citations
|
|
42
|
+
*/
|
|
43
|
+
ask(question: string, options?: RetrievalOptions): Promise<RAGResponse>;
|
|
44
|
+
/**
|
|
45
|
+
* Build context text from chunks
|
|
46
|
+
*/
|
|
47
|
+
private buildContextText;
|
|
48
|
+
/**
|
|
49
|
+
* Generate answer using LLM
|
|
50
|
+
*/
|
|
51
|
+
private generateAnswer;
|
|
52
|
+
/**
|
|
53
|
+
* Extract citations from answer text
|
|
54
|
+
*/
|
|
55
|
+
private extractCitations;
|
|
56
|
+
/**
|
|
57
|
+
* Get document count
|
|
58
|
+
*/
|
|
59
|
+
getDocumentCount(): Promise<number>;
|
|
60
|
+
/**
|
|
61
|
+
* Get chunk count
|
|
62
|
+
*/
|
|
63
|
+
getChunkCount(): Promise<number>;
|
|
64
|
+
/**
|
|
65
|
+
* Clear all indexed data
|
|
66
|
+
*/
|
|
67
|
+
clear(): Promise<void>;
|
|
68
|
+
/**
|
|
69
|
+
* Close connections
|
|
70
|
+
*/
|
|
71
|
+
close(): Promise<void>;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Create a RAG service instance
|
|
75
|
+
*/
|
|
76
|
+
declare function createRAG(options: RAGOptions): Promise<RAG>;
|
|
77
|
+
|
|
78
|
+
export { RAG, RAGOptions, RAGResponse, RAGService, RetrievalOptions, createRAG };
|