@dcyfr/ai-rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +588 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/loaders/html/index.d.ts +26 -0
- package/dist/loaders/html/index.d.ts.map +1 -0
- package/dist/loaders/html/index.js +106 -0
- package/dist/loaders/html/index.js.map +1 -0
- package/dist/loaders/index.d.ts +8 -0
- package/dist/loaders/index.d.ts.map +1 -0
- package/dist/loaders/index.js +7 -0
- package/dist/loaders/index.js.map +1 -0
- package/dist/loaders/markdown/index.d.ts +33 -0
- package/dist/loaders/markdown/index.d.ts.map +1 -0
- package/dist/loaders/markdown/index.js +150 -0
- package/dist/loaders/markdown/index.js.map +1 -0
- package/dist/loaders/text/index.d.ts +21 -0
- package/dist/loaders/text/index.d.ts.map +1 -0
- package/dist/loaders/text/index.js +78 -0
- package/dist/loaders/text/index.js.map +1 -0
- package/dist/pipeline/embedding/generator.d.ts +24 -0
- package/dist/pipeline/embedding/generator.d.ts.map +1 -0
- package/dist/pipeline/embedding/generator.js +42 -0
- package/dist/pipeline/embedding/generator.js.map +1 -0
- package/dist/pipeline/embedding/index.d.ts +8 -0
- package/dist/pipeline/embedding/index.d.ts.map +1 -0
- package/dist/pipeline/embedding/index.js +6 -0
- package/dist/pipeline/embedding/index.js.map +1 -0
- package/dist/pipeline/embedding/pipeline.d.ts +26 -0
- package/dist/pipeline/embedding/pipeline.d.ts.map +1 -0
- package/dist/pipeline/embedding/pipeline.js +59 -0
- package/dist/pipeline/embedding/pipeline.js.map +1 -0
- package/dist/pipeline/index.d.ts +7 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +7 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/ingestion/index.d.ts +5 -0
- package/dist/pipeline/ingestion/index.d.ts.map +1 -0
- package/dist/pipeline/ingestion/index.js +5 -0
- package/dist/pipeline/ingestion/index.js.map +1 -0
- package/dist/pipeline/ingestion/pipeline.d.ts +27 -0
- package/dist/pipeline/ingestion/pipeline.d.ts.map +1 -0
- package/dist/pipeline/ingestion/pipeline.js +118 -0
- package/dist/pipeline/ingestion/pipeline.js.map +1 -0
- package/dist/pipeline/retrieval/index.d.ts +5 -0
- package/dist/pipeline/retrieval/index.d.ts.map +1 -0
- package/dist/pipeline/retrieval/index.js +5 -0
- package/dist/pipeline/retrieval/index.js.map +1 -0
- package/dist/pipeline/retrieval/pipeline.d.ts +29 -0
- package/dist/pipeline/retrieval/pipeline.d.ts.map +1 -0
- package/dist/pipeline/retrieval/pipeline.js +109 -0
- package/dist/pipeline/retrieval/pipeline.js.map +1 -0
- package/dist/stores/index.d.ts +5 -0
- package/dist/stores/index.d.ts.map +1 -0
- package/dist/stores/index.js +5 -0
- package/dist/stores/index.js.map +1 -0
- package/dist/stores/vector/in-memory.d.ts +52 -0
- package/dist/stores/vector/in-memory.d.ts.map +1 -0
- package/dist/stores/vector/in-memory.js +172 -0
- package/dist/stores/vector/in-memory.js.map +1 -0
- package/dist/stores/vector/index.d.ts +6 -0
- package/dist/stores/vector/index.d.ts.map +1 -0
- package/dist/stores/vector/index.js +5 -0
- package/dist/stores/vector/index.js.map +1 -0
- package/dist/types/index.d.ts +259 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/docs/DOCUMENT_LOADERS.md +621 -0
- package/docs/EMBEDDINGS.md +733 -0
- package/docs/PIPELINES.md +771 -0
- package/docs/VECTOR_STORES.md +754 -0
- package/package.json +100 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML document loader
|
|
3
|
+
* Handles HTML files (.html, .htm)
|
|
4
|
+
*/
|
|
5
|
+
import { promises as fs } from 'node:fs';
|
|
6
|
+
import { basename } from 'node:path';
|
|
7
|
+
/**
|
|
8
|
+
* Load HTML documents
|
|
9
|
+
*/
|
|
10
|
+
export class HTMLLoader {
|
|
11
|
+
supportedExtensions = ['.html', '.htm'];
|
|
12
|
+
async load(source, config) {
|
|
13
|
+
try {
|
|
14
|
+
const content = await fs.readFile(source, 'utf-8');
|
|
15
|
+
const stats = await fs.stat(source);
|
|
16
|
+
// Extract title from <title> tag if present
|
|
17
|
+
const titleMatch = content.match(/<title>(.*?)<\/title>/i);
|
|
18
|
+
const title = titleMatch ? titleMatch[1] : basename(source);
|
|
19
|
+
// Extract text content
|
|
20
|
+
const textContent = config?.preserveFormatting
|
|
21
|
+
? content
|
|
22
|
+
: this.extractText(content);
|
|
23
|
+
const document = {
|
|
24
|
+
id: this.generateId(source),
|
|
25
|
+
content: textContent,
|
|
26
|
+
metadata: {
|
|
27
|
+
source,
|
|
28
|
+
type: 'html',
|
|
29
|
+
createdAt: stats.birthtime,
|
|
30
|
+
updatedAt: stats.mtime,
|
|
31
|
+
title,
|
|
32
|
+
...config?.metadata,
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
// Apply chunking if configured
|
|
36
|
+
if (config?.chunkSize) {
|
|
37
|
+
return this.chunkDocument(document, config);
|
|
38
|
+
}
|
|
39
|
+
return [document];
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
throw new Error(`Failed to load HTML file ${source}: ${error}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Extract text content from HTML
|
|
47
|
+
* This is a simple implementation - for production use a proper HTML parser
|
|
48
|
+
*/
|
|
49
|
+
extractText(html) {
|
|
50
|
+
return html
|
|
51
|
+
// Remove script and style tags
|
|
52
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
53
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
54
|
+
// Remove HTML comments
|
|
55
|
+
.replace(/<!--[\s\S]*?-->/g, '')
|
|
56
|
+
// Remove HTML tags
|
|
57
|
+
.replace(/<[^>]+>/g, ' ')
|
|
58
|
+
// Decode HTML entities (basic)
|
|
59
|
+
.replace(/ /g, ' ')
|
|
60
|
+
.replace(/</g, '<')
|
|
61
|
+
.replace(/>/g, '>')
|
|
62
|
+
.replace(/&/g, '&')
|
|
63
|
+
.replace(/"/g, '"')
|
|
64
|
+
.replace(/'/g, "'")
|
|
65
|
+
// Clean up whitespace
|
|
66
|
+
.replace(/\s+/g, ' ')
|
|
67
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
68
|
+
.trim();
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Split document into chunks
|
|
72
|
+
*/
|
|
73
|
+
chunkDocument(document, config) {
|
|
74
|
+
const chunkSize = config.chunkSize ?? 1000;
|
|
75
|
+
const chunkOverlap = config.chunkOverlap ?? 200;
|
|
76
|
+
const content = document.content;
|
|
77
|
+
const chunks = [];
|
|
78
|
+
let start = 0;
|
|
79
|
+
let chunkIndex = 0;
|
|
80
|
+
while (start < content.length) {
|
|
81
|
+
const end = Math.min(start + chunkSize, content.length);
|
|
82
|
+
const chunkContent = content.slice(start, end);
|
|
83
|
+
chunks.push({
|
|
84
|
+
id: `${document.id}-chunk-${chunkIndex}`,
|
|
85
|
+
content: chunkContent,
|
|
86
|
+
metadata: {
|
|
87
|
+
...document.metadata,
|
|
88
|
+
chunkIndex,
|
|
89
|
+
startChar: start,
|
|
90
|
+
endChar: end,
|
|
91
|
+
parentDocumentId: document.id,
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
start += chunkSize - chunkOverlap;
|
|
95
|
+
chunkIndex++;
|
|
96
|
+
}
|
|
97
|
+
return chunks;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Generate document ID from source
|
|
101
|
+
*/
|
|
102
|
+
generateId(source) {
|
|
103
|
+
return `html-${Buffer.from(source).toString('base64').slice(0, 16)}`;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/loaders/html/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAErC;;GAEG;AACH,MAAM,OAAO,UAAU;IACrB,mBAAmB,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAExC,KAAK,CAAC,IAAI,CAAC,MAAc,EAAE,MAAqB;QAC9C,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YACnD,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEpC,4CAA4C;YAC5C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;YAC3D,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YAE5D,uBAAuB;YACvB,MAAM,WAAW,GAAG,MAAM,EAAE,kBAAkB;gBAC5C,CAAC,CAAC,OAAO;gBACT,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;YAE9B,MAAM,QAAQ,GAAa;gBACzB,EAAE,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;gBAC3B,OAAO,EAAE,WAAW;gBACpB,QAAQ,EAAE;oBACR,MAAM;oBACN,IAAI,EAAE,MAAM;oBACZ,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,SAAS,EAAE,KAAK,CAAC,KAAK;oBACtB,KAAK;oBACL,GAAG,MAAM,EAAE,QAAQ;iBACpB;aACF,CAAC;YAEF,+BAA+B;YAC/B,IAAI,MAAM,EAAE,SAAS,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YAC9C,CAAC;YAED,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,KAAK,KAAK,EAAE,CAAC,CAAC;QAClE,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,WAAW,CAAC,IAAY;QAC9B,OAAO,IAAI;YACT,+BAA+B;aAC9B,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC;aAClE,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC;YAChE,uBAAuB;aACtB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;YAChC,mBAAmB;aAClB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;YACzB,+BAA+B;aAC9B,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;aACvB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;aACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;aACrB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;aACtB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;aACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;YACvB,sBAAsB;aACrB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;aACpB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;aAC1B,IAAI,EAAE,CAAC;IACZ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAkB,EAAE,MAAoB;QAC5D,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC;QAC3C,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,IAAI,GAAG,CAAC;QAChD,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;QACjC,MAAM,MAAM,GAAe,EAAE,CAAC;QAE9B,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,KAAK,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YACxD,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAE/C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,GAAG,QAAQ,CAAC,EAAE,UAAU,UAAU,EAAE;gBACxC,OAAO,EAAE,YAAY;gBACrB,QAAQ,EAAE;oBACR,GAAG,QAAQ,CAAC,QAAQ;oBACpB,UAAU;oBACV,SAAS,EAAE,KAAK;oBAChB,OAAO,EAAE,GAAG;oBACZ,gBAAgB,EAAE,QAAQ,CAAC,EAAE;iBAC9B;aACF,CAAC,CAAC;YAEH,KAAK,IAAI,SAAS,GAAG,YAAY,CAAC;YAClC,UAAU,EAAE,CAAC;QACf,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,MAAc;QAC/B,OAAO,QAAQ,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;IACvE,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document loaders
|
|
3
|
+
*/
|
|
4
|
+
export { TextLoader } from './text/index.js';
|
|
5
|
+
export { MarkdownLoader } from './markdown/index.js';
|
|
6
|
+
export { HTMLLoader } from './html/index.js';
|
|
7
|
+
export type { Document, DocumentLoader, LoaderConfig } from '../types/index.js';
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/loaders/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAC7C,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAG7C,YAAY,EAAE,QAAQ,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/loaders/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAC7C,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown document loader
|
|
3
|
+
* Handles Markdown files (.md, .markdown)
|
|
4
|
+
*/
|
|
5
|
+
import type { Document, DocumentLoader, LoaderConfig } from '../../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Load Markdown documents
|
|
8
|
+
*/
|
|
9
|
+
export declare class MarkdownLoader implements DocumentLoader {
|
|
10
|
+
supportedExtensions: string[];
|
|
11
|
+
load(source: string, config?: LoaderConfig): Promise<Document[]>;
|
|
12
|
+
/**
|
|
13
|
+
* Remove markdown formatting for pure text
|
|
14
|
+
*/
|
|
15
|
+
private removeFormatting;
|
|
16
|
+
/**
|
|
17
|
+
* Chunk document by sections (headings)
|
|
18
|
+
*/
|
|
19
|
+
private chunkBySection;
|
|
20
|
+
/**
|
|
21
|
+
* Split content by headings
|
|
22
|
+
*/
|
|
23
|
+
private splitByHeadings;
|
|
24
|
+
/**
|
|
25
|
+
* Chunk text by size with overlap
|
|
26
|
+
*/
|
|
27
|
+
private chunkText;
|
|
28
|
+
/**
|
|
29
|
+
* Generate document ID from source
|
|
30
|
+
*/
|
|
31
|
+
private generateId;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/loaders/markdown/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAInF;;GAEG;AACH,qBAAa,cAAe,YAAW,cAAc;IACnD,mBAAmB,WAAwB;IAErC,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAiCtE;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsBxB;;OAEG;IACH,OAAO,CAAC,cAAc;IAsCtB;;OAEG;IACH,OAAO,CAAC,eAAe;IAwBvB;;OAEG;IACH,OAAO,CAAC,SAAS;IAejB;;OAEG;IACH,OAAO,CAAC,UAAU;CAGnB"}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown document loader
|
|
3
|
+
* Handles Markdown files (.md, .markdown)
|
|
4
|
+
*/
|
|
5
|
+
import { promises as fs } from 'node:fs';
|
|
6
|
+
import { basename } from 'node:path';
|
|
7
|
+
/**
|
|
8
|
+
* Load Markdown documents
|
|
9
|
+
*/
|
|
10
|
+
export class MarkdownLoader {
|
|
11
|
+
supportedExtensions = ['.md', '.markdown'];
|
|
12
|
+
async load(source, config) {
|
|
13
|
+
try {
|
|
14
|
+
const content = await fs.readFile(source, 'utf-8');
|
|
15
|
+
const stats = await fs.stat(source);
|
|
16
|
+
// Extract title from first heading if present
|
|
17
|
+
const titleMatch = content.match(/^#\s+(.+)$/m);
|
|
18
|
+
const title = titleMatch ? titleMatch[1] : basename(source);
|
|
19
|
+
const document = {
|
|
20
|
+
id: this.generateId(source),
|
|
21
|
+
content: config?.preserveFormatting ? content : this.removeFormatting(content),
|
|
22
|
+
metadata: {
|
|
23
|
+
source,
|
|
24
|
+
type: 'markdown',
|
|
25
|
+
createdAt: stats.birthtime,
|
|
26
|
+
updatedAt: stats.mtime,
|
|
27
|
+
title,
|
|
28
|
+
...config?.metadata,
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
// Apply chunking if configured
|
|
32
|
+
if (config?.chunkSize) {
|
|
33
|
+
return this.chunkBySection(document, config);
|
|
34
|
+
}
|
|
35
|
+
return [document];
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
throw new Error(`Failed to load markdown file ${source}: ${error}`);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Remove markdown formatting for pure text
|
|
43
|
+
*/
|
|
44
|
+
removeFormatting(content) {
|
|
45
|
+
return content
|
|
46
|
+
// Remove code blocks
|
|
47
|
+
.replace(/```[\s\S]*?```/g, '')
|
|
48
|
+
// Remove inline code
|
|
49
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
50
|
+
// Remove bold/italic
|
|
51
|
+
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
|
52
|
+
.replace(/(\*|_)(.*?)\1/g, '$2')
|
|
53
|
+
// Remove links but keep text
|
|
54
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
55
|
+
// Remove images
|
|
56
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
|
|
57
|
+
// Remove headings markers
|
|
58
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
59
|
+
// Remove horizontal rules
|
|
60
|
+
.replace(/^[-*_]{3,}$/gm, '')
|
|
61
|
+
// Clean up extra whitespace
|
|
62
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
63
|
+
.trim();
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Chunk document by sections (headings)
|
|
67
|
+
*/
|
|
68
|
+
chunkBySection(document, config) {
|
|
69
|
+
const content = document.content;
|
|
70
|
+
const sections = this.splitByHeadings(content);
|
|
71
|
+
const chunks = [];
|
|
72
|
+
sections.forEach((section, index) => {
|
|
73
|
+
// Further chunk if section is too large
|
|
74
|
+
if (config.chunkSize && section.content.length > config.chunkSize) {
|
|
75
|
+
const subChunks = this.chunkText(section.content, config);
|
|
76
|
+
subChunks.forEach((chunk, subIndex) => {
|
|
77
|
+
chunks.push({
|
|
78
|
+
id: `${document.id}-${index}-${subIndex}`,
|
|
79
|
+
content: chunk,
|
|
80
|
+
metadata: {
|
|
81
|
+
...document.metadata,
|
|
82
|
+
section: section.title,
|
|
83
|
+
chunkIndex: chunks.length,
|
|
84
|
+
parentDocumentId: document.id,
|
|
85
|
+
},
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
chunks.push({
|
|
91
|
+
id: `${document.id}-${index}`,
|
|
92
|
+
content: section.content,
|
|
93
|
+
metadata: {
|
|
94
|
+
...document.metadata,
|
|
95
|
+
section: section.title,
|
|
96
|
+
chunkIndex: index,
|
|
97
|
+
parentDocumentId: document.id,
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
return chunks;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Split content by headings
|
|
106
|
+
*/
|
|
107
|
+
splitByHeadings(content) {
|
|
108
|
+
const sections = [];
|
|
109
|
+
const lines = content.split('\n');
|
|
110
|
+
let currentSection = { title: 'Introduction', content: '' };
|
|
111
|
+
for (const line of lines) {
|
|
112
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
113
|
+
if (headingMatch) {
|
|
114
|
+
if (currentSection.content.trim()) {
|
|
115
|
+
sections.push(currentSection);
|
|
116
|
+
}
|
|
117
|
+
currentSection = { title: headingMatch[2], content: '' };
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
currentSection.content += line + '\n';
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
if (currentSection.content.trim()) {
|
|
124
|
+
sections.push(currentSection);
|
|
125
|
+
}
|
|
126
|
+
return sections;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Chunk text by size with overlap
|
|
130
|
+
*/
|
|
131
|
+
chunkText(text, config) {
|
|
132
|
+
const chunkSize = config.chunkSize ?? 1000;
|
|
133
|
+
const chunkOverlap = config.chunkOverlap ?? 200;
|
|
134
|
+
const chunks = [];
|
|
135
|
+
let start = 0;
|
|
136
|
+
while (start < text.length) {
|
|
137
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
138
|
+
chunks.push(text.slice(start, end));
|
|
139
|
+
start += chunkSize - chunkOverlap;
|
|
140
|
+
}
|
|
141
|
+
return chunks;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Generate document ID from source
|
|
145
|
+
*/
|
|
146
|
+
generateId(source) {
|
|
147
|
+
return `md-${Buffer.from(source).toString('base64').slice(0, 16)}`;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/loaders/markdown/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAErC;;GAEG;AACH,MAAM,OAAO,cAAc;IACzB,mBAAmB,GAAG,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAE3C,KAAK,CAAC,IAAI,CAAC,MAAc,EAAE,MAAqB;QAC9C,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YACnD,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEpC,8CAA8C;YAC9C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;YAChD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YAE5D,MAAM,QAAQ,GAAa;gBACzB,EAAE,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;gBAC3B,OAAO,EAAE,MAAM,EAAE,kBAAkB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC;gBAC9E,QAAQ,EAAE;oBACR,MAAM;oBACN,IAAI,EAAE,UAAU;oBAChB,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,SAAS,EAAE,KAAK,CAAC,KAAK;oBACtB,KAAK;oBACL,GAAG,MAAM,EAAE,QAAQ;iBACpB;aACF,CAAC;YAEF,+BAA+B;YAC/B,IAAI,MAAM,EAAE,SAAS,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YAC/C,CAAC;YAED,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,gCAAgC,MAAM,KAAK,KAAK,EAAE,CAAC,CAAC;QACtE,CAAC;IACH,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,OAAe;QACtC,OAAO,OAAO;YACZ,qBAAqB;aACpB,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC;YAC/B,qBAAqB;aACpB,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC;YAC5B,qBAAqB;aACpB,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC;aAClC,OAAO,CAAC,gBAAgB,EAAE,IAAI,CAAC;YAChC,6BAA6B;aAC5B,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC;YACxC,gBAAgB;aACf,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC;YACvC,0BAA0B;aACzB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;YAC5B,0BAA0B;aACzB,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC;YAC7B,4BAA4B;aAC3B,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;aAC1B,IAAI,EAAE,CAAC;IACZ,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,QAAkB,EAAE,MAAoB;QAC7D,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;QACjC,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,MAAM,GAAe,EAAE,CAAC;QAE9B,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE;YAClC,wCAAwC;YACxC,IAAI,MAAM,CAAC,SAAS,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,SAAS,EAAE,CAAC;gBAClE,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;gBAC1D,SAAS,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;oBACpC,MAAM,CAAC,IAAI,CAAC;wBACV,EAAE,EAAE,GAAG,QAAQ,CAAC,EAAE,IAAI,KAAK,IAAI,QAAQ,EAAE;wBACzC,OAAO,EAAE,KAAK;wBACd,QAAQ,EAAE;4BACR,GAAG,QAAQ,CAAC,QAAQ;4BACpB,OAAO,EAAE,OAAO,CAAC,KAAK;4BACtB,UAAU,EAAE,MAAM,CAAC,MAAM;4BACzB,gBAAgB,EAAE,QAAQ,CAAC,EAAE;yBAC9B;qBACF,CAAC,CAAC;gBACL,CAAC,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,GAAG,QAAQ,CAAC,EAAE,IAAI,KAAK,EAAE;oBAC7B,OAAO,EAAE,OAAO,CAAC,OAAO;oBACxB,QAAQ,EAAE;wBACR,GAAG,QAAQ,CAAC,QAAQ;wBACpB,OAAO,EAAE,OAAO,CAAC,KAAK;wBACtB,UAAU,EAAE,KAAK;wBACjB,gBAAgB,EAAE,QAAQ,CAAC,EAAE;qBAC9B;iBACF,CAAC,CAAC;YACL,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,OAAe;QACrC,MAAM,QAAQ,GAA8C,EAAE,CAAC;QAC/D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,IAAI,cAAc,GAAG,EAAE,KAAK,EAAE,cAAc,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC;QAE5D,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;YACrD,IAAI,YAAY,EAAE,CAAC;gBACjB,IAAI,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;oBAClC,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAChC,CAAC;gBACD,cAAc,GAAG,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC;YAC3D,CAAC;iBAAM,CAAC;gBACN,cAAc,CAAC,OAAO,IAAI,IAAI,GAAG,IAAI,CAAC;YACxC,CAAC;QACH,CAAC;QAED,IAAI,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YAClC,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAChC,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACK,SAAS,CAAC,IAAY,EAAE,MAAoB;QAClD,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC;QAC3C,MAAM,YAAY,GAAG,MAAM,CAAC,YAAY,IAAI,GAAG,CAAC;QAChD,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YACrD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YACpC,KAAK,IAAI,SAAS,GAAG,YAAY,CAAC;QACpC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,MAAc;QAC/B,OAAO,MAAM,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;IACrE,CAAC;CACF"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text document loader
|
|
3
|
+
* Handles plain text files (.txt)
|
|
4
|
+
*/
|
|
5
|
+
import type { Document, DocumentLoader, LoaderConfig } from '../../types/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Load plain text documents
|
|
8
|
+
*/
|
|
9
|
+
export declare class TextLoader implements DocumentLoader {
|
|
10
|
+
supportedExtensions: string[];
|
|
11
|
+
load(source: string, config?: LoaderConfig): Promise<Document[]>;
|
|
12
|
+
/**
|
|
13
|
+
* Split document into chunks
|
|
14
|
+
*/
|
|
15
|
+
private chunkDocument;
|
|
16
|
+
/**
|
|
17
|
+
* Generate document ID from source
|
|
18
|
+
*/
|
|
19
|
+
private generateId;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/loaders/text/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAInF;;GAEG;AACH,qBAAa,UAAW,YAAW,cAAc;IAC/C,mBAAmB,WAAqB;IAElC,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAkCtE;;OAEG;IACH,OAAO,CAAC,aAAa;IAgCrB;;OAEG;IACH,OAAO,CAAC,UAAU;CAGnB"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text document loader
|
|
3
|
+
* Handles plain text files (.txt)
|
|
4
|
+
*/
|
|
5
|
+
import { promises as fs } from 'node:fs';
|
|
6
|
+
import { basename } from 'node:path';
|
|
7
|
+
/**
|
|
8
|
+
* Load plain text documents
|
|
9
|
+
*/
|
|
10
|
+
export class TextLoader {
|
|
11
|
+
supportedExtensions = ['.txt', '.text'];
|
|
12
|
+
async load(source, config) {
|
|
13
|
+
try {
|
|
14
|
+
const content = await fs.readFile(source, 'utf-8');
|
|
15
|
+
const stats = await fs.stat(source);
|
|
16
|
+
const document = {
|
|
17
|
+
id: this.generateId(source),
|
|
18
|
+
content,
|
|
19
|
+
metadata: {
|
|
20
|
+
source,
|
|
21
|
+
type: 'text',
|
|
22
|
+
createdAt: stats.birthtime,
|
|
23
|
+
updatedAt: stats.mtime,
|
|
24
|
+
title: basename(source),
|
|
25
|
+
...config?.metadata,
|
|
26
|
+
},
|
|
27
|
+
};
|
|
28
|
+
// Skip empty documents
|
|
29
|
+
if (!content.trim()) {
|
|
30
|
+
return [];
|
|
31
|
+
}
|
|
32
|
+
// Apply chunking if configured
|
|
33
|
+
if (config?.chunkSize) {
|
|
34
|
+
return this.chunkDocument(document, config);
|
|
35
|
+
}
|
|
36
|
+
return [document];
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
throw new Error(`Failed to load text file ${source}: ${error}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Split document into chunks
|
|
44
|
+
*/
|
|
45
|
+
chunkDocument(document, config) {
|
|
46
|
+
const chunkSize = config.chunkSize ?? 1000;
|
|
47
|
+
const chunkOverlap = Math.min(config.chunkOverlap ?? 200, chunkSize - 1);
|
|
48
|
+
const content = document.content;
|
|
49
|
+
const chunks = [];
|
|
50
|
+
let start = 0;
|
|
51
|
+
let chunkIndex = 0;
|
|
52
|
+
while (start < content.length) {
|
|
53
|
+
const end = Math.min(start + chunkSize, content.length);
|
|
54
|
+
const chunkContent = content.slice(start, end);
|
|
55
|
+
chunks.push({
|
|
56
|
+
id: `${document.id}-chunk-${chunkIndex}`,
|
|
57
|
+
content: chunkContent,
|
|
58
|
+
metadata: {
|
|
59
|
+
...document.metadata,
|
|
60
|
+
chunkIndex,
|
|
61
|
+
startChar: start,
|
|
62
|
+
endChar: end,
|
|
63
|
+
parentDocumentId: document.id,
|
|
64
|
+
},
|
|
65
|
+
});
|
|
66
|
+
start += chunkSize - chunkOverlap;
|
|
67
|
+
chunkIndex++;
|
|
68
|
+
}
|
|
69
|
+
return chunks;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Generate document ID from source
|
|
73
|
+
*/
|
|
74
|
+
generateId(source) {
|
|
75
|
+
return `text-${Buffer.from(source).toString('base64').slice(0, 16)}`;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/loaders/text/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAErC;;GAEG;AACH,MAAM,OAAO,UAAU;IACrB,mBAAmB,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAExC,KAAK,CAAC,IAAI,CAAC,MAAc,EAAE,MAAqB;QAC9C,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YACnD,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAEpC,MAAM,QAAQ,GAAa;gBACzB,EAAE,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;gBAC3B,OAAO;gBACP,QAAQ,EAAE;oBACR,MAAM;oBACN,IAAI,EAAE,MAAM;oBACZ,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,SAAS,EAAE,KAAK,CAAC,KAAK;oBACtB,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC;oBACvB,GAAG,MAAM,EAAE,QAAQ;iBACpB;aACF,CAAC;YAEF,uBAAuB;YACvB,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBACpB,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,+BAA+B;YAC/B,IAAI,MAAM,EAAE,SAAS,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YAC9C,CAAC;YAED,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,KAAK,KAAK,EAAE,CAAC,CAAC;QAClE,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAkB,EAAE,MAAoB;QAC5D,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,CAAC;QAC3C,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,YAAY,IAAI,GAAG,EAAE,SAAS,GAAG,CAAC,CAAC,CAAC;QACzE,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;QACjC,MAAM,MAAM,GAAe,EAAE,CAAC;QAE9B,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,OAAO,KAAK,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;YACxD,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAE/C,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,GAAG,QAAQ,CAAC,EAAE,UAAU,UAAU,EAAE;gBACxC,OAAO,EAAE,YAAY;gBACrB,QAAQ,EAAE;oBACR,GAAG,QAAQ,CAAC,QAAQ;oBACpB,UAAU;oBACV,SAAS,EAAE,KAAK;oBAChB,OAAO,EAAE,GAAG;oBACZ,gBAAgB,EAAE,QAAQ,CAAC,EAAE;iBAC9B;aACF,CAAC,CAAC;YAEH,KAAK,IAAI,SAAS,GAAG,YAAY,CAAC;YAClC,UAAU,EAAE,CAAC;QACf,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,MAAc;QAC/B,OAAO,QAAQ,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;IACvE,CAAC;CACF"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple embedding generator (placeholder implementation)
|
|
3
|
+
* In production, integrate with OpenAI, Anthropic, or local models
|
|
4
|
+
*/
|
|
5
|
+
import type { EmbeddingGenerator, EmbeddingConfig } from '../../types/index.js';
|
|
6
|
+
export interface SimpleEmbeddingOptions {
|
|
7
|
+
dimensions?: number;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Simple embedding generator using text hashing
|
|
11
|
+
* For demonstration purposes - use real embeddings in production
|
|
12
|
+
*/
|
|
13
|
+
export declare class SimpleEmbeddingGenerator implements EmbeddingGenerator {
|
|
14
|
+
private dimensions;
|
|
15
|
+
constructor(options?: SimpleEmbeddingOptions);
|
|
16
|
+
embed(texts: string[], _config?: EmbeddingConfig): Promise<number[][]>;
|
|
17
|
+
getDimensions(): number;
|
|
18
|
+
/**
|
|
19
|
+
* Generate simple embedding using character-based hashing
|
|
20
|
+
* This is NOT a real embedding - use OpenAI/Anthropic/local models in production
|
|
21
|
+
*/
|
|
22
|
+
private generateEmbedding;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["../../../src/pipeline/embedding/generator.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,kBAAkB,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAEhF,MAAM,WAAW,sBAAsB;IACrC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;GAGG;AACH,qBAAa,wBAAyB,YAAW,kBAAkB;IACjE,OAAO,CAAC,UAAU,CAAS;gBAEf,OAAO,GAAE,sBAA2B;IAI1C,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,OAAO,CAAC,EAAE,eAAe,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;IAI5E,aAAa,IAAI,MAAM;IAIvB;;;OAGG;IACH,OAAO,CAAC,iBAAiB;CAoB1B"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple embedding generator (placeholder implementation)
|
|
3
|
+
* In production, integrate with OpenAI, Anthropic, or local models
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Simple embedding generator using text hashing
|
|
7
|
+
* For demonstration purposes - use real embeddings in production
|
|
8
|
+
*/
|
|
9
|
+
export class SimpleEmbeddingGenerator {
|
|
10
|
+
dimensions;
|
|
11
|
+
constructor(options = {}) {
|
|
12
|
+
this.dimensions = options.dimensions ?? 384; // Common embedding size
|
|
13
|
+
}
|
|
14
|
+
async embed(texts, _config) {
|
|
15
|
+
return texts.map((text) => this.generateEmbedding(text));
|
|
16
|
+
}
|
|
17
|
+
getDimensions() {
|
|
18
|
+
return this.dimensions;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Generate simple embedding using character-based hashing
|
|
22
|
+
* This is NOT a real embedding - use OpenAI/Anthropic/local models in production
|
|
23
|
+
*/
|
|
24
|
+
generateEmbedding(text) {
|
|
25
|
+
const embedding = new Array(this.dimensions).fill(0);
|
|
26
|
+
// Simple character-based features
|
|
27
|
+
for (let i = 0; i < text.length; i++) {
|
|
28
|
+
const charCode = text.charCodeAt(i);
|
|
29
|
+
const index = charCode % this.dimensions;
|
|
30
|
+
embedding[index] += 1;
|
|
31
|
+
}
|
|
32
|
+
// Normalize vector
|
|
33
|
+
const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0));
|
|
34
|
+
if (magnitude > 0) {
|
|
35
|
+
for (let i = 0; i < embedding.length; i++) {
|
|
36
|
+
embedding[i] /= magnitude;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return embedding;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=generator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generator.js","sourceRoot":"","sources":["../../../src/pipeline/embedding/generator.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAQH;;;GAGG;AACH,MAAM,OAAO,wBAAwB;IAC3B,UAAU,CAAS;IAE3B,YAAY,UAAkC,EAAE;QAC9C,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC,wBAAwB;IACvE,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,KAAe,EAAE,OAAyB;QACpD,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;IAC3D,CAAC;IAED,aAAa;QACX,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;IAED;;;OAGG;IACK,iBAAiB,CAAC,IAAY;QACpC,MAAM,SAAS,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAErD,kCAAkC;QAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,KAAK,GAAG,QAAQ,GAAG,IAAI,CAAC,UAAU,CAAC;YACzC,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACxB,CAAC;QAED,mBAAmB;QACnB,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,GAAI,GAAG,GAAG,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QACjF,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;YAClB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC1C,SAAS,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding module exports
|
|
3
|
+
*/
|
|
4
|
+
export { SimpleEmbeddingGenerator } from './generator.js';
|
|
5
|
+
export { EmbeddingPipeline } from './pipeline.js';
|
|
6
|
+
export type { SimpleEmbeddingOptions } from './generator.js';
|
|
7
|
+
export type { EmbeddingPipelineOptions } from './pipeline.js';
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/pipeline/embedding/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,wBAAwB,EAAE,MAAM,gBAAgB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAElD,YAAY,EAAE,sBAAsB,EAAE,MAAM,gBAAgB,CAAC;AAC7D,YAAY,EAAE,wBAAwB,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/pipeline/embedding/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,wBAAwB,EAAE,MAAM,gBAAgB,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding pipeline for processing documents
|
|
3
|
+
*/
|
|
4
|
+
import type { Document, DocumentChunk, EmbeddingGenerator } from '../../types/index.js';
|
|
5
|
+
export interface EmbeddingPipelineOptions {
|
|
6
|
+
/** Batch size for embedding generation */
|
|
7
|
+
batchSize?: number;
|
|
8
|
+
/** Progress callback */
|
|
9
|
+
onProgress?: (processed: number, total: number) => void;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Pipeline for generating embeddings from documents
|
|
13
|
+
*/
|
|
14
|
+
export declare class EmbeddingPipeline {
|
|
15
|
+
private embedder;
|
|
16
|
+
constructor(embedder: EmbeddingGenerator);
|
|
17
|
+
/**
|
|
18
|
+
* Process documents and generate embeddings
|
|
19
|
+
*/
|
|
20
|
+
process(documents: Document[], options?: EmbeddingPipelineOptions): Promise<DocumentChunk[]>;
|
|
21
|
+
/**
|
|
22
|
+
* Generate embedding for a single query
|
|
23
|
+
*/
|
|
24
|
+
embedQuery(query: string): Promise<number[]>;
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../../src/pipeline/embedding/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAExF,MAAM,WAAW,wBAAwB;IACvC,0CAA0C;IAC1C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,wBAAwB;IACxB,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACzD;AAED;;GAEG;AACH,qBAAa,iBAAiB;IAChB,OAAO,CAAC,QAAQ;gBAAR,QAAQ,EAAE,kBAAkB;IAEhD;;OAEG;IACG,OAAO,CACX,SAAS,EAAE,QAAQ,EAAE,EACrB,OAAO,GAAE,wBAA6B,GACrC,OAAO,CAAC,aAAa,EAAE,CAAC;IA0C3B;;OAEG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAInD"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding pipeline for processing documents
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Pipeline for generating embeddings from documents
|
|
6
|
+
*/
|
|
7
|
+
export class EmbeddingPipeline {
|
|
8
|
+
embedder;
|
|
9
|
+
constructor(embedder) {
|
|
10
|
+
this.embedder = embedder;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Process documents and generate embeddings
|
|
14
|
+
*/
|
|
15
|
+
async process(documents, options = {}) {
|
|
16
|
+
const batchSize = options.batchSize ?? 32;
|
|
17
|
+
const chunks = [];
|
|
18
|
+
// Convert documents to chunks
|
|
19
|
+
for (let i = 0; i < documents.length; i++) {
|
|
20
|
+
const doc = documents[i];
|
|
21
|
+
const chunk = {
|
|
22
|
+
id: doc.id,
|
|
23
|
+
documentId: doc.id,
|
|
24
|
+
content: doc.content,
|
|
25
|
+
index: i,
|
|
26
|
+
metadata: {
|
|
27
|
+
chunkIndex: i,
|
|
28
|
+
chunkCount: documents.length,
|
|
29
|
+
startChar: 0,
|
|
30
|
+
endChar: doc.content.length,
|
|
31
|
+
...doc.metadata,
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
chunks.push(chunk);
|
|
35
|
+
}
|
|
36
|
+
// Generate embeddings in batches
|
|
37
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
38
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
39
|
+
const texts = batch.map((chunk) => chunk.content);
|
|
40
|
+
const embeddings = await this.embedder.embed(texts);
|
|
41
|
+
// Assign embeddings to chunks
|
|
42
|
+
for (let j = 0; j < batch.length; j++) {
|
|
43
|
+
batch[j].embedding = embeddings[j];
|
|
44
|
+
}
|
|
45
|
+
if (options.onProgress) {
|
|
46
|
+
options.onProgress(Math.min(i + batchSize, chunks.length), chunks.length);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return chunks;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Generate embedding for a single query
|
|
53
|
+
*/
|
|
54
|
+
async embedQuery(query) {
|
|
55
|
+
const embeddings = await this.embedder.embed([query]);
|
|
56
|
+
return embeddings[0];
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
//# sourceMappingURL=pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../../src/pipeline/embedding/pipeline.ts"],"names":[],"mappings":"AAAA;;GAEG;AAWH;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACR;IAApB,YAAoB,QAA4B;QAA5B,aAAQ,GAAR,QAAQ,CAAoB;IAAG,CAAC;IAEpD;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,SAAqB,EACrB,UAAoC,EAAE;QAEtC,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC;QAC1C,MAAM,MAAM,GAAoB,EAAE,CAAC;QAEnC,8BAA8B;QAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YACzB,MAAM,KAAK,GAAkB;gBAC3B,EAAE,EAAE,GAAG,CAAC,EAAE;gBACV,UAAU,EAAE,GAAG,CAAC,EAAE;gBAClB,OAAO,EAAE,GAAG,CAAC,OAAO;gBACpB,KAAK,EAAE,CAAC;gBACR,QAAQ,EAAE;oBACR,UAAU,EAAE,CAAC;oBACb,UAAU,EAAE,SAAS,CAAC,MAAM;oBAC5B,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,MAAM;oBAC3B,GAAG,GAAG,CAAC,QAAQ;iBAChB;aACF,CAAC;YACF,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;QAED,iCAAiC;QACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;YAClD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;YAC7C,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YAClD,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAEpD,8BAA8B;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YACrC,CAAC;YAED,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;gBACvB,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,SAAS,EAAE,MAAM,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YAC5E,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CAAC,KAAa;QAC5B,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;QACtD,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;CACF"}
|