opencode-rag-plugin 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ReadMe.md +463 -424
- package/dist/__tests__/chunker/c.test.d.ts +1 -0
- package/dist/__tests__/chunker/c.test.js +82 -0
- package/dist/__tests__/chunker/c.test.js.map +1 -0
- package/dist/__tests__/chunker/cpp.test.d.ts +1 -0
- package/dist/__tests__/chunker/cpp.test.js +96 -0
- package/dist/__tests__/chunker/cpp.test.js.map +1 -0
- package/dist/__tests__/chunker/csharp.test.d.ts +1 -0
- package/dist/__tests__/chunker/csharp.test.js +85 -0
- package/dist/__tests__/chunker/csharp.test.js.map +1 -0
- package/dist/__tests__/chunker/css.test.d.ts +1 -0
- package/dist/__tests__/chunker/css.test.js +64 -0
- package/dist/__tests__/chunker/css.test.js.map +1 -0
- package/dist/__tests__/chunker/factory.test.d.ts +1 -0
- package/dist/__tests__/chunker/factory.test.js +202 -0
- package/dist/__tests__/chunker/factory.test.js.map +1 -0
- package/dist/__tests__/chunker/fallback.test.d.ts +1 -0
- package/dist/__tests__/chunker/fallback.test.js +67 -0
- package/dist/__tests__/chunker/fallback.test.js.map +1 -0
- package/dist/__tests__/chunker/go.test.d.ts +1 -0
- package/dist/__tests__/chunker/go.test.js +90 -0
- package/dist/__tests__/chunker/go.test.js.map +1 -0
- package/dist/__tests__/chunker/grammar.test.d.ts +1 -0
- package/dist/__tests__/chunker/grammar.test.js +108 -0
- package/dist/__tests__/chunker/grammar.test.js.map +1 -0
- package/dist/__tests__/chunker/html.test.d.ts +1 -0
- package/dist/__tests__/chunker/html.test.js +64 -0
- package/dist/__tests__/chunker/html.test.js.map +1 -0
- package/dist/__tests__/chunker/java.test.d.ts +1 -0
- package/dist/__tests__/chunker/java.test.js +83 -0
- package/dist/__tests__/chunker/java.test.js.map +1 -0
- package/dist/__tests__/chunker/javascript.test.d.ts +1 -0
- package/dist/__tests__/chunker/javascript.test.js +88 -0
- package/dist/__tests__/chunker/javascript.test.js.map +1 -0
- package/dist/__tests__/chunker/json.test.d.ts +1 -0
- package/dist/__tests__/chunker/json.test.js +54 -0
- package/dist/__tests__/chunker/json.test.js.map +1 -0
- package/dist/__tests__/chunker/kotlin.test.d.ts +1 -0
- package/dist/__tests__/chunker/kotlin.test.js +42 -0
- package/dist/__tests__/chunker/kotlin.test.js.map +1 -0
- package/dist/__tests__/chunker/markdown.test.d.ts +1 -0
- package/dist/__tests__/chunker/markdown.test.js +76 -0
- package/dist/__tests__/chunker/markdown.test.js.map +1 -0
- package/dist/__tests__/chunker/python.test.d.ts +1 -0
- package/dist/__tests__/chunker/python.test.js +81 -0
- package/dist/__tests__/chunker/python.test.js.map +1 -0
- package/dist/__tests__/chunker/razor.test.d.ts +1 -0
- package/dist/__tests__/chunker/razor.test.js +73 -0
- package/dist/__tests__/chunker/razor.test.js.map +1 -0
- package/dist/__tests__/chunker/register.test.d.ts +1 -0
- package/dist/__tests__/chunker/register.test.js +93 -0
- package/dist/__tests__/chunker/register.test.js.map +1 -0
- package/dist/__tests__/chunker/ruby.test.d.ts +1 -0
- package/dist/__tests__/chunker/ruby.test.js +40 -0
- package/dist/__tests__/chunker/ruby.test.js.map +1 -0
- package/dist/__tests__/chunker/rust.test.d.ts +1 -0
- package/dist/__tests__/chunker/rust.test.js +42 -0
- package/dist/__tests__/chunker/rust.test.js.map +1 -0
- package/dist/__tests__/chunker/sln.test.d.ts +1 -0
- package/dist/__tests__/chunker/sln.test.js +45 -0
- package/dist/__tests__/chunker/sln.test.js.map +1 -0
- package/dist/__tests__/chunker/swift.test.d.ts +1 -0
- package/dist/__tests__/chunker/swift.test.js +49 -0
- package/dist/__tests__/chunker/swift.test.js.map +1 -0
- package/dist/__tests__/chunker/typescript.test.d.ts +1 -0
- package/dist/__tests__/chunker/typescript.test.js +92 -0
- package/dist/__tests__/chunker/typescript.test.js.map +1 -0
- package/dist/__tests__/chunker/uuid.test.d.ts +1 -0
- package/dist/__tests__/chunker/uuid.test.js +19 -0
- package/dist/__tests__/chunker/uuid.test.js.map +1 -0
- package/dist/__tests__/chunker/xml.test.d.ts +1 -0
- package/dist/__tests__/chunker/xml.test.js +50 -0
- package/dist/__tests__/chunker/xml.test.js.map +1 -0
- package/dist/__tests__/core/config.test.d.ts +1 -0
- package/dist/__tests__/core/config.test.js +75 -0
- package/dist/__tests__/core/config.test.js.map +1 -0
- package/dist/__tests__/core/fileLogger.test.d.ts +1 -0
- package/dist/__tests__/core/fileLogger.test.js +34 -0
- package/dist/__tests__/core/fileLogger.test.js.map +1 -0
- package/dist/__tests__/core/manifest.test.d.ts +1 -0
- package/dist/__tests__/core/manifest.test.js +56 -0
- package/dist/__tests__/core/manifest.test.js.map +1 -0
- package/dist/__tests__/embedder/embedBatch.test.d.ts +1 -0
- package/dist/__tests__/embedder/embedBatch.test.js +88 -0
- package/dist/__tests__/embedder/embedBatch.test.js.map +1 -0
- package/dist/__tests__/embedder/factory.test.d.ts +1 -0
- package/dist/__tests__/embedder/factory.test.js +71 -0
- package/dist/__tests__/embedder/factory.test.js.map +1 -0
- package/dist/__tests__/embedder/ollama.test.d.ts +1 -0
- package/dist/__tests__/embedder/ollama.test.js +106 -0
- package/dist/__tests__/embedder/ollama.test.js.map +1 -0
- package/dist/__tests__/embedder/openai.test.d.ts +1 -0
- package/dist/__tests__/embedder/openai.test.js +94 -0
- package/dist/__tests__/embedder/openai.test.js.map +1 -0
- package/dist/__tests__/indexer/indexer.test.d.ts +1 -0
- package/dist/__tests__/indexer/indexer.test.js +176 -0
- package/dist/__tests__/indexer/indexer.test.js.map +1 -0
- package/dist/__tests__/plugin.test.d.ts +1 -0
- package/dist/__tests__/plugin.test.js +77 -0
- package/dist/__tests__/plugin.test.js.map +1 -0
- package/dist/__tests__/retriever/retriever.test.d.ts +1 -0
- package/dist/__tests__/retriever/retriever.test.js +97 -0
- package/dist/__tests__/retriever/retriever.test.js.map +1 -0
- package/dist/__tests__/vectorstore/lancedb.test.d.ts +1 -0
- package/dist/__tests__/vectorstore/lancedb.test.js +159 -0
- package/dist/__tests__/vectorstore/lancedb.test.js.map +1 -0
- package/dist/chunker/doc.d.ts +8 -0
- package/dist/chunker/doc.js +79 -0
- package/dist/chunker/doc.js.map +1 -0
- package/dist/chunker/docx.d.ts +8 -0
- package/dist/chunker/docx.js +78 -0
- package/dist/chunker/docx.js.map +1 -0
- package/dist/chunker/excel.d.ts +8 -0
- package/dist/chunker/excel.js +78 -0
- package/dist/chunker/excel.js.map +1 -0
- package/dist/chunker/factory.d.ts +1 -0
- package/dist/chunker/factory.js +9 -0
- package/dist/chunker/factory.js.map +1 -1
- package/dist/cli.js +272 -1
- package/dist/cli.js.map +1 -1
- package/dist/core/config.d.ts +4 -0
- package/dist/core/config.js +6 -1
- package/dist/core/config.js.map +1 -1
- package/dist/indexer.js +18 -0
- package/dist/indexer.js.map +1 -1
- package/dist/opencode/create-read-tool.d.ts +30 -0
- package/dist/opencode/create-read-tool.js +248 -0
- package/dist/opencode/create-read-tool.js.map +1 -0
- package/dist/opencode/read-fallback.d.ts +21 -0
- package/dist/opencode/read-fallback.js +90 -0
- package/dist/opencode/read-fallback.js.map +1 -0
- package/dist/opencode/read-format.d.ts +40 -0
- package/dist/opencode/read-format.js +86 -0
- package/dist/opencode/read-format.js.map +1 -0
- package/dist/opencode/read-query.d.ts +26 -0
- package/dist/opencode/read-query.js +39 -0
- package/dist/opencode/read-query.js.map +1 -0
- package/dist/opencode/tool-args.d.ts +51 -0
- package/dist/opencode/tool-args.js +70 -0
- package/dist/opencode/tool-args.js.map +1 -0
- package/dist/plugin.js +1 -0
- package/dist/plugin.js.map +1 -1
- package/package.json +89 -82
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { retrieve } from "../../retriever/retriever.js";
|
|
4
|
+
function makeEmbedder(vectors) {
|
|
5
|
+
return {
|
|
6
|
+
name: "mock",
|
|
7
|
+
async embed(_texts) {
|
|
8
|
+
return vectors;
|
|
9
|
+
},
|
|
10
|
+
};
|
|
11
|
+
}
|
|
12
|
+
function makeStore(results) {
|
|
13
|
+
return {
|
|
14
|
+
async addChunks(_chunks) { },
|
|
15
|
+
async search(_embedding, _topK) {
|
|
16
|
+
return results;
|
|
17
|
+
},
|
|
18
|
+
async count() {
|
|
19
|
+
return results.length;
|
|
20
|
+
},
|
|
21
|
+
async clear() { },
|
|
22
|
+
async deleteByFilePath(_filePath) { },
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
describe("retrieve", () => {
|
|
26
|
+
it("returns search results from store", async () => {
|
|
27
|
+
const embedder = makeEmbedder([[0.1, 0.2, 0.3]]);
|
|
28
|
+
const store = makeStore([
|
|
29
|
+
{
|
|
30
|
+
score: 0.95,
|
|
31
|
+
chunk: {
|
|
32
|
+
id: "chunk-1",
|
|
33
|
+
content: "test content",
|
|
34
|
+
metadata: {
|
|
35
|
+
filePath: "test.ts",
|
|
36
|
+
startLine: 1,
|
|
37
|
+
endLine: 10,
|
|
38
|
+
language: "typescript",
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
]);
|
|
43
|
+
const results = await retrieve("test query", embedder, store);
|
|
44
|
+
assert.equal(results.length, 1);
|
|
45
|
+
assert.equal(results[0].score, 0.95);
|
|
46
|
+
assert.equal(results[0].chunk.id, "chunk-1");
|
|
47
|
+
});
|
|
48
|
+
it("returns empty array when embedding is empty", async () => {
|
|
49
|
+
const embedder = makeEmbedder([[]]);
|
|
50
|
+
const store = makeStore([]);
|
|
51
|
+
const results = await retrieve("test query", embedder, store);
|
|
52
|
+
assert.deepStrictEqual(results, []);
|
|
53
|
+
});
|
|
54
|
+
it("returns empty array when embeddings are empty array", async () => {
|
|
55
|
+
const embedder = makeEmbedder([]);
|
|
56
|
+
const store = makeStore([]);
|
|
57
|
+
const results = await retrieve("test query", embedder, store);
|
|
58
|
+
assert.deepStrictEqual(results, []);
|
|
59
|
+
});
|
|
60
|
+
it("passes custom topK to store", async () => {
|
|
61
|
+
let receivedTopK = 0;
|
|
62
|
+
const embedder = makeEmbedder([[0.1, 0.2]]);
|
|
63
|
+
const store = {
|
|
64
|
+
async addChunks() { },
|
|
65
|
+
async search(_embedding, topK) {
|
|
66
|
+
receivedTopK = topK;
|
|
67
|
+
return [];
|
|
68
|
+
},
|
|
69
|
+
async count() {
|
|
70
|
+
return 0;
|
|
71
|
+
},
|
|
72
|
+
async clear() { },
|
|
73
|
+
async deleteByFilePath(_filePath) { },
|
|
74
|
+
};
|
|
75
|
+
await retrieve("query", embedder, store, { topK: 5 });
|
|
76
|
+
assert.equal(receivedTopK, 5);
|
|
77
|
+
});
|
|
78
|
+
it("uses default topK of 10", async () => {
|
|
79
|
+
let receivedTopK = 0;
|
|
80
|
+
const embedder = makeEmbedder([[0.1, 0.2]]);
|
|
81
|
+
const store = {
|
|
82
|
+
async addChunks() { },
|
|
83
|
+
async search(_embedding, topK) {
|
|
84
|
+
receivedTopK = topK;
|
|
85
|
+
return [];
|
|
86
|
+
},
|
|
87
|
+
async count() {
|
|
88
|
+
return 0;
|
|
89
|
+
},
|
|
90
|
+
async clear() { },
|
|
91
|
+
async deleteByFilePath(_filePath) { },
|
|
92
|
+
};
|
|
93
|
+
await retrieve("query", embedder, store);
|
|
94
|
+
assert.equal(receivedTopK, 10);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
//# sourceMappingURL=retriever.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"retriever.test.js","sourceRoot":"","sources":["../../../src/__tests__/retriever/retriever.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,WAAW,CAAC;AACzC,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,QAAQ,EAAE,MAAM,8BAA8B,CAAC;AAQxD,SAAS,YAAY,CAAC,OAAmB;IACvC,OAAO;QACL,IAAI,EAAE,MAAM;QACZ,KAAK,CAAC,KAAK,CAAC,MAAgB;YAC1B,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC;AACJ,CAAC;AAED,SAAS,SAAS,CAAC,OAAuB;IACxC,OAAO;QACL,KAAK,CAAC,SAAS,CAAC,OAAgB,IAAkB,CAAC;QACnD,KAAK,CAAC,MAAM,CAAC,UAAoB,EAAE,KAAa;YAC9C,OAAO,OAAO,CAAC;QACjB,CAAC;QACD,KAAK,CAAC,KAAK;YACT,OAAO,OAAO,CAAC,MAAM,CAAC;QACxB,CAAC;QACD,KAAK,CAAC,KAAK,KAAmB,CAAC;QAC/B,KAAK,CAAC,gBAAgB,CAAC,SAAiB,IAAkB,CAAC;KAC5D,CAAC;AACJ,CAAC;AAED,QAAQ,CAAC,UAAU,EAAE,GAAG,EAAE;IACxB,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;QACjD,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;QACjD,MAAM,KAAK,GAAG,SAAS,CAAC;YACtB;gBACE,KAAK,EAAE,IAAI;gBACX,KAAK,EAAE;oBACL,EAAE,EAAE,SAAS;oBACb,OAAO,EAAE,cAAc;oBACvB,QAAQ,EAAE;wBACR,QAAQ,EAAE,SAAS;wBACnB,SAAS,EAAE,CAAC;wBACZ,OAAO,EAAE,EAAE;wBACX,QAAQ,EAAE,YAAY;qBACvB;iBACF;aACF;SACF,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC9D,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QAChC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QACtC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,EAAE,SAAS,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC3D,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACpC,MAAM,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;QAE5B,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC9D,MAAM,CAAC,eAAe,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;QAE5B,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC9D,MAAM,CAAC,eAAe,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;QAC3C,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,KAAK,GAAgB;YACzB,KAAK,CAAC,SAAS,KAAmB,CAAC;YACnC,KAAK,CAAC,MAAM,CAAC,UAAoB,EAAE,IAAY;gBAC7C,YAAY,GAAG,IAAI,CAAC;gBACpB,OAAO,EAAE,CAAC;YACZ,CAAC;YACD,KAAK,CAAC,KAAK;gBACT,OAAO,CAAC,CAAC;YACX,CAAC;YACD,KAAK,CAAC,KAAK,KAAmB,CAAC;YAC/B,KAAK,CAAC,gBAAgB,CAAC,SAAiB,IAAkB,CAAC;SAC5D,CAAC;QAEF,MAAM,QAAQ,CAAC,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC;QACtD,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;QACvC,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,QAAQ,GAAG,YAAY,CAAC,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,KAAK,GAAgB;YACzB,KAAK,CAAC,SAAS,KAAmB,CAAC;YACnC,KAAK,CAAC,MAAM,CAAC,UAAoB,EAAE,IAAY;gBAC7C,YAAY,GAAG,IAAI,CAAC;gBACpB,OAAO,EAAE,CAAC;YACZ,CAAC;YACD,KAAK,CAAC,KAAK;gBACT,OAAO,CAAC,CAAC;YACX,CAAC;YACD,KAAK,CAAC,KAAK,KAAmB,CAAC;YAC/B,KAAK,CAAC,gBAAgB,CAAC,SAAiB,IAAkB,CAAC;SAC5D,CAAC;QAEF,MAAM,QAAQ,CAAC,OAAO,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QACzC,MAAM,CAAC,KAAK,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { describe, it, before, after } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { LanceDBStore } from "../../vectorstore/lancedb.js";
|
|
4
|
+
import { normalizeFilePath } from "../../core/manifest.js";
|
|
5
|
+
describe("LanceDBStore (memory)", () => {
|
|
6
|
+
let store;
|
|
7
|
+
before(async () => {
|
|
8
|
+
store = new LanceDBStore("memory://");
|
|
9
|
+
});
|
|
10
|
+
after(async () => {
|
|
11
|
+
await store.clear();
|
|
12
|
+
});
|
|
13
|
+
it("starts with zero count", async () => {
|
|
14
|
+
const count = await store.count();
|
|
15
|
+
assert.equal(count, 0);
|
|
16
|
+
});
|
|
17
|
+
it("adds chunks and returns correct count", async () => {
|
|
18
|
+
const chunks = [
|
|
19
|
+
{
|
|
20
|
+
id: "chunk-1",
|
|
21
|
+
content: "function hello() { return 'world'; }",
|
|
22
|
+
embedding: new Array(384).fill(0).map((_, i) => (i % 2 === 0 ? 0.1 : -0.1)),
|
|
23
|
+
metadata: {
|
|
24
|
+
filePath: "src/hello.ts",
|
|
25
|
+
startLine: 1,
|
|
26
|
+
endLine: 5,
|
|
27
|
+
language: "typescript",
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
id: "chunk-2",
|
|
32
|
+
content: "function goodbye() { return 'farewell'; }",
|
|
33
|
+
embedding: new Array(384).fill(0).map((_, i) => (i % 2 === 0 ? -0.1 : 0.1)),
|
|
34
|
+
metadata: {
|
|
35
|
+
filePath: "src/goodbye.ts",
|
|
36
|
+
startLine: 1,
|
|
37
|
+
endLine: 5,
|
|
38
|
+
language: "typescript",
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
];
|
|
42
|
+
await store.addChunks(chunks);
|
|
43
|
+
const count = await store.count();
|
|
44
|
+
assert.equal(count, 2);
|
|
45
|
+
});
|
|
46
|
+
it("searches and returns results with scores", async () => {
|
|
47
|
+
// Search with a vector similar to chunk-1
|
|
48
|
+
const queryVector = new Array(384).fill(0).map((_, i) => i % 2 === 0 ? 0.1 : -0.1);
|
|
49
|
+
const results = await store.search(queryVector, 2);
|
|
50
|
+
assert.ok(results.length > 0, "Should return at least one result");
|
|
51
|
+
assert.ok(results[0].score > 0, "Score should be positive");
|
|
52
|
+
assert.ok(results[0].score <= 1, "Score should be <= 1");
|
|
53
|
+
assert.equal(typeof results[0].chunk.id, "string");
|
|
54
|
+
assert.equal(typeof results[0].chunk.content, "string");
|
|
55
|
+
});
|
|
56
|
+
it("respects topK parameter", async () => {
|
|
57
|
+
const queryVector = new Array(384).fill(0.1);
|
|
58
|
+
const results = await store.search(queryVector, 1);
|
|
59
|
+
assert.equal(results.length, 1);
|
|
60
|
+
});
|
|
61
|
+
it("clears all chunks", async () => {
|
|
62
|
+
await store.clear();
|
|
63
|
+
const count = await store.count();
|
|
64
|
+
assert.equal(count, 0);
|
|
65
|
+
});
|
|
66
|
+
it("can re-add chunks after clear", async () => {
|
|
67
|
+
const chunks = [
|
|
68
|
+
{
|
|
69
|
+
id: "chunk-3",
|
|
70
|
+
content: "new content",
|
|
71
|
+
embedding: new Array(384).fill(0.05),
|
|
72
|
+
metadata: {
|
|
73
|
+
filePath: "src/new.ts",
|
|
74
|
+
startLine: 1,
|
|
75
|
+
endLine: 3,
|
|
76
|
+
language: "typescript",
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
];
|
|
80
|
+
await store.addChunks(chunks);
|
|
81
|
+
const count = await store.count();
|
|
82
|
+
assert.equal(count, 1);
|
|
83
|
+
});
|
|
84
|
+
it("filters out chunks without embeddings in addChunks", async () => {
|
|
85
|
+
await store.clear();
|
|
86
|
+
const chunks = [
|
|
87
|
+
{
|
|
88
|
+
id: "no-embed",
|
|
89
|
+
content: "no embedding",
|
|
90
|
+
embedding: undefined,
|
|
91
|
+
metadata: {
|
|
92
|
+
filePath: "test.ts",
|
|
93
|
+
startLine: 1,
|
|
94
|
+
endLine: 1,
|
|
95
|
+
language: "typescript",
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
id: "empty-embed",
|
|
100
|
+
content: "empty embedding",
|
|
101
|
+
embedding: [],
|
|
102
|
+
metadata: {
|
|
103
|
+
filePath: "test.ts",
|
|
104
|
+
startLine: 2,
|
|
105
|
+
endLine: 2,
|
|
106
|
+
language: "typescript",
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
];
|
|
110
|
+
await store.addChunks(chunks);
|
|
111
|
+
const count = await store.count();
|
|
112
|
+
assert.equal(count, 0, "Chunks without embeddings should not be stored");
|
|
113
|
+
});
|
|
114
|
+
it("deletes all chunks for a specific file path", async () => {
|
|
115
|
+
await store.clear();
|
|
116
|
+
await store.addChunks([
|
|
117
|
+
{
|
|
118
|
+
id: "delete-1",
|
|
119
|
+
content: "alpha",
|
|
120
|
+
embedding: new Array(384).fill(0.1),
|
|
121
|
+
metadata: {
|
|
122
|
+
filePath: "src/delete-me.ts",
|
|
123
|
+
startLine: 1,
|
|
124
|
+
endLine: 1,
|
|
125
|
+
language: "typescript",
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
id: "delete-2",
|
|
130
|
+
content: "beta",
|
|
131
|
+
embedding: new Array(384).fill(0.2),
|
|
132
|
+
metadata: {
|
|
133
|
+
filePath: "src/keep-me.ts",
|
|
134
|
+
startLine: 1,
|
|
135
|
+
endLine: 1,
|
|
136
|
+
language: "typescript",
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
id: "delete-3",
|
|
141
|
+
content: "gamma",
|
|
142
|
+
embedding: new Array(384).fill(0.3),
|
|
143
|
+
metadata: {
|
|
144
|
+
filePath: "src/delete-me.ts",
|
|
145
|
+
startLine: 2,
|
|
146
|
+
endLine: 2,
|
|
147
|
+
language: "typescript",
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
]);
|
|
151
|
+
await store.deleteByFilePath("src/delete-me.ts");
|
|
152
|
+
const count = await store.count();
|
|
153
|
+
assert.equal(count, 1);
|
|
154
|
+
const results = await store.search(new Array(384).fill(0.2), 5);
|
|
155
|
+
assert.equal(results.length, 1);
|
|
156
|
+
assert.equal(results[0].chunk.metadata.filePath, normalizeFilePath("src/keep-me.ts"));
|
|
157
|
+
});
|
|
158
|
+
});
|
|
159
|
+
//# sourceMappingURL=lancedb.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lancedb.test.js","sourceRoot":"","sources":["../../../src/__tests__/vectorstore/lancedb.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AACxD,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAE3D,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,IAAI,KAAmB,CAAC;IAExB,MAAM,CAAC,KAAK,IAAI,EAAE;QAChB,KAAK,GAAG,IAAI,YAAY,CAAC,WAAW,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,KAAK,CAAC,KAAK,IAAI,EAAE;QACf,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;IACtB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wBAAwB,EAAE,KAAK,IAAI,EAAE;QACtC,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG;YACb;gBACE,EAAE,EAAE,SAAS;gBACb,OAAO,EAAE,sCAAsC;gBAC/C,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;gBAC3E,QAAQ,EAAE;oBACR,QAAQ,EAAE,cAAc;oBACxB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;YACD;gBACE,EAAE,EAAE,SAAS;gBACb,OAAO,EAAE,2CAA2C;gBACpD,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;gBAC3E,QAAQ,EAAE;oBACR,QAAQ,EAAE,gBAAgB;oBAC1B,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;SACF,CAAC;QAEF,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC9B,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,0CAA0C;QAC1C,MAAM,WAAW,GAAG,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACtD,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CACzB,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;QACnD,MAAM,CAAC,EAAE,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,mCAAmC,CAAC,CAAC;QACnE,MAAM,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,GAAG,CAAC,EAAE,0BAA0B,CAAC,CAAC;QAC7D,MAAM,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,CAAC,EAAE,sBAAsB,CAAC,CAAC;QAC1D,MAAM,CAAC,KAAK,CAAC,OAAO,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,EAAE,QAAQ,CAAC,CAAC;QACpD,MAAM,CAAC,KAAK,CAAC,OAAO,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;QACvC,MAAM,WAAW,GAAG,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;QACnD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAClC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mBAAmB,EAAE,KAAK,IAAI,EAAE;QACjC,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QACpB,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,KAAK,IAAI,EAAE;QAC7C,MAAM,MAAM,GAAG;YACb;gBACE,EAAE,EAAE,SAAS;gBACb,OAAO,EAAE,aAAa;gBACtB,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;gBACpC,QAAQ,EAAE;oBACR,QAAQ,EAAE,YAAY;oBACtB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;SACF,CAAC;QAEF,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC9B,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,KAAK,IAAI,EAAE;QAClE,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAEpB,MAAM,MAAM,GAAG;YACb;gBACE,EAAE,EAAE,UAAU;gBACd,OAAO,EAAE,cAAc;gBACvB,SAAS,EAAE,SAAgC;gBAC3C,QAAQ,EAAE;oBACR,QAAQ,EAAE,SAAS;oBACnB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;YACD;gBACE,EAAE,EAAE,aAAa;gBACjB,OAAO,EAAE,iBAAiB;gBAC1B,SAAS,EAAE,EAAE;gBACb,QAAQ,EAAE;oBACR,QAAQ,EAAE,SAAS;oBACnB,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;SACF,CAAC;QAEF,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC9B,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,EAAE,gDAAgD,CAAC,CAAC;IAC3E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC3D,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAEpB,MAAM,KAAK,CAAC,SAAS,CAAC;YACpB;gBACE,EAAE,EAAE,UAAU;gBACd,OAAO,EAAE,OAAO;gBAChB,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;gBACnC,QAAQ,EAAE;oBACR,QAAQ,EAAE,kBAAkB;oBAC5B,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;YACD;gBACE,EAAE,EAAE,UAAU;gBACd,OAAO,EAAE,MAAM;gBACf,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;gBACnC,QAAQ,EAAE;oBACR,QAAQ,EAAE,gBAAgB;oBAC1B,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;YACD;gBACE,EAAE,EAAE,UAAU;gBACd,OAAO,EAAE,OAAO;gBAChB,SAAS,EAAE,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;gBACnC,QAAQ,EAAE;oBACR,QAAQ,EAAE,kBAAkB;oBAC5B,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,CAAC;oBACV,QAAQ,EAAE,YAAY;iBACvB;aACF;SACF,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC;QAEjD,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,CAAC;QAClC,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAEvB,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAChE,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QAChC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,QAAQ,EAAE,iBAAiB,CAAC,gBAAgB,CAAC,CAAC,CAAC;IACzF,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Chunker, Chunk } from "../core/interfaces.js";
|
|
2
|
+
export declare function extractDocText(buffer: Buffer): Promise<string>;
|
|
3
|
+
export declare class DocChunker implements Chunker {
|
|
4
|
+
readonly language = "doc";
|
|
5
|
+
readonly fileExtensions: string[];
|
|
6
|
+
chunk(filePath: string, content: string): Promise<Chunk[]>;
|
|
7
|
+
}
|
|
8
|
+
export declare const docChunker: DocChunker;
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { uuid } from "./uuid.js";
|
|
2
|
+
const MAX_CHUNK_CHARS = 4000;
|
|
3
|
+
const MIN_GROUP_CHARS = 300;
|
|
4
|
+
const PARAGRAPH_SPLIT = /\n\s*\n/;
|
|
5
|
+
export async function extractDocText(buffer) {
|
|
6
|
+
const WordExtractor = (await import("word-extractor")).default;
|
|
7
|
+
const extractor = new WordExtractor();
|
|
8
|
+
const doc = await extractor.extract(buffer);
|
|
9
|
+
return doc.getBody();
|
|
10
|
+
}
|
|
11
|
+
export class DocChunker {
|
|
12
|
+
language = "doc";
|
|
13
|
+
fileExtensions = [".doc"];
|
|
14
|
+
async chunk(filePath, content) {
|
|
15
|
+
if (content.trim().length === 0)
|
|
16
|
+
return [];
|
|
17
|
+
const paragraphs = content.split(PARAGRAPH_SPLIT).filter((p) => p.trim().length > 0);
|
|
18
|
+
if (paragraphs.length === 0)
|
|
19
|
+
return [];
|
|
20
|
+
const chunks = [];
|
|
21
|
+
let currentGroup = [];
|
|
22
|
+
let currentSize = 0;
|
|
23
|
+
let paragraphIndex = 0;
|
|
24
|
+
function flush() {
|
|
25
|
+
const text = currentGroup.join("\n\n").trim();
|
|
26
|
+
if (text.length === 0)
|
|
27
|
+
return;
|
|
28
|
+
chunks.push({
|
|
29
|
+
id: uuid(),
|
|
30
|
+
content: text,
|
|
31
|
+
metadata: {
|
|
32
|
+
filePath,
|
|
33
|
+
startLine: paragraphIndex - currentGroup.length + 1,
|
|
34
|
+
endLine: paragraphIndex,
|
|
35
|
+
language: "doc",
|
|
36
|
+
},
|
|
37
|
+
});
|
|
38
|
+
currentGroup = [];
|
|
39
|
+
currentSize = 0;
|
|
40
|
+
}
|
|
41
|
+
for (const para of paragraphs) {
|
|
42
|
+
paragraphIndex++;
|
|
43
|
+
const paraLen = para.length;
|
|
44
|
+
if (paraLen > MAX_CHUNK_CHARS) {
|
|
45
|
+
if (currentGroup.length > 0)
|
|
46
|
+
flush();
|
|
47
|
+
chunks.push({
|
|
48
|
+
id: uuid(),
|
|
49
|
+
content: para,
|
|
50
|
+
metadata: {
|
|
51
|
+
filePath,
|
|
52
|
+
startLine: paragraphIndex,
|
|
53
|
+
endLine: paragraphIndex,
|
|
54
|
+
language: "doc",
|
|
55
|
+
},
|
|
56
|
+
});
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
if (currentGroup.length > 0 && currentSize + paraLen > MAX_CHUNK_CHARS) {
|
|
60
|
+
flush();
|
|
61
|
+
}
|
|
62
|
+
currentGroup.push(para);
|
|
63
|
+
currentSize += paraLen;
|
|
64
|
+
if (currentSize >= MIN_GROUP_CHARS && currentGroup.length >= 1) {
|
|
65
|
+
const nextParaStillSmall = paragraphIndex < paragraphs.length &&
|
|
66
|
+
paragraphs[paragraphIndex].length < MIN_GROUP_CHARS;
|
|
67
|
+
if (!nextParaStillSmall) {
|
|
68
|
+
flush();
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
if (currentGroup.length > 0) {
|
|
73
|
+
flush();
|
|
74
|
+
}
|
|
75
|
+
return chunks;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
export const docChunker = new DocChunker();
|
|
79
|
+
//# sourceMappingURL=doc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"doc.js","sourceRoot":"","sources":["../../src/chunker/doc.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,eAAe,GAAG,IAAI,CAAC;AAC7B,MAAM,eAAe,GAAG,GAAG,CAAC;AAE5B,MAAM,eAAe,GAAG,SAAS,CAAC;AAElC,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,MAAc;IACjD,MAAM,aAAa,GAAG,CAAC,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC;IAC/D,MAAM,SAAS,GAAG,IAAI,aAAa,EAAE,CAAC;IACtC,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;IAC5C,OAAO,GAAG,CAAC,OAAO,EAAE,CAAC;AACvB,CAAC;AAED,MAAM,OAAO,UAAU;IACZ,QAAQ,GAAG,KAAK,CAAC;IACjB,cAAc,GAAG,CAAC,MAAM,CAAC,CAAC;IAEnC,KAAK,CAAC,KAAK,CAAC,QAAgB,EAAE,OAAe;QAC3C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAE3C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACrF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEvC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,WAAW,GAAG,CAAC,CAAC;QACpB,IAAI,cAAc,GAAG,CAAC,CAAC;QAEvB,SAAS,KAAK;YACZ,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YAC9C,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO;YAC9B,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,EAAE;gBACV,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE;oBACR,QAAQ;oBACR,SAAS,EAAE,cAAc,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;oBACnD,OAAO,EAAE,cAAc;oBACvB,QAAQ,EAAE,KAAK;iBAChB;aACF,CAAC,CAAC;YACH,YAAY,GAAG,EAAE,CAAC;YAClB,WAAW,GAAG,CAAC,CAAC;QAClB,CAAC;QAED,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC9B,cAAc,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;YAE5B,IAAI,OAAO,GAAG,eAAe,EAAE,CAAC;gBAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,EAAE,CAAC;gBACrC,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAI,EAAE;oBACV,OAAO,EAAE,IAAI;oBACb,QAAQ,EAAE;wBACR,QAAQ;wBACR,SAAS,EAAE,cAAc;wBACzB,OAAO,EAAE,cAAc;wBACvB,QAAQ,EAAE,KAAK;qBAChB;iBACF,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,WAAW,GAAG,OAAO,GAAG,eAAe,EAAE,CAAC;gBACvE,KAAK,EAAE,CAAC;YACV,CAAC;YAED,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,WAAW,IAAI,OAAO,CAAC;YAEvB,IAAI,WAAW,IAAI,eAAe,IAAI,YAAY,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;gBAC/D,MAAM,kBAAkB,GACtB,cAAc,GAAG,UAAU,CAAC,MAAM;oBAClC,UAAU,CAAC,cAAc,CAAE,CAAC,MAAM,GAAG,eAAe,CAAC;gBACvD,IAAI,CAAC,kBAAkB,EAAE,CAAC;oBACxB,KAAK,EAAE,CAAC;gBACV,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,KAAK,EAAE,CAAC;QACV,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,IAAI,UAAU,EAAE,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Chunker, Chunk } from "../core/interfaces.js";
|
|
2
|
+
export declare function extractDocxText(buffer: Buffer): Promise<string>;
|
|
3
|
+
export declare class DocxChunker implements Chunker {
|
|
4
|
+
readonly language = "docx";
|
|
5
|
+
readonly fileExtensions: string[];
|
|
6
|
+
chunk(filePath: string, content: string): Promise<Chunk[]>;
|
|
7
|
+
}
|
|
8
|
+
export declare const docxChunker: DocxChunker;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { uuid } from "./uuid.js";
|
|
2
|
+
const MAX_CHUNK_CHARS = 4000;
|
|
3
|
+
const MIN_GROUP_CHARS = 300;
|
|
4
|
+
const PARAGRAPH_SPLIT = /\n\s*\n/;
|
|
5
|
+
export async function extractDocxText(buffer) {
|
|
6
|
+
const mammoth = await import("mammoth");
|
|
7
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
8
|
+
return result.value;
|
|
9
|
+
}
|
|
10
|
+
export class DocxChunker {
|
|
11
|
+
language = "docx";
|
|
12
|
+
fileExtensions = [".docx"];
|
|
13
|
+
async chunk(filePath, content) {
|
|
14
|
+
if (content.trim().length === 0)
|
|
15
|
+
return [];
|
|
16
|
+
const paragraphs = content.split(PARAGRAPH_SPLIT).filter((p) => p.trim().length > 0);
|
|
17
|
+
if (paragraphs.length === 0)
|
|
18
|
+
return [];
|
|
19
|
+
const chunks = [];
|
|
20
|
+
let currentGroup = [];
|
|
21
|
+
let currentSize = 0;
|
|
22
|
+
let paragraphIndex = 0;
|
|
23
|
+
function flush() {
|
|
24
|
+
const text = currentGroup.join("\n\n").trim();
|
|
25
|
+
if (text.length === 0)
|
|
26
|
+
return;
|
|
27
|
+
chunks.push({
|
|
28
|
+
id: uuid(),
|
|
29
|
+
content: text,
|
|
30
|
+
metadata: {
|
|
31
|
+
filePath,
|
|
32
|
+
startLine: paragraphIndex - currentGroup.length + 1,
|
|
33
|
+
endLine: paragraphIndex,
|
|
34
|
+
language: "docx",
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
currentGroup = [];
|
|
38
|
+
currentSize = 0;
|
|
39
|
+
}
|
|
40
|
+
for (const para of paragraphs) {
|
|
41
|
+
paragraphIndex++;
|
|
42
|
+
const paraLen = para.length;
|
|
43
|
+
if (paraLen > MAX_CHUNK_CHARS) {
|
|
44
|
+
if (currentGroup.length > 0)
|
|
45
|
+
flush();
|
|
46
|
+
chunks.push({
|
|
47
|
+
id: uuid(),
|
|
48
|
+
content: para,
|
|
49
|
+
metadata: {
|
|
50
|
+
filePath,
|
|
51
|
+
startLine: paragraphIndex,
|
|
52
|
+
endLine: paragraphIndex,
|
|
53
|
+
language: "docx",
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
if (currentGroup.length > 0 && currentSize + paraLen > MAX_CHUNK_CHARS) {
|
|
59
|
+
flush();
|
|
60
|
+
}
|
|
61
|
+
currentGroup.push(para);
|
|
62
|
+
currentSize += paraLen;
|
|
63
|
+
if (currentSize >= MIN_GROUP_CHARS && currentGroup.length >= 1) {
|
|
64
|
+
const nextParaStillSmall = paragraphIndex < paragraphs.length &&
|
|
65
|
+
paragraphs[paragraphIndex].length < MIN_GROUP_CHARS;
|
|
66
|
+
if (!nextParaStillSmall) {
|
|
67
|
+
flush();
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
if (currentGroup.length > 0) {
|
|
72
|
+
flush();
|
|
73
|
+
}
|
|
74
|
+
return chunks;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
export const docxChunker = new DocxChunker();
|
|
78
|
+
//# sourceMappingURL=docx.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx.js","sourceRoot":"","sources":["../../src/chunker/docx.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,eAAe,GAAG,IAAI,CAAC;AAC7B,MAAM,eAAe,GAAG,GAAG,CAAC;AAE5B,MAAM,eAAe,GAAG,SAAS,CAAC;AAElC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,MAAc;IAClD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IACxD,OAAO,MAAM,CAAC,KAAK,CAAC;AACtB,CAAC;AAED,MAAM,OAAO,WAAW;IACb,QAAQ,GAAG,MAAM,CAAC;IAClB,cAAc,GAAG,CAAC,OAAO,CAAC,CAAC;IAEpC,KAAK,CAAC,KAAK,CAAC,QAAgB,EAAE,OAAe;QAC3C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAE3C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACrF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEvC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,WAAW,GAAG,CAAC,CAAC;QACpB,IAAI,cAAc,GAAG,CAAC,CAAC;QAEvB,SAAS,KAAK;YACZ,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;YAC9C,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO;YAC9B,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,EAAE;gBACV,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE;oBACR,QAAQ;oBACR,SAAS,EAAE,cAAc,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;oBACnD,OAAO,EAAE,cAAc;oBACvB,QAAQ,EAAE,MAAM;iBACjB;aACF,CAAC,CAAC;YACH,YAAY,GAAG,EAAE,CAAC;YAClB,WAAW,GAAG,CAAC,CAAC;QAClB,CAAC;QAED,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC9B,cAAc,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;YAE5B,IAAI,OAAO,GAAG,eAAe,EAAE,CAAC;gBAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,EAAE,CAAC;gBACrC,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAI,EAAE;oBACV,OAAO,EAAE,IAAI;oBACb,QAAQ,EAAE;wBACR,QAAQ;wBACR,SAAS,EAAE,cAAc;wBACzB,OAAO,EAAE,cAAc;wBACvB,QAAQ,EAAE,MAAM;qBACjB;iBACF,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,WAAW,GAAG,OAAO,GAAG,eAAe,EAAE,CAAC;gBACvE,KAAK,EAAE,CAAC;YACV,CAAC;YAED,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,WAAW,IAAI,OAAO,CAAC;YAEvB,IAAI,WAAW,IAAI,eAAe,IAAI,YAAY,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;gBAC/D,MAAM,kBAAkB,GACtB,cAAc,GAAG,UAAU,CAAC,MAAM;oBAClC,UAAU,CAAC,cAAc,CAAE,CAAC,MAAM,GAAG,eAAe,CAAC;gBACvD,IAAI,CAAC,kBAAkB,EAAE,CAAC;oBACxB,KAAK,EAAE,CAAC;gBACV,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,KAAK,EAAE,CAAC;QACV,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,MAAM,CAAC,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { Chunker, Chunk } from "../core/interfaces.js";
|
|
2
|
+
export declare function extractExcelText(buffer: Buffer): Promise<string>;
|
|
3
|
+
export declare class ExcelChunker implements Chunker {
|
|
4
|
+
readonly language = "excel";
|
|
5
|
+
readonly fileExtensions: string[];
|
|
6
|
+
chunk(filePath: string, content: string): Promise<Chunk[]>;
|
|
7
|
+
}
|
|
8
|
+
export declare const excelChunker: ExcelChunker;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { uuid } from "./uuid.js";
|
|
2
|
+
const MAX_CHUNK_CHARS = 4000;
|
|
3
|
+
export async function extractExcelText(buffer) {
|
|
4
|
+
const XLSX = await import("@e965/xlsx");
|
|
5
|
+
const workbook = XLSX.read(buffer, { type: "buffer" });
|
|
6
|
+
const lines = [];
|
|
7
|
+
for (const sheetName of workbook.SheetNames) {
|
|
8
|
+
const sheet = workbook.Sheets[sheetName];
|
|
9
|
+
if (!sheet)
|
|
10
|
+
continue;
|
|
11
|
+
lines.push(`[Sheet: ${sheetName}]`);
|
|
12
|
+
const csv = XLSX.utils.sheet_to_csv(sheet, { blankrows: false });
|
|
13
|
+
if (csv.trim().length > 0) {
|
|
14
|
+
lines.push(csv);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return lines.join("\n\n");
|
|
18
|
+
}
|
|
19
|
+
export class ExcelChunker {
|
|
20
|
+
language = "excel";
|
|
21
|
+
fileExtensions = [".xls", ".xlsx"];
|
|
22
|
+
async chunk(filePath, content) {
|
|
23
|
+
if (content.trim().length === 0)
|
|
24
|
+
return [];
|
|
25
|
+
// Split by sheet sections (separated by double newlines)
|
|
26
|
+
const sections = content.split(/\n\n(?=\[Sheet:)/).filter((s) => s.trim().length > 0);
|
|
27
|
+
if (sections.length === 0)
|
|
28
|
+
return [];
|
|
29
|
+
const chunks = [];
|
|
30
|
+
let lineCounter = 0;
|
|
31
|
+
for (const section of sections) {
|
|
32
|
+
const sectionLines = section.split("\n");
|
|
33
|
+
const startLine = lineCounter + 1;
|
|
34
|
+
lineCounter += sectionLines.length;
|
|
35
|
+
const endLine = lineCounter;
|
|
36
|
+
if (section.length <= MAX_CHUNK_CHARS) {
|
|
37
|
+
chunks.push({
|
|
38
|
+
id: uuid(),
|
|
39
|
+
content: section.trim(),
|
|
40
|
+
metadata: { filePath, startLine, endLine, language: "excel" },
|
|
41
|
+
});
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
// Split oversized sheet content into row batches
|
|
45
|
+
const rows = section.split("\n");
|
|
46
|
+
let batch = [];
|
|
47
|
+
let batchSize = 0;
|
|
48
|
+
let batchStart = startLine;
|
|
49
|
+
let rowLine = startLine;
|
|
50
|
+
for (const row of rows) {
|
|
51
|
+
rowLine++;
|
|
52
|
+
const rowLen = row.length + 1;
|
|
53
|
+
if (batch.length > 0 && batchSize + rowLen > MAX_CHUNK_CHARS) {
|
|
54
|
+
chunks.push({
|
|
55
|
+
id: uuid(),
|
|
56
|
+
content: batch.join("\n").trim(),
|
|
57
|
+
metadata: { filePath, startLine: batchStart, endLine: rowLine - 1, language: "excel" },
|
|
58
|
+
});
|
|
59
|
+
batch = [];
|
|
60
|
+
batchSize = 0;
|
|
61
|
+
batchStart = rowLine;
|
|
62
|
+
}
|
|
63
|
+
batch.push(row);
|
|
64
|
+
batchSize += rowLen;
|
|
65
|
+
}
|
|
66
|
+
if (batch.length > 0) {
|
|
67
|
+
chunks.push({
|
|
68
|
+
id: uuid(),
|
|
69
|
+
content: batch.join("\n").trim(),
|
|
70
|
+
metadata: { filePath, startLine: batchStart, endLine: rowLine, language: "excel" },
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return chunks;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
export const excelChunker = new ExcelChunker();
|
|
78
|
+
//# sourceMappingURL=excel.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"excel.js","sourceRoot":"","sources":["../../src/chunker/excel.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,eAAe,GAAG,IAAI,CAAC;AAE7B,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,MAAc;IACnD,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;IACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC;IACvD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;QAC5C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACzC,IAAI,CAAC,KAAK;YAAE,SAAS;QACrB,KAAK,CAAC,IAAI,CAAC,WAAW,SAAS,GAAG,CAAC,CAAC;QACpC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;QACjE,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,MAAM,OAAO,YAAY;IACd,QAAQ,GAAG,OAAO,CAAC;IACnB,cAAc,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAE5C,KAAK,CAAC,KAAK,CAAC,QAAgB,EAAE,OAAe;QAC3C,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAE3C,yDAAyD;QACzD,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACtF,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAErC,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,IAAI,WAAW,GAAG,CAAC,CAAC;QAEpB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACzC,MAAM,SAAS,GAAG,WAAW,GAAG,CAAC,CAAC;YAClC,WAAW,IAAI,YAAY,CAAC,MAAM,CAAC;YACnC,MAAM,OAAO,GAAG,WAAW,CAAC;YAE5B,IAAI,OAAO,CAAC,MAAM,IAAI,eAAe,EAAE,CAAC;gBACtC,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAI,EAAE;oBACV,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;oBACvB,QAAQ,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE;iBAC9D,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,iDAAiD;YACjD,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACjC,IAAI,KAAK,GAAa,EAAE,CAAC;YACzB,IAAI,SAAS,GAAG,CAAC,CAAC;YAClB,IAAI,UAAU,GAAG,SAAS,CAAC;YAC3B,IAAI,OAAO,GAAG,SAAS,CAAC;YAExB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;gBACvB,OAAO,EAAE,CAAC;gBACV,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,GAAG,MAAM,GAAG,eAAe,EAAE,CAAC;oBAC7D,MAAM,CAAC,IAAI,CAAC;wBACV,EAAE,EAAE,IAAI,EAAE;wBACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE;wBAChC,QAAQ,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,OAAO,EAAE,OAAO,GAAG,CAAC,EAAE,QAAQ,EAAE,OAAO,EAAE;qBACvF,CAAC,CAAC;oBACH,KAAK,GAAG,EAAE,CAAC;oBACX,SAAS,GAAG,CAAC,CAAC;oBACd,UAAU,GAAG,OAAO,CAAC;gBACvB,CAAC;gBACD,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAChB,SAAS,IAAI,MAAM,CAAC;YACtB,CAAC;YAED,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CAAC;oBACV,EAAE,EAAE,IAAI,EAAE;oBACV,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE;oBAChC,QAAQ,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE;iBACnF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,MAAM,CAAC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC"}
|
|
@@ -23,5 +23,6 @@ import { fallbackChunker } from "./fallback.js";
|
|
|
23
23
|
import { pdfChunker } from "./pdf.js";
|
|
24
24
|
export declare function registerChunker(chunker: Chunker, extensions?: string[]): void;
|
|
25
25
|
export declare function getChunker(filePath: string): Chunker;
|
|
26
|
+
export declare function getRegisteredExtensions(): string[];
|
|
26
27
|
export declare function chunkFile(filePath: string, content: string): Promise<Chunk[]>;
|
|
27
28
|
export { typescriptChunker, pythonChunker, javaChunker, goChunker, markdownChunker, cChunker, cppChunker, csharpChunker, javascriptChunker, razorChunker, jsonChunker, htmlChunker, cssChunker, xmlChunker, slnChunker, rustChunker, rubyChunker, kotlinChunker, swiftChunker, texChunker, pdfChunker, fallbackChunker };
|
package/dist/chunker/factory.js
CHANGED
|
@@ -20,6 +20,9 @@ import { swiftChunker } from "./swift.js";
|
|
|
20
20
|
import { texChunker } from "./tex.js";
|
|
21
21
|
import { fallbackChunker } from "./fallback.js";
|
|
22
22
|
import { pdfChunker } from "./pdf.js";
|
|
23
|
+
import { docxChunker } from "./docx.js";
|
|
24
|
+
import { docChunker } from "./doc.js";
|
|
25
|
+
import { excelChunker } from "./excel.js";
|
|
23
26
|
import { uuid } from "./uuid.js";
|
|
24
27
|
const chunkers = [
|
|
25
28
|
typescriptChunker,
|
|
@@ -43,6 +46,9 @@ const chunkers = [
|
|
|
43
46
|
swiftChunker,
|
|
44
47
|
texChunker,
|
|
45
48
|
pdfChunker,
|
|
49
|
+
docxChunker,
|
|
50
|
+
docChunker,
|
|
51
|
+
excelChunker,
|
|
46
52
|
];
|
|
47
53
|
const extensionMap = new Map();
|
|
48
54
|
for (const chunker of chunkers) {
|
|
@@ -70,6 +76,9 @@ export function getChunker(filePath) {
|
|
|
70
76
|
const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase();
|
|
71
77
|
return extensionMap.get(ext) ?? fallbackChunker;
|
|
72
78
|
}
|
|
79
|
+
export function getRegisteredExtensions() {
|
|
80
|
+
return [...extensionMap.keys()].sort();
|
|
81
|
+
}
|
|
73
82
|
const MAX_CHUNK_LINES = 100;
|
|
74
83
|
const MAX_CHUNK_CHARS = 8000;
|
|
75
84
|
function splitOversized(chunks, filePath) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/chunker/factory.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACpC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,QAAQ,GAAc;IAC1B,iBAAiB;IACjB,aAAa;IACb,WAAW;IACX,SAAS;IACT,eAAe;IACf,QAAQ;IACR,UAAU;IACV,aAAa;IACb,iBAAiB;IACjB,YAAY;IACZ,WAAW;IACX,WAAW;IACX,UAAU;IACV,UAAU;IACV,UAAU;IACV,WAAW;IACX,WAAW;IACX,aAAa;IACb,YAAY;IACZ,UAAU;IACV,UAAU;
|
|
1
|
+
{"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/chunker/factory.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACpC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,UAAU,EAAE,MAAM,UAAU,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,QAAQ,GAAc;IAC1B,iBAAiB;IACjB,aAAa;IACb,WAAW;IACX,SAAS;IACT,eAAe;IACf,QAAQ;IACR,UAAU;IACV,aAAa;IACb,iBAAiB;IACjB,YAAY;IACZ,WAAW;IACX,WAAW;IACX,UAAU;IACV,UAAU;IACV,UAAU;IACV,WAAW;IACX,WAAW;IACX,aAAa;IACb,YAAY;IACZ,UAAU;IACV,UAAU;IACV,WAAW;IACX,UAAU;IACV,YAAY;CACb,CAAC;AAEF,MAAM,YAAY,GAAG,IAAI,GAAG,EAAmB,CAAC;AAEhD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;IAC/B,IAAI,gBAAgB,IAAI,OAAO,EAAE,CAAC;QAChC,MAAM,EAAE,GAAG,OAAwD,CAAC;QACpE,KAAK,MAAM,GAAG,IAAI,EAAE,CAAC,cAAc,EAAE,CAAC;YACpC,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,OAAgB,EAChB,UAAqB;IAErB,MAAM,IAAI,GAAG,UAAU,IAAI,CAAC,gBAAgB,IAAI,OAAO;QACrD,CAAC,CAAE,OAAyD,CAAC,cAAc;QAC3E,CAAC,CAAC,EAAE,CAAC,CAAC;IAER,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC;QAChC,IAAI,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,CAAC,IAAI,CACV,+BAA+B,KAAK,sDAAsD,OAAO,CAAC,QAAQ,GAAG,CAC9G,CAAC;YACF,SAAS;QACX,CAAC;QACD,YAAY,CAAC,GAAG,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IACnC,CAAC;AACH,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,QAAgB;IACzC,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;IACpE,OAAO,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,eAAe,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,uBAAuB;IACrC,OAAO,CAAC,GAAG,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;AACzC,CAAC;AAED,MAAM,eAAe,GAAG,GAAG,CAAC;AAC5B,MAAM,eAAe,GAAG,IAAI,CAAC;AAE7B,SAAS,cAAc,CAAC,MAAe,EAAE,QAAgB;IACvD,MAAM,MAAM,GAAY,EAAE,CAAC;IAE3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,KAAK,CAAC,MAAM,IAAI,eAAe,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,IAAI,eAAe,EAAE,CAAC;YAC/E,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnB,SAAS;QACX,CAAC;QAED,MAAM,SAAS,GAAY,EAAE,CAAC;QAC9B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,IAAI,gBAAgB,GAAG,CAAC,CAAC;QACzB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;YACvB,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;YAEhC,IACE,YAAY,CAAC,MAAM,GAAG,CAAC;gBACvB,CAAC,YAAY,CAAC,MAAM,IAAI,eAAe,IAAI,gBAAgB,GAAG,OAAO,GAAG,eAAe,CAAC,EACxF,CAAC;gBACD,SAAS,CAAC,IAAI,CAAC;oBACb,EAAE,EAAE,IAAI,EAAE;oBACV,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC;oBAChC,QAAQ,EAAE;wBACR,QAAQ;wBACR,SAAS,EAAE,KAAK,CAAC,QAAQ,CAAC,SAAS,GAAG,UAAU;wBAChD,OAAO,EAAE,KAAK,CAAC,QAAQ,CAAC,SAAS,GAAG,CAAC,GAAG,CAAC;wBACzC,QAAQ,EAAE,KAAK,CAAC,QAAQ,CAAC,QAAQ;qBAClC;iBACF,CAAC,CAAC;gBACH,YAAY,GAAG,EAAE,CAAC;gBAClB,gBAAgB,GAAG,CAAC,CAAC;gBACrB,UAAU,GAAG,CAAC,CAAC;YACjB,CAAC;YAED,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACxB,gBAAgB,IAAI,OAAO,CAAC;QAC9B,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,IAAI,EAAE;gBACV,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC;gBAChC,QAAQ,EAAE;oBACR,QAAQ;oBACR,SAAS,EAAE,KAAK,CAAC,QAAQ,CAAC,SAAS,GAAG,UAAU;oBAChD,OAAO,EAAE,KAAK,CAAC,QAAQ,CAAC,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC;oBACpD,QAAQ,EAAE,KAAK,CAAC,QAAQ,CAAC,QAAQ;iBAClC;aACF,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACnB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,QAAgB,EAChB,OAAe;IAEf,MAAM,OAAO,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,KAAK,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAEtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,eAAe,CAAC,KAAK,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,CAAC;IAED,OAAO,cAAc,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;AAC1C,CAAC;AAED,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,WAAW,EAAE,SAAS,EAAE,eAAe,EAAE,QAAQ,EAAE,UAAU,EAAE,aAAa,EAAE,iBAAiB,EAAE,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,UAAU,EAAE,UAAU,EAAE,UAAU,EAAE,WAAW,EAAE,WAAW,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,UAAU,EAAE,eAAe,EAAE,CAAC"}
|