@meaningfully/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.nvmrc +1 -0
- package/LICENSE +7 -0
- package/README.md +3 -0
- package/dist/DocumentSetManager.d.ts +28 -0
- package/dist/DocumentSetManager.d.ts.map +1 -0
- package/dist/DocumentSetManager.js +134 -0
- package/dist/DocumentSetManager.js.map +1 -0
- package/dist/Meaningfully.d.ts +52 -0
- package/dist/Meaningfully.d.ts.map +1 -0
- package/dist/Meaningfully.js +206 -0
- package/dist/Meaningfully.js.map +1 -0
- package/dist/MetadataManager.d.ts +32 -0
- package/dist/MetadataManager.d.ts.map +1 -0
- package/dist/MetadataManager.js +115 -0
- package/dist/MetadataManager.js.map +1 -0
- package/dist/api/embedding.d.ts +7 -0
- package/dist/api/embedding.d.ts.map +1 -0
- package/dist/api/embedding.js +94 -0
- package/dist/api/embedding.js.map +1 -0
- package/dist/api/embedding.test.d.ts +2 -0
- package/dist/api/embedding.test.d.ts.map +1 -0
- package/dist/api/embedding.test.js +340 -0
- package/dist/api/embedding.test.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/services/batchingWeaviateVectorStore.d.ts +6 -0
- package/dist/services/batchingWeaviateVectorStore.d.ts.map +1 -0
- package/dist/services/batchingWeaviateVectorStore.js +21 -0
- package/dist/services/batchingWeaviateVectorStore.js.map +1 -0
- package/dist/services/csvLoader.d.ts +3 -0
- package/dist/services/csvLoader.d.ts.map +1 -0
- package/dist/services/csvLoader.js +18 -0
- package/dist/services/csvLoader.js.map +1 -0
- package/dist/services/csvLoader.test.d.ts +2 -0
- package/dist/services/csvLoader.test.d.ts.map +1 -0
- package/dist/services/csvLoader.test.js +75 -0
- package/dist/services/csvLoader.test.js.map +1 -0
- package/dist/services/embeddings.d.ts +22 -0
- package/dist/services/embeddings.d.ts.map +1 -0
- package/dist/services/embeddings.js +314 -0
- package/dist/services/embeddings.js.map +1 -0
- package/dist/services/embeddings.test.d.ts +2 -0
- package/dist/services/embeddings.test.d.ts.map +1 -0
- package/dist/services/embeddings.test.js +115 -0
- package/dist/services/embeddings.test.js.map +1 -0
- package/dist/services/loggingOpenAIEmbedding.d.ts +2 -0
- package/dist/services/loggingOpenAIEmbedding.d.ts.map +1 -0
- package/dist/services/loggingOpenAIEmbedding.js +41 -0
- package/dist/services/loggingOpenAIEmbedding.js.map +1 -0
- package/dist/services/mockEmbedding.d.ts +6 -0
- package/dist/services/mockEmbedding.d.ts.map +1 -0
- package/dist/services/mockEmbedding.js +14 -0
- package/dist/services/mockEmbedding.js.map +1 -0
- package/dist/services/progressManager.d.ts +21 -0
- package/dist/services/progressManager.d.ts.map +1 -0
- package/dist/services/progressManager.js +76 -0
- package/dist/services/progressManager.js.map +1 -0
- package/dist/services/progressVectorStoreIndex.d.ts +21 -0
- package/dist/services/progressVectorStoreIndex.d.ts.map +1 -0
- package/dist/services/progressVectorStoreIndex.js +60 -0
- package/dist/services/progressVectorStoreIndex.js.map +1 -0
- package/dist/services/sentenceSplitter.d.ts +17 -0
- package/dist/services/sentenceSplitter.d.ts.map +1 -0
- package/dist/services/sentenceSplitter.js +207 -0
- package/dist/services/sentenceSplitter.js.map +1 -0
- package/dist/services/sentenceSplitter.test.d.ts +2 -0
- package/dist/services/sentenceSplitter.test.d.ts.map +1 -0
- package/dist/services/sentenceSplitter.test.js +68 -0
- package/dist/services/sentenceSplitter.test.js.map +1 -0
- package/dist/services/sploder.d.ts +13 -0
- package/dist/services/sploder.d.ts.map +1 -0
- package/dist/services/sploder.js +45 -0
- package/dist/services/sploder.js.map +1 -0
- package/dist/types/index.d.ts +77 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +2 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils.d.ts +3 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +7 -0
- package/dist/utils.js.map +1 -0
- package/package.json +43 -0
- package/src/Meaningfully.d.ts +57 -0
- package/src/Meaningfully.ts +228 -0
- package/src/MetadataManager.d.ts +27 -0
- package/src/MetadataManager.ts +145 -0
- package/src/api/embedding.d.ts +6 -0
- package/src/api/embedding.ts +122 -0
- package/src/index.ts +5 -0
- package/src/services/batchingWeaviateVectorStore.d.ts +5 -0
- package/src/services/batchingWeaviateVectorStore.ts +23 -0
- package/src/services/csvLoader.d.ts +2 -0
- package/src/services/csvLoader.ts +24 -0
- package/src/services/embeddings.d.ts +21 -0
- package/src/services/embeddings.ts +374 -0
- package/src/services/loggingOpenAIEmbedding.d.ts +0 -0
- package/src/services/loggingOpenAIEmbedding.ts +46 -0
- package/src/services/mockEmbedding.d.ts +5 -0
- package/src/services/mockEmbedding.ts +13 -0
- package/src/services/progressManager.d.ts +20 -0
- package/src/services/progressManager.ts +88 -0
- package/src/services/progressVectorStoreIndex.d.ts +20 -0
- package/src/services/progressVectorStoreIndex.ts +95 -0
- package/src/services/sentenceSplitter.d.ts +16 -0
- package/src/services/sentenceSplitter.ts +243 -0
- package/src/services/sploder.d.ts +12 -0
- package/src/services/sploder.ts +62 -0
- package/src/types/index.d.ts +71 -0
- package/src/types/index.ts +89 -0
- package/src/utils.d.ts +2 -0
- package/src/utils.ts +6 -0
- package/tests/MetadataManager.test.ts +120 -0
- package/tests/csvLoader.test.d.ts +1 -0
- package/tests/csvLoader.test.ts +88 -0
- package/tests/embedding.test.d.ts +1 -0
- package/tests/embedding.test.ts +425 -0
- package/tests/embeddings.test.d.ts +1 -0
- package/tests/embeddings.test.ts +144 -0
- package/tests/sentenceSplitter.test.d.ts +1 -0
- package/tests/sentenceSplitter.test.ts +81 -0
- package/tsconfig.json +31 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mockEmbedding.d.ts","sourceRoot":"","sources":["../../src/services/mockEmbedding.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAE3C,qBAAa,aAAc,SAAQ,aAAa;;IAItC,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAK1D"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
//@ts-nocheck
|
|
2
|
+
import { BaseEmbedding } from "llamaindex";
|
|
3
|
+
export class MockEmbedding extends BaseEmbedding {
|
|
4
|
+
constructor() {
|
|
5
|
+
super();
|
|
6
|
+
}
|
|
7
|
+
async getTextEmbedding(text) {
|
|
8
|
+
return new Promise((resolve) => {
|
|
9
|
+
resolve([1, 0, 0, 0, 0, 0]);
|
|
10
|
+
});
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
;
|
|
14
|
+
//# sourceMappingURL=mockEmbedding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mockEmbedding.js","sourceRoot":"","sources":["../../src/services/mockEmbedding.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,OAAO,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAE3C,MAAM,OAAO,aAAc,SAAQ,aAAa;IAC5C;QACI,KAAK,EAAE,CAAC;IACZ,CAAC;IACD,KAAK,CAAC,gBAAgB,CAAC,IAAY;QAC/B,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC3B,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAChC,CAAC,CAAC,CAAC;IACP,CAAC;CACJ;AAAA,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A simple manager to track progress of various operations
|
|
3
|
+
*/
|
|
4
|
+
export declare class ProgressManager {
|
|
5
|
+
private static instance;
|
|
6
|
+
private progressMap;
|
|
7
|
+
private currentOperation;
|
|
8
|
+
private constructor();
|
|
9
|
+
static getInstance(): ProgressManager;
|
|
10
|
+
startOperation(operationId: string, total?: number): void;
|
|
11
|
+
updateProgress(operationId: string, progress: number): void;
|
|
12
|
+
completeOperation(operationId: string): void;
|
|
13
|
+
getCurrentProgress(): {
|
|
14
|
+
progress: number;
|
|
15
|
+
total: number;
|
|
16
|
+
elapsedTimeMs: number;
|
|
17
|
+
estimatedTimeRemainingMs: number | null;
|
|
18
|
+
};
|
|
19
|
+
clearOperation(operationId: string): void;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=progressManager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressManager.d.ts","sourceRoot":"","sources":["../../src/services/progressManager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,qBAAa,eAAe;IAC1B,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAkB;IACzC,OAAO,CAAC,WAAW,CAA0G;IAC7H,OAAO,CAAC,gBAAgB,CAAuB;IAG/C,OAAO;WAEO,WAAW,IAAI,eAAe;IAOrC,cAAc,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,GAAE,MAAY,GAAG,IAAI;IAM9D,cAAc,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,IAAI;IAY3D,iBAAiB,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI;IAgB5C,kBAAkB,IAAI;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,wBAAwB,EAAE,MAAM,GAAG,IAAI,CAAA;KAAE;IA0BzH,cAAc,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI;CAMjD"}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A simple manager to track progress of various operations
|
|
3
|
+
*/
|
|
4
|
+
const FUDGE_FACTOR = 1.2; // seat of my pants guess
|
|
5
|
+
export class ProgressManager {
|
|
6
|
+
static instance;
|
|
7
|
+
progressMap = new Map();
|
|
8
|
+
currentOperation = null;
|
|
9
|
+
constructor() { }
|
|
10
|
+
static getInstance() {
|
|
11
|
+
if (!ProgressManager.instance) {
|
|
12
|
+
ProgressManager.instance = new ProgressManager();
|
|
13
|
+
}
|
|
14
|
+
return ProgressManager.instance;
|
|
15
|
+
}
|
|
16
|
+
startOperation(operationId, total = 100) {
|
|
17
|
+
const now = Date.now();
|
|
18
|
+
this.progressMap.set(operationId, { progress: 0, total, startTime: now, lastUpdateTime: now });
|
|
19
|
+
this.currentOperation = operationId;
|
|
20
|
+
}
|
|
21
|
+
updateProgress(operationId, progress) {
|
|
22
|
+
const currentProgress = this.progressMap.get(operationId);
|
|
23
|
+
if (currentProgress) {
|
|
24
|
+
this.progressMap.set(operationId, {
|
|
25
|
+
progress,
|
|
26
|
+
total: currentProgress.total,
|
|
27
|
+
startTime: currentProgress.startTime,
|
|
28
|
+
lastUpdateTime: Date.now()
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
completeOperation(operationId) {
|
|
33
|
+
const currentProgress = this.progressMap.get(operationId);
|
|
34
|
+
if (currentProgress) {
|
|
35
|
+
this.progressMap.set(operationId, {
|
|
36
|
+
progress: currentProgress.total,
|
|
37
|
+
total: currentProgress.total,
|
|
38
|
+
startTime: currentProgress.startTime,
|
|
39
|
+
lastUpdateTime: Date.now()
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
if (this.currentOperation === operationId) {
|
|
43
|
+
this.currentOperation = null;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
getCurrentProgress() {
|
|
47
|
+
if (this.currentOperation) {
|
|
48
|
+
const operationData = this.progressMap.get(this.currentOperation);
|
|
49
|
+
if (operationData) {
|
|
50
|
+
const now = Date.now();
|
|
51
|
+
const elapsedTimeMs = now - operationData.startTime;
|
|
52
|
+
let estimatedTimeRemainingMs = null;
|
|
53
|
+
// Only estimate if we have meaningful progress (at least 5% complete and some time elapsed)
|
|
54
|
+
if (operationData.progress > 0.05 * operationData.total && elapsedTimeMs > 1000) {
|
|
55
|
+
const progressPercentage = operationData.progress / operationData.total;
|
|
56
|
+
const estimatedTotalTime = (elapsedTimeMs / progressPercentage) * FUDGE_FACTOR;
|
|
57
|
+
estimatedTimeRemainingMs = Math.max(0, estimatedTotalTime - elapsedTimeMs);
|
|
58
|
+
}
|
|
59
|
+
return {
|
|
60
|
+
progress: operationData.progress,
|
|
61
|
+
total: operationData.total,
|
|
62
|
+
elapsedTimeMs,
|
|
63
|
+
estimatedTimeRemainingMs
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return { progress: 0, total: 100, elapsedTimeMs: 0, estimatedTimeRemainingMs: null };
|
|
68
|
+
}
|
|
69
|
+
clearOperation(operationId) {
|
|
70
|
+
this.progressMap.delete(operationId);
|
|
71
|
+
if (this.currentOperation === operationId) {
|
|
72
|
+
this.currentOperation = null;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=progressManager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressManager.js","sourceRoot":"","sources":["../../src/services/progressManager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,YAAY,GAAG,GAAG,CAAC,CAAC,yBAAyB;AAEnD,MAAM,OAAO,eAAe;IAClB,MAAM,CAAC,QAAQ,CAAkB;IACjC,WAAW,GAAgG,IAAI,GAAG,EAAE,CAAC;IACrH,gBAAgB,GAAkB,IAAI,CAAC;IAG/C,gBAAuB,CAAC;IAEjB,MAAM,CAAC,WAAW;QACvB,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC;YAC9B,eAAe,CAAC,QAAQ,GAAG,IAAI,eAAe,EAAE,CAAC;QACnD,CAAC;QACD,OAAO,eAAe,CAAC,QAAQ,CAAC;IAClC,CAAC;IAEM,cAAc,CAAC,WAAmB,EAAE,QAAgB,GAAG;QAC5D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,EAAE,cAAc,EAAE,GAAG,EAAE,CAAC,CAAC;QAC/F,IAAI,CAAC,gBAAgB,GAAG,WAAW,CAAC;IACtC,CAAC;IAEM,cAAc,CAAC,WAAmB,EAAE,QAAgB;QACzD,MAAM,eAAe,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QAC1D,IAAI,eAAe,EAAE,CAAC;YACpB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,WAAW,EAAE;gBAChC,QAAQ;gBACR,KAAK,EAAE,eAAe,CAAC,KAAK;gBAC5B,SAAS,EAAE,eAAe,CAAC,SAAS;gBACpC,cAAc,EAAE,IAAI,CAAC,GAAG,EAAE;aAC3B,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAEM,iBAAiB,CAAC,WAAmB;QAC1C,MAAM,eAAe,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QAC1D,IAAI,eAAe,EAAE,CAAC;YACpB,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,WAAW,EAAE;gBAChC,QAAQ,EAAE,eAAe,CAAC,KAAK;gBAC/B,KAAK,EAAE,eAAe,CAAC,KAAK;gBAC5B,SAAS,EAAE,eAAe,CAAC,SAAS;gBACpC,cAAc,EAAE,IAAI,CAAC,GAAG,EAAE;aAC3B,CAAC,CAAC;QACL,CAAC;QAED,IAAI,IAAI,CAAC,gBAAgB,KAAK,WAAW,EAAE,CAAC;YAC1C,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC;QAC/B,CAAC;IACH,CAAC;IAEM,kBAAkB;QACvB,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;YAC1B,MAAM,aAAa,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAClE,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBACvB,MAAM,aAAa,GAAG,GAAG,GAAG,aAAa,CAAC,SAAS,CAAC;gBACpD,IAAI,wBAAwB,GAAkB,IAAI,CAAC;gBAEnD,4FAA4F;gBAC5F,IAAI,aAAa,CAAC,QAAQ,GAAG,IAAI,GAAG,aAAa,CAAC,KAAK,IAAI,aAAa,GAAG,IAAI,EAAE,CAAC;oBAChF,MAAM,kBAAkB,GAAG,aAAa,CAAC,QAAQ,GAAG,aAAa,CAAC,KAAK,CAAC;oBACxE,MAAM,kBAAkB,GAAG,CAAC,aAAa,GAAG,kBAAkB,CAAC,GAAG,YAAY,CAAC;oBAC/E,wBAAwB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,kBAAkB,GAAG,aAAa,CAAC,CAAC;gBAC7E,CAAC;gBAED,OAAO;oBACL,QAAQ,EAAE,aAAa,CAAC,QAAQ;oBAChC,KAAK,EAAE,aAAa,CAAC,KAAK;oBAC1B,aAAa;oBACb,wBAAwB;iBACzB,CAAC;YACJ,CAAC;QACH,CAAC;QACD,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,aAAa,EAAE,CAAC,EAAE,wBAAwB,EAAE,IAAI,EAAE,CAAC;IACvF,CAAC;IAEM,cAAc,CAAC,WAAmB;QACvC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;QACrC,IAAI,IAAI,CAAC,gBAAgB,KAAK,WAAW,EAAE,CAAC;YAC1C,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC;QAC/B,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { VectorStoreIndex, type VectorIndexOptions as BaseVectorIndexOptions } from "llamaindex";
|
|
2
|
+
import { BaseNode } from "llamaindex";
|
|
3
|
+
export interface VectorIndexOptions extends BaseVectorIndexOptions {
|
|
4
|
+
progressCallback?: (progress: number, total: number) => void;
|
|
5
|
+
}
|
|
6
|
+
export declare class ProgressVectorStoreIndex extends VectorStoreIndex {
|
|
7
|
+
static init(options: VectorIndexOptions): Promise<VectorStoreIndex>;
|
|
8
|
+
buildIndexFromNodes(nodes: BaseNode[], options?: {
|
|
9
|
+
logProgress?: boolean;
|
|
10
|
+
progressCallback?: (progress: number, total: number) => void;
|
|
11
|
+
}): Promise<void>;
|
|
12
|
+
insertNodes(nodes: BaseNode[], options?: {
|
|
13
|
+
logProgress?: boolean;
|
|
14
|
+
progressCallback?: (progress: number, total: number) => void;
|
|
15
|
+
}): Promise<void>;
|
|
16
|
+
getNodeEmbeddingResults(nodes: BaseNode[], options?: {
|
|
17
|
+
logProgress?: boolean;
|
|
18
|
+
progressCallback?: (progress: number, total: number) => void;
|
|
19
|
+
}): Promise<BaseNode[]>;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=progressVectorStoreIndex.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressVectorStoreIndex.d.ts","sourceRoot":"","sources":["../../src/services/progressVectorStoreIndex.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,KAAK,kBAAkB,IAAI,sBAAsB,EAAyC,MAAM,YAAY,CAAC;AACxI,OAAO,EAAE,QAAQ,EAAkC,MAAM,YAAY,CAAC;AAItE,MAAM,WAAW,kBAAmB,SAAQ,sBAAsB;IAChE,gBAAgB,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CAC9D;AAID,qBAAa,wBAAyB,SAAQ,gBAAgB;WACxC,IAAI,CACtB,OAAO,EAAE,kBAAkB,GAC1B,OAAO,CAAC,gBAAgB,CAAC;IAuCtB,mBAAmB,CACvB,KAAK,EAAE,QAAQ,EAAE,EACjB,OAAO,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,OAAO,CAAC;QAAC,gBAAgB,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAA;KAAE;IAK7F,WAAW,CACf,KAAK,EAAE,QAAQ,EAAE,EACjB,OAAO,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,OAAO,CAAC;QAAC,gBAAgB,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAA;KAAE;IAe7F,uBAAuB,CAC3B,KAAK,EAAE,QAAQ,EAAE,EACjB,OAAO,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,OAAO,CAAC;QAAC,gBAAgB,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAA;KAAE,GAChG,OAAO,CAAC,QAAQ,EAAE,CAAC;CAcvB"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { VectorStoreIndex, storageContextFromDefaults, IndexDict } from "llamaindex";
|
|
2
|
+
import { splitNodesByType } from "llamaindex";
|
|
3
|
+
import { addNodesToVectorStores } from "llamaindex";
|
|
4
|
+
// Subclass VectorStoreIndex to handle progressCallback
|
|
5
|
+
// @ts-ignore
|
|
6
|
+
export class ProgressVectorStoreIndex extends VectorStoreIndex {
|
|
7
|
+
static async init(options) {
|
|
8
|
+
const storageContext = options.storageContext ?? (await storageContextFromDefaults({}));
|
|
9
|
+
const indexStore = storageContext.indexStore;
|
|
10
|
+
const docStore = storageContext.docStore;
|
|
11
|
+
// @ts-ignore
|
|
12
|
+
let indexStruct = await VectorStoreIndex.setupIndexStructFromStorage(indexStore, options);
|
|
13
|
+
if (!options.nodes && !indexStruct) {
|
|
14
|
+
throw new Error("Cannot initialize VectorStoreIndex without nodes or indexStruct");
|
|
15
|
+
}
|
|
16
|
+
indexStruct = indexStruct ?? new IndexDict();
|
|
17
|
+
// @ts-ignore
|
|
18
|
+
const index = new this({
|
|
19
|
+
storageContext,
|
|
20
|
+
docStore,
|
|
21
|
+
indexStruct,
|
|
22
|
+
indexStore,
|
|
23
|
+
vectorStores: options.vectorStores,
|
|
24
|
+
});
|
|
25
|
+
if (options.nodes) {
|
|
26
|
+
// If nodes are passed in, then we need to update the index
|
|
27
|
+
await index.buildIndexFromNodes(options.nodes, {
|
|
28
|
+
logProgress: options.logProgress,
|
|
29
|
+
progressCallback: options.progressCallback,
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
return index;
|
|
33
|
+
}
|
|
34
|
+
async buildIndexFromNodes(nodes, options) {
|
|
35
|
+
await this.insertNodes(nodes, options);
|
|
36
|
+
}
|
|
37
|
+
async insertNodes(nodes, options) {
|
|
38
|
+
if (!nodes || nodes.length === 0) {
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
nodes = await this.getNodeEmbeddingResults(nodes, options);
|
|
42
|
+
await addNodesToVectorStores(nodes, this.vectorStores, this.insertNodesToStore.bind(this));
|
|
43
|
+
await this.indexStore.addIndexStruct(this.indexStruct);
|
|
44
|
+
}
|
|
45
|
+
async getNodeEmbeddingResults(nodes, options) {
|
|
46
|
+
const nodeMap = splitNodesByType(nodes);
|
|
47
|
+
for (const type in nodeMap) {
|
|
48
|
+
const nodes = nodeMap[type];
|
|
49
|
+
const embedModel = this.vectorStores[type]?.embedModel ?? this.embedModel;
|
|
50
|
+
if (embedModel && nodes) {
|
|
51
|
+
await embedModel(nodes, {
|
|
52
|
+
logProgress: options?.logProgress,
|
|
53
|
+
progressCallback: options?.progressCallback, // Pass progressCallback to embedModel
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return nodes;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=progressVectorStoreIndex.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progressVectorStoreIndex.js","sourceRoot":"","sources":["../../src/services/progressVectorStoreIndex.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAqD,0BAA0B,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACxI,OAAO,EAA0B,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACtE,OAAO,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAOpD,uDAAuD;AACvD,aAAa;AACb,MAAM,OAAO,wBAAyB,SAAQ,gBAAgB;IACrD,MAAM,CAAC,KAAK,CAAC,IAAI,CACtB,OAA2B;QAE3B,MAAM,cAAc,GAClB,OAAO,CAAC,cAAc,IAAI,CAAC,MAAM,0BAA0B,CAAC,EAAE,CAAC,CAAC,CAAC;QACnE,MAAM,UAAU,GAAG,cAAc,CAAC,UAAU,CAAC;QAC7C,MAAM,QAAQ,GAAG,cAAc,CAAC,QAAQ,CAAC;QAEzC,cAAc;QACd,IAAI,WAAW,GAAG,MAAM,gBAAgB,CAAC,2BAA2B,CAClE,UAAU,EACV,OAAO,CACR,CAAC;QAEF,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,CAAC,WAAW,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CACb,iEAAiE,CAClE,CAAC;QACJ,CAAC;QAED,WAAW,GAAG,WAAW,IAAI,IAAI,SAAS,EAAE,CAAC;QAE7C,aAAa;QACb,MAAM,KAAK,GAAG,IAAI,IAAI,CAAC;YACrB,cAAc;YACd,QAAQ;YACR,WAAW;YACX,UAAU;YACV,YAAY,EAAE,OAAO,CAAC,YAAY;SACnC,CAAC,CAAC;QAEH,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,2DAA2D;YAC3D,MAAM,KAAK,CAAC,mBAAmB,CAAC,OAAO,CAAC,KAAK,EAAE;gBAC7C,WAAW,EAAE,OAAO,CAAC,WAAW;gBAChC,gBAAgB,EAAE,OAAO,CAAC,gBAAgB;aAC3C,CAAC,CAAC;QACL,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,mBAAmB,CACvB,KAAiB,EACjB,OAAiG;QAEjG,MAAM,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,WAAW,CACf,KAAiB,EACjB,OAAiG;QAEjG,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO;QACT,CAAC;QAED,KAAK,GAAG,MAAM,IAAI,CAAC,uBAAuB,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAC3D,MAAM,sBAAsB,CAC1B,KAAK,EACL,IAAI,CAAC,YAAY,EACjB,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,CACnC,CAAC;QACF,MAAM,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,uBAAuB,CAC3B,KAAiB,EACjB,OAAiG;QAEjG,MAAM,OAAO,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC;QACxC,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;YAC3B,MAAM,KAAK,GAAG,OAAO,CAAC,IAAoB,CAAC,CAAC;YAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,IAAoB,CAAC,EAAE,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC;YAC1F,IAAI,UAAU,IAAI,KAAK,EAAE,CAAC;gBACxB,MAAM,UAAU,CAAC,KAAK,EAAE;oBACtB,WAAW,EAAE,OAAO,EAAE,WAAW;oBACjC,gBAAgB,EAAE,OAAO,EAAE,gBAAgB,EAAE,sCAAsC;iBACpF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;CACF"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { SentenceSplitter } from "llamaindex";
|
|
2
|
+
import natural from "natural";
|
|
3
|
+
type TextSplitterFn = (text: string) => string[];
|
|
4
|
+
export declare class CustomSentenceSplitter extends SentenceSplitter {
|
|
5
|
+
#private;
|
|
6
|
+
chunkingTokenizerFn: () => TextSplitterFn;
|
|
7
|
+
abbreviations: string[];
|
|
8
|
+
tokenizer: natural.SentenceTokenizer;
|
|
9
|
+
constructor(params?: {
|
|
10
|
+
chunkSize?: number;
|
|
11
|
+
chunkOverlap?: number;
|
|
12
|
+
abbreviations?: string[];
|
|
13
|
+
});
|
|
14
|
+
_splitText(text: string, chunkSize: number): string[];
|
|
15
|
+
}
|
|
16
|
+
export {};
|
|
17
|
+
//# sourceMappingURL=sentenceSplitter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentenceSplitter.d.ts","sourceRoot":"","sources":["../../src/services/sentenceSplitter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAmD,MAAM,YAAY,CAAC;AAC/F,OAAO,OAAO,MAAM,SAAS,CAAA;AA2C7B,KAAK,cAAc,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,EAAE,CAAC;AAejD,qBAAa,sBAAuB,SAAQ,gBAAgB;;IAG1D,mBAAmB,QAAO,cAAc,CAQtC;IAGF,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,EAAE,OAAO,CAAC,iBAAiB,CAAC;gBAEzB,MAAM,GAAE;QAAE,SAAS,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAA;KAAO;IAoChG,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE;CAkItD"}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import { SentenceSplitter, splitBySep, splitByRegex, splitByChar, Settings } from "llamaindex";
|
|
2
|
+
import natural from "natural";
|
|
3
|
+
/*
|
|
4
|
+
LlamaIndex's includes the length of the metadata as part of the size of the chunk when splitting by sentences.
|
|
5
|
+
This produces very unintuitive behavior: e.g. when the user specifies a chunk-size of 50 and nodes have metadata of length 40,
|
|
6
|
+
the resulting split sentences are about 10 tokens long -- as opposed to the specified 50.
|
|
7
|
+
|
|
8
|
+
This modified SentenceSplitter adds a `include_metadata_in_chunksize` flag that disables the above behavior,
|
|
9
|
+
ignoring metadata when calculating chunksize (i.e. only including the size of the text datga when calculating chunksize.)
|
|
10
|
+
|
|
11
|
+
Additionally, splitTextMetadataAware does some bizarre stuff where it will split sentences at abbreviations -- even if the
|
|
12
|
+
underlying tokenizer knows about the abbreviations, I think due to some weird sub-sentence splitting. It also sews sentence
|
|
13
|
+
chunks back together in a way that eliminates spaces, e.g. `JPMorgan Chase & Co.elected Mark Weinberger` and `Mr.Weinberger was Global Chairman`.
|
|
14
|
+
|
|
15
|
+
I also tried making SentenceSplitter just split on sentences (with Natural) but this misbehaved by splitting TOO much. I do need short sentences grouped
|
|
16
|
+
together (whether they are true short sentences, or false-positives like "USA v. one 12 ft. I.B.M. mainframe").
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
*/
|
|
20
|
+
// TODO: make this configurable
|
|
21
|
+
const INCLUDE_METADATA_IN_CHUNKSIZE = false;
|
|
22
|
+
SentenceSplitter.prototype.splitTextMetadataAware = function (text, metadata) {
|
|
23
|
+
const metadataLength = this.tokenSize(metadata);
|
|
24
|
+
const effectiveChunkSize = INCLUDE_METADATA_IN_CHUNKSIZE ? this.chunkSize - metadataLength : this.chunkSize;
|
|
25
|
+
if (effectiveChunkSize <= 0) {
|
|
26
|
+
throw new Error(`Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
|
|
27
|
+
}
|
|
28
|
+
else if (effectiveChunkSize < 50) {
|
|
29
|
+
console.log(`Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
|
|
30
|
+
}
|
|
31
|
+
return this._splitText(text, effectiveChunkSize);
|
|
32
|
+
};
|
|
33
|
+
const default_abbreviations = ['dr.', 'vs.', 'mr.', 'ms.', 'mx.', 'mrs.', 'prof.', 'inc.', 'corp.', 'co.', 'llc.', 'ltd.', 'etc.', "i.e.",
|
|
34
|
+
"etc.",
|
|
35
|
+
"vs.",
|
|
36
|
+
"A.S.A.P.",
|
|
37
|
+
];
|
|
38
|
+
// This varies from SentenceSplitter in two ways:
|
|
39
|
+
// 1. it uses abbreviations set here.
|
|
40
|
+
// 2. it uses a custom SentenceTokenizer with a second trimSentences arguemnt that controls
|
|
41
|
+
// whether or not leading/trailing whitespace is preserved.
|
|
42
|
+
// We want to preserve it, so that when sentences are merged back again, we don't end up with
|
|
43
|
+
// sentences that are not separated by spaces.
|
|
44
|
+
// Because JavaScript is stupid, we have to copy over almost the whole SentenceSplitter just to make those few small changes.
|
|
45
|
+
export class CustomSentenceSplitter extends SentenceSplitter {
|
|
46
|
+
// this function is new.
|
|
47
|
+
chunkingTokenizerFn = () => {
|
|
48
|
+
return (text) => {
|
|
49
|
+
try {
|
|
50
|
+
return this.tokenizer.tokenize(text);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return [text];
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
};
|
|
57
|
+
#splitFns = new Set();
|
|
58
|
+
#subSentenceSplitFns = new Set();
|
|
59
|
+
abbreviations;
|
|
60
|
+
tokenizer;
|
|
61
|
+
constructor(params = {}) {
|
|
62
|
+
super(params);
|
|
63
|
+
// Create custom tokenizer with abbreviations
|
|
64
|
+
this.abbreviations = params.abbreviations || default_abbreviations;
|
|
65
|
+
// I modified my local node_modules/natural/lib/natural/tokenizers/index.d.ts to add the second argument to the natural.SentenceTokenizer constructor.
|
|
66
|
+
// once that gets fixed in the next version of the library, remove the ts-ignore.
|
|
67
|
+
// @ts-ignore
|
|
68
|
+
this.tokenizer = new natural.SentenceTokenizer(this.abbreviations, false); // false is don't trim sentences
|
|
69
|
+
// copied from the superclass.
|
|
70
|
+
this.#splitFns.add(splitBySep(this.paragraphSeparator));
|
|
71
|
+
this.#splitFns.add(this.chunkingTokenizerFn()); // the ONLY change here in the constructor.
|
|
72
|
+
// copied from the superclass.
|
|
73
|
+
this.#subSentenceSplitFns.add(splitByRegex(this.secondaryChunkingRegex));
|
|
74
|
+
this.#subSentenceSplitFns.add(splitBySep(this.separator));
|
|
75
|
+
this.#subSentenceSplitFns.add(splitByChar());
|
|
76
|
+
// left over from a failed attempt to JUST use natural.SentenceTokenizer
|
|
77
|
+
// but I DO in fact need the merge stuff.
|
|
78
|
+
// const tokenizer =
|
|
79
|
+
// Override the default splitText method
|
|
80
|
+
// this.splitText = (text: string): string[] => {
|
|
81
|
+
// return tokenizer.tokenize(text);
|
|
82
|
+
// };
|
|
83
|
+
// /* tslint:disable:no-unused-variable */
|
|
84
|
+
// this.splitTextMetadataAware = (text: string, metadata: string): string[] => {
|
|
85
|
+
// return tokenizer.tokenize(text);
|
|
86
|
+
// }
|
|
87
|
+
}
|
|
88
|
+
//just verbatim copies of the parent class
|
|
89
|
+
_splitText(text, chunkSize) {
|
|
90
|
+
if (text === "")
|
|
91
|
+
return [text];
|
|
92
|
+
const callbackManager = Settings.callbackManager;
|
|
93
|
+
callbackManager.dispatchEvent("chunking-start", {
|
|
94
|
+
text: [text],
|
|
95
|
+
});
|
|
96
|
+
const splits = this.#split(text, chunkSize);
|
|
97
|
+
const chunks = this.#merge(splits, chunkSize);
|
|
98
|
+
callbackManager.dispatchEvent("chunking-end", {
|
|
99
|
+
chunks,
|
|
100
|
+
});
|
|
101
|
+
return chunks;
|
|
102
|
+
}
|
|
103
|
+
#split(text, chunkSize) {
|
|
104
|
+
const tokenSize = this.tokenSize(text);
|
|
105
|
+
if (tokenSize <= chunkSize) {
|
|
106
|
+
return [
|
|
107
|
+
{
|
|
108
|
+
text,
|
|
109
|
+
isSentence: true,
|
|
110
|
+
tokenSize,
|
|
111
|
+
},
|
|
112
|
+
];
|
|
113
|
+
}
|
|
114
|
+
const [textSplitsByFns, isSentence] = this.#getSplitsByFns(text);
|
|
115
|
+
const textSplits = [];
|
|
116
|
+
for (const textSplit of textSplitsByFns) {
|
|
117
|
+
const tokenSize = this.tokenSize(textSplit);
|
|
118
|
+
if (tokenSize <= chunkSize) {
|
|
119
|
+
textSplits.push({
|
|
120
|
+
text: textSplit,
|
|
121
|
+
isSentence,
|
|
122
|
+
tokenSize,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
const recursiveTextSplits = this.#split(textSplit, chunkSize);
|
|
127
|
+
textSplits.push(...recursiveTextSplits);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return textSplits;
|
|
131
|
+
}
|
|
132
|
+
#getSplitsByFns(text) {
|
|
133
|
+
for (const splitFn of this.#splitFns) {
|
|
134
|
+
const splits = splitFn(text);
|
|
135
|
+
if (splits.length > 1) {
|
|
136
|
+
return [splits, true];
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
for (const splitFn of this.#subSentenceSplitFns) {
|
|
140
|
+
const splits = splitFn(text);
|
|
141
|
+
if (splits.length > 1) {
|
|
142
|
+
return [splits, false];
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return [[text], true];
|
|
146
|
+
}
|
|
147
|
+
#merge(splits, chunkSize) {
|
|
148
|
+
const chunks = [];
|
|
149
|
+
let currentChunk = [];
|
|
150
|
+
let lastChunk = [];
|
|
151
|
+
let currentChunkLength = 0;
|
|
152
|
+
let newChunk = true;
|
|
153
|
+
const closeChunk = () => {
|
|
154
|
+
chunks.push(currentChunk.map(([text]) => text).join(""));
|
|
155
|
+
lastChunk = currentChunk;
|
|
156
|
+
currentChunk = [];
|
|
157
|
+
currentChunkLength = 0;
|
|
158
|
+
newChunk = true;
|
|
159
|
+
let lastIndex = lastChunk.length - 1;
|
|
160
|
+
while (lastIndex >= 0 &&
|
|
161
|
+
currentChunkLength + lastChunk[lastIndex][1] <= this.chunkOverlap) {
|
|
162
|
+
const [text, length] = lastChunk[lastIndex];
|
|
163
|
+
currentChunkLength += length;
|
|
164
|
+
currentChunk.unshift([text, length]);
|
|
165
|
+
lastIndex -= 1;
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
while (splits.length > 0) {
|
|
169
|
+
const curSplit = splits[0];
|
|
170
|
+
if (curSplit.tokenSize > chunkSize) {
|
|
171
|
+
throw new Error("Single token exceeded chunk size");
|
|
172
|
+
}
|
|
173
|
+
if (currentChunkLength + curSplit.tokenSize > chunkSize && !newChunk) {
|
|
174
|
+
closeChunk();
|
|
175
|
+
}
|
|
176
|
+
else {
|
|
177
|
+
if (curSplit.isSentence ||
|
|
178
|
+
currentChunkLength + curSplit.tokenSize <= chunkSize ||
|
|
179
|
+
newChunk) {
|
|
180
|
+
currentChunkLength += curSplit.tokenSize;
|
|
181
|
+
currentChunk.push([curSplit.text, curSplit.tokenSize]);
|
|
182
|
+
splits.shift();
|
|
183
|
+
newChunk = false;
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
closeChunk();
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// Handle the last chunk
|
|
191
|
+
if (!newChunk) {
|
|
192
|
+
chunks.push(currentChunk.map(([text]) => text).join(""));
|
|
193
|
+
}
|
|
194
|
+
return this.#postprocessChunks(chunks);
|
|
195
|
+
}
|
|
196
|
+
#postprocessChunks(chunks) {
|
|
197
|
+
const newChunks = [];
|
|
198
|
+
for (const chunk of chunks) {
|
|
199
|
+
const trimmedChunk = chunk.trim();
|
|
200
|
+
if (trimmedChunk !== "") {
|
|
201
|
+
newChunks.push(trimmedChunk);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
return newChunks;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
//# sourceMappingURL=sentenceSplitter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentenceSplitter.js","sourceRoot":"","sources":["../../src/services/sentenceSplitter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,UAAU,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAC/F,OAAO,OAAO,MAAM,SAAS,CAAA;AAE7B;;;;;;;;;;;;;;;;EAgBE;AACF,+BAA+B;AAC/B,MAAM,6BAA6B,GAAG,KAAK,CAAC;AAC5C,gBAAgB,CAAC,SAAS,CAAC,sBAAsB,GAAG,UAAS,IAAY,EAAE,QAAgB;IACzF,MAAM,cAAc,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IAChD,MAAM,kBAAkB,GAAG,6BAA6B,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;IAC5G,IAAI,kBAAkB,IAAI,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,oBAAoB,cAAc,gCAAgC,IAAI,CAAC,SAAS,8FAA8F,CAC/K,CAAC;IACJ,CAAC;SAAM,IAAI,kBAAkB,GAAG,EAAE,EAAE,CAAC;QACnC,OAAO,CAAC,GAAG,CACT,oBAAoB,cAAc,6BAA6B,IAAI,CAAC,SAAS,wIAAwI,CACtN,CAAC;IACJ,CAAC;IACD,OAAO,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,kBAAkB,CAAC,CAAC;AACnD,CAAC,CAAA;AAED,MAAM,qBAAqB,GAAE,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IACtI,MAAM;IACN,KAAK;IACL,UAAU;CACX,CAAC;AAWF,iDAAiD;AACjD,qCAAqC;AACrC,2FAA2F;AAC3F,8DAA8D;AAC9D,iGAAiG;AACjG,iDAAiD;AACjD,6HAA6H;AAC7H,MAAM,OAAO,sBAAuB,SAAQ,gBAAgB;IAE1D,wBAAwB;IACxB,mBAAmB,GAAG,GAAmB,EAAE;QACzC,OAAO,CAAC,IAAY,EAAE,EAAE;YACtB,IAAI,CAAC;gBACH,OAAO,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACvC,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,CAAC,IAAI,CAAC,CAAC;YAChB,CAAC;QACH,CAAC,CAAC;IACJ,CAAC,CAAC;IACF,SAAS,GAAwB,IAAI,GAAG,EAAE,CAAC;IAC3C,oBAAoB,GAAwB,IAAI,GAAG,EAAE,CAAC;IACtD,aAAa,CAAW;IACxB,SAAS,CAA4B;IAErC,YAAY,SAAkF,EAAE;QAC9F,KAAK,CAAC,MAAM,CAAC,CAAC;QACd,6CAA6C;QAC7C,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,IAAI,qBAAqB,CAAC;QAEnE,sJAAsJ;QACtJ,iFAAiF;QACjF,aAAa;QACb,IAAI,CAAC,SAAS,GAAG,IAAI,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC,CAAC,gCAAgC;QAE3G,8BAA8B;QAC9B,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC,CAAC;QAExD,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,EAAE,CAAC,CAAC,CAAC,2CAA2C;QAE3F,8BAA8B;QAC9B,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,YAAY,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC,CAAC;QACzE,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;QAC1D,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC;QAE7C,wEAAwE;QACxE,yCAAyC;QACzC,qBAAqB;QACrB,wCAAwC;QACxC,iDAAiD;QACjD,qCAAqC;QACrC,KAAK;QACL,0CAA0C;QAC1C,gFAAgF;QAChF,qCAAqC;QACrC,IAAI;IACN,CAAC;IAGD,0CAA0C;IAE1C,UAAU,CAAC,IAAY,EAAE,SAAiB;QACxC,IAAI,IAAI,KAAK,EAAE;YAAE,OAAO,CAAC,IAAI,CAAC,CAAC;QAE/B,MAAM,eAAe,GAAG,QAAQ,CAAC,eAAe,CAAC;QAEjD,eAAe,CAAC,aAAa,CAAC,gBAAgB,EAAE;YAC9C,IAAI,EAAE,CAAC,IAAI,CAAC;SACb,CAAC,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAE9C,eAAe,CAAC,aAAa,CAAC,cAAc,EAAE;YAC5C,MAAM;SACP,CAAC,CAAC;QACH,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,CAAC,IAAY,EAAE,SAAiB;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,SAAS,IAAI,SAAS,EAAE,CAAC;YAC3B,OAAO;gBACL;oBACE,IAAI;oBACJ,UAAU,EAAE,IAAI;oBAChB,SAAS;iBACV;aACF,CAAC;QACJ,CAAC;QACD,MAAM,CAAC,eAAe,EAAE,UAAU,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QACjE,MAAM,UAAU,GAAa,EAAE,CAAC;QAEhC,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;YAC5C,IAAI,SAAS,IAAI,SAAS,EAAE,CAAC;gBAC3B,UAAU,CAAC,IAAI,CAAC;oBACd,IAAI,EAAE,SAAS;oBACf,UAAU;oBACV,SAAS;iBACV,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,MAAM,mBAAmB,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;gBAC9D,UAAU,CAAC,IAAI,CAAC,GAAG,mBAAmB,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QACD,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,eAAe,CAAC,IAAY;QAC1B,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACrC,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;YAC7B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QACD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,oBAAoB,EAAE,CAAC;YAChD,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;YAC7B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtB,OAAO,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;QACD,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC;IACxB,CAAC;IAED,MAAM,CAAC,MAAgB,EAAE,SAAiB;QACxC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAuB,EAAE,CAAC;QAC1C,IAAI,SAAS,GAAuB,EAAE,CAAC;QACvC,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAC3B,IAAI,QAAQ,GAAG,IAAI,CAAC;QAEpB,MAAM,UAAU,GAAG,GAAS,EAAE;YAC5B,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;YACzD,SAAS,GAAG,YAAY,CAAC;YACzB,YAAY,GAAG,EAAE,CAAC;YAClB,kBAAkB,GAAG,CAAC,CAAC;YACvB,QAAQ,GAAG,IAAI,CAAC;YAEhB,IAAI,SAAS,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC;YACrC,OACE,SAAS,IAAI,CAAC;gBACd,kBAAkB,GAAG,SAAS,CAAC,SAAS,CAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,YAAY,EAClE,CAAC;gBACD,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,GAAG,SAAS,CAAC,SAAS,CAAE,CAAC;gBAC7C,kBAAkB,IAAI,MAAM,CAAC;gBAC7B,YAAY,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC;gBACrC,SAAS,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC,CAAC;QAEF,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC;YAC5B,IAAI,QAAQ,CAAC,SAAS,GAAG,SAAS,EAAE,CAAC;gBACnC,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAC;YACtD,CAAC;YACD,IAAI,kBAAkB,GAAG,QAAQ,CAAC,SAAS,GAAG,SAAS,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACrE,UAAU,EAAE,CAAC;YACf,CAAC;iBAAM,CAAC;gBACN,IACE,QAAQ,CAAC,UAAU;oBACnB,kBAAkB,GAAG,QAAQ,CAAC,SAAS,IAAI,SAAS;oBACpD,QAAQ,EACR,CAAC;oBACD,kBAAkB,IAAI,QAAQ,CAAC,SAAS,CAAC;oBACzC,YAAY,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,EAAE,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;oBACvD,MAAM,CAAC,KAAK,EAAE,CAAC;oBACf,QAAQ,GAAG,KAAK,CAAC;gBACnB,CAAC;qBAAM,CAAC;oBACN,UAAU,EAAE,CAAC;gBACf,CAAC;YACH,CAAC;QACH,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC;QAED,OAAO,IAAI,CAAC,kBAAkB,CAAC,MAAM,CAAC,CAAC;IACzC,CAAC;IAED,kBAAkB,CAAC,MAAgB;QACjC,MAAM,SAAS,GAAa,EAAE,CAAC;QAC/B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;YAClC,IAAI,YAAY,KAAK,EAAE,EAAE,CAAC;gBACxB,SAAS,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAC/B,CAAC;QACH,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentenceSplitter.test.d.ts","sourceRoot":"","sources":["../../src/services/sentenceSplitter.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
//@ts-nocheck
|
|
2
|
+
import { expect, test } from 'vitest';
|
|
3
|
+
import { CustomSentenceSplitter } from './sentenceSplitter';
|
|
4
|
+
import { SentenceSplitter, IngestionPipeline, Document } from "llamaindex";
|
|
5
|
+
// do these tests just to make sure that we can factor out my hacky fixes when llamaindex is fixed.
|
|
6
|
+
// test that original sentenceSplitter splits on abbreviations
|
|
7
|
+
// test that original sentenceSplitter splits on abbreviations even when specified
|
|
8
|
+
// test that my modified sentenceSplitter excludes metadata when arg is specified
|
|
9
|
+
// test that my modified sentenceSplitter includes metadata when arg is specified the other way
|
|
10
|
+
let documents = [
|
|
11
|
+
new Document({ text: "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee. Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019. He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time." }),
|
|
12
|
+
];
|
|
13
|
+
let originalSentenceSplitterPipeline = new IngestionPipeline({
|
|
14
|
+
transformations: [
|
|
15
|
+
new SentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
|
|
16
|
+
],
|
|
17
|
+
});
|
|
18
|
+
let customSentenceSplitterPipeline = new IngestionPipeline({
|
|
19
|
+
transformations: [
|
|
20
|
+
new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
|
|
21
|
+
],
|
|
22
|
+
});
|
|
23
|
+
test("my modified sentenceSplitter doesn't eliminate spaces", () => {
|
|
24
|
+
customSentenceSplitterPipeline.run({ documents: documents }).then((nodes) => {
|
|
25
|
+
expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(false);
|
|
26
|
+
expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(false);
|
|
27
|
+
expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(false);
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
// test("original sentenceSplitter does eliminate spaces", () => {
|
|
31
|
+
// originalSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
|
|
32
|
+
// expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(true);
|
|
33
|
+
// expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(true);
|
|
34
|
+
// expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(true);
|
|
35
|
+
// });
|
|
36
|
+
// });
|
|
37
|
+
let noAbbrevsCustomSentenceSplitterPipeline = new IngestionPipeline({
|
|
38
|
+
transformations: [
|
|
39
|
+
new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10, abbreviations: [] }),
|
|
40
|
+
],
|
|
41
|
+
});
|
|
42
|
+
test("my modified sentenceSplitter doesn't split on specified abbreviations", () => {
|
|
43
|
+
customSentenceSplitterPipeline.run({ documents: documents }).then((nodes) => {
|
|
44
|
+
expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).not.toContainEqual(true);
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
// this is only a problem on branch fix/sentence-splitter-spaces
|
|
48
|
+
// where the chunker is eliminated entirely in favor of just splitting by sentences with natural.
|
|
49
|
+
test("original sentenceSplitter splits in silly places, like Mr", () => {
|
|
50
|
+
noAbbrevsCustomSentenceSplitterPipeline.run({ documents: documents }).then((nodes) => {
|
|
51
|
+
expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).toContainEqual(true);
|
|
52
|
+
});
|
|
53
|
+
});
|
|
54
|
+
const testcases = [
|
|
55
|
+
["USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41", "USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41"],
|
|
56
|
+
["JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.", "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee."],
|
|
57
|
+
["Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.", "Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019."],
|
|
58
|
+
["He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time.", "He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time."],
|
|
59
|
+
];
|
|
60
|
+
testcases.forEach(([testcase_input, testcase_expected_output]) => {
|
|
61
|
+
test(`my sentenceSplitter correctly handles short sentence ${testcase_input}`, () => {
|
|
62
|
+
customSentenceSplitterPipeline.run({ documents: [new Document({ text: testcase_input })] }).then((nodes) => {
|
|
63
|
+
expect(nodes.length).toEqual(1);
|
|
64
|
+
expect(nodes[0]["text"]).toEqual(testcase_expected_output);
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
});
|
|
68
|
+
//# sourceMappingURL=sentenceSplitter.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentenceSplitter.test.js","sourceRoot":"","sources":["../../src/services/sentenceSplitter.test.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,QAAQ,CAAA;AACrC,OAAO,EAAE,sBAAsB,EAAE,MAAM,oBAAoB,CAAA;AAC3D,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAE3E,mGAAmG;AACnG,8DAA8D;AAC9D,kFAAkF;AAElF,iFAAiF;AACjF,+FAA+F;AAI/F,IAAI,SAAS,GAAG;IACZ,IAAI,QAAQ,CAAC,EAAE,IAAI,EAAE,oeAAoe,EAAE,CAAC;CAC/f,CAAC;AAEF,IAAI,gCAAgC,GAAG,IAAI,iBAAiB,CAAC;IACzD,eAAe,EAAE;QACb,IAAI,gBAAgB,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,EAAE,CAAC;KACxD;CACJ,CAAC,CAAC;AACP,IAAI,8BAA8B,GAAG,IAAI,iBAAiB,CAAC;IACvD,eAAe,EAAE;QACf,IAAI,sBAAsB,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,EAAE,CAAC;KAChE;CACF,CAAC,CAAC;AAEL,IAAI,CAAC,uDAAuD,EAAE,GAAG,EAAE;IAC/D,8BAA8B,CAAC,GAAG,CAAC,EAAC,SAAS,EAAE,SAAS,EAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE;QACtE,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACrF,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACxF,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACpF,CAAC,CAAC,CAAC;AACP,CAAC,CAAC,CAAC;AAEH,kEAAkE;AAClE,qFAAqF;AACrF,+FAA+F;AAC/F,kGAAkG;AAClG,0FAA0F;AAC1F,UAAU;AACV,MAAM;AAEN,IAAI,uCAAuC,GAAG,IAAI,iBAAiB,CAAC;IAChE,eAAe,EAAE;QACf,IAAI,sBAAsB,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,EAAE,aAAa,EAAE,EAAE,EAAC,CAAC;KAClF;CACF,CAAC,CAAC;AAGH,IAAI,CAAC,uEAAuE,EAAE,GAAG,EAAE;IACjF,8BAA8B,CAAC,GAAG,CAAC,EAAC,SAAS,EAAE,SAAS,EAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE;QACtE,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACxF,CAAC,CAAC,CAAC;AACP,CAAC,CAAC,CAAC;AAEH,gEAAgE;AAChE,iGAAiG;AACjG,IAAI,CAAC,2DAA2D,EAAE,GAAG,EAAE;IACnE,uCAAuC,CAAC,GAAG,CAAC,EAAC,SAAS,EAAE,SAAS,EAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE;QAC/E,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACpF,CAAC,CAAC,CAAC;AACP,CAAC,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG;IACd,CAAC,gGAAgG,EAAE,gGAAgG,CAAC;IACpM,CAAC,sKAAsK,EAAE,sKAAsK,CAAC;IAChV,CAAC,oGAAoG,EAAE,oGAAoG,CAAC;IAC5M,CAAC,0NAA0N,EAAE,0NAA0N,CAAC;CAE3b,CAAC;AACF,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,EAAE,wBAAwB,CAAC,EAAE,EAAE;IAC7D,IAAI,CAAC,wDAAwD,cAAc,EAAE,EAAE,GAAG,EAAE;QAChF,8BAA8B,CAAC,GAAG,CAAC,EAAC,SAAS,EAAE,CAAC,IAAI,QAAQ,CAAC,EAAC,IAAI,EAAE,cAAc,EAAC,CAAC,CAAC,EAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE;YACnG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,wBAAwB,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;IACP,CAAC,CAAC,CAAA;AACN,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { TextNode, TransformComponent } from "llamaindex";
|
|
2
|
+
interface SploderConfig {
|
|
3
|
+
maxStringTokenCount: number;
|
|
4
|
+
}
|
|
5
|
+
export declare class Sploder extends TransformComponent {
|
|
6
|
+
private maxTokenCount;
|
|
7
|
+
private tokenizer;
|
|
8
|
+
constructor(config: SploderConfig);
|
|
9
|
+
private getTokenCount;
|
|
10
|
+
transform(nodes: TextNode[]): Promise<TextNode[]>;
|
|
11
|
+
}
|
|
12
|
+
export {};
|
|
13
|
+
//# sourceMappingURL=sploder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sploder.d.ts","sourceRoot":"","sources":["../../src/services/sploder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAY,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAGpE,UAAU,aAAa;IACrB,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,qBAAa,OAAQ,SAAQ,kBAAkB;IAC7C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,SAAS,CAAM;gBAIX,MAAM,EAAE,aAAa;IAMjC,OAAO,CAAC,aAAa;IAIf,SAAS,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;CAsCxD"}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { TextNode, TransformComponent } from "llamaindex";
|
|
2
|
+
import { encodingForModel } from "js-tiktoken";
|
|
3
|
+
export class Sploder extends TransformComponent {
|
|
4
|
+
maxTokenCount;
|
|
5
|
+
tokenizer; // js-tiktoken encoder
|
|
6
|
+
// TODO: this is a hack to get the tokenizer for the embedding model
|
|
7
|
+
// TODO: this should be a singleton
|
|
8
|
+
constructor(config) {
|
|
9
|
+
super(async (nodes) => nodes); // no-op, to be replaced later
|
|
10
|
+
this.maxTokenCount = config.maxStringTokenCount;
|
|
11
|
+
this.tokenizer = encodingForModel("text-embedding-3-small");
|
|
12
|
+
}
|
|
13
|
+
getTokenCount(text) {
|
|
14
|
+
return this.tokenizer.encode(text).length;
|
|
15
|
+
}
|
|
16
|
+
async transform(nodes) {
|
|
17
|
+
const newNodes = [];
|
|
18
|
+
nodes.forEach((node, index) => {
|
|
19
|
+
// Keep original node
|
|
20
|
+
newNodes.push(node);
|
|
21
|
+
// Skip if text is too long
|
|
22
|
+
if (this.getTokenCount(node.text) > this.maxTokenCount) {
|
|
23
|
+
return;
|
|
24
|
+
}
|
|
25
|
+
const prevNode = index > 0 ? nodes[index - 1] : null;
|
|
26
|
+
const nextNode = index < nodes.length - 1 ? nodes[index + 1] : null;
|
|
27
|
+
// Create node with current + next if available
|
|
28
|
+
if (nextNode) {
|
|
29
|
+
newNodes.push(new TextNode({
|
|
30
|
+
text: node.text + " " + nextNode.text,
|
|
31
|
+
metadata: { ...node.metadata, isExpanded: true }
|
|
32
|
+
}));
|
|
33
|
+
}
|
|
34
|
+
// Create node with prev + current + next if both available
|
|
35
|
+
if (prevNode && nextNode) {
|
|
36
|
+
newNodes.push(new TextNode({
|
|
37
|
+
text: prevNode.text + " " + node.text + " " + nextNode.text,
|
|
38
|
+
metadata: { ...node.metadata, isExpanded: true }
|
|
39
|
+
}));
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
return newNodes;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
//# sourceMappingURL=sploder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sploder.js","sourceRoot":"","sources":["../../src/services/sploder.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAY,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACpE,OAAO,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAM/C,MAAM,OAAO,OAAQ,SAAQ,kBAAkB;IACrC,aAAa,CAAS;IACtB,SAAS,CAAM,CAAC,sBAAsB;IAE9C,oEAAoE;IACpE,mCAAmC;IACnC,YAAY,MAAqB;QAC/B,KAAK,CAAC,KAAK,EAAE,KAAiB,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,8BAA8B;QACzE,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,mBAAmB,CAAC;QAChD,IAAI,CAAC,SAAS,GAAG,gBAAgB,CAAC,wBAAwB,CAAC,CAAC;IAC9D,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IAC5C,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,KAAiB;QAC/B,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;YAC5B,qBAAqB;YACrB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAEpB,2BAA2B;YAC3B,IAAI,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBACvD,OAAO;YACT,CAAC;YAED,MAAM,QAAQ,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACrD,MAAM,QAAQ,GAAG,KAAK,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YAEpE,+CAA+C;YAC/C,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,IAAI,CACX,IAAI,QAAQ,CAAC;oBACX,IAAI,EAAE,IAAI,CAAC,IAAI,GAAG,GAAG,GAAG,QAAQ,CAAC,IAAI;oBACrC,QAAQ,EAAE,EAAE,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,EAAE,IAAI,EAAE;iBACjD,CAAC,CACH,CAAC;YACJ,CAAC;YAED,2DAA2D;YAC3D,IAAI,QAAQ,IAAI,QAAQ,EAAE,CAAC;gBACzB,QAAQ,CAAC,IAAI,CACX,IAAI,QAAQ,CAAC;oBACX,IAAI,EAAE,QAAQ,CAAC,IAAI,GAAG,GAAG,GAAG,IAAI,CAAC,IAAI,GAAG,GAAG,GAAG,QAAQ,CAAC,IAAI;oBAC3D,QAAQ,EAAE,EAAE,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,EAAE,IAAI,EAAE;iBACjD,CAAC,CACH,CAAC;YACJ,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC;IAClB,CAAC;CACF"}
|