@meaningfully/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/.nvmrc +1 -0
  2. package/LICENSE +7 -0
  3. package/README.md +3 -0
  4. package/dist/DocumentSetManager.d.ts +28 -0
  5. package/dist/DocumentSetManager.d.ts.map +1 -0
  6. package/dist/DocumentSetManager.js +134 -0
  7. package/dist/DocumentSetManager.js.map +1 -0
  8. package/dist/Meaningfully.d.ts +52 -0
  9. package/dist/Meaningfully.d.ts.map +1 -0
  10. package/dist/Meaningfully.js +206 -0
  11. package/dist/Meaningfully.js.map +1 -0
  12. package/dist/MetadataManager.d.ts +32 -0
  13. package/dist/MetadataManager.d.ts.map +1 -0
  14. package/dist/MetadataManager.js +115 -0
  15. package/dist/MetadataManager.js.map +1 -0
  16. package/dist/api/embedding.d.ts +7 -0
  17. package/dist/api/embedding.d.ts.map +1 -0
  18. package/dist/api/embedding.js +94 -0
  19. package/dist/api/embedding.js.map +1 -0
  20. package/dist/api/embedding.test.d.ts +2 -0
  21. package/dist/api/embedding.test.d.ts.map +1 -0
  22. package/dist/api/embedding.test.js +340 -0
  23. package/dist/api/embedding.test.js.map +1 -0
  24. package/dist/index.d.ts +5 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +6 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/services/batchingWeaviateVectorStore.d.ts +6 -0
  29. package/dist/services/batchingWeaviateVectorStore.d.ts.map +1 -0
  30. package/dist/services/batchingWeaviateVectorStore.js +21 -0
  31. package/dist/services/batchingWeaviateVectorStore.js.map +1 -0
  32. package/dist/services/csvLoader.d.ts +3 -0
  33. package/dist/services/csvLoader.d.ts.map +1 -0
  34. package/dist/services/csvLoader.js +18 -0
  35. package/dist/services/csvLoader.js.map +1 -0
  36. package/dist/services/csvLoader.test.d.ts +2 -0
  37. package/dist/services/csvLoader.test.d.ts.map +1 -0
  38. package/dist/services/csvLoader.test.js +75 -0
  39. package/dist/services/csvLoader.test.js.map +1 -0
  40. package/dist/services/embeddings.d.ts +22 -0
  41. package/dist/services/embeddings.d.ts.map +1 -0
  42. package/dist/services/embeddings.js +314 -0
  43. package/dist/services/embeddings.js.map +1 -0
  44. package/dist/services/embeddings.test.d.ts +2 -0
  45. package/dist/services/embeddings.test.d.ts.map +1 -0
  46. package/dist/services/embeddings.test.js +115 -0
  47. package/dist/services/embeddings.test.js.map +1 -0
  48. package/dist/services/loggingOpenAIEmbedding.d.ts +2 -0
  49. package/dist/services/loggingOpenAIEmbedding.d.ts.map +1 -0
  50. package/dist/services/loggingOpenAIEmbedding.js +41 -0
  51. package/dist/services/loggingOpenAIEmbedding.js.map +1 -0
  52. package/dist/services/mockEmbedding.d.ts +6 -0
  53. package/dist/services/mockEmbedding.d.ts.map +1 -0
  54. package/dist/services/mockEmbedding.js +14 -0
  55. package/dist/services/mockEmbedding.js.map +1 -0
  56. package/dist/services/progressManager.d.ts +21 -0
  57. package/dist/services/progressManager.d.ts.map +1 -0
  58. package/dist/services/progressManager.js +76 -0
  59. package/dist/services/progressManager.js.map +1 -0
  60. package/dist/services/progressVectorStoreIndex.d.ts +21 -0
  61. package/dist/services/progressVectorStoreIndex.d.ts.map +1 -0
  62. package/dist/services/progressVectorStoreIndex.js +60 -0
  63. package/dist/services/progressVectorStoreIndex.js.map +1 -0
  64. package/dist/services/sentenceSplitter.d.ts +17 -0
  65. package/dist/services/sentenceSplitter.d.ts.map +1 -0
  66. package/dist/services/sentenceSplitter.js +207 -0
  67. package/dist/services/sentenceSplitter.js.map +1 -0
  68. package/dist/services/sentenceSplitter.test.d.ts +2 -0
  69. package/dist/services/sentenceSplitter.test.d.ts.map +1 -0
  70. package/dist/services/sentenceSplitter.test.js +68 -0
  71. package/dist/services/sentenceSplitter.test.js.map +1 -0
  72. package/dist/services/sploder.d.ts +13 -0
  73. package/dist/services/sploder.d.ts.map +1 -0
  74. package/dist/services/sploder.js +45 -0
  75. package/dist/services/sploder.js.map +1 -0
  76. package/dist/types/index.d.ts +77 -0
  77. package/dist/types/index.d.ts.map +1 -0
  78. package/dist/types/index.js +2 -0
  79. package/dist/types/index.js.map +1 -0
  80. package/dist/utils.d.ts +3 -0
  81. package/dist/utils.d.ts.map +1 -0
  82. package/dist/utils.js +7 -0
  83. package/dist/utils.js.map +1 -0
  84. package/package.json +43 -0
  85. package/src/Meaningfully.d.ts +57 -0
  86. package/src/Meaningfully.ts +228 -0
  87. package/src/MetadataManager.d.ts +27 -0
  88. package/src/MetadataManager.ts +145 -0
  89. package/src/api/embedding.d.ts +6 -0
  90. package/src/api/embedding.ts +122 -0
  91. package/src/index.ts +5 -0
  92. package/src/services/batchingWeaviateVectorStore.d.ts +5 -0
  93. package/src/services/batchingWeaviateVectorStore.ts +23 -0
  94. package/src/services/csvLoader.d.ts +2 -0
  95. package/src/services/csvLoader.ts +24 -0
  96. package/src/services/embeddings.d.ts +21 -0
  97. package/src/services/embeddings.ts +374 -0
  98. package/src/services/loggingOpenAIEmbedding.d.ts +0 -0
  99. package/src/services/loggingOpenAIEmbedding.ts +46 -0
  100. package/src/services/mockEmbedding.d.ts +5 -0
  101. package/src/services/mockEmbedding.ts +13 -0
  102. package/src/services/progressManager.d.ts +20 -0
  103. package/src/services/progressManager.ts +88 -0
  104. package/src/services/progressVectorStoreIndex.d.ts +20 -0
  105. package/src/services/progressVectorStoreIndex.ts +95 -0
  106. package/src/services/sentenceSplitter.d.ts +16 -0
  107. package/src/services/sentenceSplitter.ts +243 -0
  108. package/src/services/sploder.d.ts +12 -0
  109. package/src/services/sploder.ts +62 -0
  110. package/src/types/index.d.ts +71 -0
  111. package/src/types/index.ts +89 -0
  112. package/src/utils.d.ts +2 -0
  113. package/src/utils.ts +6 -0
  114. package/tests/MetadataManager.test.ts +120 -0
  115. package/tests/csvLoader.test.d.ts +1 -0
  116. package/tests/csvLoader.test.ts +88 -0
  117. package/tests/embedding.test.d.ts +1 -0
  118. package/tests/embedding.test.ts +425 -0
  119. package/tests/embeddings.test.d.ts +1 -0
  120. package/tests/embeddings.test.ts +144 -0
  121. package/tests/sentenceSplitter.test.d.ts +1 -0
  122. package/tests/sentenceSplitter.test.ts +81 -0
  123. package/tsconfig.json +31 -0
  124. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,88 @@
1
+ /**
2
+ * A simple manager to track progress of various operations
3
+ */
4
+
5
+ const FUDGE_FACTOR = 1.2; // seat of my pants guess
6
+
7
+ export class ProgressManager {
8
+ private static instance: ProgressManager;
9
+ private progressMap: Map<string, { progress: number; total: number; startTime: number; lastUpdateTime: number }> = new Map();
10
+ private currentOperation: string | null = null;
11
+
12
+
13
+ private constructor() {}
14
+
15
+ public static getInstance(): ProgressManager {
16
+ if (!ProgressManager.instance) {
17
+ ProgressManager.instance = new ProgressManager();
18
+ }
19
+ return ProgressManager.instance;
20
+ }
21
+
22
+ public startOperation(operationId: string, total: number = 100): void {
23
+ const now = Date.now();
24
+ this.progressMap.set(operationId, { progress: 0, total, startTime: now, lastUpdateTime: now });
25
+ this.currentOperation = operationId;
26
+ }
27
+
28
+ public updateProgress(operationId: string, progress: number): void {
29
+ const currentProgress = this.progressMap.get(operationId);
30
+ if (currentProgress) {
31
+ this.progressMap.set(operationId, {
32
+ progress,
33
+ total: currentProgress.total,
34
+ startTime: currentProgress.startTime,
35
+ lastUpdateTime: Date.now()
36
+ });
37
+ }
38
+ }
39
+
40
+ public completeOperation(operationId: string): void {
41
+ const currentProgress = this.progressMap.get(operationId);
42
+ if (currentProgress) {
43
+ this.progressMap.set(operationId, {
44
+ progress: currentProgress.total,
45
+ total: currentProgress.total,
46
+ startTime: currentProgress.startTime,
47
+ lastUpdateTime: Date.now()
48
+ });
49
+ }
50
+
51
+ if (this.currentOperation === operationId) {
52
+ this.currentOperation = null;
53
+ }
54
+ }
55
+
56
+ public getCurrentProgress(): { progress: number; total: number; elapsedTimeMs: number; estimatedTimeRemainingMs: number | null } {
57
+ if (this.currentOperation) {
58
+ const operationData = this.progressMap.get(this.currentOperation);
59
+ if (operationData) {
60
+ const now = Date.now();
61
+ const elapsedTimeMs = now - operationData.startTime;
62
+ let estimatedTimeRemainingMs: number | null = null;
63
+
64
+ // Only estimate if we have meaningful progress (at least 5% complete and some time elapsed)
65
+ if (operationData.progress > 0.05 * operationData.total && elapsedTimeMs > 1000) {
66
+ const progressPercentage = operationData.progress / operationData.total;
67
+ const estimatedTotalTime = (elapsedTimeMs / progressPercentage) * FUDGE_FACTOR;
68
+ estimatedTimeRemainingMs = Math.max(0, estimatedTotalTime - elapsedTimeMs);
69
+ }
70
+
71
+ return {
72
+ progress: operationData.progress,
73
+ total: operationData.total,
74
+ elapsedTimeMs,
75
+ estimatedTimeRemainingMs
76
+ };
77
+ }
78
+ }
79
+ return { progress: 0, total: 100, elapsedTimeMs: 0, estimatedTimeRemainingMs: null };
80
+ }
81
+
82
+ public clearOperation(operationId: string): void {
83
+ this.progressMap.delete(operationId);
84
+ if (this.currentOperation === operationId) {
85
+ this.currentOperation = null;
86
+ }
87
+ }
88
+ }
@@ -0,0 +1,20 @@
1
+ import { VectorStoreIndex, VectorIndexOptions as BaseVectorIndexOptions } from "llamaindex";
2
+ import { BaseNode } from "llamaindex";
3
+ export interface VectorIndexOptions extends BaseVectorIndexOptions {
4
+ progressCallback?: (progress: number, total: number) => void;
5
+ }
6
+ export declare class ProgressVectorStoreIndex extends VectorStoreIndex {
7
+ static init(options: VectorIndexOptions): Promise<VectorStoreIndex>;
8
+ buildIndexFromNodes(nodes: BaseNode[], options?: {
9
+ logProgress?: boolean;
10
+ progressCallback?: (progress: number, total: number) => void;
11
+ }): Promise<void>;
12
+ insertNodes(nodes: BaseNode[], options?: {
13
+ logProgress?: boolean;
14
+ progressCallback?: (progress: number, total: number) => void;
15
+ }): Promise<void>;
16
+ getNodeEmbeddingResults(nodes: BaseNode[], options?: {
17
+ logProgress?: boolean;
18
+ progressCallback?: (progress: number, total: number) => void;
19
+ }): Promise<BaseNode[]>;
20
+ }
@@ -0,0 +1,95 @@
1
+ import { VectorStoreIndex, type VectorIndexOptions as BaseVectorIndexOptions, storageContextFromDefaults, IndexDict } from "llamaindex";
2
+ import { BaseNode, ModalityType, splitNodesByType } from "llamaindex";
3
+ import { addNodesToVectorStores } from "llamaindex";
4
+
5
+ // Extend the VectorIndexOptions interface to include progressCallback
6
+ export interface VectorIndexOptions extends BaseVectorIndexOptions {
7
+ progressCallback?: (progress: number, total: number) => void;
8
+ }
9
+
10
+ // Subclass VectorStoreIndex to handle progressCallback
11
+ // @ts-ignore
12
+ export class ProgressVectorStoreIndex extends VectorStoreIndex {
13
+ public static async init(
14
+ options: VectorIndexOptions,
15
+ ): Promise<VectorStoreIndex> {
16
+ const storageContext =
17
+ options.storageContext ?? (await storageContextFromDefaults({}));
18
+ const indexStore = storageContext.indexStore;
19
+ const docStore = storageContext.docStore;
20
+
21
+ // @ts-ignore
22
+ let indexStruct = await VectorStoreIndex.setupIndexStructFromStorage(
23
+ indexStore,
24
+ options,
25
+ );
26
+
27
+ if (!options.nodes && !indexStruct) {
28
+ throw new Error(
29
+ "Cannot initialize VectorStoreIndex without nodes or indexStruct",
30
+ );
31
+ }
32
+
33
+ indexStruct = indexStruct ?? new IndexDict();
34
+
35
+ // @ts-ignore
36
+ const index = new this({
37
+ storageContext,
38
+ docStore,
39
+ indexStruct,
40
+ indexStore,
41
+ vectorStores: options.vectorStores,
42
+ });
43
+
44
+ if (options.nodes) {
45
+ // If nodes are passed in, then we need to update the index
46
+ await index.buildIndexFromNodes(options.nodes, {
47
+ logProgress: options.logProgress,
48
+ progressCallback: options.progressCallback,
49
+ });
50
+ }
51
+ return index;
52
+ }
53
+
54
+ async buildIndexFromNodes(
55
+ nodes: BaseNode[],
56
+ options?: { logProgress?: boolean; progressCallback?: (progress: number, total: number) => void }
57
+ ) {
58
+ await this.insertNodes(nodes, options);
59
+ }
60
+
61
+ async insertNodes(
62
+ nodes: BaseNode[],
63
+ options?: { logProgress?: boolean; progressCallback?: (progress: number, total: number) => void }
64
+ ) {
65
+ if (!nodes || nodes.length === 0) {
66
+ return;
67
+ }
68
+
69
+ nodes = await this.getNodeEmbeddingResults(nodes, options);
70
+ await addNodesToVectorStores(
71
+ nodes,
72
+ this.vectorStores,
73
+ this.insertNodesToStore.bind(this),
74
+ );
75
+ await this.indexStore.addIndexStruct(this.indexStruct);
76
+ }
77
+
78
+ async getNodeEmbeddingResults(
79
+ nodes: BaseNode[],
80
+ options?: { logProgress?: boolean; progressCallback?: (progress: number, total: number) => void }
81
+ ): Promise<BaseNode[]> {
82
+ const nodeMap = splitNodesByType(nodes);
83
+ for (const type in nodeMap) {
84
+ const nodes = nodeMap[type as ModalityType];
85
+ const embedModel = this.vectorStores[type as ModalityType]?.embedModel ?? this.embedModel;
86
+ if (embedModel && nodes) {
87
+ await embedModel(nodes, {
88
+ logProgress: options?.logProgress,
89
+ progressCallback: options?.progressCallback, // Pass progressCallback to embedModel
90
+ });
91
+ }
92
+ }
93
+ return nodes;
94
+ }
95
+ }
@@ -0,0 +1,16 @@
1
+ import { SentenceSplitter } from "llamaindex";
2
+ import natural from "natural";
3
+ type TextSplitterFn = (text: string) => string[];
4
+ export declare class CustomSentenceSplitter extends SentenceSplitter {
5
+ #private;
6
+ chunkingTokenizerFn: () => TextSplitterFn;
7
+ abbreviations: string[];
8
+ tokenizer: natural.SentenceTokenizer;
9
+ constructor(params?: {
10
+ chunkSize?: number;
11
+ chunkOverlap?: number;
12
+ abbreviations?: string[];
13
+ });
14
+ _splitText(text: string, chunkSize: number): string[];
15
+ }
16
+ export {};
@@ -0,0 +1,243 @@
1
+ import { SentenceSplitter, splitBySep, splitByRegex, splitByChar, Settings } from "llamaindex";
2
+ import natural from "natural"
3
+
4
+ /*
5
+ LlamaIndex's includes the length of the metadata as part of the size of the chunk when splitting by sentences.
6
+ This produces very unintuitive behavior: e.g. when the user specifies a chunk-size of 50 and nodes have metadata of length 40,
7
+ the resulting split sentences are about 10 tokens long -- as opposed to the specified 50.
8
+
9
+ This modified SentenceSplitter adds a `include_metadata_in_chunksize` flag that disables the above behavior,
10
+ ignoring metadata when calculating chunksize (i.e. only including the size of the text datga when calculating chunksize.)
11
+
12
+ Additionally, splitTextMetadataAware does some bizarre stuff where it will split sentences at abbreviations -- even if the
13
+ underlying tokenizer knows about the abbreviations, I think due to some weird sub-sentence splitting. It also sews sentence
14
+ chunks back together in a way that eliminates spaces, e.g. `JPMorgan Chase & Co.elected Mark Weinberger` and `Mr.Weinberger was Global Chairman`.
15
+
16
+ I also tried making SentenceSplitter just split on sentences (with Natural) but this misbehaved by splitting TOO much. I do need short sentences grouped
17
+ together (whether they are true short sentences, or false-positives like "USA v. one 12 ft. I.B.M. mainframe").
18
+
19
+
20
+ */
21
+ // TODO: make this configurable
22
+ const INCLUDE_METADATA_IN_CHUNKSIZE = false;
23
+ SentenceSplitter.prototype.splitTextMetadataAware = function(text: string, metadata: string): string[] {
24
+ const metadataLength = this.tokenSize(metadata);
25
+ const effectiveChunkSize = INCLUDE_METADATA_IN_CHUNKSIZE ? this.chunkSize - metadataLength : this.chunkSize;
26
+ if (effectiveChunkSize <= 0) {
27
+ throw new Error(
28
+ `Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`,
29
+ );
30
+ } else if (effectiveChunkSize < 50) {
31
+ console.log(
32
+ `Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`,
33
+ );
34
+ }
35
+ return this._splitText(text, effectiveChunkSize);
36
+ }
37
+
38
+ const default_abbreviations= ['dr.', 'vs.', 'mr.', 'ms.', 'mx.', 'mrs.', 'prof.', 'inc.', 'corp.', 'co.', 'llc.', 'ltd.', 'etc.', "i.e.",
39
+ "etc.",
40
+ "vs.",
41
+ "A.S.A.P.",
42
+ ];
43
+
44
+ // verbatim copies
45
+ type TextSplitterFn = (text: string) => string[];
46
+ type _Split = {
47
+ text: string;
48
+ isSentence: boolean;
49
+ tokenSize: number;
50
+ };
51
+
52
+
53
+ // This varies from SentenceSplitter in two ways:
54
+ // 1. it uses abbreviations set here.
55
+ // 2. it uses a custom SentenceTokenizer with a second trimSentences arguemnt that controls
56
+ // whether or not leading/trailing whitespace is preserved.
57
+ // We want to preserve it, so that when sentences are merged back again, we don't end up with
58
+ // sentences that are not separated by spaces.
59
+ // Because JavaScript is stupid, we have to copy over almost the whole SentenceSplitter just to make those few small changes.
60
+ export class CustomSentenceSplitter extends SentenceSplitter {
61
+
62
+ // this function is new.
63
+ chunkingTokenizerFn = (): TextSplitterFn => {
64
+ return (text: string) => {
65
+ try {
66
+ return this.tokenizer.tokenize(text);
67
+ } catch {
68
+ return [text];
69
+ }
70
+ };
71
+ };
72
+ #splitFns: Set<TextSplitterFn> = new Set();
73
+ #subSentenceSplitFns: Set<TextSplitterFn> = new Set();
74
+ abbreviations: string[];
75
+ tokenizer: natural.SentenceTokenizer;
76
+
77
+ constructor(params: { chunkSize?: number; chunkOverlap?: number; abbreviations?: string[] } = {}) {
78
+ super(params);
79
+ // Create custom tokenizer with abbreviations
80
+ this.abbreviations = params.abbreviations || default_abbreviations;
81
+
82
+ // I modified my local node_modules/natural/lib/natural/tokenizers/index.d.ts to add the second argument to the natural.SentenceTokenizer constructor.
83
+ // once that gets fixed in the next version of the library, remove the ts-ignore.
84
+ // @ts-ignore
85
+ this.tokenizer = new natural.SentenceTokenizer(this.abbreviations, false); // false is don't trim sentences
86
+
87
+ // copied from the superclass.
88
+ this.#splitFns.add(splitBySep(this.paragraphSeparator));
89
+
90
+ this.#splitFns.add(this.chunkingTokenizerFn()); // the ONLY change here in the constructor.
91
+
92
+ // copied from the superclass.
93
+ this.#subSentenceSplitFns.add(splitByRegex(this.secondaryChunkingRegex));
94
+ this.#subSentenceSplitFns.add(splitBySep(this.separator));
95
+ this.#subSentenceSplitFns.add(splitByChar());
96
+
97
+ // left over from a failed attempt to JUST use natural.SentenceTokenizer
98
+ // but I DO in fact need the merge stuff.
99
+ // const tokenizer =
100
+ // Override the default splitText method
101
+ // this.splitText = (text: string): string[] => {
102
+ // return tokenizer.tokenize(text);
103
+ // };
104
+ // /* tslint:disable:no-unused-variable */
105
+ // this.splitTextMetadataAware = (text: string, metadata: string): string[] => {
106
+ // return tokenizer.tokenize(text);
107
+ // }
108
+ }
109
+
110
+
111
+ //just verbatim copies of the parent class
112
+
113
+ _splitText(text: string, chunkSize: number): string[] {
114
+ if (text === "") return [text];
115
+
116
+ const callbackManager = Settings.callbackManager;
117
+
118
+ callbackManager.dispatchEvent("chunking-start", {
119
+ text: [text],
120
+ });
121
+ const splits = this.#split(text, chunkSize);
122
+ const chunks = this.#merge(splits, chunkSize);
123
+
124
+ callbackManager.dispatchEvent("chunking-end", {
125
+ chunks,
126
+ });
127
+ return chunks;
128
+ }
129
+
130
+ #split(text: string, chunkSize: number): _Split[] {
131
+ const tokenSize = this.tokenSize(text);
132
+ if (tokenSize <= chunkSize) {
133
+ return [
134
+ {
135
+ text,
136
+ isSentence: true,
137
+ tokenSize,
138
+ },
139
+ ];
140
+ }
141
+ const [textSplitsByFns, isSentence] = this.#getSplitsByFns(text);
142
+ const textSplits: _Split[] = [];
143
+
144
+ for (const textSplit of textSplitsByFns) {
145
+ const tokenSize = this.tokenSize(textSplit);
146
+ if (tokenSize <= chunkSize) {
147
+ textSplits.push({
148
+ text: textSplit,
149
+ isSentence,
150
+ tokenSize,
151
+ });
152
+ } else {
153
+ const recursiveTextSplits = this.#split(textSplit, chunkSize);
154
+ textSplits.push(...recursiveTextSplits);
155
+ }
156
+ }
157
+ return textSplits;
158
+ }
159
+
160
+ #getSplitsByFns(text: string): [splits: string[], isSentence: boolean] {
161
+ for (const splitFn of this.#splitFns) {
162
+ const splits = splitFn(text);
163
+ if (splits.length > 1) {
164
+ return [splits, true];
165
+ }
166
+ }
167
+ for (const splitFn of this.#subSentenceSplitFns) {
168
+ const splits = splitFn(text);
169
+ if (splits.length > 1) {
170
+ return [splits, false];
171
+ }
172
+ }
173
+ return [[text], true];
174
+ }
175
+
176
+ #merge(splits: _Split[], chunkSize: number): string[] {
177
+ const chunks: string[] = [];
178
+ let currentChunk: [string, number][] = [];
179
+ let lastChunk: [string, number][] = [];
180
+ let currentChunkLength = 0;
181
+ let newChunk = true;
182
+
183
+ const closeChunk = (): void => {
184
+ chunks.push(currentChunk.map(([text]) => text).join(""));
185
+ lastChunk = currentChunk;
186
+ currentChunk = [];
187
+ currentChunkLength = 0;
188
+ newChunk = true;
189
+
190
+ let lastIndex = lastChunk.length - 1;
191
+ while (
192
+ lastIndex >= 0 &&
193
+ currentChunkLength + lastChunk[lastIndex]![1] <= this.chunkOverlap
194
+ ) {
195
+ const [text, length] = lastChunk[lastIndex]!;
196
+ currentChunkLength += length;
197
+ currentChunk.unshift([text, length]);
198
+ lastIndex -= 1;
199
+ }
200
+ };
201
+
202
+ while (splits.length > 0) {
203
+ const curSplit = splits[0]!;
204
+ if (curSplit.tokenSize > chunkSize) {
205
+ throw new Error("Single token exceeded chunk size");
206
+ }
207
+ if (currentChunkLength + curSplit.tokenSize > chunkSize && !newChunk) {
208
+ closeChunk();
209
+ } else {
210
+ if (
211
+ curSplit.isSentence ||
212
+ currentChunkLength + curSplit.tokenSize <= chunkSize ||
213
+ newChunk
214
+ ) {
215
+ currentChunkLength += curSplit.tokenSize;
216
+ currentChunk.push([curSplit.text, curSplit.tokenSize]);
217
+ splits.shift();
218
+ newChunk = false;
219
+ } else {
220
+ closeChunk();
221
+ }
222
+ }
223
+ }
224
+
225
+ // Handle the last chunk
226
+ if (!newChunk) {
227
+ chunks.push(currentChunk.map(([text]) => text).join(""));
228
+ }
229
+
230
+ return this.#postprocessChunks(chunks);
231
+ }
232
+
233
+ #postprocessChunks(chunks: string[]): string[] {
234
+ const newChunks: string[] = [];
235
+ for (const chunk of chunks) {
236
+ const trimmedChunk = chunk.trim();
237
+ if (trimmedChunk !== "") {
238
+ newChunks.push(trimmedChunk);
239
+ }
240
+ }
241
+ return newChunks;
242
+ }
243
+ }
@@ -0,0 +1,12 @@
1
+ import { TextNode, TransformComponent } from "llamaindex";
2
+ interface SploderConfig {
3
+ maxStringTokenCount: number;
4
+ }
5
+ export declare class Sploder extends TransformComponent {
6
+ private maxTokenCount;
7
+ private tokenizer;
8
+ constructor(config: SploderConfig);
9
+ private getTokenCount;
10
+ transform(nodes: TextNode[]): Promise<TextNode[]>;
11
+ }
12
+ export {};
@@ -0,0 +1,62 @@
1
+ import { TextNode, BaseNode, TransformComponent } from "llamaindex";
2
+ import { encodingForModel } from "js-tiktoken";
3
+
4
+ interface SploderConfig {
5
+ maxStringTokenCount: number;
6
+ }
7
+
8
+ export class Sploder extends TransformComponent {
9
+ private maxTokenCount: number;
10
+ private tokenizer: any; // js-tiktoken encoder
11
+
12
+ // TODO: this is a hack to get the tokenizer for the embedding model
13
+ // TODO: this should be a singleton
14
+ constructor(config: SploderConfig) {
15
+ super(async (nodes: BaseNode[]) => nodes); // no-op, to be replaced later
16
+ this.maxTokenCount = config.maxStringTokenCount;
17
+ this.tokenizer = encodingForModel("text-embedding-3-small");
18
+ }
19
+
20
+ private getTokenCount(text: string): number {
21
+ return this.tokenizer.encode(text).length;
22
+ }
23
+
24
+ async transform(nodes: TextNode[]): Promise<TextNode[]> {
25
+ const newNodes: TextNode[] = [];
26
+
27
+ nodes.forEach((node, index) => {
28
+ // Keep original node
29
+ newNodes.push(node);
30
+
31
+ // Skip if text is too long
32
+ if (this.getTokenCount(node.text) > this.maxTokenCount) {
33
+ return;
34
+ }
35
+
36
+ const prevNode = index > 0 ? nodes[index - 1] : null;
37
+ const nextNode = index < nodes.length - 1 ? nodes[index + 1] : null;
38
+
39
+ // Create node with current + next if available
40
+ if (nextNode) {
41
+ newNodes.push(
42
+ new TextNode({
43
+ text: node.text + " " + nextNode.text,
44
+ metadata: { ...node.metadata, isExpanded: true }
45
+ })
46
+ );
47
+ }
48
+
49
+ // Create node with prev + current + next if both available
50
+ if (prevNode && nextNode) {
51
+ newNodes.push(
52
+ new TextNode({
53
+ text: prevNode.text + " " + node.text + " " + nextNode.text,
54
+ metadata: { ...node.metadata, isExpanded: true }
55
+ })
56
+ );
57
+ }
58
+ });
59
+
60
+ return newNodes;
61
+ }
62
+ }
@@ -0,0 +1,71 @@
1
+ export interface SearchResult {
2
+ text: string;
3
+ score: number;
4
+ metadata: Record<string, any>;
5
+ }
6
+ export interface EmbeddingResult {
7
+ success: boolean;
8
+ error?: string;
9
+ index?: any;
10
+ }
11
+ export interface PreviewResult {
12
+ success: boolean;
13
+ error?: string;
14
+ nodes?: Array<{
15
+ text: string;
16
+ metadata: Record<string, any>;
17
+ }>;
18
+ estimatedPrice?: number;
19
+ tokenCount?: number;
20
+ pricePer1M?: number;
21
+ }
22
+ export interface DocumentSetMetadata {
23
+ documentSetId: number;
24
+ name: string;
25
+ uploadDate: Date;
26
+ parameters: Record<string, unknown>;
27
+ totalDocuments: number;
28
+ }
29
+ export interface DocumentSetParams {
30
+ datasetName: string;
31
+ description: string;
32
+ textColumns: string[];
33
+ metadataColumns: string[];
34
+ splitIntoSentences: boolean;
35
+ combineSentencesIntoChunks: boolean;
36
+ sploderMaxSize: number;
37
+ chunkSize: number;
38
+ chunkOverlap: number;
39
+ modelName: string;
40
+ modelProvider: string;
41
+ }
42
+ export interface EmbeddingConfig {
43
+ modelName: string;
44
+ modelProvider: string;
45
+ vectorStoreType: "simple" | "postgres" | "weaviate";
46
+ projectName: string;
47
+ storagePath: string;
48
+ splitIntoSentences: boolean;
49
+ combineSentencesIntoChunks: boolean;
50
+ sploderMaxSize: number;
51
+ chunkSize: number;
52
+ chunkOverlap: number;
53
+ }
54
+ export interface Settings {
55
+ openAIKey: string | null;
56
+ oLlamaBaseURL: string | null;
57
+ azureOpenAIKey: string | null;
58
+ azureOpenAIEndpoint: string | null;
59
+ azureOpenAIApiVersion: string | null;
60
+ mistralApiKey: string | null;
61
+ geminiApiKey: string | null;
62
+ }
63
+ export interface MetadataFilter {
64
+ key: string;
65
+ operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty";
66
+ value: any;
67
+ }
68
+ export interface Clients {
69
+ weaviateClient: any;
70
+ postgresClient: any;
71
+ }
@@ -0,0 +1,89 @@
1
+ export interface SearchResult {
2
+ text: string;
3
+ score: number;
4
+ metadata: Record<string, any>;
5
+ }
6
+
7
+ export interface EmbeddingResult {
8
+ success: boolean;
9
+ error?: string;
10
+ index?: any;
11
+ }
12
+
13
+ export interface PreviewResult {
14
+ success: boolean;
15
+ error?: string;
16
+ nodes?: Array<{
17
+ text: string;
18
+ metadata: Record<string, any>;
19
+ }>;
20
+ estimatedPrice?: number;
21
+ tokenCount?: number;
22
+ pricePer1M?: number;
23
+ }
24
+
25
+ // Type definitions for meaningfully core
26
+ export interface SearchConfig {
27
+ modelProvider: string
28
+ modelName: string
29
+ projectName: string
30
+ }
31
+
32
+ // Define types for our document set metadata
33
+ export interface DocumentSetMetadata {
34
+ documentSetId: number;
35
+ name: string;
36
+ uploadDate: Date;
37
+ parameters: Record<string, unknown>;
38
+ totalDocuments: number;
39
+ }
40
+
41
+ export interface DocumentSetParams {
42
+ datasetName: string,
43
+ description: string,
44
+ textColumns: string[],
45
+ metadataColumns: string[],
46
+ splitIntoSentences: boolean,
47
+ combineSentencesIntoChunks: boolean,
48
+ sploderMaxSize: number,
49
+ chunkSize: number,
50
+ chunkOverlap: number,
51
+ modelName: string,
52
+ modelProvider: string
53
+ }
54
+
55
+
56
+ export interface EmbeddingConfig {
57
+ modelName: string;
58
+ modelProvider: string
59
+ vectorStoreType: "simple" | "postgres" | "weaviate";
60
+ projectName: string;
61
+ storagePath: string;
62
+ splitIntoSentences: boolean;
63
+ combineSentencesIntoChunks: boolean;
64
+ sploderMaxSize: number;
65
+ chunkSize: number;
66
+ chunkOverlap: number;
67
+ }
68
+
69
+
70
+ export interface Settings {
71
+ openAIKey: string | null;
72
+ oLlamaBaseURL: string | null;
73
+ azureOpenAIKey: string | null;
74
+ azureOpenAIEndpoint: string | null;
75
+ azureOpenAIApiVersion: string | null;
76
+ mistralApiKey: string | null;
77
+ geminiApiKey: string | null;
78
+ }
79
+
80
+ export interface MetadataFilter{
81
+ key: string,
82
+ operator: "==" | "in" | ">" | "<" | "!=" | ">=" | "<=" | "nin" | "any" | "all" | "text_match" | "contains" | "is_empty",
83
+ value: any
84
+ }
85
+
86
+ export interface Clients {
87
+ weaviateClient: any;
88
+ postgresClient: any;
89
+ }