@meaningfully/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.nvmrc +1 -0
- package/LICENSE +7 -0
- package/README.md +3 -0
- package/dist/DocumentSetManager.d.ts +28 -0
- package/dist/DocumentSetManager.d.ts.map +1 -0
- package/dist/DocumentSetManager.js +134 -0
- package/dist/DocumentSetManager.js.map +1 -0
- package/dist/Meaningfully.d.ts +52 -0
- package/dist/Meaningfully.d.ts.map +1 -0
- package/dist/Meaningfully.js +206 -0
- package/dist/Meaningfully.js.map +1 -0
- package/dist/MetadataManager.d.ts +32 -0
- package/dist/MetadataManager.d.ts.map +1 -0
- package/dist/MetadataManager.js +115 -0
- package/dist/MetadataManager.js.map +1 -0
- package/dist/api/embedding.d.ts +7 -0
- package/dist/api/embedding.d.ts.map +1 -0
- package/dist/api/embedding.js +94 -0
- package/dist/api/embedding.js.map +1 -0
- package/dist/api/embedding.test.d.ts +2 -0
- package/dist/api/embedding.test.d.ts.map +1 -0
- package/dist/api/embedding.test.js +340 -0
- package/dist/api/embedding.test.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/services/batchingWeaviateVectorStore.d.ts +6 -0
- package/dist/services/batchingWeaviateVectorStore.d.ts.map +1 -0
- package/dist/services/batchingWeaviateVectorStore.js +21 -0
- package/dist/services/batchingWeaviateVectorStore.js.map +1 -0
- package/dist/services/csvLoader.d.ts +3 -0
- package/dist/services/csvLoader.d.ts.map +1 -0
- package/dist/services/csvLoader.js +18 -0
- package/dist/services/csvLoader.js.map +1 -0
- package/dist/services/csvLoader.test.d.ts +2 -0
- package/dist/services/csvLoader.test.d.ts.map +1 -0
- package/dist/services/csvLoader.test.js +75 -0
- package/dist/services/csvLoader.test.js.map +1 -0
- package/dist/services/embeddings.d.ts +22 -0
- package/dist/services/embeddings.d.ts.map +1 -0
- package/dist/services/embeddings.js +314 -0
- package/dist/services/embeddings.js.map +1 -0
- package/dist/services/embeddings.test.d.ts +2 -0
- package/dist/services/embeddings.test.d.ts.map +1 -0
- package/dist/services/embeddings.test.js +115 -0
- package/dist/services/embeddings.test.js.map +1 -0
- package/dist/services/loggingOpenAIEmbedding.d.ts +2 -0
- package/dist/services/loggingOpenAIEmbedding.d.ts.map +1 -0
- package/dist/services/loggingOpenAIEmbedding.js +41 -0
- package/dist/services/loggingOpenAIEmbedding.js.map +1 -0
- package/dist/services/mockEmbedding.d.ts +6 -0
- package/dist/services/mockEmbedding.d.ts.map +1 -0
- package/dist/services/mockEmbedding.js +14 -0
- package/dist/services/mockEmbedding.js.map +1 -0
- package/dist/services/progressManager.d.ts +21 -0
- package/dist/services/progressManager.d.ts.map +1 -0
- package/dist/services/progressManager.js +76 -0
- package/dist/services/progressManager.js.map +1 -0
- package/dist/services/progressVectorStoreIndex.d.ts +21 -0
- package/dist/services/progressVectorStoreIndex.d.ts.map +1 -0
- package/dist/services/progressVectorStoreIndex.js +60 -0
- package/dist/services/progressVectorStoreIndex.js.map +1 -0
- package/dist/services/sentenceSplitter.d.ts +17 -0
- package/dist/services/sentenceSplitter.d.ts.map +1 -0
- package/dist/services/sentenceSplitter.js +207 -0
- package/dist/services/sentenceSplitter.js.map +1 -0
- package/dist/services/sentenceSplitter.test.d.ts +2 -0
- package/dist/services/sentenceSplitter.test.d.ts.map +1 -0
- package/dist/services/sentenceSplitter.test.js +68 -0
- package/dist/services/sentenceSplitter.test.js.map +1 -0
- package/dist/services/sploder.d.ts +13 -0
- package/dist/services/sploder.d.ts.map +1 -0
- package/dist/services/sploder.js +45 -0
- package/dist/services/sploder.js.map +1 -0
- package/dist/types/index.d.ts +77 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +2 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils.d.ts +3 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +7 -0
- package/dist/utils.js.map +1 -0
- package/package.json +43 -0
- package/src/Meaningfully.d.ts +57 -0
- package/src/Meaningfully.ts +228 -0
- package/src/MetadataManager.d.ts +27 -0
- package/src/MetadataManager.ts +145 -0
- package/src/api/embedding.d.ts +6 -0
- package/src/api/embedding.ts +122 -0
- package/src/index.ts +5 -0
- package/src/services/batchingWeaviateVectorStore.d.ts +5 -0
- package/src/services/batchingWeaviateVectorStore.ts +23 -0
- package/src/services/csvLoader.d.ts +2 -0
- package/src/services/csvLoader.ts +24 -0
- package/src/services/embeddings.d.ts +21 -0
- package/src/services/embeddings.ts +374 -0
- package/src/services/loggingOpenAIEmbedding.d.ts +0 -0
- package/src/services/loggingOpenAIEmbedding.ts +46 -0
- package/src/services/mockEmbedding.d.ts +5 -0
- package/src/services/mockEmbedding.ts +13 -0
- package/src/services/progressManager.d.ts +20 -0
- package/src/services/progressManager.ts +88 -0
- package/src/services/progressVectorStoreIndex.d.ts +20 -0
- package/src/services/progressVectorStoreIndex.ts +95 -0
- package/src/services/sentenceSplitter.d.ts +16 -0
- package/src/services/sentenceSplitter.ts +243 -0
- package/src/services/sploder.d.ts +12 -0
- package/src/services/sploder.ts +62 -0
- package/src/types/index.d.ts +71 -0
- package/src/types/index.ts +89 -0
- package/src/utils.d.ts +2 -0
- package/src/utils.ts +6 -0
- package/tests/MetadataManager.test.ts +120 -0
- package/tests/csvLoader.test.d.ts +1 -0
- package/tests/csvLoader.test.ts +88 -0
- package/tests/embedding.test.d.ts +1 -0
- package/tests/embedding.test.ts +425 -0
- package/tests/embeddings.test.d.ts +1 -0
- package/tests/embeddings.test.ts +144 -0
- package/tests/sentenceSplitter.test.d.ts +1 -0
- package/tests/sentenceSplitter.test.ts +81 -0
- package/tsconfig.json +31 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
//@ts-nocheck
|
|
2
|
+
|
|
3
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
4
|
+
import { Document, TextNode } from 'llamaindex';
|
|
5
|
+
|
|
6
|
+
// First, set up the mock before importing the module
|
|
7
|
+
vi.mock(import("../src/services/embeddings.js"), async (importOriginal) => {
|
|
8
|
+
const actual = await importOriginal()
|
|
9
|
+
return {
|
|
10
|
+
...actual,
|
|
11
|
+
// your mocked methods
|
|
12
|
+
estimateCost: vi.fn(),
|
|
13
|
+
getExistingVectorStoreIndex: vi.fn(),
|
|
14
|
+
persistNodes: vi.fn(),
|
|
15
|
+
persistDocuments: vi.fn(),
|
|
16
|
+
getExistingDocStore: vi.fn(),
|
|
17
|
+
searchDocuments: vi.fn()
|
|
18
|
+
}
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
// Now import the mocked functions
|
|
22
|
+
import { transformDocumentsToNodes, getEmbedModel } from '../src/services/embeddings.js';
|
|
23
|
+
|
|
24
|
+
describe('transformDocumentsToNodes', () => {
|
|
25
|
+
beforeEach(() => {
|
|
26
|
+
vi.clearAllMocks();
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
const mockConfig = {
|
|
30
|
+
chunkSize: 100,
|
|
31
|
+
chunkOverlap: 10,
|
|
32
|
+
combineSentencesIntoChunks: true,
|
|
33
|
+
sploderMaxSize: 500,
|
|
34
|
+
modelProvider: 'mock',
|
|
35
|
+
modelName: 'text-embedding-3-small',
|
|
36
|
+
vectorStoreType: "simple" as "simple",
|
|
37
|
+
storagePath: './storage',
|
|
38
|
+
projectName: 'test_project',
|
|
39
|
+
splitIntoSentences: true,
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
const mockSettings = {
|
|
43
|
+
openAIKey: 'mock-api-key',
|
|
44
|
+
oLlamaBaseURL: 'http://localhost',
|
|
45
|
+
azureOpenAIKey: null,
|
|
46
|
+
azureOpenAIEndpoint: null,
|
|
47
|
+
azureOpenAIApiVersion: null,
|
|
48
|
+
mistralApiKey: null,
|
|
49
|
+
geminiApiKey: null,
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
it('should process documents and return nodes', async () => {
|
|
53
|
+
const mockDocuments = [
|
|
54
|
+
new Document({ text: 'Document 1', metadata: { key1: 'value1' } }),
|
|
55
|
+
new Document({ text: 'Document 2', metadata: { key2: 'value2' } }),
|
|
56
|
+
];
|
|
57
|
+
const mockNodes = [
|
|
58
|
+
new TextNode({ text: 'Document 1' }),
|
|
59
|
+
new TextNode({ text: 'Document 2' }),
|
|
60
|
+
];
|
|
61
|
+
|
|
62
|
+
const result = await transformDocumentsToNodes(mockDocuments, mockConfig, mockSettings);
|
|
63
|
+
|
|
64
|
+
expect(result.map((node) => node.text)).toEqual(mockNodes.map((node) => node.text));
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('should filter out documents with null, undefined, or zero-length text', async () => {
|
|
68
|
+
const mockDocuments = [
|
|
69
|
+
new Document({ text: 'Valid Document', metadata: { key1: 'value1' } }),
|
|
70
|
+
new Document({ text: undefined, metadata: { key3: 'value3' } }),
|
|
71
|
+
new Document({ text: '', metadata: { key4: 'value4' } }),
|
|
72
|
+
];
|
|
73
|
+
const filteredDocuments = [mockDocuments[0]];
|
|
74
|
+
const mockNodes = [new TextNode({ text: 'Valid Document' })];
|
|
75
|
+
|
|
76
|
+
// (transformDocumentsToNodes as vi.Mock).mockResolvedValue(mockNodes);
|
|
77
|
+
|
|
78
|
+
const result = await transformDocumentsToNodes(mockDocuments, mockConfig, mockSettings);
|
|
79
|
+
expect(result.map((n) => n.text)).toEqual(mockNodes.map((n) => n.text));
|
|
80
|
+
|
|
81
|
+
// TODO: I can't get these to work. Apparently you can't spyOn a function that is imported from the same file.
|
|
82
|
+
// all well and good but ... why did CoPilot generate a test that can't work?
|
|
83
|
+
// expect(transformDocumentsToNodes).toHaveBeenCalledWith(filteredDocuments, expect.any(Array));
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it('should exclude all metadata keys from embedding', async () => {
|
|
87
|
+
const mockDocuments = [
|
|
88
|
+
new Document({ text: 'Document 1', metadata: { key1: 'value1', key2: 'value2' } }),
|
|
89
|
+
];
|
|
90
|
+
|
|
91
|
+
const nodes = await transformDocumentsToNodes(mockDocuments, mockConfig, mockSettings)
|
|
92
|
+
expect(nodes[0].excludedEmbedMetadataKeys).toEqual(['key1', 'key2']);
|
|
93
|
+
});
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
describe('getEmbedModel', () => {
|
|
97
|
+
const mockConfig = {
|
|
98
|
+
chunkSize: 100,
|
|
99
|
+
chunkOverlap: 10,
|
|
100
|
+
combineSentencesIntoChunks: true,
|
|
101
|
+
sploderMaxSize: 500,
|
|
102
|
+
modelProvider: 'openai',
|
|
103
|
+
modelName: 'text-embedding-3-small',
|
|
104
|
+
vectorStoreType: "simple" as "simple",
|
|
105
|
+
storagePath: './storage',
|
|
106
|
+
projectName: 'test_project',
|
|
107
|
+
splitIntoSentences: true,
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
const mockSettings = {
|
|
111
|
+
openAIKey: 'mock-api-key',
|
|
112
|
+
oLlamaBaseURL: 'http://localhost',
|
|
113
|
+
azureOpenAIKey: null,
|
|
114
|
+
azureOpenAIEndpoint: null,
|
|
115
|
+
azureOpenAIApiVersion: null,
|
|
116
|
+
mistralApiKey: null,
|
|
117
|
+
geminiApiKey: null,
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
it('should handle different model providers correctly', () => {
|
|
122
|
+
// Test with 'ollama' provider
|
|
123
|
+
const ollamaModel = getEmbedModel(
|
|
124
|
+
{ ...mockConfig, modelProvider: 'ollama' },
|
|
125
|
+
mockSettings
|
|
126
|
+
);
|
|
127
|
+
expect(ollamaModel).toBeDefined();
|
|
128
|
+
|
|
129
|
+
// Test with 'mock' provider
|
|
130
|
+
const mockModel = getEmbedModel(
|
|
131
|
+
{ ...mockConfig, modelProvider: 'mock' },
|
|
132
|
+
mockSettings
|
|
133
|
+
);
|
|
134
|
+
expect(mockModel).toBeDefined();
|
|
135
|
+
|
|
136
|
+
// Test with invalid provider
|
|
137
|
+
expect(() => {
|
|
138
|
+
getEmbedModel(
|
|
139
|
+
{ ...mockConfig, modelProvider: 'invalid' as any },
|
|
140
|
+
mockSettings
|
|
141
|
+
);
|
|
142
|
+
}).toThrow('Unsupported embedding model provider: invalid');
|
|
143
|
+
});
|
|
144
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
//@ts-nocheck
|
|
2
|
+
import { expect, test } from 'vitest'
|
|
3
|
+
import { CustomSentenceSplitter } from '../src/services/sentenceSplitter.js'
|
|
4
|
+
import { SentenceSplitter, IngestionPipeline, Document } from "llamaindex";
|
|
5
|
+
|
|
6
|
+
// do these tests just to make sure that we can factor out my hacky fixes when llamaindex is fixed.
|
|
7
|
+
// test that original sentenceSplitter splits on abbreviations
|
|
8
|
+
// test that original sentenceSplitter splits on abbreviations even when specified
|
|
9
|
+
|
|
10
|
+
// test that my modified sentenceSplitter excludes metadata when arg is specified
|
|
11
|
+
// test that my modified sentenceSplitter includes metadata when arg is specified the other way
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
let documents = [
|
|
16
|
+
new Document({ text: "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee. Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019. He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time." }),
|
|
17
|
+
];
|
|
18
|
+
|
|
19
|
+
let originalSentenceSplitterPipeline = new IngestionPipeline({
|
|
20
|
+
transformations: [
|
|
21
|
+
new SentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
|
|
22
|
+
],
|
|
23
|
+
});
|
|
24
|
+
let customSentenceSplitterPipeline = new IngestionPipeline({
|
|
25
|
+
transformations: [
|
|
26
|
+
new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10 }),
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test("my modified sentenceSplitter doesn't eliminate spaces", () => {
|
|
31
|
+
customSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
|
|
32
|
+
expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(false);
|
|
33
|
+
expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(false);
|
|
34
|
+
expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(false);
|
|
35
|
+
});
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
// test("original sentenceSplitter does eliminate spaces", () => {
|
|
39
|
+
// originalSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
|
|
40
|
+
// expect(nodes.some((node) => node["text"].indexOf("Co.elected") > -1)).toEqual(true);
|
|
41
|
+
// expect(nodes.some((node) => node["text"].indexOf("Mr.Weinberger") > -1)).toEqual(true);
|
|
42
|
+
// expect(nodes.some((node) => node["text"].indexOf("A.and") > -1)).toEqual(true);
|
|
43
|
+
// });
|
|
44
|
+
// });
|
|
45
|
+
|
|
46
|
+
let noAbbrevsCustomSentenceSplitterPipeline = new IngestionPipeline({
|
|
47
|
+
transformations: [
|
|
48
|
+
new CustomSentenceSplitter({ chunkSize: 50, chunkOverlap: 10, abbreviations: []}),
|
|
49
|
+
],
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
test("my modified sentenceSplitter doesn't split on specified abbreviations", () => {
|
|
54
|
+
customSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
|
|
55
|
+
expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).not.toContainEqual(true);
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
// this is only a problem on branch fix/sentence-splitter-spaces
|
|
60
|
+
// where the chunker is eliminated entirely in favor of just splitting by sentences with natural.
|
|
61
|
+
test("original sentenceSplitter splits in silly places, like Mr", () => {
|
|
62
|
+
noAbbrevsCustomSentenceSplitterPipeline.run({documents: documents}).then((nodes) => {
|
|
63
|
+
expect(nodes.map((node) => !!node["text"].match(/Mr\.$/))).toContainEqual(true);
|
|
64
|
+
});
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
const testcases = [
|
|
68
|
+
["USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41", "USA v. 4227 JENIFER STREET N.W. WASHINGTON, D.C., AND ELECTRONIC DEVICES THEREIN UNDER RULE 41"],
|
|
69
|
+
["JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee.", "JPMorgan Chase & Co. elected Mark Weinberger as a director, effective January 16, 2024, and the Board of Directors appointed him as a member of the Audit Committee."],
|
|
70
|
+
["Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019.", "Mr. Weinberger was Global Chairman and Chief Executive Officer of Ernst & Young from 2013 to 2019."],
|
|
71
|
+
["He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time.", "He was also elected a director of JPMorgan Chase Bank, N.A. and a manager of JPMorgan Chase Holdings LLC, and may be elected a director of such other subsidiary or subsidiaries as may be determined from time to time."],
|
|
72
|
+
|
|
73
|
+
];
|
|
74
|
+
testcases.forEach(([testcase_input, testcase_expected_output]) => {
|
|
75
|
+
test(`my sentenceSplitter correctly handles short sentence ${testcase_input}`, () => {
|
|
76
|
+
customSentenceSplitterPipeline.run({documents: [new Document({text: testcase_input})]}).then((nodes) => {
|
|
77
|
+
expect(nodes.length).toEqual(1);
|
|
78
|
+
expect(nodes[0]["text"]).toEqual(testcase_expected_output);
|
|
79
|
+
});
|
|
80
|
+
})
|
|
81
|
+
});
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
/* Base Options: */
|
|
4
|
+
"esModuleInterop": true,
|
|
5
|
+
"skipLibCheck": true,
|
|
6
|
+
"target": "es2022",
|
|
7
|
+
"allowJs": true,
|
|
8
|
+
"resolveJsonModule": true,
|
|
9
|
+
"moduleDetection": "force",
|
|
10
|
+
"isolatedModules": true,
|
|
11
|
+
"verbatimModuleSyntax": false,
|
|
12
|
+
|
|
13
|
+
/* Strictness */
|
|
14
|
+
"strict": true,
|
|
15
|
+
"noUncheckedIndexedAccess": true,
|
|
16
|
+
"noImplicitOverride": false,
|
|
17
|
+
|
|
18
|
+
/* If transpiling with TypeScript: */
|
|
19
|
+
"module": "NodeNext",
|
|
20
|
+
"outDir": "dist",
|
|
21
|
+
"rootDir": "src",
|
|
22
|
+
"sourceMap": true,
|
|
23
|
+
|
|
24
|
+
/* AND if you're building for a library: */
|
|
25
|
+
"declaration": true,
|
|
26
|
+
|
|
27
|
+
/* AND if you're building for a library in a monorepo: */
|
|
28
|
+
"declarationMap": true
|
|
29
|
+
},
|
|
30
|
+
"exclude": ["node_modules/natural", "dist/**"]
|
|
31
|
+
}
|