@meaningfully/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/.nvmrc +1 -0
  2. package/LICENSE +7 -0
  3. package/README.md +3 -0
  4. package/dist/DocumentSetManager.d.ts +28 -0
  5. package/dist/DocumentSetManager.d.ts.map +1 -0
  6. package/dist/DocumentSetManager.js +134 -0
  7. package/dist/DocumentSetManager.js.map +1 -0
  8. package/dist/Meaningfully.d.ts +52 -0
  9. package/dist/Meaningfully.d.ts.map +1 -0
  10. package/dist/Meaningfully.js +206 -0
  11. package/dist/Meaningfully.js.map +1 -0
  12. package/dist/MetadataManager.d.ts +32 -0
  13. package/dist/MetadataManager.d.ts.map +1 -0
  14. package/dist/MetadataManager.js +115 -0
  15. package/dist/MetadataManager.js.map +1 -0
  16. package/dist/api/embedding.d.ts +7 -0
  17. package/dist/api/embedding.d.ts.map +1 -0
  18. package/dist/api/embedding.js +94 -0
  19. package/dist/api/embedding.js.map +1 -0
  20. package/dist/api/embedding.test.d.ts +2 -0
  21. package/dist/api/embedding.test.d.ts.map +1 -0
  22. package/dist/api/embedding.test.js +340 -0
  23. package/dist/api/embedding.test.js.map +1 -0
  24. package/dist/index.d.ts +5 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +6 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/services/batchingWeaviateVectorStore.d.ts +6 -0
  29. package/dist/services/batchingWeaviateVectorStore.d.ts.map +1 -0
  30. package/dist/services/batchingWeaviateVectorStore.js +21 -0
  31. package/dist/services/batchingWeaviateVectorStore.js.map +1 -0
  32. package/dist/services/csvLoader.d.ts +3 -0
  33. package/dist/services/csvLoader.d.ts.map +1 -0
  34. package/dist/services/csvLoader.js +18 -0
  35. package/dist/services/csvLoader.js.map +1 -0
  36. package/dist/services/csvLoader.test.d.ts +2 -0
  37. package/dist/services/csvLoader.test.d.ts.map +1 -0
  38. package/dist/services/csvLoader.test.js +75 -0
  39. package/dist/services/csvLoader.test.js.map +1 -0
  40. package/dist/services/embeddings.d.ts +22 -0
  41. package/dist/services/embeddings.d.ts.map +1 -0
  42. package/dist/services/embeddings.js +314 -0
  43. package/dist/services/embeddings.js.map +1 -0
  44. package/dist/services/embeddings.test.d.ts +2 -0
  45. package/dist/services/embeddings.test.d.ts.map +1 -0
  46. package/dist/services/embeddings.test.js +115 -0
  47. package/dist/services/embeddings.test.js.map +1 -0
  48. package/dist/services/loggingOpenAIEmbedding.d.ts +2 -0
  49. package/dist/services/loggingOpenAIEmbedding.d.ts.map +1 -0
  50. package/dist/services/loggingOpenAIEmbedding.js +41 -0
  51. package/dist/services/loggingOpenAIEmbedding.js.map +1 -0
  52. package/dist/services/mockEmbedding.d.ts +6 -0
  53. package/dist/services/mockEmbedding.d.ts.map +1 -0
  54. package/dist/services/mockEmbedding.js +14 -0
  55. package/dist/services/mockEmbedding.js.map +1 -0
  56. package/dist/services/progressManager.d.ts +21 -0
  57. package/dist/services/progressManager.d.ts.map +1 -0
  58. package/dist/services/progressManager.js +76 -0
  59. package/dist/services/progressManager.js.map +1 -0
  60. package/dist/services/progressVectorStoreIndex.d.ts +21 -0
  61. package/dist/services/progressVectorStoreIndex.d.ts.map +1 -0
  62. package/dist/services/progressVectorStoreIndex.js +60 -0
  63. package/dist/services/progressVectorStoreIndex.js.map +1 -0
  64. package/dist/services/sentenceSplitter.d.ts +17 -0
  65. package/dist/services/sentenceSplitter.d.ts.map +1 -0
  66. package/dist/services/sentenceSplitter.js +207 -0
  67. package/dist/services/sentenceSplitter.js.map +1 -0
  68. package/dist/services/sentenceSplitter.test.d.ts +2 -0
  69. package/dist/services/sentenceSplitter.test.d.ts.map +1 -0
  70. package/dist/services/sentenceSplitter.test.js +68 -0
  71. package/dist/services/sentenceSplitter.test.js.map +1 -0
  72. package/dist/services/sploder.d.ts +13 -0
  73. package/dist/services/sploder.d.ts.map +1 -0
  74. package/dist/services/sploder.js +45 -0
  75. package/dist/services/sploder.js.map +1 -0
  76. package/dist/types/index.d.ts +77 -0
  77. package/dist/types/index.d.ts.map +1 -0
  78. package/dist/types/index.js +2 -0
  79. package/dist/types/index.js.map +1 -0
  80. package/dist/utils.d.ts +3 -0
  81. package/dist/utils.d.ts.map +1 -0
  82. package/dist/utils.js +7 -0
  83. package/dist/utils.js.map +1 -0
  84. package/package.json +43 -0
  85. package/src/Meaningfully.d.ts +57 -0
  86. package/src/Meaningfully.ts +228 -0
  87. package/src/MetadataManager.d.ts +27 -0
  88. package/src/MetadataManager.ts +145 -0
  89. package/src/api/embedding.d.ts +6 -0
  90. package/src/api/embedding.ts +122 -0
  91. package/src/index.ts +5 -0
  92. package/src/services/batchingWeaviateVectorStore.d.ts +5 -0
  93. package/src/services/batchingWeaviateVectorStore.ts +23 -0
  94. package/src/services/csvLoader.d.ts +2 -0
  95. package/src/services/csvLoader.ts +24 -0
  96. package/src/services/embeddings.d.ts +21 -0
  97. package/src/services/embeddings.ts +374 -0
  98. package/src/services/loggingOpenAIEmbedding.d.ts +0 -0
  99. package/src/services/loggingOpenAIEmbedding.ts +46 -0
  100. package/src/services/mockEmbedding.d.ts +5 -0
  101. package/src/services/mockEmbedding.ts +13 -0
  102. package/src/services/progressManager.d.ts +20 -0
  103. package/src/services/progressManager.ts +88 -0
  104. package/src/services/progressVectorStoreIndex.d.ts +20 -0
  105. package/src/services/progressVectorStoreIndex.ts +95 -0
  106. package/src/services/sentenceSplitter.d.ts +16 -0
  107. package/src/services/sentenceSplitter.ts +243 -0
  108. package/src/services/sploder.d.ts +12 -0
  109. package/src/services/sploder.ts +62 -0
  110. package/src/types/index.d.ts +71 -0
  111. package/src/types/index.ts +89 -0
  112. package/src/utils.d.ts +2 -0
  113. package/src/utils.ts +6 -0
  114. package/tests/MetadataManager.test.ts +120 -0
  115. package/tests/csvLoader.test.d.ts +1 -0
  116. package/tests/csvLoader.test.ts +88 -0
  117. package/tests/embedding.test.d.ts +1 -0
  118. package/tests/embedding.test.ts +425 -0
  119. package/tests/embeddings.test.d.ts +1 -0
  120. package/tests/embeddings.test.ts +144 -0
  121. package/tests/sentenceSplitter.test.d.ts +1 -0
  122. package/tests/sentenceSplitter.test.ts +81 -0
  123. package/tsconfig.json +31 -0
  124. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,122 @@
1
+ import { transformDocumentsToNodes, estimateCost, searchDocuments, getExistingVectorStoreIndex, persistNodes, persistDocuments, getExistingDocStore } from "../services/embeddings.js";
2
+ import type { EmbeddingConfig, EmbeddingResult, SearchResult, PreviewResult, Settings, MetadataFilter, Clients } from "../types/index.js";
3
+ import { loadDocumentsFromCsv } from "../services/csvLoader.js";
4
+ import { MetadataMode } from "llamaindex";
5
+ import { ProgressManager } from "../services/progressManager.js";
6
+
7
+ export async function createEmbeddings(
8
+ csvPath: string,
9
+ textColumnName: string,
10
+ config: EmbeddingConfig,
11
+ settings: Settings,
12
+ clients: Clients
13
+ ): Promise<EmbeddingResult> {
14
+ try {
15
+ console.time("createEmbeddings Run Time");
16
+ const operationId = `embed-${Date.now()}`;
17
+ const progressManager = ProgressManager.getInstance();
18
+ progressManager.startOperation(operationId, 100);
19
+
20
+ const documents = await loadDocumentsFromCsv(csvPath, textColumnName);
21
+ if (documents.length === 0) {
22
+ progressManager.clearOperation(operationId);
23
+ console.timeEnd("createEmbeddings Run Time");
24
+ return {
25
+ success: false,
26
+ error: "That CSV does not appear to contain any documents. Please check the file and try again.",
27
+ };
28
+ }
29
+
30
+ progressManager.updateProgress(operationId, 5);
31
+
32
+ const nodes = await transformDocumentsToNodes(documents, config);
33
+
34
+ const [index] = await Promise.all([
35
+ persistNodes(nodes, config, settings, clients, (progress, total) => {
36
+ const percentage = Math.floor((progress / total) * 90) + 5; // Map to 5-95% of total progress
37
+ progressManager.updateProgress(operationId, percentage);
38
+ }),
39
+ persistDocuments(documents, config, settings, clients)
40
+ ]);
41
+
42
+ progressManager.completeOperation(operationId);
43
+ console.timeEnd("createEmbeddings Run Time");
44
+ return {
45
+ success: true,
46
+ index,
47
+ };
48
+ } catch (error) {
49
+ return {
50
+ success: false,
51
+ error: error instanceof Error ? error.message : "Unknown error occurred",
52
+ };
53
+ }
54
+ }
55
+
56
+ // TODO: rename this to be parallel to createEmbeddings
57
+ export async function previewResults(
58
+ csvPath: string,
59
+ textColumnName: string,
60
+ config: EmbeddingConfig
61
+ ): Promise<PreviewResult> {
62
+ try {
63
+ const documents = await loadDocumentsFromCsv(csvPath, textColumnName);
64
+ if (documents.length === 0) {
65
+ return {
66
+ success: false,
67
+ error: "That CSV does not appear to contain any documents. Please check the file and try again.",
68
+ };
69
+ }
70
+ // Take 10 rows from the middle of the dataset for preview
71
+ // we take a consistent 10 so that the results of the preview are consistent (i.e. with a larger chunk size, you have fewer, longer results, but more shorter ones if you adjust it)
72
+ // and we take from the middle because the initial rows may be idiosyncratic.
73
+ const previewDocumentsSubset = documents.slice(
74
+ Math.floor(documents.length / 2),
75
+ Math.floor(documents.length / 2) + 10
76
+ );
77
+
78
+ const previewNodes = await transformDocumentsToNodes(documents, config);
79
+ const previewSubsetNodes = await transformDocumentsToNodes(previewDocumentsSubset, config);
80
+ const { estimatedPrice, tokenCount, pricePer1M } = estimateCost(previewNodes, config.modelName);
81
+
82
+ return {
83
+ success: true,
84
+ nodes: previewSubsetNodes.map((node: any) => ({
85
+ text: node.text,
86
+ metadata: node.metadata
87
+ })),
88
+ estimatedPrice,
89
+ tokenCount,
90
+ pricePer1M
91
+ };
92
+ } catch (error) {
93
+ return {
94
+ success: false,
95
+ error: error instanceof Error ? error.message : "Unknown error occurred"
96
+ };
97
+ }
98
+ }
99
+
100
+ export async function getDocStore(config: EmbeddingConfig) {
101
+ return await getExistingDocStore(config);
102
+ }
103
+
104
+ export async function getIndex(config: EmbeddingConfig, settings: Settings, clients: Clients) {
105
+ return await getExistingVectorStoreIndex(config, settings, clients);
106
+ }
107
+
108
+ export async function search(
109
+ index: any,
110
+ query: string,
111
+ numResults: number = 10,
112
+ filters?: MetadataFilter[]
113
+ ): Promise<SearchResult[]> {
114
+ const results = await searchDocuments(index, query, numResults, filters);
115
+ return results.map((result: any) => ({
116
+ text: result.node.getContent(MetadataMode.NONE),
117
+ score: result.score ?? 0,
118
+ metadata: result.node.metadata,
119
+ // @ts-ignore
120
+ sourceNodeId: result.node.relationships?.SOURCE?.nodeId
121
+ }));
122
+ }
package/src/index.ts ADDED
@@ -0,0 +1,5 @@
1
+ export * from './Meaningfully.js';
2
+ export * from './MetadataManager.js';
3
+ export * from './services/progressManager.js';
4
+ export * from './types/index.js';
5
+ // src/index.ts
@@ -0,0 +1,5 @@
1
+ import { WeaviateVectorStore } from '@llamaindex/weaviate';
2
+ import { BaseNode } from 'llamaindex';
3
+ export declare class BatchingWeaviateVectorStore extends WeaviateVectorStore {
4
+ add(nodes: BaseNode[]): Promise<string[]>;
5
+ }
@@ -0,0 +1,23 @@
1
+ import { WeaviateVectorStore } from '@llamaindex/weaviate';
2
+ import { BaseNode } from 'llamaindex';
3
+
4
+ /*
5
+
6
+ Patched version of WeaviateVectorStore to handle large batches by splitting into smaller chunks.
7
+
8
+ When I loaded a large-ish (5.4MB) spreadsheet, I got a Weaviate error about trying to load too much data at once.
9
+
10
+ */
11
+
12
+ export class BatchingWeaviateVectorStore extends WeaviateVectorStore {
13
+ async add(nodes: BaseNode[]): Promise<string[]> {
14
+ const batchSize = 100; // Define the batch size
15
+ const results: string[] = []; // Collect results from each batch
16
+ for (let i = 0; i < nodes.length; i += batchSize) {
17
+ const batch = nodes.slice(i, i + batchSize);
18
+ const batchResults = await super.add(batch); // Call the parent class's add method for each batch
19
+ results.push(...batchResults); // Aggregate results
20
+ }
21
+ return results; // Return aggregated results
22
+ }
23
+ }
@@ -0,0 +1,2 @@
1
+ import { Document } from "llamaindex";
2
+ export declare function loadDocumentsFromCsv(filePath: string, textColumnName: string): Promise<Document[]>;
@@ -0,0 +1,24 @@
1
+ import { Document } from "llamaindex";
2
+ import { readFileSync } from "fs";
3
+ import Papa from "papaparse";
4
+
5
+ export async function loadDocumentsFromCsv(
6
+ filePath: string,
7
+ textColumnName: string
8
+ ): Promise<Document[]> {
9
+ const fileContent = readFileSync(filePath, "utf-8");
10
+ const { data: records } = Papa.parse(fileContent, {
11
+ header: true,
12
+ skipEmptyLines: true,
13
+ });
14
+
15
+ return records.map((record: any) => {
16
+ const { [textColumnName]: text, ...metadata } = record;
17
+ return new Document({
18
+ text,
19
+ metadata: Object.fromEntries(
20
+ Object.entries(metadata).map(([k, v]) => [k, v ?? ""])
21
+ ),
22
+ });
23
+ });
24
+ }
@@ -0,0 +1,21 @@
1
+ import { Document, VectorStoreIndex, TextNode, StorageContext } from "llamaindex";
2
+ import { OllamaEmbedding } from '@llamaindex/ollama';
3
+ import { MistralAIEmbedding } from '@llamaindex/mistral';
4
+ import { GeminiEmbedding } from '@llamaindex/google';
5
+ import { MockEmbedding } from "./mockEmbedding";
6
+ import { EmbeddingConfig, Settings, MetadataFilter, Clients } from "../types";
7
+ import { OpenAIEmbedding } from "@llamaindex/openai";
8
+ import { ProgressVectorStoreIndex } from "./progressVectorStoreIndex";
9
+ export declare function estimateCost(nodes: TextNode[], modelName: string): {
10
+ estimatedPrice: number;
11
+ tokenCount: number;
12
+ pricePer1M: number;
13
+ };
14
+ export declare function getExistingVectorStoreIndex(config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<VectorStoreIndex>;
15
+ export declare function getExistingDocStore(config: EmbeddingConfig): Promise<import("llamaindex").BaseDocumentStore>;
16
+ export declare function transformDocumentsToNodes(documents: Document[], config: EmbeddingConfig): Promise<TextNode<import("llamaindex").Metadata>[]>;
17
+ export declare function getEmbedModel(config: EmbeddingConfig, settings: Settings): MockEmbedding | OpenAIEmbedding | OllamaEmbedding | MistralAIEmbedding | GeminiEmbedding;
18
+ export declare function getStorageContext(config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<StorageContext>;
19
+ export declare function persistDocuments(documents: Document[], config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<void>;
20
+ export declare function persistNodes(nodes: TextNode[], config: EmbeddingConfig, settings: Settings, clients: Clients, progressCallback?: (progress: number, total: number) => void): Promise<ProgressVectorStoreIndex>;
21
+ export declare function searchDocuments(index: VectorStoreIndex, query: string, numResults?: number, filters?: MetadataFilter[]): Promise<import("llamaindex").NodeWithScore<import("llamaindex").Metadata>[]>;
@@ -0,0 +1,374 @@
1
+ import {
2
+ Document,
3
+ VectorStoreIndex,
4
+ // OpenAIEmbedding,
5
+ IngestionPipeline,
6
+ TransformComponent,
7
+ TextNode,
8
+ ModalityType,
9
+ type MetadataFilters,
10
+ storageContextFromDefaults,
11
+ SimpleVectorStore,
12
+ type StorageContext,
13
+ Settings as LlamaindexSettings,
14
+ SimpleDocumentStore
15
+ } from "llamaindex";
16
+ import { OllamaEmbedding} from '@llamaindex/ollama'
17
+ import { MistralAIEmbedding, MistralAIEmbeddingModelType } from '@llamaindex/mistral'
18
+ import { GeminiEmbedding } from '@llamaindex/google'
19
+ import { PGVectorStore } from '@llamaindex/postgres';
20
+ import { AzureOpenAIEmbedding } from "@llamaindex/azure";
21
+ import { Sploder } from "./sploder.js";
22
+ import { CustomSentenceSplitter } from "./sentenceSplitter.js";
23
+ import { MockEmbedding } from "./mockEmbedding.js";
24
+ import { encodingForModel, type TiktokenModel } from "js-tiktoken";
25
+ import { join } from "path";
26
+ import type { EmbeddingConfig, Settings, MetadataFilter, Clients } from "../types/index.js";
27
+ import { sanitizeProjectName, capitalizeFirstLetter } from "../utils.js";
28
+ import * as fs from 'fs';
29
+ import { OpenAIEmbedding } from "@llamaindex/openai";
30
+ import { BatchingWeaviateVectorStore } from "./batchingWeaviateVectorStore.js";
31
+ import { ProgressVectorStoreIndex } from "./progressVectorStoreIndex.js";
32
+
33
+ // unused, but probalby eventually will be used.
34
+ // to be used by postgres store, which it' slooking increasingly like I have to enable again
35
+ const MODEL_DIMENSIONS: Record<string, number> = {
36
+ "text-embedding-3-small": 1536,
37
+ "text-embedding-3-large": 3072,
38
+ "mxbai-embed-large": 1024,
39
+ "mistral-embed": 1024,
40
+ "gemini-embedding-001": 768, // Gemini embedding model
41
+ };
42
+
43
+ const PRICE_PER_1M: Record<string, number> = {
44
+ "text-embedding-3-small": 0.02,
45
+ "text-embedding-3-large": 0.13,
46
+ "mistral-embed": 0.1,
47
+ "mxbai-embed-large": 0, // local model, free
48
+ "nomic-embed-text": 0, // local model, free
49
+ "gemini-embedding-001": 0.0, // Gemini embedding is currently free (unless you're on the paid tier, in which case it is $0.15/million tokens)
50
+ };
51
+
52
+
53
+ /* all transformations except the embedding step (which is handled by VectorStoreIndex.init) */
54
+ function getBaseTransformations(config: EmbeddingConfig){
55
+ const transformations: TransformComponent[] = [
56
+ new CustomSentenceSplitter({ chunkSize: config.chunkSize, chunkOverlap: config.chunkOverlap }),
57
+ ];
58
+
59
+ if (config.combineSentencesIntoChunks) {
60
+ transformations.push(
61
+ new Sploder({
62
+ maxStringTokenCount: config.sploderMaxSize
63
+ })
64
+ );
65
+ }
66
+
67
+ return transformations;
68
+ }
69
+
70
+ export function estimateCost(nodes: TextNode[], modelName: string): {
71
+ estimatedPrice: number;
72
+ tokenCount: number;
73
+ pricePer1M: number;
74
+ } {
75
+ const pricePer1M = PRICE_PER_1M[modelName] || 0; // default to 0 if model not found or free
76
+
77
+ let tokenizer;
78
+ try{
79
+ tokenizer = encodingForModel(modelName as TiktokenModel); // This doesn't work for ollama
80
+ } catch (error) {
81
+ // If the tokenizer is not found, it means the model is likely not supported by tiktoken
82
+ // or is a local model (like Ollama). In this case, we can't estimate the cost.
83
+ tokenizer = encodingForModel("text-embedding-3-small"); // fallback to a known tokenizer
84
+ console.warn(`Tokenizer for model ${modelName} not found. Using fallback tokenizer.`);
85
+ }
86
+ const tokenCount = nodes.reduce((sum, node) => {
87
+ return sum + tokenizer.encode(node.text).length;
88
+ }, 0);
89
+
90
+ const estimatedPrice = tokenCount * (pricePer1M / 1_000_000);
91
+
92
+ return {
93
+ estimatedPrice,
94
+ tokenCount,
95
+ pricePer1M
96
+ };
97
+ }
98
+
99
+ export async function getExistingVectorStoreIndex(config: EmbeddingConfig, settings: Settings, clients: Clients) {
100
+ const embedModel = getEmbedModel(config, settings);
101
+ switch (config.vectorStoreType) {
102
+ case "simple":
103
+ const persistDir = join(config.storagePath, sanitizeProjectName(config.projectName));
104
+ const storageContext = await storageContextFromDefaults({
105
+ persistDir: persistDir,
106
+ });
107
+ let vsi = await VectorStoreIndex.init({
108
+ storageContext: storageContext,
109
+ });
110
+ vsi.embedModel = embedModel;
111
+ return vsi;
112
+
113
+ case "postgres":
114
+ if (!clients.postgresClient) {
115
+ throw new Error("Postgres client required but not provided");
116
+ }
117
+ const pgStore = new PGVectorStore({
118
+ clientConfig: { connectionString: process.env.POSTGRES_CONNECTION_STRING },
119
+ tableName: sanitizeProjectName(config.projectName),
120
+ dimensions: MODEL_DIMENSIONS[config.modelName] || 1536, // default to 1536 if model not found
121
+ embeddingModel: embedModel
122
+ });
123
+ const pgStorageContext = await storageContextFromDefaults({
124
+ vectorStores: { [ModalityType.TEXT]: pgStore },
125
+ });
126
+ return await VectorStoreIndex.init({
127
+ storageContext: pgStorageContext,
128
+ });
129
+ case "weaviate":
130
+ if (!clients.weaviateClient) {
131
+ throw new Error("Weaviate client required but not provided");
132
+ }
133
+ const weaviateStore = new BatchingWeaviateVectorStore({
134
+ indexName: capitalizeFirstLetter(sanitizeProjectName(config.projectName)),
135
+ weaviateClient: clients.weaviateClient,
136
+ embeddingModel: embedModel
137
+ });
138
+
139
+ // WeaviateVectorStore's getNodeSimilarity method looks for distance, but current weaviate provides score
140
+ // (WeaviateVectorStore would get `score` if we were doing hybrid search)
141
+ // Overwrite the private getNodeSimilarity method to use 'score' from metadata
142
+ // @ts-ignore
143
+ weaviateStore.getNodeSimilarity = (entry, _similarityKey = "score") => {
144
+ return entry.metadata.score;
145
+ }
146
+
147
+ return await VectorStoreIndex.fromVectorStore(weaviateStore)
148
+
149
+ default:
150
+ throw new Error(`Unsupported vector store type: ${config.vectorStoreType}`);
151
+ }
152
+ }
153
+
154
+ export async function getExistingDocStore(config: EmbeddingConfig) {
155
+ // switch (config.vectorStoreType) {
156
+ // case "simple":
157
+ const persistDir = join(config.storagePath, sanitizeProjectName(config.projectName) );
158
+ const storageContext = await storageContextFromDefaults({
159
+ persistDir: persistDir,
160
+ });
161
+ return storageContext.docStore;
162
+
163
+ // case "postgres":
164
+ // throw new Error(`Not yet implemented vector store type: ${config.vectorStoreType}`);
165
+ // // return await createVectorStore(config);
166
+ // default:
167
+ // throw new Error(`Unsupported vector store type: ${config.vectorStoreType}`);
168
+ // }
169
+ }
170
+
171
+
172
+
173
+ export async function transformDocumentsToNodes(
174
+ documents: Document[],
175
+ config: EmbeddingConfig,
176
+ ) {
177
+ console.time("transformDocumentsToNodes Run Time");
178
+
179
+ const transformations = getBaseTransformations(config);
180
+
181
+ // llama-index stupidly includes all the metadata in the embedding, which is a waste of tokens
182
+ // so we exclude everything except the text column from the embedding
183
+ for (const document of documents) {
184
+ document.excludedEmbedMetadataKeys = Object.keys(document.metadata);
185
+ }
186
+ console.time("transformDocumentsToNodes transformDocuments Run Time");
187
+ // remove empty documents. we can't meaningfully embed these, so we're just gonna ignore 'em.
188
+ // that might not ultimately be the right solution.
189
+ documents = documents.filter((document_) => document_.text && document_.text.length > 0);
190
+
191
+ // Create nodes with sentence splitting and optional sploder
192
+ const pipeline = new IngestionPipeline({
193
+ transformations
194
+ });
195
+
196
+ const nodes = (await pipeline.run({documents: documents})) as TextNode[];
197
+
198
+ console.timeEnd("transformDocumentsToNodes transformDocuments Run Time");
199
+ console.timeEnd("transformDocumentsToNodes Run Time");
200
+ return nodes;
201
+ }
202
+
203
+ export function getEmbedModel(
204
+ config: EmbeddingConfig,
205
+ settings: Settings,
206
+ ) {
207
+ let embedModel;
208
+ if (config.modelProvider === "openai" ){
209
+ embedModel = new OpenAIEmbedding({ model: config.modelName, apiKey: settings.openAIKey ? settings.openAIKey : undefined} );
210
+ embedModel.embedBatchSize = 50; // all embedding models enforce a maximum of 300,000 tokens summed across all inputs in a single request
211
+ } else if (config.modelProvider === "ollama") {
212
+ embedModel = new OllamaEmbedding({ model: config.modelName, config: {
213
+ host: settings.oLlamaBaseURL ? settings.oLlamaBaseURL : undefined
214
+ }, });
215
+ } else if (config.modelProvider === "azure") {
216
+ if (!settings.azureOpenAIKey || !settings.azureOpenAIEndpoint) {
217
+ throw new Error("Azure OpenAI API key and endpoint are required for Azure embedding models");
218
+ }
219
+ embedModel = new AzureOpenAIEmbedding({
220
+ model: config.modelName,
221
+ apiKey: settings.azureOpenAIKey,
222
+ endpoint: settings.azureOpenAIEndpoint,
223
+ apiVersion: settings.azureOpenAIApiVersion ?? undefined
224
+ });
225
+ } else if (config.modelProvider === "mistral") {
226
+ if (!settings.mistralApiKey) {
227
+ throw new Error("Mistral API key is required for Mistral embedding models");
228
+ }
229
+ embedModel = new MistralAIEmbedding({
230
+ model: MistralAIEmbeddingModelType.MISTRAL_EMBED, // only one choice!
231
+ apiKey: settings.mistralApiKey
232
+ });
233
+ } else if (config.modelProvider === "gemini") {
234
+ if (!settings.geminiApiKey) {
235
+ throw new Error("Gemini API key is required for Gemini embedding models");
236
+ }
237
+ embedModel = new GeminiEmbedding({
238
+ apiKey: settings.geminiApiKey,
239
+ });
240
+ embedModel.embedBatchSize = 50;
241
+ } else if (config.modelProvider === "mock") {
242
+ embedModel = new MockEmbedding();
243
+ } else {
244
+ throw new Error(`Unsupported embedding model provider: ${config.modelProvider}`);
245
+ }
246
+ LlamaindexSettings.embedModel = embedModel;
247
+ return embedModel;
248
+ }
249
+
250
+ export async function getStorageContext(config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<StorageContext> {
251
+ const vectorStore = await createVectorStore(config, settings, clients);
252
+ fs.mkdirSync(config.storagePath, { recursive: true });
253
+ const persistDir = join(config.storagePath, sanitizeProjectName(config.projectName) );
254
+ return await storageContextFromDefaults({
255
+ persistDir: persistDir,
256
+ vectorStores: {[ModalityType.TEXT]: vectorStore},
257
+ docStore: new SimpleDocumentStore()
258
+ /*
259
+ if docStore is created with a persist path (as it is by default in storageContextFromDefaults)
260
+ then it will write to disk after every put(), which happens 2+ times per document.
261
+
262
+ so we create it without a persist path, and then explicitly persist it when we're done adding documents.
263
+
264
+ see https://github.com/jeremybmerrill/meaningfully/issues/52
265
+ */
266
+ });
267
+ }
268
+
269
+ export async function persistDocuments(documents: Document[], config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<void> {
270
+ console.time("persistDocuments Run Time");
271
+ const storageContext = await getStorageContext(config, settings, clients);
272
+ await storageContext.docStore.addDocuments(documents, true);
273
+
274
+ // see comments in getStorageContext
275
+ const persistDir = join(config.storagePath, sanitizeProjectName(config.projectName) );
276
+ // @ts-ignore
277
+ await (storageContext.docStore as SimpleDocumentStore).kvStore.persist(join(persistDir, "doc_store.json"));
278
+
279
+ console.timeEnd("persistDocuments Run Time");
280
+ }
281
+
282
+ export async function persistNodes(nodes: TextNode[], config: EmbeddingConfig, settings: Settings, clients: Clients, progressCallback?: (progress: number, total: number) => void): Promise<ProgressVectorStoreIndex> {
283
+ // Create and configure vector store based on type
284
+ console.time("persistNodes Run Time");
285
+
286
+ const storageContext = await getStorageContext(config, settings, clients);
287
+ const vectorStore = storageContext.vectorStores[ModalityType.TEXT];
288
+ if (!vectorStore) {
289
+ throw new Error("Vector store is undefined");
290
+ }
291
+ // Create index and embed documents
292
+ // this is what actaully embeds the nodes
293
+ // (even if they already have embeddings, stupidly)
294
+ const index = await ProgressVectorStoreIndex.init({
295
+ nodes,
296
+ storageContext,
297
+ logProgress: true,
298
+ progressCallback,
299
+ });
300
+
301
+ // I'm not sure why this explicit call to persist is necessary.
302
+ // storageContext should handle this, but it doesn't.
303
+ // all the if statements are just type-checking boilerplate.
304
+ // N.B. WeaviateVectorStore does not need to be explicitly persisted, so we don't include it in the OR conditional here..
305
+ if (vectorStore) {
306
+ if (vectorStore instanceof PGVectorStore || vectorStore instanceof SimpleVectorStore) {
307
+ await vectorStore.persist(join(config.storagePath, sanitizeProjectName(config.projectName), "vector_store.json"));
308
+ } else if (vectorStore instanceof BatchingWeaviateVectorStore) {
309
+ // WeaviateVectorStore does not have a persist method, it persists automatically
310
+ console.log("Pretending to persist Weaviate vector store, but it actually persists automatically.");
311
+ } else {
312
+ throw new Error("Vector store does not support persist method");
313
+ }
314
+ } else {
315
+ throw new Error("Vector store is undefined");
316
+ }
317
+ console.timeEnd("persistNodes Run Time");
318
+ return index;
319
+ }
320
+
321
+ async function createVectorStore(config: EmbeddingConfig, settings: Settings, clients: Clients): Promise<PGVectorStore | SimpleVectorStore | BatchingWeaviateVectorStore> {
322
+ const embeddingModel = getEmbedModel(config, settings);
323
+ switch (config.vectorStoreType) {
324
+
325
+ // for some reason the embedding model has to be specified here TOO
326
+ // otherwise it defaults to Ada.
327
+ case "postgres":
328
+ return new PGVectorStore({
329
+ clientConfig: {connectionString: process.env.POSTGRES_CONNECTION_STRING},
330
+ tableName: sanitizeProjectName(config.projectName),
331
+ dimensions: MODEL_DIMENSIONS[config.modelName] || 1536, // default to 1536 if model not found
332
+ embeddingModel: embeddingModel
333
+ });
334
+
335
+ case "simple":
336
+ const persistDir = join(config.storagePath, sanitizeProjectName(config.projectName));
337
+ return SimpleVectorStore.fromPersistDir(persistDir, embeddingModel);
338
+
339
+ case "weaviate":
340
+ const vectorStore = new BatchingWeaviateVectorStore({
341
+ indexName: capitalizeFirstLetter(sanitizeProjectName(config.projectName)),
342
+ weaviateClient: clients.weaviateClient,
343
+ embeddingModel: embeddingModel
344
+ });
345
+
346
+ // WeaviateVectorStore's getNodeSimilarity method looks for distance, but current weaviate provides score
347
+ // (WeaviateVectorStore would get `score` if we were doing hybrid search)
348
+ // Overwrite the private getNodeSimilarity method to use 'score' from metadata
349
+ // @ts-ignore
350
+ vectorStore.getNodeSimilarity = (entry, _similarityKey = "score") => {
351
+ return entry.metadata.score;
352
+ }
353
+
354
+ return vectorStore;
355
+ default:
356
+ throw new Error(`Unsupported vector store type: ${config.vectorStoreType}`);
357
+ }
358
+ }
359
+
360
+ export async function searchDocuments(
361
+ index: VectorStoreIndex,
362
+ query: string,
363
+ numResults: number = 10,
364
+ filters?: MetadataFilter[]
365
+ ) {
366
+ // const metadataFilters: MetadataFilters | undefined = filters ? {filters: filters} : undefined;
367
+ const metadataFilters: MetadataFilters = {
368
+ filters: filters ? filters : [],
369
+ };
370
+ const retriever = index.asRetriever({ similarityTopK: numResults, filters: metadataFilters });
371
+
372
+ const results = await retriever.retrieve(query );
373
+ return results;
374
+ }
File without changes
@@ -0,0 +1,46 @@
1
+
2
+ // // temporary
3
+ // // this is a wrapper around OpenAIEmbedding that logs the input of the embedding
4
+ // // it's used to debug the embedding process (to make sure random metadata isn't wrongfully included)
5
+ // // it's not used in the production code
6
+
7
+ // import { OpenAIEmbedding } from "@llamaindex/openai";
8
+ // import type {
9
+ // OpenAI as OpenAILLM,
10
+ // } from "openai";
11
+ // type LLMInstance = Pick<OpenAILLM, "embeddings" | "apiKey" | "baseURL">;
12
+
13
+
14
+ // export class LoggingOpenAIEmbedding extends OpenAIEmbedding {
15
+ // constructor(
16
+ // init?: Omit<Partial<OpenAIEmbedding>, "session"> & {
17
+ // session?: LLMInstance;
18
+ // },
19
+ // ) {
20
+ // super(init);
21
+ // // overwrite private member "getMessage" 🙀
22
+ // (this as any).getOpenAIEmbedding = async function(input: string[]): Promise<number[][]> {
23
+ // // TODO: ensure this for every sub class by calling it in the base class
24
+ // input = this.truncateMaxTokens(input);
25
+
26
+ // console.log("LoggingOpenAIEmbedding input", input);
27
+
28
+ // const { data } = await (
29
+ // await this.session
30
+ // ).embeddings.create(
31
+ // this.dimensions
32
+ // ? {
33
+ // model: this.model,
34
+ // dimensions: this.dimensions, // only sent to OpenAI if set by user
35
+ // input,
36
+ // }
37
+ // : {
38
+ // model: this.model,
39
+ // input,
40
+ // },
41
+ // );
42
+
43
+ // return data.map((d) => d.embedding);
44
+ // }
45
+ // }
46
+ // }
@@ -0,0 +1,5 @@
1
+ import { BaseEmbedding } from "llamaindex";
2
+ export declare class MockEmbedding extends BaseEmbedding {
3
+ constructor();
4
+ getTextEmbedding(text: string): Promise<number[]>;
5
+ }
@@ -0,0 +1,13 @@
1
+ //@ts-nocheck
2
+ import { BaseEmbedding } from "llamaindex";
3
+
4
+ export class MockEmbedding extends BaseEmbedding {
5
+ constructor() {
6
+ super();
7
+ }
8
+ async getTextEmbedding(text: string): Promise<number[]> {
9
+ return new Promise((resolve) => {
10
+ resolve([1, 0, 0, 0, 0, 0]);
11
+ });
12
+ }
13
+ };
@@ -0,0 +1,20 @@
1
+ /**
2
+ * A simple manager to track progress of various operations
3
+ */
4
+ export declare class ProgressManager {
5
+ private static instance;
6
+ private progressMap;
7
+ private currentOperation;
8
+ private constructor();
9
+ static getInstance(): ProgressManager;
10
+ startOperation(operationId: string, total?: number): void;
11
+ updateProgress(operationId: string, progress: number): void;
12
+ completeOperation(operationId: string): void;
13
+ getCurrentProgress(): {
14
+ progress: number;
15
+ total: number;
16
+ elapsedTimeMs: number;
17
+ estimatedTimeRemainingMs: number | null;
18
+ };
19
+ clearOperation(operationId: string): void;
20
+ }