@byted-las/contextlake-openclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +64 -0
  2. package/bin/contextlake-openclaw.js +5 -0
  3. package/dist/index.d.ts +113 -0
  4. package/dist/index.js +73 -0
  5. package/dist/src/client/lancedb.d.ts +30 -0
  6. package/dist/src/client/lancedb.js +113 -0
  7. package/dist/src/client/tos.d.ts +19 -0
  8. package/dist/src/client/tos.js +81 -0
  9. package/dist/src/commands/cli.d.ts +6 -0
  10. package/dist/src/commands/cli.js +78 -0
  11. package/dist/src/commands/index.d.ts +1 -0
  12. package/dist/src/commands/index.js +139 -0
  13. package/dist/src/commands/slashcmd.d.ts +14 -0
  14. package/dist/src/commands/slashcmd.js +91 -0
  15. package/dist/src/commands/tools.d.ts +219 -0
  16. package/dist/src/commands/tools.js +286 -0
  17. package/dist/src/lib/actions/ingest.d.ts +8 -0
  18. package/dist/src/lib/actions/ingest.js +123 -0
  19. package/dist/src/lib/actions/manage.d.ts +15 -0
  20. package/dist/src/lib/actions/manage.js +91 -0
  21. package/dist/src/lib/actions/retrieve.d.ts +8 -0
  22. package/dist/src/lib/actions/retrieve.js +73 -0
  23. package/dist/src/processor/loader.d.ts +7 -0
  24. package/dist/src/processor/loader.js +83 -0
  25. package/dist/src/service/embedding/factory.d.ts +2 -0
  26. package/dist/src/service/embedding/factory.js +16 -0
  27. package/dist/src/service/embedding/interface.d.ts +18 -0
  28. package/dist/src/service/embedding/interface.js +2 -0
  29. package/dist/src/service/embedding/local.d.ts +14 -0
  30. package/dist/src/service/embedding/local.js +104 -0
  31. package/dist/src/service/embedding/remote.d.ts +9 -0
  32. package/dist/src/service/embedding/remote.js +42 -0
  33. package/dist/src/service/metadata/factory.d.ts +13 -0
  34. package/dist/src/service/metadata/factory.js +48 -0
  35. package/dist/src/service/metadata/interface.d.ts +17 -0
  36. package/dist/src/service/metadata/interface.js +2 -0
  37. package/dist/src/service/metadata/local.d.ts +13 -0
  38. package/dist/src/service/metadata/local.js +49 -0
  39. package/dist/src/service/storage/factory.d.ts +2 -0
  40. package/dist/src/service/storage/factory.js +19 -0
  41. package/dist/src/service/storage/interface.d.ts +32 -0
  42. package/dist/src/service/storage/interface.js +2 -0
  43. package/dist/src/service/storage/local.d.ts +9 -0
  44. package/dist/src/service/storage/local.js +72 -0
  45. package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
  46. package/dist/src/skills/las-data-profiler/index.js +231 -0
  47. package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
  48. package/dist/src/skills/las-data-profiler/register.js +19 -0
  49. package/dist/src/utils/config.d.ts +1 -0
  50. package/dist/src/utils/config.js +16 -0
  51. package/index.ts +78 -0
  52. package/openclaw.plugin.json +57 -0
  53. package/package.json +52 -0
  54. package/src/client/lancedb.ts +102 -0
  55. package/src/client/tos.ts +100 -0
  56. package/src/commands/cli.ts +77 -0
  57. package/src/commands/index.ts +156 -0
  58. package/src/commands/slashcmd.ts +95 -0
  59. package/src/commands/tools.ts +286 -0
  60. package/src/lib/actions/ingest.ts +103 -0
  61. package/src/lib/actions/manage.ts +107 -0
  62. package/src/lib/actions/retrieve.ts +90 -0
  63. package/src/processor/loader.ts +58 -0
  64. package/src/service/embedding/factory.ts +13 -0
  65. package/src/service/embedding/interface.ts +21 -0
  66. package/src/service/embedding/local.ts +118 -0
  67. package/src/service/embedding/remote.ts +45 -0
  68. package/src/service/metadata/factory.ts +52 -0
  69. package/src/service/metadata/interface.ts +19 -0
  70. package/src/service/metadata/local.ts +60 -0
  71. package/src/service/storage/factory.ts +16 -0
  72. package/src/service/storage/interface.ts +36 -0
  73. package/src/service/storage/local.ts +42 -0
  74. package/src/skills/contextlake-delete/SKILL.md +36 -0
  75. package/src/skills/contextlake-ingest/SKILL.md +40 -0
  76. package/src/skills/contextlake-list/SKILL.md +22 -0
  77. package/src/skills/contextlake-retrieve/SKILL.md +37 -0
  78. package/src/skills/las-data-profiler/SKILL.md +174 -0
  79. package/src/skills/las-data-profiler/index.ts +254 -0
  80. package/src/skills/las-data-profiler/register.ts +19 -0
  81. package/src/skills/las-data-profiler/s3_catalog.py +608 -0
  82. package/src/utils/config.ts +13 -0
@@ -0,0 +1,103 @@
1
+ import { createStorageProvider } from '../../service/storage/factory';
2
+ import { createMetadataProvider } from '../../service/metadata/factory';
3
+ import { processFile, splitText } from '../../processor/loader';
4
+ import * as path from 'path';
5
+ // @ts-ignore
6
+ import { v4 as uuidv4 } from 'uuid';
7
+
8
+ interface IngestParams {
9
+ files: string[];
10
+ metadata?: Record<string, any>;
11
+ chunkSize?: number;
12
+ overlap?: number;
13
+ }
14
+
15
+ export async function ingestAssets(params: IngestParams, config: any, logger?: any) {
16
+ if (logger) {
17
+ logger.info(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
18
+ } else {
19
+ // eslint-disable-next-line no-console
20
+ console.log(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
21
+ }
22
+
23
+ const storageConfig = config.file_storage || { type: 'local', local_base_dir: './data/files' };
24
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
25
+
26
+ const storageProvider = createStorageProvider(storageConfig);
27
+ const metadataProvider = createMetadataProvider(metaConfig);
28
+
29
+ await metadataProvider.connect();
30
+ const maxInlineSize = (config.storage_policy?.max_inline_size_kb || 1024) * 1024;
31
+
32
+ const results = [];
33
+
34
+ for (const filePath of params.files) {
35
+ try {
36
+ const fileName = path.basename(filePath);
37
+ const { buffer, text, type } = await processFile(filePath);
38
+ const createdAt = Date.now(); // Current timestamp
39
+
40
+ let fileUrl = '';
41
+ let storageType = '';
42
+ let binaryData: Buffer = Buffer.alloc(0);
43
+
44
+ // Decide storage strategy
45
+ if (buffer.length <= maxInlineSize) {
46
+ binaryData = buffer;
47
+ fileUrl = `inline://${fileName}`; // Virtual URL for inline
48
+ storageType = 'inline';
49
+ } else {
50
+ fileUrl = await storageProvider.uploadFile(fileName, buffer);
51
+ storageType = storageConfig.type;
52
+ }
53
+
54
+ const chunks = splitText(text, params.chunkSize || 500, params.overlap || 50);
55
+ const docs = [];
56
+
57
+ // If no text extracted (e.g. image), store one entry with empty text but with metadata/binary
58
+ if (chunks.length === 0) {
59
+ const vector = await metadataProvider.generateEmbedding(fileName); // Embed filename as fallback
60
+ docs.push({
61
+ id: uuidv4(),
62
+ vector,
63
+ text: '',
64
+ source: fileName,
65
+ file_type: type,
66
+ storage_type: storageType,
67
+ url: fileUrl,
68
+ metadata: JSON.stringify(params.metadata || {}),
69
+ created_at: createdAt,
70
+ binary_data: binaryData
71
+ });
72
+ } else {
73
+ for (const chunk of chunks) {
74
+ const vector = await metadataProvider.generateEmbedding(chunk);
75
+ docs.push({
76
+ id: uuidv4(),
77
+ vector,
78
+ text: chunk,
79
+ source: fileName,
80
+ file_type: type,
81
+ storage_type: storageType,
82
+ url: fileUrl,
83
+ metadata: JSON.stringify(params.metadata || {}),
84
+ created_at: createdAt,
85
+ binary_data: binaryData // Only attach to first chunk
86
+ });
87
+ // Clear binary data for subsequent chunks of the same file to avoid duplication
88
+ binaryData = Buffer.alloc(0);
89
+ }
90
+ }
91
+
92
+ await metadataProvider.addAssets(docs);
93
+ results.push({ file: fileName, status: 'success', chunks: docs.length });
94
+
95
+ } catch (error: any) {
96
+ // @ts-ignore
97
+ results.push({ file: filePath, status: 'error', message: error.message });
98
+ }
99
+ }
100
+
101
+ // Ensure plain JSON serialization
102
+ return JSON.parse(JSON.stringify(results));
103
+ }
@@ -0,0 +1,107 @@
1
+ import { createMetadataProvider } from '../../service/metadata/factory';
2
+ import { createStorageProvider } from '../../service/storage/factory';
3
+
4
+ interface ListParams {
5
+ limit?: number;
6
+ }
7
+
8
+ interface DeleteParams {
9
+ file_ids?: string[];
10
+ filter?: string;
11
+ }
12
+
13
+ export async function listAssets(params: ListParams, config: any, logger?: any) {
14
+ if (logger) {
15
+ logger.info(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
16
+ } else {
17
+ // eslint-disable-next-line no-console
18
+ console.log(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
19
+ }
20
+
21
+ // Ensure config has default if not provided
22
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
23
+ const metadataProvider = createMetadataProvider(metaConfig);
24
+
25
+ await metadataProvider.connect();
26
+ const docs = await metadataProvider.list(params.limit || 100);
27
+
28
+ // Group by file (since we store chunks)
29
+ const fileMap = new Map();
30
+ docs.forEach((doc: any) => {
31
+ // Basic grouping by source filename.
32
+ const key = doc.source;
33
+ if (!fileMap.has(key)) {
34
+ fileMap.set(key, {
35
+ source: doc.source,
36
+ file_type: doc.file_type,
37
+ storage_type: doc.storage_type,
38
+ url: doc.url,
39
+ chunks_count: 0,
40
+ metadata: doc.metadata
41
+ });
42
+ }
43
+ fileMap.get(key).chunks_count++;
44
+ });
45
+
46
+ // Cleanly serialize before sending back
47
+ return JSON.parse(JSON.stringify(Array.from(fileMap.values())));
48
+ }
49
+
50
+ export async function deleteAssets(params: DeleteParams, config: any, logger?: any) {
51
+ if (logger) {
52
+ logger.info(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
53
+ } else {
54
+ // eslint-disable-next-line no-console
55
+ console.log(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
56
+ }
57
+
58
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
59
+ const metadataProvider = createMetadataProvider(metaConfig);
60
+
61
+ // file_storage config is optional for deletion (we might not need to delete from storage if inline)
62
+ // Check if file_storage config exists before creating provider
63
+ let storageProvider;
64
+ if (config.file_storage && config.file_storage.type) {
65
+ storageProvider = createStorageProvider(config.file_storage);
66
+ }
67
+
68
+ await metadataProvider.connect();
69
+
70
+ let deleteFilter = params.filter || '';
71
+ if (params.file_ids && params.file_ids.length > 0) {
72
+ // Construct filter
73
+ deleteFilter = params.file_ids.map(id => `id = '${id}'`).join(' OR ');
74
+ }
75
+
76
+ if (!deleteFilter) throw new Error("Delete requires file_ids or filter");
77
+
78
+ // Retrieve documents to be deleted to get their URLs for storage cleanup
79
+ // Note: This might be expensive if deleting many files.
80
+ // Optimization: If filter is complex, we might skip storage cleanup or do it asynchronously.
81
+ const docsToDelete = await metadataProvider.search('', 1000, deleteFilter);
82
+
83
+ // 1. Delete from Metadata
84
+ await metadataProvider.delete(deleteFilter);
85
+
86
+ // 2. Delete from Storage (Best effort)
87
+ const deletedUrls = new Set<string>();
88
+ if (storageProvider) {
89
+ for (const doc of docsToDelete as any[]) {
90
+ if (doc.url && !deletedUrls.has(doc.url) && !doc.url.startsWith('inline://')) {
91
+ try {
92
+ await storageProvider.deleteFile(doc.url);
93
+ deletedUrls.add(doc.url);
94
+ } catch (e) {
95
+ console.error(`Failed to delete file ${doc.url}:`, e);
96
+ }
97
+ }
98
+ }
99
+ }
100
+
101
+ return {
102
+ status: 'success',
103
+ message: 'Documents deleted',
104
+ deleted_count: docsToDelete.length,
105
+ storage_cleaned: deletedUrls.size
106
+ };
107
+ }
@@ -0,0 +1,90 @@
1
+ import { createStorageProvider } from '../../service/storage/factory';
2
+ import { createMetadataProvider } from '../../service/metadata/factory';
3
+
4
+ interface RetrieveParams {
5
+ query: string;
6
+ top_k?: number;
7
+ filter?: string; // SQL-like filter
8
+ include_binary?: boolean;
9
+ }
10
+
11
+ export async function retrieveAssets(params: RetrieveParams, config: any, logger?: any) {
12
+ if (logger) {
13
+ logger.info(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
14
+ } else {
15
+ // eslint-disable-next-line no-console
16
+ console.log(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
17
+ }
18
+
19
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
20
+ const metadataProvider = createMetadataProvider(metaConfig);
21
+
22
+ let storageProvider: any;
23
+ if (config.file_storage && config.file_storage.type) {
24
+ storageProvider = createStorageProvider(config.file_storage);
25
+ }
26
+
27
+ await metadataProvider.connect();
28
+
29
+ const results = await metadataProvider.search(
30
+ params.query,
31
+ params.top_k || 5,
32
+ params.filter
33
+ );
34
+
35
+ // Post-process to handle binary data retrieval if requested
36
+
37
+ if (params.include_binary) {
38
+ const enrichedResults = await Promise.all(results.map(async (doc: any) => {
39
+ // Remove raw binary_data buffer to avoid clone errors in postMessage
40
+ const { binary_data, ...rest } = doc;
41
+
42
+ if (rest.vector) {
43
+ rest.vector = Array.from(rest.vector);
44
+ }
45
+
46
+ try {
47
+ if (rest.storage_type === 'inline') {
48
+ // Return inline binary data (Base64)
49
+ return {
50
+ ...rest,
51
+ binary_content: binary_data ? binary_data.toString('base64') : null
52
+ };
53
+ } else if (rest.storage_type === 'local' || rest.storage_type === 'tos') {
54
+ if (storageProvider) {
55
+ // Download from storage provider
56
+ const buffer = await storageProvider.downloadFile(rest.url);
57
+ return {
58
+ ...rest,
59
+ binary_content: buffer.toString('base64')
60
+ };
61
+ } else {
62
+ return { ...rest, binary_error: 'Storage provider not configured' };
63
+ }
64
+ }
65
+ return rest;
66
+ } catch (e) {
67
+ return { ...rest, binary_error: 'Failed to fetch content' };
68
+ }
69
+ }));
70
+ return enrichedResults;
71
+ }
72
+
73
+ // Remove binary_data from default response to keep it light and serializable
74
+ return results.map((doc: any) => {
75
+ const { binary_data, ...rest } = doc;
76
+
77
+ // Ensure all fields are standard serializable types (remove complex objects/buffers)
78
+ if (rest.vector) {
79
+ // Keep vector as standard array of numbers if it exists, but some clients might choke on large arrays.
80
+ // Usually vector is fine, but LanceDB might return custom objects like Float32Array which fail structuredClone in postMessage
81
+ rest.vector = Array.from(rest.vector);
82
+ }
83
+
84
+ // LanceDB records might be Proxy objects or contain non-clonable getters.
85
+ // We stringify and parse to get a clean, plain Javascript Object.
86
+ const safeDoc = JSON.parse(JSON.stringify(rest));
87
+
88
+ return safeDoc;
89
+ });
90
+ }
@@ -0,0 +1,58 @@
1
+ import * as fs from 'fs';
2
+ import * as path from 'path';
3
+ // @ts-ignore
4
+ import * as pdf from 'pdf-parse';
5
+ import * as mammoth from 'mammoth';
6
+
7
+ export interface FileData {
8
+ buffer: Buffer;
9
+ text: string;
10
+ type: string;
11
+ }
12
+
13
+ export async function processFile(filePath: string): Promise<FileData> {
14
+ const buffer = fs.readFileSync(filePath);
15
+ const ext = path.extname(filePath).toLowerCase();
16
+ let text = '';
17
+
18
+ switch (ext) {
19
+ case '.pdf':
20
+ const pdfData = await (pdf as any)(buffer);
21
+ text = pdfData.text;
22
+ break;
23
+ case '.docx':
24
+ const result = await mammoth.extractRawText({ buffer });
25
+ text = result.value;
26
+ break;
27
+ case '.txt':
28
+ case '.md':
29
+ text = buffer.toString('utf-8');
30
+ break;
31
+ default:
32
+ text = buffer.toString('utf-8'); // Fallback to text
33
+ }
34
+
35
+ // Basic cleaning
36
+ text = text.replace(/\s+/g, ' ').trim();
37
+
38
+ return {
39
+ buffer,
40
+ text,
41
+ type: ext.replace('.', '')
42
+ };
43
+ }
44
+
45
+ export function splitText(text: string, chunkSize: number = 500, overlap: number = 50): string[] {
46
+ const chunks: string[] = [];
47
+ if (!text) return chunks;
48
+
49
+ let start = 0;
50
+ while (start < text.length) {
51
+ const end = Math.min(start + chunkSize, text.length);
52
+ const chunk = text.slice(start, end);
53
+ chunks.push(chunk);
54
+ start += chunkSize - overlap;
55
+ }
56
+
57
+ return chunks;
58
+ }
@@ -0,0 +1,13 @@
1
+ import { EmbeddingProvider, EmbeddingConfig } from './interface';
2
+ import { LocalEmbeddingProvider } from './local';
3
+ import { RemoteEmbeddingProvider } from './remote';
4
+
5
+ export function createEmbeddingProvider(config: EmbeddingConfig): EmbeddingProvider {
6
+ if (config.provider === 'local') {
7
+ return new LocalEmbeddingProvider(config);
8
+ } else if (config.provider === 'openai' || config.provider === 'remote') {
9
+ return new RemoteEmbeddingProvider(config);
10
+ } else {
11
+ throw new Error(`Unsupported embedding provider: ${config.provider}`);
12
+ }
13
+ }
@@ -0,0 +1,21 @@
1
+
2
+ export interface EmbeddingProvider {
3
+ /**
4
+ * Generate embedding for text
5
+ * @param text - Input text
6
+ */
7
+ generateEmbedding(text: string): Promise<number[]>;
8
+
9
+ /**
10
+ * Generate embeddings for multiple texts
11
+ * @param texts - Array of input texts
12
+ */
13
+ generateEmbeddings(texts: string[]): Promise<number[][]>;
14
+ }
15
+
16
+ export interface EmbeddingConfig {
17
+ provider: 'local' | 'remote' | 'openai'; // Support 'openai' for backward compatibility or clarity
18
+ model_name: string;
19
+ api_key?: string;
20
+ api_base?: string;
21
+ }
@@ -0,0 +1,118 @@
1
+ import { EmbeddingProvider, EmbeddingConfig } from './interface';
2
+ // import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
3
+
4
+ const DEFAULT_LOCAL_MODEL = "hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf";
5
+
6
+ let nodeLlamaImportPromise: Promise<any> | null = null;
7
+
8
+ export const setNodeLlamaCppImporter = (importer: () => Promise<any>) => {
9
+ nodeLlamaImportPromise = importer();
10
+ };
11
+
12
+ const importNodeLlamaCpp = async (): Promise<any> => {
13
+ if (!nodeLlamaImportPromise) {
14
+ nodeLlamaImportPromise = import("node-llama-cpp");
15
+ }
16
+ return nodeLlamaImportPromise;
17
+ };
18
+
19
+ export class LocalEmbeddingProvider implements EmbeddingProvider {
20
+ private llama: any = null;
21
+ private model: any = null;
22
+ private context: any = null;
23
+ private initPromise: Promise<void> | null = null;
24
+ private modelPath: string;
25
+
26
+ constructor(config: EmbeddingConfig) {
27
+ // Override transformers.js default with node-llama-cpp default
28
+ this.modelPath = config.model_name === 'Xenova/all-MiniLM-L6-v2'
29
+ ? DEFAULT_LOCAL_MODEL
30
+ : (config.model_name || DEFAULT_LOCAL_MODEL);
31
+ }
32
+
33
+ private async ensureInitialized(): Promise<void> {
34
+ if (this.context) {
35
+ return;
36
+ }
37
+ if (this.initPromise) {
38
+ return this.initPromise;
39
+ }
40
+
41
+ this.initPromise = this.doInitialize();
42
+ return this.initPromise;
43
+ }
44
+
45
+ private async doInitialize(): Promise<void> {
46
+ try {
47
+ const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
48
+
49
+ if (!this.llama) {
50
+ this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
51
+ }
52
+
53
+ if (!this.model) {
54
+ const resolved = await resolveModelFile(this.modelPath);
55
+ this.model = await this.llama.loadModel({ modelPath: resolved });
56
+ }
57
+
58
+ if (!this.context) {
59
+ this.context = await this.model.createEmbeddingContext();
60
+ }
61
+ } catch (err) {
62
+ const detail = err instanceof Error ? err.message : String(err);
63
+ throw new Error(`Local embeddings unavailable. Reason: ${detail}`, {
64
+ cause: err,
65
+ });
66
+ }
67
+ }
68
+
69
+ async generateEmbedding(text: string): Promise<number[]> {
70
+ await this.ensureInitialized();
71
+
72
+ const embedding = await this.context!.getEmbeddingFor(text);
73
+ const vector = embedding.vector; // TypedArray
74
+
75
+ // Optimized normalization loop
76
+ let sumSq = 0;
77
+ const len = vector.length;
78
+
79
+ // First pass: Calculate magnitude and sanitize (implicitly handled by JS numbers usually, but keeping finite check if needed)
80
+ // For performance, we assume node-llama-cpp returns valid floats.
81
+ // If strict sanitization is needed, it can be combined.
82
+ for (let i = 0; i < len; i++) {
83
+ const val = vector[i];
84
+ if (Number.isFinite(val)) {
85
+ sumSq += val * val;
86
+ } else {
87
+ vector[i] = 0;
88
+ }
89
+ }
90
+
91
+ const magnitude = Math.sqrt(sumSq);
92
+ const result = new Array(len);
93
+
94
+ if (magnitude > 0) {
95
+ const scale = 1.0 / magnitude;
96
+ for (let i = 0; i < len; i++) {
97
+ result[i] = vector[i] * scale;
98
+ }
99
+ } else {
100
+ // Zero vector case
101
+ for (let i = 0; i < len; i++) {
102
+ result[i] = vector[i]; // or 0
103
+ }
104
+ }
105
+
106
+ return result;
107
+ }
108
+
109
+ // Optimized batch processing for local embedding
110
+ async generateEmbeddings(texts: string[]): Promise<number[][]> {
111
+ await this.ensureInitialized();
112
+ // node-llama-cpp's createEmbeddingContext might not support batch directly yet depending on version,
113
+ // but we can at least optimize the loop.
114
+ // If newer version supports batch, we should use it. For now, we parallelize with limit.
115
+
116
+ return Promise.all(texts.map(text => this.generateEmbedding(text)));
117
+ }
118
+ }
@@ -0,0 +1,45 @@
1
+ import { EmbeddingProvider, EmbeddingConfig } from './interface';
2
+
3
+ export class RemoteEmbeddingProvider implements EmbeddingProvider {
4
+ private apiKey: string;
5
+ private modelName: string;
6
+ private apiBase: string;
7
+
8
+ constructor(config: EmbeddingConfig) {
9
+ this.apiKey = config.api_key || '';
10
+ this.modelName = config.model_name;
11
+ this.apiBase = config.api_base || 'https://api.openai.com/v1';
12
+ }
13
+
14
+ async generateEmbedding(text: string): Promise<number[]> {
15
+ const embeddings = await this.generateEmbeddings([text]);
16
+ return embeddings[0];
17
+ }
18
+
19
+ // Optimized batch embedding generation
20
+ async generateEmbeddings(texts: string[]): Promise<number[][]> {
21
+ const response = await fetch(`${this.apiBase}/embeddings`, {
22
+ method: 'POST',
23
+ headers: {
24
+ 'Content-Type': 'application/json',
25
+ 'Authorization': `Bearer ${this.apiKey}`
26
+ },
27
+ body: JSON.stringify({
28
+ input: texts,
29
+ model: this.modelName
30
+ })
31
+ });
32
+
33
+ if (!response.ok) {
34
+ const error = await response.text();
35
+ throw new Error(`Remote API error: ${response.status} ${error}`);
36
+ }
37
+
38
+ const data = await response.json();
39
+ if (data.data && Array.isArray(data.data) && data.data.length > 0) {
40
+ // Ensure order matches input
41
+ return data.data.sort((a: any, b: any) => a.index - b.index).map((item: any) => item.embedding);
42
+ }
43
+ throw new Error('Unexpected API response format');
44
+ }
45
+ }
@@ -0,0 +1,52 @@
1
+ import { MetadataProvider, MetadataConfig } from './interface';
2
+ import { DocumentSchema } from '../../client/lancedb';
3
+ import { LocalMetadataProvider } from './local';
4
+
5
+ export class RemoteMetadataProvider implements MetadataProvider {
6
+ private config: MetadataConfig;
7
+
8
+ constructor(config: MetadataConfig) {
9
+ this.config = config;
10
+ if (!config.remote_api_endpoint) {
11
+ throw new Error("Missing remote API endpoint configuration");
12
+ }
13
+ }
14
+
15
+ async connect(): Promise<void> {
16
+ // TODO: Implement health check or auth verification
17
+ console.log("Connected to remote metadata service at", this.config.remote_api_endpoint);
18
+ }
19
+
20
+ async addAssets(docs: DocumentSchema[]): Promise<void> {
21
+ throw new Error('Remote metadata service not implemented yet');
22
+ }
23
+
24
+ async search(query: string, limit?: number, filter?: string): Promise<DocumentSchema[]> {
25
+ throw new Error('Remote metadata service not implemented yet');
26
+ }
27
+
28
+ async list(limit?: number): Promise<DocumentSchema[]> {
29
+ throw new Error('Remote metadata service not implemented yet');
30
+ }
31
+
32
+ async delete(filter: string): Promise<void> {
33
+ throw new Error('Remote metadata service not implemented yet');
34
+ }
35
+
36
+ async generateEmbedding(text: string): Promise<number[]> {
37
+ throw new Error('Remote metadata service not implemented yet');
38
+ }
39
+ }
40
+
41
+ export function createMetadataProvider(config: MetadataConfig): MetadataProvider {
42
+ if (!config) {
43
+ throw new Error('Metadata configuration is missing');
44
+ }
45
+ if (config.type === 'local' || !config.type) { // Default to local if type is missing but config exists
46
+ return new LocalMetadataProvider(config);
47
+ } else if (config.type === 'remote') {
48
+ return new RemoteMetadataProvider(config);
49
+ } else {
50
+ throw new Error(`Unsupported metadata storage type: ${config.type}`);
51
+ }
52
+ }
@@ -0,0 +1,19 @@
1
+ import { DocumentSchema } from '../../client/lancedb';
2
+ import { EmbeddingConfig } from '../embedding/interface';
3
+
4
+ export interface MetadataProvider {
5
+ connect(): Promise<void>;
6
+ addAssets(docs: DocumentSchema[]): Promise<void>;
7
+ search(query: string, limit?: number, filter?: string): Promise<DocumentSchema[]>;
8
+ list(limit?: number, filter?: string): Promise<DocumentSchema[]>;
9
+ delete(filter: string): Promise<void>;
10
+ generateEmbedding(text: string): Promise<number[]>;
11
+ }
12
+
13
+ export interface MetadataConfig {
14
+ type: 'local' | 'remote';
15
+ lancedb_uri?: string;
16
+ remote_api_endpoint?: string;
17
+ remote_api_key?: string;
18
+ embedding?: EmbeddingConfig;
19
+ }