@byted-las/contextlake-openclaw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/bin/contextlake-openclaw.js +5 -0
- package/dist/index.d.ts +113 -0
- package/dist/index.js +73 -0
- package/dist/src/client/lancedb.d.ts +30 -0
- package/dist/src/client/lancedb.js +113 -0
- package/dist/src/client/tos.d.ts +19 -0
- package/dist/src/client/tos.js +81 -0
- package/dist/src/commands/cli.d.ts +6 -0
- package/dist/src/commands/cli.js +78 -0
- package/dist/src/commands/index.d.ts +1 -0
- package/dist/src/commands/index.js +139 -0
- package/dist/src/commands/slashcmd.d.ts +14 -0
- package/dist/src/commands/slashcmd.js +91 -0
- package/dist/src/commands/tools.d.ts +219 -0
- package/dist/src/commands/tools.js +286 -0
- package/dist/src/lib/actions/ingest.d.ts +8 -0
- package/dist/src/lib/actions/ingest.js +123 -0
- package/dist/src/lib/actions/manage.d.ts +15 -0
- package/dist/src/lib/actions/manage.js +91 -0
- package/dist/src/lib/actions/retrieve.d.ts +8 -0
- package/dist/src/lib/actions/retrieve.js +73 -0
- package/dist/src/processor/loader.d.ts +7 -0
- package/dist/src/processor/loader.js +83 -0
- package/dist/src/service/embedding/factory.d.ts +2 -0
- package/dist/src/service/embedding/factory.js +16 -0
- package/dist/src/service/embedding/interface.d.ts +18 -0
- package/dist/src/service/embedding/interface.js +2 -0
- package/dist/src/service/embedding/local.d.ts +14 -0
- package/dist/src/service/embedding/local.js +104 -0
- package/dist/src/service/embedding/remote.d.ts +9 -0
- package/dist/src/service/embedding/remote.js +42 -0
- package/dist/src/service/metadata/factory.d.ts +13 -0
- package/dist/src/service/metadata/factory.js +48 -0
- package/dist/src/service/metadata/interface.d.ts +17 -0
- package/dist/src/service/metadata/interface.js +2 -0
- package/dist/src/service/metadata/local.d.ts +13 -0
- package/dist/src/service/metadata/local.js +49 -0
- package/dist/src/service/storage/factory.d.ts +2 -0
- package/dist/src/service/storage/factory.js +19 -0
- package/dist/src/service/storage/interface.d.ts +32 -0
- package/dist/src/service/storage/interface.js +2 -0
- package/dist/src/service/storage/local.d.ts +9 -0
- package/dist/src/service/storage/local.js +72 -0
- package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
- package/dist/src/skills/las-data-profiler/index.js +231 -0
- package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
- package/dist/src/skills/las-data-profiler/register.js +19 -0
- package/dist/src/utils/config.d.ts +1 -0
- package/dist/src/utils/config.js +16 -0
- package/index.ts +78 -0
- package/openclaw.plugin.json +57 -0
- package/package.json +52 -0
- package/src/client/lancedb.ts +102 -0
- package/src/client/tos.ts +100 -0
- package/src/commands/cli.ts +77 -0
- package/src/commands/index.ts +156 -0
- package/src/commands/slashcmd.ts +95 -0
- package/src/commands/tools.ts +286 -0
- package/src/lib/actions/ingest.ts +103 -0
- package/src/lib/actions/manage.ts +107 -0
- package/src/lib/actions/retrieve.ts +90 -0
- package/src/processor/loader.ts +58 -0
- package/src/service/embedding/factory.ts +13 -0
- package/src/service/embedding/interface.ts +21 -0
- package/src/service/embedding/local.ts +118 -0
- package/src/service/embedding/remote.ts +45 -0
- package/src/service/metadata/factory.ts +52 -0
- package/src/service/metadata/interface.ts +19 -0
- package/src/service/metadata/local.ts +60 -0
- package/src/service/storage/factory.ts +16 -0
- package/src/service/storage/interface.ts +36 -0
- package/src/service/storage/local.ts +42 -0
- package/src/skills/contextlake-delete/SKILL.md +36 -0
- package/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/src/skills/contextlake-list/SKILL.md +22 -0
- package/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/src/skills/las-data-profiler/SKILL.md +174 -0
- package/src/skills/las-data-profiler/index.ts +254 -0
- package/src/skills/las-data-profiler/register.ts +19 -0
- package/src/skills/las-data-profiler/s3_catalog.py +608 -0
- package/src/utils/config.ts +13 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { createStorageProvider } from '../../service/storage/factory';
|
|
2
|
+
import { createMetadataProvider } from '../../service/metadata/factory';
|
|
3
|
+
import { processFile, splitText } from '../../processor/loader';
|
|
4
|
+
import * as path from 'path';
|
|
5
|
+
// @ts-ignore
|
|
6
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
7
|
+
|
|
8
|
+
interface IngestParams {
|
|
9
|
+
files: string[];
|
|
10
|
+
metadata?: Record<string, any>;
|
|
11
|
+
chunkSize?: number;
|
|
12
|
+
overlap?: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export async function ingestAssets(params: IngestParams, config: any, logger?: any) {
|
|
16
|
+
if (logger) {
|
|
17
|
+
logger.info(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
|
|
18
|
+
} else {
|
|
19
|
+
// eslint-disable-next-line no-console
|
|
20
|
+
console.log(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const storageConfig = config.file_storage || { type: 'local', local_base_dir: './data/files' };
|
|
24
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
25
|
+
|
|
26
|
+
const storageProvider = createStorageProvider(storageConfig);
|
|
27
|
+
const metadataProvider = createMetadataProvider(metaConfig);
|
|
28
|
+
|
|
29
|
+
await metadataProvider.connect();
|
|
30
|
+
const maxInlineSize = (config.storage_policy?.max_inline_size_kb || 1024) * 1024;
|
|
31
|
+
|
|
32
|
+
const results = [];
|
|
33
|
+
|
|
34
|
+
for (const filePath of params.files) {
|
|
35
|
+
try {
|
|
36
|
+
const fileName = path.basename(filePath);
|
|
37
|
+
const { buffer, text, type } = await processFile(filePath);
|
|
38
|
+
const createdAt = Date.now(); // Current timestamp
|
|
39
|
+
|
|
40
|
+
let fileUrl = '';
|
|
41
|
+
let storageType = '';
|
|
42
|
+
let binaryData: Buffer = Buffer.alloc(0);
|
|
43
|
+
|
|
44
|
+
// Decide storage strategy
|
|
45
|
+
if (buffer.length <= maxInlineSize) {
|
|
46
|
+
binaryData = buffer;
|
|
47
|
+
fileUrl = `inline://${fileName}`; // Virtual URL for inline
|
|
48
|
+
storageType = 'inline';
|
|
49
|
+
} else {
|
|
50
|
+
fileUrl = await storageProvider.uploadFile(fileName, buffer);
|
|
51
|
+
storageType = storageConfig.type;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const chunks = splitText(text, params.chunkSize || 500, params.overlap || 50);
|
|
55
|
+
const docs = [];
|
|
56
|
+
|
|
57
|
+
// If no text extracted (e.g. image), store one entry with empty text but with metadata/binary
|
|
58
|
+
if (chunks.length === 0) {
|
|
59
|
+
const vector = await metadataProvider.generateEmbedding(fileName); // Embed filename as fallback
|
|
60
|
+
docs.push({
|
|
61
|
+
id: uuidv4(),
|
|
62
|
+
vector,
|
|
63
|
+
text: '',
|
|
64
|
+
source: fileName,
|
|
65
|
+
file_type: type,
|
|
66
|
+
storage_type: storageType,
|
|
67
|
+
url: fileUrl,
|
|
68
|
+
metadata: JSON.stringify(params.metadata || {}),
|
|
69
|
+
created_at: createdAt,
|
|
70
|
+
binary_data: binaryData
|
|
71
|
+
});
|
|
72
|
+
} else {
|
|
73
|
+
for (const chunk of chunks) {
|
|
74
|
+
const vector = await metadataProvider.generateEmbedding(chunk);
|
|
75
|
+
docs.push({
|
|
76
|
+
id: uuidv4(),
|
|
77
|
+
vector,
|
|
78
|
+
text: chunk,
|
|
79
|
+
source: fileName,
|
|
80
|
+
file_type: type,
|
|
81
|
+
storage_type: storageType,
|
|
82
|
+
url: fileUrl,
|
|
83
|
+
metadata: JSON.stringify(params.metadata || {}),
|
|
84
|
+
created_at: createdAt,
|
|
85
|
+
binary_data: binaryData // Only attach to first chunk
|
|
86
|
+
});
|
|
87
|
+
// Clear binary data for subsequent chunks of the same file to avoid duplication
|
|
88
|
+
binaryData = Buffer.alloc(0);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
await metadataProvider.addAssets(docs);
|
|
93
|
+
results.push({ file: fileName, status: 'success', chunks: docs.length });
|
|
94
|
+
|
|
95
|
+
} catch (error: any) {
|
|
96
|
+
// @ts-ignore
|
|
97
|
+
results.push({ file: filePath, status: 'error', message: error.message });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Ensure plain JSON serialization
|
|
102
|
+
return JSON.parse(JSON.stringify(results));
|
|
103
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { createMetadataProvider } from '../../service/metadata/factory';
|
|
2
|
+
import { createStorageProvider } from '../../service/storage/factory';
|
|
3
|
+
|
|
4
|
+
interface ListParams {
|
|
5
|
+
limit?: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
interface DeleteParams {
|
|
9
|
+
file_ids?: string[];
|
|
10
|
+
filter?: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function listAssets(params: ListParams, config: any, logger?: any) {
|
|
14
|
+
if (logger) {
|
|
15
|
+
logger.info(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
|
|
16
|
+
} else {
|
|
17
|
+
// eslint-disable-next-line no-console
|
|
18
|
+
console.log(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Ensure config has default if not provided
|
|
22
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
23
|
+
const metadataProvider = createMetadataProvider(metaConfig);
|
|
24
|
+
|
|
25
|
+
await metadataProvider.connect();
|
|
26
|
+
const docs = await metadataProvider.list(params.limit || 100);
|
|
27
|
+
|
|
28
|
+
// Group by file (since we store chunks)
|
|
29
|
+
const fileMap = new Map();
|
|
30
|
+
docs.forEach((doc: any) => {
|
|
31
|
+
// Basic grouping by source filename.
|
|
32
|
+
const key = doc.source;
|
|
33
|
+
if (!fileMap.has(key)) {
|
|
34
|
+
fileMap.set(key, {
|
|
35
|
+
source: doc.source,
|
|
36
|
+
file_type: doc.file_type,
|
|
37
|
+
storage_type: doc.storage_type,
|
|
38
|
+
url: doc.url,
|
|
39
|
+
chunks_count: 0,
|
|
40
|
+
metadata: doc.metadata
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
fileMap.get(key).chunks_count++;
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
// Cleanly serialize before sending back
|
|
47
|
+
return JSON.parse(JSON.stringify(Array.from(fileMap.values())));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export async function deleteAssets(params: DeleteParams, config: any, logger?: any) {
|
|
51
|
+
if (logger) {
|
|
52
|
+
logger.info(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
|
|
53
|
+
} else {
|
|
54
|
+
// eslint-disable-next-line no-console
|
|
55
|
+
console.log(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
59
|
+
const metadataProvider = createMetadataProvider(metaConfig);
|
|
60
|
+
|
|
61
|
+
// file_storage config is optional for deletion (we might not need to delete from storage if inline)
|
|
62
|
+
// Check if file_storage config exists before creating provider
|
|
63
|
+
let storageProvider;
|
|
64
|
+
if (config.file_storage && config.file_storage.type) {
|
|
65
|
+
storageProvider = createStorageProvider(config.file_storage);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
await metadataProvider.connect();
|
|
69
|
+
|
|
70
|
+
let deleteFilter = params.filter || '';
|
|
71
|
+
if (params.file_ids && params.file_ids.length > 0) {
|
|
72
|
+
// Construct filter
|
|
73
|
+
deleteFilter = params.file_ids.map(id => `id = '${id}'`).join(' OR ');
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (!deleteFilter) throw new Error("Delete requires file_ids or filter");
|
|
77
|
+
|
|
78
|
+
// Retrieve documents to be deleted to get their URLs for storage cleanup
|
|
79
|
+
// Note: This might be expensive if deleting many files.
|
|
80
|
+
// Optimization: If filter is complex, we might skip storage cleanup or do it asynchronously.
|
|
81
|
+
const docsToDelete = await metadataProvider.search('', 1000, deleteFilter);
|
|
82
|
+
|
|
83
|
+
// 1. Delete from Metadata
|
|
84
|
+
await metadataProvider.delete(deleteFilter);
|
|
85
|
+
|
|
86
|
+
// 2. Delete from Storage (Best effort)
|
|
87
|
+
const deletedUrls = new Set<string>();
|
|
88
|
+
if (storageProvider) {
|
|
89
|
+
for (const doc of docsToDelete as any[]) {
|
|
90
|
+
if (doc.url && !deletedUrls.has(doc.url) && !doc.url.startsWith('inline://')) {
|
|
91
|
+
try {
|
|
92
|
+
await storageProvider.deleteFile(doc.url);
|
|
93
|
+
deletedUrls.add(doc.url);
|
|
94
|
+
} catch (e) {
|
|
95
|
+
console.error(`Failed to delete file ${doc.url}:`, e);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
status: 'success',
|
|
103
|
+
message: 'Documents deleted',
|
|
104
|
+
deleted_count: docsToDelete.length,
|
|
105
|
+
storage_cleaned: deletedUrls.size
|
|
106
|
+
};
|
|
107
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { createStorageProvider } from '../../service/storage/factory';
|
|
2
|
+
import { createMetadataProvider } from '../../service/metadata/factory';
|
|
3
|
+
|
|
4
|
+
interface RetrieveParams {
|
|
5
|
+
query: string;
|
|
6
|
+
top_k?: number;
|
|
7
|
+
filter?: string; // SQL-like filter
|
|
8
|
+
include_binary?: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function retrieveAssets(params: RetrieveParams, config: any, logger?: any) {
|
|
12
|
+
if (logger) {
|
|
13
|
+
logger.info(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
|
|
14
|
+
} else {
|
|
15
|
+
// eslint-disable-next-line no-console
|
|
16
|
+
console.log(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
20
|
+
const metadataProvider = createMetadataProvider(metaConfig);
|
|
21
|
+
|
|
22
|
+
let storageProvider: any;
|
|
23
|
+
if (config.file_storage && config.file_storage.type) {
|
|
24
|
+
storageProvider = createStorageProvider(config.file_storage);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
await metadataProvider.connect();
|
|
28
|
+
|
|
29
|
+
const results = await metadataProvider.search(
|
|
30
|
+
params.query,
|
|
31
|
+
params.top_k || 5,
|
|
32
|
+
params.filter
|
|
33
|
+
);
|
|
34
|
+
|
|
35
|
+
// Post-process to handle binary data retrieval if requested
|
|
36
|
+
|
|
37
|
+
if (params.include_binary) {
|
|
38
|
+
const enrichedResults = await Promise.all(results.map(async (doc: any) => {
|
|
39
|
+
// Remove raw binary_data buffer to avoid clone errors in postMessage
|
|
40
|
+
const { binary_data, ...rest } = doc;
|
|
41
|
+
|
|
42
|
+
if (rest.vector) {
|
|
43
|
+
rest.vector = Array.from(rest.vector);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
try {
|
|
47
|
+
if (rest.storage_type === 'inline') {
|
|
48
|
+
// Return inline binary data (Base64)
|
|
49
|
+
return {
|
|
50
|
+
...rest,
|
|
51
|
+
binary_content: binary_data ? binary_data.toString('base64') : null
|
|
52
|
+
};
|
|
53
|
+
} else if (rest.storage_type === 'local' || rest.storage_type === 'tos') {
|
|
54
|
+
if (storageProvider) {
|
|
55
|
+
// Download from storage provider
|
|
56
|
+
const buffer = await storageProvider.downloadFile(rest.url);
|
|
57
|
+
return {
|
|
58
|
+
...rest,
|
|
59
|
+
binary_content: buffer.toString('base64')
|
|
60
|
+
};
|
|
61
|
+
} else {
|
|
62
|
+
return { ...rest, binary_error: 'Storage provider not configured' };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return rest;
|
|
66
|
+
} catch (e) {
|
|
67
|
+
return { ...rest, binary_error: 'Failed to fetch content' };
|
|
68
|
+
}
|
|
69
|
+
}));
|
|
70
|
+
return enrichedResults;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Remove binary_data from default response to keep it light and serializable
|
|
74
|
+
return results.map((doc: any) => {
|
|
75
|
+
const { binary_data, ...rest } = doc;
|
|
76
|
+
|
|
77
|
+
// Ensure all fields are standard serializable types (remove complex objects/buffers)
|
|
78
|
+
if (rest.vector) {
|
|
79
|
+
// Keep vector as standard array of numbers if it exists, but some clients might choke on large arrays.
|
|
80
|
+
// Usually vector is fine, but LanceDB might return custom objects like Float32Array which fail structuredClone in postMessage
|
|
81
|
+
rest.vector = Array.from(rest.vector);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// LanceDB records might be Proxy objects or contain non-clonable getters.
|
|
85
|
+
// We stringify and parse to get a clean, plain Javascript Object.
|
|
86
|
+
const safeDoc = JSON.parse(JSON.stringify(rest));
|
|
87
|
+
|
|
88
|
+
return safeDoc;
|
|
89
|
+
});
|
|
90
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import * as fs from 'fs';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
// @ts-ignore
|
|
4
|
+
import * as pdf from 'pdf-parse';
|
|
5
|
+
import * as mammoth from 'mammoth';
|
|
6
|
+
|
|
7
|
+
export interface FileData {
|
|
8
|
+
buffer: Buffer;
|
|
9
|
+
text: string;
|
|
10
|
+
type: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function processFile(filePath: string): Promise<FileData> {
|
|
14
|
+
const buffer = fs.readFileSync(filePath);
|
|
15
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
16
|
+
let text = '';
|
|
17
|
+
|
|
18
|
+
switch (ext) {
|
|
19
|
+
case '.pdf':
|
|
20
|
+
const pdfData = await (pdf as any)(buffer);
|
|
21
|
+
text = pdfData.text;
|
|
22
|
+
break;
|
|
23
|
+
case '.docx':
|
|
24
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
25
|
+
text = result.value;
|
|
26
|
+
break;
|
|
27
|
+
case '.txt':
|
|
28
|
+
case '.md':
|
|
29
|
+
text = buffer.toString('utf-8');
|
|
30
|
+
break;
|
|
31
|
+
default:
|
|
32
|
+
text = buffer.toString('utf-8'); // Fallback to text
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Basic cleaning
|
|
36
|
+
text = text.replace(/\s+/g, ' ').trim();
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
buffer,
|
|
40
|
+
text,
|
|
41
|
+
type: ext.replace('.', '')
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export function splitText(text: string, chunkSize: number = 500, overlap: number = 50): string[] {
|
|
46
|
+
const chunks: string[] = [];
|
|
47
|
+
if (!text) return chunks;
|
|
48
|
+
|
|
49
|
+
let start = 0;
|
|
50
|
+
while (start < text.length) {
|
|
51
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
52
|
+
const chunk = text.slice(start, end);
|
|
53
|
+
chunks.push(chunk);
|
|
54
|
+
start += chunkSize - overlap;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return chunks;
|
|
58
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { EmbeddingProvider, EmbeddingConfig } from './interface';
|
|
2
|
+
import { LocalEmbeddingProvider } from './local';
|
|
3
|
+
import { RemoteEmbeddingProvider } from './remote';
|
|
4
|
+
|
|
5
|
+
export function createEmbeddingProvider(config: EmbeddingConfig): EmbeddingProvider {
|
|
6
|
+
if (config.provider === 'local') {
|
|
7
|
+
return new LocalEmbeddingProvider(config);
|
|
8
|
+
} else if (config.provider === 'openai' || config.provider === 'remote') {
|
|
9
|
+
return new RemoteEmbeddingProvider(config);
|
|
10
|
+
} else {
|
|
11
|
+
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
export interface EmbeddingProvider {
|
|
3
|
+
/**
|
|
4
|
+
* Generate embedding for text
|
|
5
|
+
* @param text - Input text
|
|
6
|
+
*/
|
|
7
|
+
generateEmbedding(text: string): Promise<number[]>;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Generate embeddings for multiple texts
|
|
11
|
+
* @param texts - Array of input texts
|
|
12
|
+
*/
|
|
13
|
+
generateEmbeddings(texts: string[]): Promise<number[][]>;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface EmbeddingConfig {
|
|
17
|
+
provider: 'local' | 'remote' | 'openai'; // Support 'openai' for backward compatibility or clarity
|
|
18
|
+
model_name: string;
|
|
19
|
+
api_key?: string;
|
|
20
|
+
api_base?: string;
|
|
21
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { EmbeddingProvider, EmbeddingConfig } from './interface';
|
|
2
|
+
// import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
|
|
3
|
+
|
|
4
|
+
const DEFAULT_LOCAL_MODEL = "hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf";
|
|
5
|
+
|
|
6
|
+
let nodeLlamaImportPromise: Promise<any> | null = null;
|
|
7
|
+
|
|
8
|
+
export const setNodeLlamaCppImporter = (importer: () => Promise<any>) => {
|
|
9
|
+
nodeLlamaImportPromise = importer();
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
const importNodeLlamaCpp = async (): Promise<any> => {
|
|
13
|
+
if (!nodeLlamaImportPromise) {
|
|
14
|
+
nodeLlamaImportPromise = import("node-llama-cpp");
|
|
15
|
+
}
|
|
16
|
+
return nodeLlamaImportPromise;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
export class LocalEmbeddingProvider implements EmbeddingProvider {
|
|
20
|
+
private llama: any = null;
|
|
21
|
+
private model: any = null;
|
|
22
|
+
private context: any = null;
|
|
23
|
+
private initPromise: Promise<void> | null = null;
|
|
24
|
+
private modelPath: string;
|
|
25
|
+
|
|
26
|
+
constructor(config: EmbeddingConfig) {
|
|
27
|
+
// Override transformers.js default with node-llama-cpp default
|
|
28
|
+
this.modelPath = config.model_name === 'Xenova/all-MiniLM-L6-v2'
|
|
29
|
+
? DEFAULT_LOCAL_MODEL
|
|
30
|
+
: (config.model_name || DEFAULT_LOCAL_MODEL);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
private async ensureInitialized(): Promise<void> {
|
|
34
|
+
if (this.context) {
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
if (this.initPromise) {
|
|
38
|
+
return this.initPromise;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
this.initPromise = this.doInitialize();
|
|
42
|
+
return this.initPromise;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
private async doInitialize(): Promise<void> {
|
|
46
|
+
try {
|
|
47
|
+
const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
|
|
48
|
+
|
|
49
|
+
if (!this.llama) {
|
|
50
|
+
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (!this.model) {
|
|
54
|
+
const resolved = await resolveModelFile(this.modelPath);
|
|
55
|
+
this.model = await this.llama.loadModel({ modelPath: resolved });
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (!this.context) {
|
|
59
|
+
this.context = await this.model.createEmbeddingContext();
|
|
60
|
+
}
|
|
61
|
+
} catch (err) {
|
|
62
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
63
|
+
throw new Error(`Local embeddings unavailable. Reason: ${detail}`, {
|
|
64
|
+
cause: err,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async generateEmbedding(text: string): Promise<number[]> {
|
|
70
|
+
await this.ensureInitialized();
|
|
71
|
+
|
|
72
|
+
const embedding = await this.context!.getEmbeddingFor(text);
|
|
73
|
+
const vector = embedding.vector; // TypedArray
|
|
74
|
+
|
|
75
|
+
// Optimized normalization loop
|
|
76
|
+
let sumSq = 0;
|
|
77
|
+
const len = vector.length;
|
|
78
|
+
|
|
79
|
+
// First pass: Calculate magnitude and sanitize (implicitly handled by JS numbers usually, but keeping finite check if needed)
|
|
80
|
+
// For performance, we assume node-llama-cpp returns valid floats.
|
|
81
|
+
// If strict sanitization is needed, it can be combined.
|
|
82
|
+
for (let i = 0; i < len; i++) {
|
|
83
|
+
const val = vector[i];
|
|
84
|
+
if (Number.isFinite(val)) {
|
|
85
|
+
sumSq += val * val;
|
|
86
|
+
} else {
|
|
87
|
+
vector[i] = 0;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const magnitude = Math.sqrt(sumSq);
|
|
92
|
+
const result = new Array(len);
|
|
93
|
+
|
|
94
|
+
if (magnitude > 0) {
|
|
95
|
+
const scale = 1.0 / magnitude;
|
|
96
|
+
for (let i = 0; i < len; i++) {
|
|
97
|
+
result[i] = vector[i] * scale;
|
|
98
|
+
}
|
|
99
|
+
} else {
|
|
100
|
+
// Zero vector case
|
|
101
|
+
for (let i = 0; i < len; i++) {
|
|
102
|
+
result[i] = vector[i]; // or 0
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return result;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Optimized batch processing for local embedding
|
|
110
|
+
async generateEmbeddings(texts: string[]): Promise<number[][]> {
|
|
111
|
+
await this.ensureInitialized();
|
|
112
|
+
// node-llama-cpp's createEmbeddingContext might not support batch directly yet depending on version,
|
|
113
|
+
// but we can at least optimize the loop.
|
|
114
|
+
// If newer version supports batch, we should use it. For now, we parallelize with limit.
|
|
115
|
+
|
|
116
|
+
return Promise.all(texts.map(text => this.generateEmbedding(text)));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { EmbeddingProvider, EmbeddingConfig } from './interface';
|
|
2
|
+
|
|
3
|
+
export class RemoteEmbeddingProvider implements EmbeddingProvider {
|
|
4
|
+
private apiKey: string;
|
|
5
|
+
private modelName: string;
|
|
6
|
+
private apiBase: string;
|
|
7
|
+
|
|
8
|
+
constructor(config: EmbeddingConfig) {
|
|
9
|
+
this.apiKey = config.api_key || '';
|
|
10
|
+
this.modelName = config.model_name;
|
|
11
|
+
this.apiBase = config.api_base || 'https://api.openai.com/v1';
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async generateEmbedding(text: string): Promise<number[]> {
|
|
15
|
+
const embeddings = await this.generateEmbeddings([text]);
|
|
16
|
+
return embeddings[0];
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Optimized batch embedding generation
|
|
20
|
+
async generateEmbeddings(texts: string[]): Promise<number[][]> {
|
|
21
|
+
const response = await fetch(`${this.apiBase}/embeddings`, {
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: {
|
|
24
|
+
'Content-Type': 'application/json',
|
|
25
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
26
|
+
},
|
|
27
|
+
body: JSON.stringify({
|
|
28
|
+
input: texts,
|
|
29
|
+
model: this.modelName
|
|
30
|
+
})
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
if (!response.ok) {
|
|
34
|
+
const error = await response.text();
|
|
35
|
+
throw new Error(`Remote API error: ${response.status} ${error}`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const data = await response.json();
|
|
39
|
+
if (data.data && Array.isArray(data.data) && data.data.length > 0) {
|
|
40
|
+
// Ensure order matches input
|
|
41
|
+
return data.data.sort((a: any, b: any) => a.index - b.index).map((item: any) => item.embedding);
|
|
42
|
+
}
|
|
43
|
+
throw new Error('Unexpected API response format');
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { MetadataProvider, MetadataConfig } from './interface';
|
|
2
|
+
import { DocumentSchema } from '../../client/lancedb';
|
|
3
|
+
import { LocalMetadataProvider } from './local';
|
|
4
|
+
|
|
5
|
+
export class RemoteMetadataProvider implements MetadataProvider {
|
|
6
|
+
private config: MetadataConfig;
|
|
7
|
+
|
|
8
|
+
constructor(config: MetadataConfig) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
if (!config.remote_api_endpoint) {
|
|
11
|
+
throw new Error("Missing remote API endpoint configuration");
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async connect(): Promise<void> {
|
|
16
|
+
// TODO: Implement health check or auth verification
|
|
17
|
+
console.log("Connected to remote metadata service at", this.config.remote_api_endpoint);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
async addAssets(docs: DocumentSchema[]): Promise<void> {
|
|
21
|
+
throw new Error('Remote metadata service not implemented yet');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async search(query: string, limit?: number, filter?: string): Promise<DocumentSchema[]> {
|
|
25
|
+
throw new Error('Remote metadata service not implemented yet');
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async list(limit?: number): Promise<DocumentSchema[]> {
|
|
29
|
+
throw new Error('Remote metadata service not implemented yet');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async delete(filter: string): Promise<void> {
|
|
33
|
+
throw new Error('Remote metadata service not implemented yet');
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async generateEmbedding(text: string): Promise<number[]> {
|
|
37
|
+
throw new Error('Remote metadata service not implemented yet');
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function createMetadataProvider(config: MetadataConfig): MetadataProvider {
|
|
42
|
+
if (!config) {
|
|
43
|
+
throw new Error('Metadata configuration is missing');
|
|
44
|
+
}
|
|
45
|
+
if (config.type === 'local' || !config.type) { // Default to local if type is missing but config exists
|
|
46
|
+
return new LocalMetadataProvider(config);
|
|
47
|
+
} else if (config.type === 'remote') {
|
|
48
|
+
return new RemoteMetadataProvider(config);
|
|
49
|
+
} else {
|
|
50
|
+
throw new Error(`Unsupported metadata storage type: ${config.type}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { DocumentSchema } from '../../client/lancedb';
|
|
2
|
+
import { EmbeddingConfig } from '../embedding/interface';
|
|
3
|
+
|
|
4
|
+
export interface MetadataProvider {
|
|
5
|
+
connect(): Promise<void>;
|
|
6
|
+
addAssets(docs: DocumentSchema[]): Promise<void>;
|
|
7
|
+
search(query: string, limit?: number, filter?: string): Promise<DocumentSchema[]>;
|
|
8
|
+
list(limit?: number, filter?: string): Promise<DocumentSchema[]>;
|
|
9
|
+
delete(filter: string): Promise<void>;
|
|
10
|
+
generateEmbedding(text: string): Promise<number[]>;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface MetadataConfig {
|
|
14
|
+
type: 'local' | 'remote';
|
|
15
|
+
lancedb_uri?: string;
|
|
16
|
+
remote_api_endpoint?: string;
|
|
17
|
+
remote_api_key?: string;
|
|
18
|
+
embedding?: EmbeddingConfig;
|
|
19
|
+
}
|