@byted-las/contextlake-openclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +64 -0
  2. package/bin/contextlake-openclaw.js +5 -0
  3. package/dist/index.d.ts +113 -0
  4. package/dist/index.js +73 -0
  5. package/dist/src/client/lancedb.d.ts +30 -0
  6. package/dist/src/client/lancedb.js +113 -0
  7. package/dist/src/client/tos.d.ts +19 -0
  8. package/dist/src/client/tos.js +81 -0
  9. package/dist/src/commands/cli.d.ts +6 -0
  10. package/dist/src/commands/cli.js +78 -0
  11. package/dist/src/commands/index.d.ts +1 -0
  12. package/dist/src/commands/index.js +139 -0
  13. package/dist/src/commands/slashcmd.d.ts +14 -0
  14. package/dist/src/commands/slashcmd.js +91 -0
  15. package/dist/src/commands/tools.d.ts +219 -0
  16. package/dist/src/commands/tools.js +286 -0
  17. package/dist/src/lib/actions/ingest.d.ts +8 -0
  18. package/dist/src/lib/actions/ingest.js +123 -0
  19. package/dist/src/lib/actions/manage.d.ts +15 -0
  20. package/dist/src/lib/actions/manage.js +91 -0
  21. package/dist/src/lib/actions/retrieve.d.ts +8 -0
  22. package/dist/src/lib/actions/retrieve.js +73 -0
  23. package/dist/src/processor/loader.d.ts +7 -0
  24. package/dist/src/processor/loader.js +83 -0
  25. package/dist/src/service/embedding/factory.d.ts +2 -0
  26. package/dist/src/service/embedding/factory.js +16 -0
  27. package/dist/src/service/embedding/interface.d.ts +18 -0
  28. package/dist/src/service/embedding/interface.js +2 -0
  29. package/dist/src/service/embedding/local.d.ts +14 -0
  30. package/dist/src/service/embedding/local.js +104 -0
  31. package/dist/src/service/embedding/remote.d.ts +9 -0
  32. package/dist/src/service/embedding/remote.js +42 -0
  33. package/dist/src/service/metadata/factory.d.ts +13 -0
  34. package/dist/src/service/metadata/factory.js +48 -0
  35. package/dist/src/service/metadata/interface.d.ts +17 -0
  36. package/dist/src/service/metadata/interface.js +2 -0
  37. package/dist/src/service/metadata/local.d.ts +13 -0
  38. package/dist/src/service/metadata/local.js +49 -0
  39. package/dist/src/service/storage/factory.d.ts +2 -0
  40. package/dist/src/service/storage/factory.js +19 -0
  41. package/dist/src/service/storage/interface.d.ts +32 -0
  42. package/dist/src/service/storage/interface.js +2 -0
  43. package/dist/src/service/storage/local.d.ts +9 -0
  44. package/dist/src/service/storage/local.js +72 -0
  45. package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
  46. package/dist/src/skills/las-data-profiler/index.js +231 -0
  47. package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
  48. package/dist/src/skills/las-data-profiler/register.js +19 -0
  49. package/dist/src/utils/config.d.ts +1 -0
  50. package/dist/src/utils/config.js +16 -0
  51. package/index.ts +78 -0
  52. package/openclaw.plugin.json +57 -0
  53. package/package.json +52 -0
  54. package/src/client/lancedb.ts +102 -0
  55. package/src/client/tos.ts +100 -0
  56. package/src/commands/cli.ts +77 -0
  57. package/src/commands/index.ts +156 -0
  58. package/src/commands/slashcmd.ts +95 -0
  59. package/src/commands/tools.ts +286 -0
  60. package/src/lib/actions/ingest.ts +103 -0
  61. package/src/lib/actions/manage.ts +107 -0
  62. package/src/lib/actions/retrieve.ts +90 -0
  63. package/src/processor/loader.ts +58 -0
  64. package/src/service/embedding/factory.ts +13 -0
  65. package/src/service/embedding/interface.ts +21 -0
  66. package/src/service/embedding/local.ts +118 -0
  67. package/src/service/embedding/remote.ts +45 -0
  68. package/src/service/metadata/factory.ts +52 -0
  69. package/src/service/metadata/interface.ts +19 -0
  70. package/src/service/metadata/local.ts +60 -0
  71. package/src/service/storage/factory.ts +16 -0
  72. package/src/service/storage/interface.ts +36 -0
  73. package/src/service/storage/local.ts +42 -0
  74. package/src/skills/contextlake-delete/SKILL.md +36 -0
  75. package/src/skills/contextlake-ingest/SKILL.md +40 -0
  76. package/src/skills/contextlake-list/SKILL.md +22 -0
  77. package/src/skills/contextlake-retrieve/SKILL.md +37 -0
  78. package/src/skills/las-data-profiler/SKILL.md +174 -0
  79. package/src/skills/las-data-profiler/index.ts +254 -0
  80. package/src/skills/las-data-profiler/register.ts +19 -0
  81. package/src/skills/las-data-profiler/s3_catalog.py +608 -0
  82. package/src/utils/config.ts +13 -0
@@ -0,0 +1,286 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.getAgentTools = getAgentTools;
4
+ const ingest_1 = require("../lib/actions/ingest");
5
+ const retrieve_1 = require("../lib/actions/retrieve");
6
+ const manage_1 = require("../lib/actions/manage");
7
+ function getAgentTools(pluginConfig, logger) {
8
+ return {
9
+ ingestTool: {
10
+ name: 'contextlake-ingest',
11
+ label: 'ContextLake Ingest',
12
+ description: `Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
13
+ Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
14
+ Supports processing of various file types including PDF, Word, Markdown, and Text.
15
+ Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
16
+
17
+ Example User Queries:
18
+ - "帮我把这个文档注入到知识湖中"
19
+ - "上传这份 PDF 到知识库"
20
+ - "Please ingest these documents into ContextLake"
21
+ - "将 /path/to/doc.txt 添加到知识库"`,
22
+ parameters: {
23
+ type: 'object',
24
+ properties: {
25
+ files: {
26
+ type: 'array',
27
+ items: { type: 'string' },
28
+ description: 'List of file paths to ingest'
29
+ },
30
+ metadata: {
31
+ type: 'object',
32
+ description: 'Optional JSON metadata to attach to documents',
33
+ additionalProperties: true
34
+ },
35
+ chunkSize: {
36
+ type: 'integer',
37
+ description: 'Chunk size for text splitting'
38
+ },
39
+ overlap: {
40
+ type: 'integer',
41
+ description: 'Overlap size for text splitting'
42
+ }
43
+ },
44
+ required: ['files'],
45
+ additionalProperties: false
46
+ },
47
+ schema: {
48
+ type: 'object',
49
+ properties: {
50
+ files: {
51
+ type: 'array',
52
+ items: { type: 'string' },
53
+ description: 'List of file paths to ingest'
54
+ },
55
+ metadata: {
56
+ type: 'object',
57
+ description: 'Optional JSON metadata to attach to documents',
58
+ additionalProperties: true
59
+ },
60
+ chunkSize: {
61
+ type: 'integer',
62
+ description: 'Chunk size for text splitting'
63
+ },
64
+ overlap: {
65
+ type: 'integer',
66
+ description: 'Overlap size for text splitting'
67
+ }
68
+ },
69
+ required: ['files'],
70
+ additionalProperties: false
71
+ },
72
+ async execute(toolCallId, params) {
73
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
74
+ try {
75
+ let actualParams = params;
76
+ if (typeof params === 'string') {
77
+ try {
78
+ actualParams = JSON.parse(params);
79
+ }
80
+ catch (e) {
81
+ logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
82
+ return {
83
+ success: false,
84
+ error: `Invalid params format: received string "${params}", expected object with 'files' array.`
85
+ };
86
+ }
87
+ }
88
+ if (!actualParams.files && actualParams.params && actualParams.params.files) {
89
+ actualParams = actualParams.params;
90
+ }
91
+ if (!actualParams.files || !Array.isArray(actualParams.files)) {
92
+ return {
93
+ success: false,
94
+ error: `Invalid params: 'files' must be an array. Received keys: ${Object.keys(actualParams)}`
95
+ };
96
+ }
97
+ const result = await (0, ingest_1.ingestAssets)(actualParams, pluginConfig, logger);
98
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
99
+ return {
100
+ success: true,
101
+ result
102
+ };
103
+ }
104
+ catch (error) {
105
+ logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
106
+ return {
107
+ success: false,
108
+ error: error.message
109
+ };
110
+ }
111
+ }
112
+ },
113
+ retrieveTool: {
114
+ name: 'contextlake-retrieve',
115
+ label: 'ContextLake Retrieve',
116
+ description: `Search, query, and retrieve relevant information from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
117
+ Use this tool when the user wants to "搜索知识", "获取信息", "召回文档", "查询知识库", "search knowledge base", or "retrieve documents".
118
+ Uses vector similarity search to find semantically related document chunks.
119
+
120
+ Example User Queries:
121
+ - "知识库里有关于产品安装的文档吗?"
122
+ - "帮我从知识湖中召回关于财务报表的资料"
123
+ - "Search the knowledge base for deployment guides"
124
+ - "根据知识库内容,回答如何配置网关"`,
125
+ parameters: {
126
+ type: 'object',
127
+ properties: {
128
+ query: { type: 'string', description: 'Search query' },
129
+ top_k: { type: 'integer', description: 'Number of results to return' },
130
+ filter: { type: 'string', description: 'Filter string' },
131
+ include_binary: { type: 'boolean', description: 'Whether to include binary content' }
132
+ },
133
+ required: ['query'],
134
+ additionalProperties: false
135
+ },
136
+ schema: {
137
+ type: 'object',
138
+ properties: {
139
+ query: { type: 'string', description: 'Search query' },
140
+ top_k: { type: 'integer', description: 'Number of results to return' },
141
+ filter: { type: 'string', description: 'Filter string' },
142
+ include_binary: { type: 'boolean', description: 'Whether to include binary content' }
143
+ },
144
+ required: ['query'],
145
+ additionalProperties: false
146
+ },
147
+ async execute(toolCallId, params) {
148
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing retrieve skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
149
+ try {
150
+ let actualParams = params;
151
+ if (typeof params === 'string') {
152
+ try {
153
+ actualParams = JSON.parse(params);
154
+ }
155
+ catch (e) {
156
+ logger.warn(`[ContextLake] Received string params for retrieve, attempting fallback parsing`, { params });
157
+ // For retrieve, if it's a string, maybe they just passed the query directly
158
+ actualParams = { query: params };
159
+ }
160
+ }
161
+ else if (params && params.params) {
162
+ actualParams = params.params;
163
+ }
164
+ if (!actualParams || typeof actualParams.query !== 'string') {
165
+ return {
166
+ success: false,
167
+ error: `Invalid params: 'query' is required and must be a string. Received: ${JSON.stringify(actualParams)}`
168
+ };
169
+ }
170
+ const result = await (0, retrieve_1.retrieveAssets)(actualParams, pluginConfig, logger);
171
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Retrieve skill completed`, { resultCount: Array.isArray(result) ? result.length : 0 });
172
+ return {
173
+ success: true,
174
+ result
175
+ };
176
+ }
177
+ catch (error) {
178
+ logger.error(`[${new Date().toISOString()}] [ContextLake] Retrieve skill failed`, { error: error.message, stack: error.stack });
179
+ return {
180
+ success: false,
181
+ error: error.message
182
+ };
183
+ }
184
+ }
185
+ },
186
+ listTool: {
187
+ name: 'contextlake-list',
188
+ label: 'ContextLake List',
189
+ description: `List documents and assets currently in the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
190
+ Use this tool when the user wants to "列出所有知识", "查看知识库文件", "显示文档列表", "list knowledge", or "show documents".
191
+
192
+ Example User Queries:
193
+ - "知识湖里目前有哪些文件?"
194
+ - "列出前10个知识库文档"
195
+ - "Show me all documents in the knowledge base"`,
196
+ parameters: {
197
+ type: 'object',
198
+ properties: {
199
+ limit: { type: 'integer', description: 'Limit for list action' }
200
+ },
201
+ required: [],
202
+ additionalProperties: false
203
+ },
204
+ schema: {
205
+ type: 'object',
206
+ properties: {
207
+ limit: { type: 'integer', description: 'Limit for list action' }
208
+ },
209
+ required: []
210
+ },
211
+ async execute(toolCallId, params) {
212
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing list skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
213
+ try {
214
+ let actualParams = params;
215
+ if (params && params.params) {
216
+ actualParams = params.params;
217
+ }
218
+ const result = await (0, manage_1.listAssets)(actualParams, pluginConfig, logger);
219
+ logger.info(`[${new Date().toISOString()}] [ContextLake] List skill completed`, { count: Array.isArray(result) ? result.length : 0 });
220
+ return {
221
+ success: true,
222
+ result
223
+ };
224
+ }
225
+ catch (error) {
226
+ logger.error(`[${new Date().toISOString()}] [ContextLake] List skill failed`, { error: error.message, stack: error.stack });
227
+ return {
228
+ success: false,
229
+ error: error.message
230
+ };
231
+ }
232
+ }
233
+ },
234
+ deleteTool: {
235
+ name: 'contextlake-delete',
236
+ label: 'ContextLake Delete',
237
+ description: `Delete documents and assets from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
238
+ Use this tool when the user wants to "删除某个文档", "清理知识库", "移除文件", "delete knowledge", or "remove documents".
239
+ Supports deleting documents by their specific IDs or by applying a SQL-like filter.
240
+
241
+ Example User Queries:
242
+ - "删除 ID 为 12345 的文档"
243
+ - "Please delete the old architecture document from the knowledge base"
244
+ - "从知识湖中移除 category 是 finance 的所有记录"`,
245
+ parameters: {
246
+ type: 'object',
247
+ properties: {
248
+ file_ids: { type: 'array', items: { type: 'string' }, description: 'File IDs to delete' },
249
+ filter: { type: 'string', description: 'Filter string for deletion' }
250
+ },
251
+ required: [],
252
+ additionalProperties: false
253
+ },
254
+ schema: {
255
+ type: 'object',
256
+ properties: {
257
+ file_ids: { type: 'array', items: { type: 'string' }, description: 'File IDs to delete' },
258
+ filter: { type: 'string', description: 'Filter string for deletion' }
259
+ },
260
+ required: []
261
+ },
262
+ async execute(toolCallId, params) {
263
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing delete skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
264
+ try {
265
+ let actualParams = params;
266
+ if (params && params.params) {
267
+ actualParams = params.params;
268
+ }
269
+ const result = await (0, manage_1.deleteAssets)(actualParams, pluginConfig, logger);
270
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Delete skill completed`, { result });
271
+ return {
272
+ success: true,
273
+ result
274
+ };
275
+ }
276
+ catch (error) {
277
+ logger.error(`[${new Date().toISOString()}] [ContextLake] Delete skill failed`, { error: error.message, stack: error.stack });
278
+ return {
279
+ success: false,
280
+ error: error.message
281
+ };
282
+ }
283
+ }
284
+ }
285
+ };
286
+ }
@@ -0,0 +1,8 @@
1
+ interface IngestParams {
2
+ files: string[];
3
+ metadata?: Record<string, any>;
4
+ chunkSize?: number;
5
+ overlap?: number;
6
+ }
7
+ export declare function ingestAssets(params: IngestParams, config: any, logger?: any): Promise<any>;
8
+ export {};
@@ -0,0 +1,123 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.ingestAssets = ingestAssets;
37
+ const factory_1 = require("../../service/storage/factory");
38
+ const factory_2 = require("../../service/metadata/factory");
39
+ const loader_1 = require("../../processor/loader");
40
+ const path = __importStar(require("path"));
41
+ // @ts-ignore
42
+ const uuid_1 = require("uuid");
43
+ async function ingestAssets(params, config, logger) {
44
+ if (logger) {
45
+ logger.info(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
46
+ }
47
+ else {
48
+ // eslint-disable-next-line no-console
49
+ console.log(`[ContextLake-Action] Calling ingestAssets with params: ${JSON.stringify(params)}`);
50
+ }
51
+ const storageConfig = config.file_storage || { type: 'local', local_base_dir: './data/files' };
52
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
53
+ const storageProvider = (0, factory_1.createStorageProvider)(storageConfig);
54
+ const metadataProvider = (0, factory_2.createMetadataProvider)(metaConfig);
55
+ await metadataProvider.connect();
56
+ const maxInlineSize = (config.storage_policy?.max_inline_size_kb || 1024) * 1024;
57
+ const results = [];
58
+ for (const filePath of params.files) {
59
+ try {
60
+ const fileName = path.basename(filePath);
61
+ const { buffer, text, type } = await (0, loader_1.processFile)(filePath);
62
+ const createdAt = Date.now(); // Current timestamp
63
+ let fileUrl = '';
64
+ let storageType = '';
65
+ let binaryData = Buffer.alloc(0);
66
+ // Decide storage strategy
67
+ if (buffer.length <= maxInlineSize) {
68
+ binaryData = buffer;
69
+ fileUrl = `inline://${fileName}`; // Virtual URL for inline
70
+ storageType = 'inline';
71
+ }
72
+ else {
73
+ fileUrl = await storageProvider.uploadFile(fileName, buffer);
74
+ storageType = storageConfig.type;
75
+ }
76
+ const chunks = (0, loader_1.splitText)(text, params.chunkSize || 500, params.overlap || 50);
77
+ const docs = [];
78
+ // If no text extracted (e.g. image), store one entry with empty text but with metadata/binary
79
+ if (chunks.length === 0) {
80
+ const vector = await metadataProvider.generateEmbedding(fileName); // Embed filename as fallback
81
+ docs.push({
82
+ id: (0, uuid_1.v4)(),
83
+ vector,
84
+ text: '',
85
+ source: fileName,
86
+ file_type: type,
87
+ storage_type: storageType,
88
+ url: fileUrl,
89
+ metadata: JSON.stringify(params.metadata || {}),
90
+ created_at: createdAt,
91
+ binary_data: binaryData
92
+ });
93
+ }
94
+ else {
95
+ for (const chunk of chunks) {
96
+ const vector = await metadataProvider.generateEmbedding(chunk);
97
+ docs.push({
98
+ id: (0, uuid_1.v4)(),
99
+ vector,
100
+ text: chunk,
101
+ source: fileName,
102
+ file_type: type,
103
+ storage_type: storageType,
104
+ url: fileUrl,
105
+ metadata: JSON.stringify(params.metadata || {}),
106
+ created_at: createdAt,
107
+ binary_data: binaryData // Only attach to first chunk
108
+ });
109
+ // Clear binary data for subsequent chunks of the same file to avoid duplication
110
+ binaryData = Buffer.alloc(0);
111
+ }
112
+ }
113
+ await metadataProvider.addAssets(docs);
114
+ results.push({ file: fileName, status: 'success', chunks: docs.length });
115
+ }
116
+ catch (error) {
117
+ // @ts-ignore
118
+ results.push({ file: filePath, status: 'error', message: error.message });
119
+ }
120
+ }
121
+ // Ensure plain JSON serialization
122
+ return JSON.parse(JSON.stringify(results));
123
+ }
@@ -0,0 +1,15 @@
1
+ interface ListParams {
2
+ limit?: number;
3
+ }
4
+ interface DeleteParams {
5
+ file_ids?: string[];
6
+ filter?: string;
7
+ }
8
+ export declare function listAssets(params: ListParams, config: any, logger?: any): Promise<any>;
9
+ export declare function deleteAssets(params: DeleteParams, config: any, logger?: any): Promise<{
10
+ status: string;
11
+ message: string;
12
+ deleted_count: number;
13
+ storage_cleaned: number;
14
+ }>;
15
+ export {};
@@ -0,0 +1,91 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.listAssets = listAssets;
4
+ exports.deleteAssets = deleteAssets;
5
+ const factory_1 = require("../../service/metadata/factory");
6
+ const factory_2 = require("../../service/storage/factory");
7
+ async function listAssets(params, config, logger) {
8
+ if (logger) {
9
+ logger.info(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
10
+ }
11
+ else {
12
+ // eslint-disable-next-line no-console
13
+ console.log(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
14
+ }
15
+ // Ensure config has default if not provided
16
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
17
+ const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
18
+ await metadataProvider.connect();
19
+ const docs = await metadataProvider.list(params.limit || 100);
20
+ // Group by file (since we store chunks)
21
+ const fileMap = new Map();
22
+ docs.forEach((doc) => {
23
+ // Basic grouping by source filename.
24
+ const key = doc.source;
25
+ if (!fileMap.has(key)) {
26
+ fileMap.set(key, {
27
+ source: doc.source,
28
+ file_type: doc.file_type,
29
+ storage_type: doc.storage_type,
30
+ url: doc.url,
31
+ chunks_count: 0,
32
+ metadata: doc.metadata
33
+ });
34
+ }
35
+ fileMap.get(key).chunks_count++;
36
+ });
37
+ // Cleanly serialize before sending back
38
+ return JSON.parse(JSON.stringify(Array.from(fileMap.values())));
39
+ }
40
+ async function deleteAssets(params, config, logger) {
41
+ if (logger) {
42
+ logger.info(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
43
+ }
44
+ else {
45
+ // eslint-disable-next-line no-console
46
+ console.log(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
47
+ }
48
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
49
+ const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
50
+ // file_storage config is optional for deletion (we might not need to delete from storage if inline)
51
+ // Check if file_storage config exists before creating provider
52
+ let storageProvider;
53
+ if (config.file_storage && config.file_storage.type) {
54
+ storageProvider = (0, factory_2.createStorageProvider)(config.file_storage);
55
+ }
56
+ await metadataProvider.connect();
57
+ let deleteFilter = params.filter || '';
58
+ if (params.file_ids && params.file_ids.length > 0) {
59
+ // Construct filter
60
+ deleteFilter = params.file_ids.map(id => `id = '${id}'`).join(' OR ');
61
+ }
62
+ if (!deleteFilter)
63
+ throw new Error("Delete requires file_ids or filter");
64
+ // Retrieve documents to be deleted to get their URLs for storage cleanup
65
+ // Note: This might be expensive if deleting many files.
66
+ // Optimization: If filter is complex, we might skip storage cleanup or do it asynchronously.
67
+ const docsToDelete = await metadataProvider.search('', 1000, deleteFilter);
68
+ // 1. Delete from Metadata
69
+ await metadataProvider.delete(deleteFilter);
70
+ // 2. Delete from Storage (Best effort)
71
+ const deletedUrls = new Set();
72
+ if (storageProvider) {
73
+ for (const doc of docsToDelete) {
74
+ if (doc.url && !deletedUrls.has(doc.url) && !doc.url.startsWith('inline://')) {
75
+ try {
76
+ await storageProvider.deleteFile(doc.url);
77
+ deletedUrls.add(doc.url);
78
+ }
79
+ catch (e) {
80
+ console.error(`Failed to delete file ${doc.url}:`, e);
81
+ }
82
+ }
83
+ }
84
+ }
85
+ return {
86
+ status: 'success',
87
+ message: 'Documents deleted',
88
+ deleted_count: docsToDelete.length,
89
+ storage_cleaned: deletedUrls.size
90
+ };
91
+ }
@@ -0,0 +1,8 @@
1
+ interface RetrieveParams {
2
+ query: string;
3
+ top_k?: number;
4
+ filter?: string;
5
+ include_binary?: boolean;
6
+ }
7
+ export declare function retrieveAssets(params: RetrieveParams, config: any, logger?: any): Promise<any[]>;
8
+ export {};
@@ -0,0 +1,73 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.retrieveAssets = retrieveAssets;
4
+ const factory_1 = require("../../service/storage/factory");
5
+ const factory_2 = require("../../service/metadata/factory");
6
+ async function retrieveAssets(params, config, logger) {
7
+ if (logger) {
8
+ logger.info(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
9
+ }
10
+ else {
11
+ // eslint-disable-next-line no-console
12
+ console.log(`[ContextLake-Action] Calling retrieveAssets with params: ${JSON.stringify(params)}`);
13
+ }
14
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
15
+ const metadataProvider = (0, factory_2.createMetadataProvider)(metaConfig);
16
+ let storageProvider;
17
+ if (config.file_storage && config.file_storage.type) {
18
+ storageProvider = (0, factory_1.createStorageProvider)(config.file_storage);
19
+ }
20
+ await metadataProvider.connect();
21
+ const results = await metadataProvider.search(params.query, params.top_k || 5, params.filter);
22
+ // Post-process to handle binary data retrieval if requested
23
+ if (params.include_binary) {
24
+ const enrichedResults = await Promise.all(results.map(async (doc) => {
25
+ // Remove raw binary_data buffer to avoid clone errors in postMessage
26
+ const { binary_data, ...rest } = doc;
27
+ if (rest.vector) {
28
+ rest.vector = Array.from(rest.vector);
29
+ }
30
+ try {
31
+ if (rest.storage_type === 'inline') {
32
+ // Return inline binary data (Base64)
33
+ return {
34
+ ...rest,
35
+ binary_content: binary_data ? binary_data.toString('base64') : null
36
+ };
37
+ }
38
+ else if (rest.storage_type === 'local' || rest.storage_type === 'tos') {
39
+ if (storageProvider) {
40
+ // Download from storage provider
41
+ const buffer = await storageProvider.downloadFile(rest.url);
42
+ return {
43
+ ...rest,
44
+ binary_content: buffer.toString('base64')
45
+ };
46
+ }
47
+ else {
48
+ return { ...rest, binary_error: 'Storage provider not configured' };
49
+ }
50
+ }
51
+ return rest;
52
+ }
53
+ catch (e) {
54
+ return { ...rest, binary_error: 'Failed to fetch content' };
55
+ }
56
+ }));
57
+ return enrichedResults;
58
+ }
59
+ // Remove binary_data from default response to keep it light and serializable
60
+ return results.map((doc) => {
61
+ const { binary_data, ...rest } = doc;
62
+ // Ensure all fields are standard serializable types (remove complex objects/buffers)
63
+ if (rest.vector) {
64
+ // Keep vector as standard array of numbers if it exists, but some clients might choke on large arrays.
65
+ // Usually vector is fine, but LanceDB might return custom objects like Float32Array which fail structuredClone in postMessage
66
+ rest.vector = Array.from(rest.vector);
67
+ }
68
+ // LanceDB records might be Proxy objects or contain non-clonable getters.
69
+ // We stringify and parse to get a clean, plain Javascript Object.
70
+ const safeDoc = JSON.parse(JSON.stringify(rest));
71
+ return safeDoc;
72
+ });
73
+ }
@@ -0,0 +1,7 @@
1
+ export interface FileData {
2
+ buffer: Buffer;
3
+ text: string;
4
+ type: string;
5
+ }
6
+ export declare function processFile(filePath: string): Promise<FileData>;
7
+ export declare function splitText(text: string, chunkSize?: number, overlap?: number): string[];