@byted-las/contextlake-openclaw 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/src/client/lancedb.js +1 -1
  2. package/dist/src/commands/cli.d.ts +2 -1
  3. package/dist/src/commands/cli.js +31 -8
  4. package/dist/src/commands/index.js +25 -6
  5. package/dist/src/commands/slashcmd.d.ts +6 -0
  6. package/dist/src/commands/slashcmd.js +90 -6
  7. package/dist/src/commands/tools.d.ts +2 -0
  8. package/dist/src/commands/tools.js +44 -37
  9. package/dist/src/lib/actions/ingest-source.d.ts +15 -0
  10. package/dist/src/lib/actions/ingest-source.js +193 -0
  11. package/dist/src/lib/actions/ingest.d.ts +13 -7
  12. package/dist/src/lib/actions/ingest.js +133 -58
  13. package/dist/src/lib/actions/las-api.d.ts +13 -0
  14. package/dist/src/lib/actions/las-api.js +105 -0
  15. package/dist/src/lib/actions/las-tools.d.ts +3 -0
  16. package/dist/src/lib/actions/las-tools.js +194 -0
  17. package/dist/src/lib/actions/las.d.ts +64 -0
  18. package/dist/src/lib/actions/las.js +72 -0
  19. package/dist/src/lib/actions/profiler.d.ts +3 -0
  20. package/dist/src/lib/actions/profiler.js +17 -1
  21. package/dist/src/lib/actions/retrieve.js +2 -8
  22. package/dist/src/lib/scripts/s3_catalog.py +10 -1
  23. package/dist/src/service/embedding/factory.js +1 -10
  24. package/dist/src/service/embedding/interface.d.ts +5 -0
  25. package/dist/src/service/embedding/remote.d.ts +1 -0
  26. package/dist/src/service/embedding/remote.js +31 -0
  27. package/dist/src/service/metadata/interface.d.ts +1 -0
  28. package/dist/src/service/metadata/local.d.ts +1 -0
  29. package/dist/src/service/metadata/local.js +6 -0
  30. package/dist/src/utils/config.js +11 -2
  31. package/dist/src/utils/credentials.d.ts +8 -0
  32. package/dist/src/utils/credentials.js +77 -0
  33. package/openclaw.plugin.json +1 -1
  34. package/package.json +1 -5
  35. package/src/client/lancedb.ts +1 -1
  36. package/src/commands/cli.ts +35 -9
  37. package/src/commands/index.ts +30 -6
  38. package/src/commands/slashcmd.ts +67 -7
  39. package/src/commands/tools.ts +49 -41
  40. package/src/lib/actions/ingest.ts +151 -71
  41. package/src/lib/actions/las-api.ts +119 -0
  42. package/src/lib/actions/las-tools.ts +196 -0
  43. package/src/lib/actions/profiler.ts +18 -1
  44. package/src/lib/actions/retrieve.ts +2 -10
  45. package/src/lib/scripts/s3_catalog.py +10 -1
  46. package/src/service/embedding/factory.ts +1 -8
  47. package/src/service/embedding/interface.ts +6 -0
  48. package/src/service/embedding/remote.ts +36 -0
  49. package/src/service/metadata/interface.ts +1 -0
  50. package/src/service/metadata/local.ts +7 -0
  51. package/src/utils/config.ts +13 -2
  52. package/src/utils/credentials.ts +50 -0
  53. package/bin/contextlake-openclaw.js +0 -5
  54. package/src/service/embedding/local.ts +0 -121
@@ -97,7 +97,7 @@ class ContextLakeLanceDBClient {
97
97
  }
98
98
  return await fallbackQuery.toArray();
99
99
  }
100
- const vector = await this.embeddingProvider.generateEmbedding(query);
100
+ const vector = await this.embeddingProvider.generateMultimodalEmbedding([{ type: 'text', text: query }]);
101
101
  // @ts-ignore
102
102
  let search = table.vectorSearch(vector).limit(normalizedLimit);
103
103
  if (filter) {
@@ -1,8 +1,9 @@
1
1
  import { ContextLakeConfig } from '../utils/config';
2
2
  export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
3
3
  connectAction: (datasource_name: string, options: any) => Promise<void>;
4
- ingestAction: (files: any, options: any) => Promise<void>;
4
+ ingestAction: (datasource_name: string) => Promise<void>;
5
5
  searchAction: (query: any, options: any) => Promise<void>;
6
6
  listAction: (options: any) => Promise<void>;
7
7
  deleteAction: (options: any) => Promise<void>;
8
+ onboardAction: () => Promise<void>;
8
9
  };
@@ -6,6 +6,7 @@ const ingest_1 = require("../lib/actions/ingest");
6
6
  const retrieve_1 = require("../lib/actions/retrieve");
7
7
  const manage_1 = require("../lib/actions/manage");
8
8
  const profiler_1 = require("../lib/actions/profiler");
9
+ const credentials_1 = require("../utils/credentials");
9
10
  function parseOptionalInt(value, fallback) {
10
11
  const parsed = Number.parseInt(String(value), 10);
11
12
  return Number.isFinite(parsed) ? parsed : fallback;
@@ -67,15 +68,11 @@ function getCliCommands(pluginConfig, logger) {
67
68
  process.exitCode = 1;
68
69
  }
69
70
  },
70
- ingestAction: async (files, options) => {
71
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { files, options });
71
+ ingestAction: async (datasource_name) => {
72
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
72
73
  try {
73
- const metadata = parseMetadata(options.metadata);
74
- const result = await (0, ingest_1.ingestAssets)({
75
- files,
76
- metadata,
77
- chunkSize: parseOptionalInt(options.chunkSize, 500),
78
- overlap: parseOptionalInt(options.overlap, 50)
74
+ const result = await (0, ingest_1.ingestSource)({
75
+ datasource_name
79
76
  }, pluginConfig, logger);
80
77
  // eslint-disable-next-line no-console
81
78
  console.log(JSON.stringify(result, null, 2));
@@ -134,6 +131,32 @@ function getCliCommands(pluginConfig, logger) {
134
131
  console.error('Error:', e.message);
135
132
  logger.error(`[${new Date().toISOString()}] [ContextLake] CLI delete failed`, { error: e.message, stack: e.stack });
136
133
  }
134
+ },
135
+ onboardAction: async () => {
136
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard started`);
137
+ try {
138
+ const currentCreds = (0, credentials_1.loadCredentials)();
139
+ // eslint-disable-next-line no-console
140
+ console.log('Welcome to ContextLake Onboarding!');
141
+ // eslint-disable-next-line no-console
142
+ console.log('Please provide your credentials below. Press enter to keep the current value.');
143
+ const lasApiKey = await (0, credentials_1.promptForInput)('LAS_API_KEY', currentCreds.LAS_API_KEY);
144
+ const volcengineAccessKey = await (0, credentials_1.promptForInput)('VOLCENGINE_ACCESS_KEY', currentCreds.VOLCENGINE_ACCESS_KEY);
145
+ const volcengineSecretKey = await (0, credentials_1.promptForInput)('VOLCENGINE_SECRET_KEY', currentCreds.VOLCENGINE_SECRET_KEY);
146
+ const newCreds = {
147
+ LAS_API_KEY: lasApiKey,
148
+ VOLCENGINE_ACCESS_KEY: volcengineAccessKey,
149
+ VOLCENGINE_SECRET_KEY: volcengineSecretKey
150
+ };
151
+ (0, credentials_1.saveCredentials)(newCreds);
152
+ // eslint-disable-next-line no-console
153
+ console.log('Credentials saved successfully!');
154
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard success`);
155
+ }
156
+ catch (e) {
157
+ console.error('Error during onboarding:', e.message);
158
+ logger.error(`[${new Date().toISOString()}] [ContextLake] CLI onboard failed`, { error: e.message, stack: e.stack });
159
+ }
137
160
  }
138
161
  };
139
162
  }
@@ -20,6 +20,12 @@ function registerAll(ctx, logger) {
20
20
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
21
21
  ctx.registerTool(tools.lasDataProfilerTool);
22
22
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
23
+ ctx.registerTool(tools.listDatasourceTool);
24
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
25
+ for (const lasTool of tools.lasTools) {
26
+ ctx.registerTool(lasTool);
27
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${lasTool.name}`);
28
+ }
23
29
  }
24
30
  catch (error) {
25
31
  logger.error(`[${new Date().toISOString()}] [ContextLake] Error registering agent tools: ${error.message}${error.stack ? '\\n' + error.stack : ''}`);
@@ -46,11 +52,8 @@ function registerAll(ctx, logger) {
46
52
  .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
47
53
  .action(commands.connectAction);
48
54
  // Ingest
49
- contextlake.command('ingest <files...>')
50
- .description('Ingest one or more files into the knowledge base')
51
- .option('-c, --chunk-size <number>', 'Chunk size for text splitting', '500')
52
- .option('-o, --overlap <number>', 'Chunk overlap size', '50')
53
- .option('-m, --metadata <json>', 'JSON metadata to attach to the documents')
55
+ contextlake.command('ingest <datasource_name>')
56
+ .description('Process and ingest all files from a connected data source into the knowledge base')
54
57
  .action(commands.ingestAction);
55
58
  // Search
56
59
  contextlake.command('search <query>')
@@ -70,6 +73,10 @@ function registerAll(ctx, logger) {
70
73
  .option('--ids <ids...>', 'List of specific file IDs to delete')
71
74
  .option('-f, --filter <string>', 'Filter string to match documents for deletion')
72
75
  .action(commands.deleteAction);
76
+ // Onboard
77
+ contextlake.command('onboard')
78
+ .description('Configure credentials for ContextLake')
79
+ .action(commands.onboardAction);
73
80
  }, { commands: ['contextlake'] });
74
81
  logger.info(`[${new Date().toISOString()}] [ContextLake] CLI commands registered`);
75
82
  }
@@ -86,7 +93,7 @@ function registerAll(ctx, logger) {
86
93
  const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
87
94
  ctx.registerCommand({
88
95
  name: 'contextlake-ingest',
89
- description: 'Ingest files into the knowledge base (usage: /contextlake-ingest file1 file2)',
96
+ description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
90
97
  acceptsArgs: true,
91
98
  handler: slashCommands.ingestHandler
92
99
  });
@@ -108,6 +115,18 @@ function registerAll(ctx, logger) {
108
115
  acceptsArgs: true,
109
116
  handler: slashCommands.deleteHandler
110
117
  });
118
+ ctx.registerCommand({
119
+ name: 'contextlake-profiler',
120
+ description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
121
+ acceptsArgs: true,
122
+ handler: slashCommands.profilerHandler
123
+ });
124
+ ctx.registerCommand({
125
+ name: 'contextlake-list-datasource',
126
+ description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
127
+ acceptsArgs: false,
128
+ handler: slashCommands.listDatasourceHandler
129
+ });
111
130
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash commands registered`);
112
131
  }
113
132
  catch (error) {
@@ -12,4 +12,10 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
12
12
  deleteHandler: (commandCtx: any) => Promise<{
13
13
  text: string;
14
14
  }>;
15
+ profilerHandler: (commandCtx: any) => Promise<{
16
+ text: string;
17
+ }>;
18
+ listDatasourceHandler: (commandCtx: any) => Promise<{
19
+ text: string;
20
+ }>;
15
21
  };
@@ -1,9 +1,46 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  Object.defineProperty(exports, "__esModule", { value: true });
3
36
  exports.getSlashCommands = getSlashCommands;
4
37
  const ingest_1 = require("../lib/actions/ingest");
5
38
  const retrieve_1 = require("../lib/actions/retrieve");
6
39
  const manage_1 = require("../lib/actions/manage");
40
+ const profiler_1 = require("../lib/actions/profiler");
41
+ const fs = __importStar(require("fs"));
42
+ const path = __importStar(require("path"));
43
+ const os = __importStar(require("os"));
7
44
  function getSlashCommands(pluginConfig, logger) {
8
45
  return {
9
46
  ingestHandler: async (commandCtx) => {
@@ -12,12 +49,16 @@ function getSlashCommands(pluginConfig, logger) {
12
49
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
13
50
  try {
14
51
  if (args.length === 0) {
15
- return { text: `**Error:** Missing files. Usage: /contextlake-ingest /path/to/file1 /path/to/file2` };
52
+ return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
16
53
  }
17
- const result = await (0, ingest_1.ingestAssets)({
18
- files: args,
19
- metadata: {}
20
- }, pluginConfig, logger);
54
+ const datasource_name = args[0];
55
+ const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
56
+ const dsDir = path.join(BASE_DIR, datasource_name);
57
+ const dbPath = path.join(dsDir, 'catalog_db');
58
+ if (!fs.existsSync(dbPath)) {
59
+ return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
60
+ }
61
+ const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
21
62
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
22
63
  return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
23
64
  }
@@ -86,6 +127,49 @@ function getSlashCommands(pluginConfig, logger) {
86
127
  logger.error(`[ContextLake] Slash delete failed`, { error: e.message });
87
128
  return { text: `**Error executing delete:** ${e.message}` };
88
129
  }
89
- }
130
+ },
131
+ profilerHandler: async (commandCtx) => {
132
+ const rawArgs = commandCtx.args || "";
133
+ const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
134
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
135
+ try {
136
+ if (args.length < 4) {
137
+ return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]` };
138
+ }
139
+ const [datasource_name, vendor, bucket, prefix, endpoint, access_key, secret_key, region] = args;
140
+ if (!['volcengine', 'alibaba', 'tencent', 'aws', 'local'].includes(vendor)) {
141
+ return { text: `**Error:** Invalid vendor. Must be one of: volcengine, alibaba, tencent, aws, local` };
142
+ }
143
+ const params = {
144
+ datasource_name,
145
+ vendor: vendor,
146
+ bucket,
147
+ prefix,
148
+ endpoint,
149
+ access_key,
150
+ secret_key,
151
+ region,
152
+ };
153
+ const result = await (0, profiler_1.connectDataSource)(params);
154
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
155
+ return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
156
+ }
157
+ catch (e) {
158
+ logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
159
+ return { text: `**Error executing profiler:** ${e.message}` };
160
+ }
161
+ },
162
+ listDatasourceHandler: async (commandCtx) => {
163
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
164
+ try {
165
+ const result = await (0, profiler_1.listDataSources)();
166
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource completed`, { result });
167
+ return { text: `**Data Sources:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
168
+ }
169
+ catch (e) {
170
+ logger.error(`[ContextLake] Slash list-datasource failed`, { error: e.message });
171
+ return { text: `**Error executing list-datasource:** ${e.message}` };
172
+ }
173
+ },
90
174
  };
91
175
  }
@@ -6,4 +6,6 @@ export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: a
6
6
  listTool: AnyAgentTool;
7
7
  deleteTool: AnyAgentTool;
8
8
  lasDataProfilerTool: AnyAgentTool;
9
+ listDatasourceTool: AnyAgentTool;
10
+ lasTools: AnyAgentTool[];
9
11
  };
@@ -5,44 +5,52 @@ const ingest_1 = require("../lib/actions/ingest");
5
5
  const retrieve_1 = require("../lib/actions/retrieve");
6
6
  const manage_1 = require("../lib/actions/manage");
7
7
  const profiler_1 = require("../lib/actions/profiler");
8
+ const las_tools_1 = require("../lib/actions/las-tools");
8
9
  function getAgentTools(pluginConfig, logger) {
10
+ const lasTools = (0, las_tools_1.getLasTools)(pluginConfig, logger);
9
11
  return {
12
+ lasTools,
13
+ listDatasourceTool: {
14
+ name: 'contextlake-list-datasource',
15
+ label: 'ContextLake List Datasources',
16
+ description: `List all connected and profiled data sources.`,
17
+ parameters: {
18
+ type: 'object',
19
+ properties: {},
20
+ required: [],
21
+ additionalProperties: false
22
+ },
23
+ async execute(toolCallId, params) {
24
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing list-datasource skill, toolCallId: ${toolCallId}`);
25
+ try {
26
+ const result = await (0, profiler_1.listDataSources)();
27
+ return {
28
+ content: [{ type: "text", text: JSON.stringify(result) }],
29
+ details: result
30
+ };
31
+ }
32
+ catch (error) {
33
+ logger.error(`[${new Date().toISOString()}] [ContextLake] list-datasource skill failed`, { error: error.message });
34
+ return {
35
+ content: [{ type: "text", text: String(error.message) }],
36
+ details: { error: error.message }
37
+ };
38
+ }
39
+ }
40
+ },
10
41
  ingestTool: {
11
42
  name: 'contextlake-ingest',
12
43
  label: 'ContextLake Ingest',
13
- description: `Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
44
+ description: `Process and ingest all files from a connected data source into the knowledge base.
14
45
  Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
15
- Supports processing of various file types including PDF, Word, Markdown, and Text.
16
- Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
17
-
18
- Example User Queries:
19
- - "帮我把这个文档注入到知识湖中"
20
- - "上传这份 PDF 到知识库"
21
- - "Please ingest these documents into ContextLake"
22
- - "将 /path/to/doc.txt 添加到知识库"`,
46
+ Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
47
+ Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
23
48
  parameters: {
24
49
  type: 'object',
25
50
  properties: {
26
- files: {
27
- type: 'array',
28
- items: { type: 'string' },
29
- description: 'List of file paths to ingest'
30
- },
31
- metadata: {
32
- type: 'object',
33
- description: 'Optional JSON metadata to attach to documents',
34
- additionalProperties: true
35
- },
36
- chunkSize: {
37
- type: 'integer',
38
- description: 'Chunk size for text splitting'
39
- },
40
- overlap: {
41
- type: 'integer',
42
- description: 'Overlap size for text splitting'
43
- }
51
+ datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
44
52
  },
45
- required: ['files'],
53
+ required: ['datasource_name'],
46
54
  additionalProperties: false
47
55
  },
48
56
  async execute(toolCallId, params) {
@@ -56,21 +64,21 @@ Example User Queries:
56
64
  catch (e) {
57
65
  logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
58
66
  return {
59
- content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'files' array.` }],
60
- details: { error: `Invalid params format: received string "${params}", expected object with 'files' array.` }
67
+ content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
68
+ details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
61
69
  };
62
70
  }
63
71
  }
64
- if (!actualParams.files && actualParams.params && actualParams.params.files) {
72
+ if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
65
73
  actualParams = actualParams.params;
66
74
  }
67
- if (!actualParams.files || !Array.isArray(actualParams.files)) {
75
+ if (!actualParams.datasource_name) {
68
76
  return {
69
- content: [{ type: "text", text: `Invalid params: 'files' must be an array. Received keys: ${Object.keys(actualParams)}` }],
70
- details: { error: `Invalid params: 'files' must be an array. Received keys: ${Object.keys(actualParams)}` }
77
+ content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
78
+ details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
71
79
  };
72
80
  }
73
- const result = await (0, ingest_1.ingestAssets)(actualParams, pluginConfig, logger);
81
+ const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
74
82
  logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
75
83
  return {
76
84
  content: [{ type: "text", text: JSON.stringify(result) }],
@@ -81,8 +89,7 @@ Example User Queries:
81
89
  logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
82
90
  return {
83
91
  content: [{ type: "text", text: String(error.message) }],
84
- details: { error: error.message
85
- }
92
+ details: { error: error.message }
86
93
  };
87
94
  }
88
95
  }
@@ -0,0 +1,15 @@
1
+ import { ContextLakeConfig } from '../../utils/config';
2
+ export interface IngestSourceParams {
3
+ datasource_name: string;
4
+ }
5
+ export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
6
+ file: any;
7
+ status: string;
8
+ chunks: number;
9
+ message?: undefined;
10
+ } | {
11
+ file: any;
12
+ status: string;
13
+ message: any;
14
+ chunks?: undefined;
15
+ })[]>;
@@ -0,0 +1,193 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.ingestSource = ingestSource;
37
+ const factory_1 = require("../../service/metadata/factory");
38
+ const las_api_1 = require("./las-api");
39
+ const lancedb = __importStar(require("@lancedb/lancedb"));
40
+ const path = __importStar(require("path"));
41
+ const fs = __importStar(require("fs"));
42
+ const os = __importStar(require("os"));
43
+ // @ts-ignore
44
+ const uuid_1 = require("uuid");
45
+ const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
46
+ async function ingestSource(params, config, logger) {
47
+ if (logger) {
48
+ logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
49
+ }
50
+ else {
51
+ // eslint-disable-next-line no-console
52
+ console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
53
+ }
54
+ const dsDir = path.join(BASE_DIR, params.datasource_name);
55
+ const dbPath = path.join(dsDir, 'catalog_db');
56
+ if (!fs.existsSync(dbPath)) {
57
+ throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
58
+ }
59
+ const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
60
+ const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
61
+ await metadataProvider.connect();
62
+ const lasClient = new las_api_1.LasApiClient(config, logger);
63
+ const results = [];
64
+ // Connect to the profiler LanceDB to read the file catalog
65
+ const profilerDb = await lancedb.connect(dbPath);
66
+ const tableNames = await profilerDb.tableNames();
67
+ if (!tableNames.includes('file_catalog')) {
68
+ throw new Error(`table 'file_catalog' not found in ${dbPath}`);
69
+ }
70
+ const catalogTable = await profilerDb.openTable('file_catalog');
71
+ const files = await catalogTable.query().toArray();
72
+ logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
73
+ // Simple chunking for text
74
+ const splitText = (text, chunkSize = 500, overlap = 50) => {
75
+ const chunks = [];
76
+ if (!text)
77
+ return chunks;
78
+ let i = 0;
79
+ while (i < text.length) {
80
+ chunks.push(text.slice(i, i + chunkSize));
81
+ i += chunkSize - overlap;
82
+ }
83
+ return chunks;
84
+ };
85
+ const processText = async (text, fileInfo) => {
86
+ const chunks = splitText(text);
87
+ const docs = [];
88
+ for (const chunk of chunks) {
89
+ const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
90
+ docs.push({
91
+ id: (0, uuid_1.v4)(),
92
+ vector,
93
+ text: chunk,
94
+ source: fileInfo.key,
95
+ file_type: fileInfo.category,
96
+ storage_type: 'source',
97
+ url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
98
+ metadata: JSON.stringify({ datasource: params.datasource_name }),
99
+ created_at: Date.now(),
100
+ binary_data: Buffer.from('')
101
+ });
102
+ }
103
+ return docs;
104
+ };
105
+ for (const file of files) {
106
+ try {
107
+ logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
108
+ let docs = [];
109
+ const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
110
+ if (file.media_type === 'pdf') {
111
+ // PDF Parse
112
+ const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
113
+ url: fileUrl
114
+ });
115
+ const markdown = result.data?.markdown || '';
116
+ docs = await processText(markdown, file);
117
+ }
118
+ else if (file.media_type === 'image') {
119
+ // Multimodal Embedding directly
120
+ const vector = await metadataProvider.generateMultimodalEmbedding([
121
+ { type: 'image_url', image_url: { url: fileUrl } },
122
+ { type: 'text', text: 'This is an image from the dataset.' }
123
+ ]);
124
+ docs.push({
125
+ id: (0, uuid_1.v4)(),
126
+ vector,
127
+ text: 'Image from dataset',
128
+ source: file.key,
129
+ file_type: 'image',
130
+ storage_type: 'source',
131
+ url: fileUrl,
132
+ metadata: JSON.stringify({ datasource: params.datasource_name }),
133
+ created_at: Date.now(),
134
+ binary_data: Buffer.from('')
135
+ });
136
+ }
137
+ else if (file.media_type === 'audio') {
138
+ // ASR
139
+ const result = await lasClient.submitAndPoll('las_asr_pro', {
140
+ audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
141
+ request: { model_name: 'bigmodel' }
142
+ });
143
+ const text = result.data?.result?.text || '';
144
+ docs = await processText(text, file);
145
+ }
146
+ else if (file.media_type === 'video') {
147
+ // Video understanding -> text -> embedding
148
+ const result = await lasClient.submitAndPoll('las_long_video_understand', {
149
+ video_url: fileUrl,
150
+ query: "详细描述这个视频的内容",
151
+ model_name: "doubao-seed-2-0-lite-260215"
152
+ });
153
+ // Assuming video output is a text description somewhere in the response.
154
+ // Note: the exact structure depends on the API return, adjusting to generic text.
155
+ const text = JSON.stringify(result.data || '');
156
+ // Also need audio extract and ASR for video
157
+ // 1. Extract audio
158
+ // The output_path_template needs a unique path per video
159
+ const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
160
+ await lasClient.process('las_audio_extract_and_split', {
161
+ input_path: fileUrl,
162
+ output_path_template: audioOutputPath,
163
+ output_format: 'wav'
164
+ });
165
+ // 2. ASR on the extracted audio
166
+ // Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
167
+ const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
168
+ audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
169
+ request: { model_name: 'bigmodel' }
170
+ });
171
+ const audioText = asrResult.data?.result?.text || '';
172
+ // Combine video text and audio text
173
+ const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
174
+ docs = await processText(combinedText, file);
175
+ }
176
+ else if (file.category === 'structured' || file.category === 'non-structured') {
177
+ // If we had a direct text content, we could process it here.
178
+ // Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
179
+ // Fallback just logs
180
+ logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
181
+ }
182
+ if (docs.length > 0) {
183
+ await metadataProvider.addAssets(docs);
184
+ results.push({ file: file.key, status: 'success', chunks: docs.length });
185
+ }
186
+ }
187
+ catch (error) {
188
+ logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
189
+ results.push({ file: file.key, status: 'error', message: error.message });
190
+ }
191
+ }
192
+ return results;
193
+ }
@@ -1,9 +1,15 @@
1
1
  import { ContextLakeConfig } from '../../utils/config';
2
- interface IngestParams {
3
- files: string[];
4
- metadata?: Record<string, any>;
5
- chunkSize?: number;
6
- overlap?: number;
2
+ export interface IngestSourceParams {
3
+ datasource_name: string;
7
4
  }
8
- export declare function ingestAssets(params: IngestParams, config: ContextLakeConfig, logger?: any): Promise<any>;
9
- export {};
5
+ export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
6
+ file: any;
7
+ status: string;
8
+ chunks: number;
9
+ message?: undefined;
10
+ } | {
11
+ file: any;
12
+ status: string;
13
+ message: any;
14
+ chunks?: undefined;
15
+ })[]>;