@byted-las/contextlake-openclaw 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +45 -23
  2. package/dist/src/client/lancedb.js +1 -1
  3. package/dist/src/commands/cli.d.ts +2 -1
  4. package/dist/src/commands/cli.js +33 -8
  5. package/dist/src/commands/index.js +31 -6
  6. package/dist/src/commands/slashcmd.d.ts +6 -0
  7. package/dist/src/commands/slashcmd.js +90 -6
  8. package/dist/src/commands/tools.d.ts +5 -0
  9. package/dist/src/commands/tools.js +134 -39
  10. package/dist/src/lib/actions/ingest-source.d.ts +15 -0
  11. package/dist/src/lib/actions/ingest-source.js +193 -0
  12. package/dist/src/lib/actions/ingest.d.ts +13 -7
  13. package/dist/src/lib/actions/ingest.js +133 -58
  14. package/dist/src/lib/actions/lance-tools.d.ts +6 -0
  15. package/dist/src/lib/actions/lance-tools.js +51 -0
  16. package/dist/src/lib/actions/las-api.d.ts +13 -0
  17. package/dist/src/lib/actions/las-api.js +105 -0
  18. package/dist/src/lib/actions/las-tools.d.ts +3 -0
  19. package/dist/src/lib/actions/las-tools.js +194 -0
  20. package/dist/src/lib/actions/las.d.ts +64 -0
  21. package/dist/src/lib/actions/las.js +72 -0
  22. package/dist/src/lib/actions/profiler.d.ts +3 -0
  23. package/dist/src/lib/actions/profiler.js +135 -140
  24. package/dist/src/lib/actions/retrieve.js +2 -8
  25. package/dist/src/lib/actions/s3-tools.d.ts +18 -0
  26. package/dist/src/lib/actions/s3-tools.js +167 -0
  27. package/dist/src/lib/scripts/s3_catalog.py +10 -1
  28. package/dist/src/service/embedding/factory.js +1 -10
  29. package/dist/src/service/embedding/interface.d.ts +5 -0
  30. package/dist/src/service/embedding/remote.d.ts +1 -0
  31. package/dist/src/service/embedding/remote.js +31 -0
  32. package/dist/src/service/metadata/interface.d.ts +1 -0
  33. package/dist/src/service/metadata/local.d.ts +1 -0
  34. package/dist/src/service/metadata/local.js +6 -0
  35. package/dist/src/skills/SKILL.md +14 -151
  36. package/dist/src/skills/las-data-profiler/SKILL.md +14 -151
  37. package/dist/src/utils/config.js +13 -3
  38. package/dist/src/utils/credentials.d.ts +12 -0
  39. package/dist/src/utils/credentials.js +77 -0
  40. package/openclaw.plugin.json +1 -1
  41. package/package.json +2 -5
  42. package/src/client/lancedb.ts +1 -1
  43. package/src/commands/cli.ts +37 -9
  44. package/src/commands/index.ts +39 -6
  45. package/src/commands/slashcmd.ts +67 -7
  46. package/src/commands/tools.ts +140 -45
  47. package/src/lib/actions/ingest.ts +151 -71
  48. package/src/lib/actions/lance-tools.ts +23 -0
  49. package/src/lib/actions/las-api.ts +119 -0
  50. package/src/lib/actions/las-tools.ts +196 -0
  51. package/src/lib/actions/profiler.ts +134 -161
  52. package/src/lib/actions/retrieve.ts +2 -10
  53. package/src/lib/actions/s3-tools.ts +148 -0
  54. package/src/service/embedding/factory.ts +1 -8
  55. package/src/service/embedding/interface.ts +6 -0
  56. package/src/service/embedding/remote.ts +36 -0
  57. package/src/service/metadata/interface.ts +1 -0
  58. package/src/service/metadata/local.ts +7 -0
  59. package/src/skills/las-data-profiler/SKILL.md +14 -151
  60. package/src/utils/config.ts +15 -3
  61. package/src/utils/credentials.ts +56 -0
  62. package/bin/contextlake-openclaw.js +0 -5
  63. package/src/lib/scripts/s3_catalog.py +0 -608
  64. package/src/service/embedding/local.ts +0 -121
package/README.md CHANGED
@@ -1,27 +1,32 @@
1
1
  # ContextLake OpenClaw Plugin
2
2
 
3
- A powerful, serverless-first Knowledge Base / Knowledge Lake (知识库/知识湖) plugin for the OpenClaw Agent framework.
3
+ A powerful, multi-modal Knowledge Base / Knowledge Lake (知识库/知识湖) plugin for the OpenClaw Agent framework.
4
4
 
5
- This plugin allows OpenClaw agents to natively understand, index, and retrieve local and remote documents (PDF, Word, Markdown, Text) using vector similarity search. It features a pluggable architecture supporting both completely offline operations (via LanceDB and local GGUF models) and cloud-based operations.
5
+ This plugin allows OpenClaw agents to natively understand, profile, index, and retrieve data from diverse data sources (Local, TOS, S3, etc.). It supports advanced multi-modal document ingestion (Text, PDF, Images, Audio, Video) powered by Large AI Services (LAS) and vector similarity search.
6
6
 
7
7
  ## Features
8
8
 
9
- - **Multi-modal Document Ingestion**: Automatically extracts text from PDF, DOCX, TXT, and MD files.
10
- - **Agentic Tools**: Exposes `contextlake-ingest`, `contextlake-retrieve`, `contextlake-list`, and `contextlake-delete` skills directly to the LLM.
11
- - **Slash Commands**: Quick chat actions (`/contextlake-search`, `/contextlake-ingest`) for fast human-in-the-loop operations.
12
- - **CLI Management**: Full command-line interface for bulk operations and CI/CD integration.
13
- - **Local-First & Pluggable**:
14
- - Uses `LanceDB` for local embedded vector storage.
15
- - Uses `node-llama-cpp` for offline, private embedding generation (`bge-small-zh` by default).
16
- - Can be easily configured to use remote embedding APIs (like OpenAI or Doubao ARK) or remote storage (TOS).
9
+ - **Data Profiling**: Connect to heterogeneous data sources (TOS/OSS/COS/S3/Local) and auto-profile schemas and media metadata.
10
+ - **Multi-modal Ingestion**: Automatically extracts text, converts PDFs, parses audio (ASR), understands video, and embeds images directly using LAS multi-modal models.
11
+ - **Agentic Tools**: Exposes `contextlake-ingest`, `contextlake-retrieve`, `contextlake-list`, `contextlake-delete`, `las-data-profiler`, and `contextlake-list-datasource` skills directly to the LLM.
12
+ - **Slash Commands**: Quick chat actions (`/contextlake-search`, `/contextlake-ingest`, `/contextlake-profiler`) for fast human-in-the-loop operations.
13
+ - **CLI Management**: Full command-line interface for bulk operations and pipeline integration.
14
+ - **Pluggable Architecture**:
15
+ - Uses `LanceDB` for local embedded multi-modal vector storage.
16
+ - Pluggable storage providers (Local and TOS).
17
+
18
+ ## Documentation
19
+
20
+ For a comprehensive guide on installation, initialization (`onboard`), connecting data sources (`connect`/`profiler`), ingestion, and searching, please refer to the [**USAGE.md**](./USAGE.md).
21
+
22
+ For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
17
23
 
18
24
  ## Installation
19
25
 
20
26
  Assuming you are in an OpenClaw environment:
21
27
 
22
28
  ```bash
23
- npm install
24
- npm run build
29
+ npm install @byted-las/contextlake-openclaw
25
30
  ```
26
31
 
27
32
  Then register the plugin in your OpenClaw configuration:
@@ -30,7 +35,7 @@ Then register the plugin in your OpenClaw configuration:
30
35
  {
31
36
  "plugins": [
32
37
  {
33
- "package": "contextlake-openclaw",
38
+ "package": "@byted-las/contextlake-openclaw",
34
39
  "config": {
35
40
  "metadata_storage": {
36
41
  "type": "local",
@@ -46,19 +51,36 @@ Then register the plugin in your OpenClaw configuration:
46
51
  }
47
52
  ```
48
53
 
54
+ ## Quick Start (Slash Commands)
55
+
56
+ 1. **Initialize Credentials** (Run in terminal):
57
+ ```bash
58
+ openclaw contextlake onboard
59
+ ```
60
+ 2. **Profile a Data Source**:
61
+ ```text
62
+ /contextlake-profiler my_data local /path/to/my/files .
63
+ ```
64
+ 3. **Ingest the Data Source** (Processes PDF, Audio, Images, Text):
65
+ ```text
66
+ /contextlake-ingest my_data
67
+ ```
68
+ 4. **Search your Knowledge Base**:
69
+ ```text
70
+ /contextlake-search How do I configure the API?
71
+ ```
72
+
49
73
  ## Agent Tools (Skills)
50
74
 
51
75
  The LLM agent automatically has access to the following tools:
52
76
 
53
- 1. **`contextlake-ingest`**: Uploads and indexes files.
54
- 2. **`contextlake-retrieve`**: Performs vector search on the indexed documents based on a query.
55
- 3. **`contextlake-list`**: Lists all currently indexed documents.
56
- 4. **`contextlake-delete`**: Removes documents by ID or filter.
77
+ 1. **`las-data-profiler`**: Connect to a data source and profile its structure.
78
+ 2. **`contextlake-list-datasource`**: List all connected and profiled data sources.
79
+ 3. **`contextlake-ingest`**: Process and ingest all profiled files into the knowledge base via multi-modal embeddings.
80
+ 4. **`contextlake-retrieve`**: Performs vector search on the indexed documents based on a query.
81
+ 5. **`contextlake-list`**: Lists all currently indexed documents.
82
+ 6. **`contextlake-delete`**: Removes documents by ID or filter.
57
83
 
58
84
  You can simply talk to the agent:
59
- > "帮我把这个 `manual.pdf` 注入到知识库中。"
60
- > "在知识湖中检索关于架构设计的文档。"
61
-
62
- ## Development
63
-
64
- For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
85
+ > "帮我把 `my_data` 数据源注入到知识库中。"
86
+ > "在知识湖中检索关于架构设计的文档。"
@@ -97,7 +97,7 @@ class ContextLakeLanceDBClient {
97
97
  }
98
98
  return await fallbackQuery.toArray();
99
99
  }
100
- const vector = await this.embeddingProvider.generateEmbedding(query);
100
+ const vector = await this.embeddingProvider.generateMultimodalEmbedding([{ type: 'text', text: query }]);
101
101
  // @ts-ignore
102
102
  let search = table.vectorSearch(vector).limit(normalizedLimit);
103
103
  if (filter) {
@@ -1,8 +1,9 @@
1
1
  import { ContextLakeConfig } from '../utils/config';
2
2
  export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
3
3
  connectAction: (datasource_name: string, options: any) => Promise<void>;
4
- ingestAction: (files: any, options: any) => Promise<void>;
4
+ ingestAction: (datasource_name: string) => Promise<void>;
5
5
  searchAction: (query: any, options: any) => Promise<void>;
6
6
  listAction: (options: any) => Promise<void>;
7
7
  deleteAction: (options: any) => Promise<void>;
8
+ onboardAction: () => Promise<void>;
8
9
  };
@@ -6,6 +6,7 @@ const ingest_1 = require("../lib/actions/ingest");
6
6
  const retrieve_1 = require("../lib/actions/retrieve");
7
7
  const manage_1 = require("../lib/actions/manage");
8
8
  const profiler_1 = require("../lib/actions/profiler");
9
+ const credentials_1 = require("../utils/credentials");
9
10
  function parseOptionalInt(value, fallback) {
10
11
  const parsed = Number.parseInt(String(value), 10);
11
12
  return Number.isFinite(parsed) ? parsed : fallback;
@@ -67,15 +68,11 @@ function getCliCommands(pluginConfig, logger) {
67
68
  process.exitCode = 1;
68
69
  }
69
70
  },
70
- ingestAction: async (files, options) => {
71
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { files, options });
71
+ ingestAction: async (datasource_name) => {
72
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
72
73
  try {
73
- const metadata = parseMetadata(options.metadata);
74
- const result = await (0, ingest_1.ingestAssets)({
75
- files,
76
- metadata,
77
- chunkSize: parseOptionalInt(options.chunkSize, 500),
78
- overlap: parseOptionalInt(options.overlap, 50)
74
+ const result = await (0, ingest_1.ingestSource)({
75
+ datasource_name
79
76
  }, pluginConfig, logger);
80
77
  // eslint-disable-next-line no-console
81
78
  console.log(JSON.stringify(result, null, 2));
@@ -134,6 +131,34 @@ function getCliCommands(pluginConfig, logger) {
134
131
  console.error('Error:', e.message);
135
132
  logger.error(`[${new Date().toISOString()}] [ContextLake] CLI delete failed`, { error: e.message, stack: e.stack });
136
133
  }
134
+ },
135
+ onboardAction: async () => {
136
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard started`);
137
+ try {
138
+ const currentCreds = (0, credentials_1.loadCredentials)();
139
+ // eslint-disable-next-line no-console
140
+ console.log('Welcome to ContextLake Onboarding!');
141
+ // eslint-disable-next-line no-console
142
+ console.log('Please provide your credentials below. Press enter to keep the current value.');
143
+ const lasApiKey = await (0, credentials_1.promptForInput)('LAS_API_KEY', currentCreds.LAS_API_KEY);
144
+ const accessKey = await (0, credentials_1.promptForInput)('ACCESS_KEY', currentCreds.ACCESS_KEY || currentCreds.VOLCENGINE_ACCESS_KEY);
145
+ const secretKey = await (0, credentials_1.promptForInput)('SECRET_KEY', currentCreds.SECRET_KEY || currentCreds.VOLCENGINE_SECRET_KEY);
146
+ const region = await (0, credentials_1.promptForInput)('REGION', currentCreds.REGION || currentCreds.VOLCENGINE_REGION || 'cn-beijing');
147
+ const newCreds = {
148
+ LAS_API_KEY: lasApiKey,
149
+ ACCESS_KEY: accessKey,
150
+ SECRET_KEY: secretKey,
151
+ REGION: region
152
+ };
153
+ (0, credentials_1.saveCredentials)(newCreds);
154
+ // eslint-disable-next-line no-console
155
+ console.log('Credentials saved successfully!');
156
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard success`);
157
+ }
158
+ catch (e) {
159
+ console.error('Error during onboarding:', e.message);
160
+ logger.error(`[${new Date().toISOString()}] [ContextLake] CLI onboard failed`, { error: e.message, stack: e.stack });
161
+ }
137
162
  }
138
163
  };
139
164
  }
@@ -20,6 +20,18 @@ function registerAll(ctx, logger) {
20
20
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
21
21
  ctx.registerTool(tools.lasDataProfilerTool);
22
22
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
23
+ ctx.registerTool(tools.listS3ObjectsTool);
24
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
25
+ ctx.registerTool(tools.readS3ObjectTool);
26
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readS3ObjectTool.name}`);
27
+ ctx.registerTool(tools.writeLanceCatalogTool);
28
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.writeLanceCatalogTool.name}`);
29
+ ctx.registerTool(tools.listDatasourceTool);
30
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
31
+ for (const lasTool of tools.lasTools) {
32
+ ctx.registerTool(lasTool);
33
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${lasTool.name}`);
34
+ }
23
35
  }
24
36
  catch (error) {
25
37
  logger.error(`[${new Date().toISOString()}] [ContextLake] Error registering agent tools: ${error.message}${error.stack ? '\\n' + error.stack : ''}`);
@@ -46,11 +58,8 @@ function registerAll(ctx, logger) {
46
58
  .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
47
59
  .action(commands.connectAction);
48
60
  // Ingest
49
- contextlake.command('ingest <files...>')
50
- .description('Ingest one or more files into the knowledge base')
51
- .option('-c, --chunk-size <number>', 'Chunk size for text splitting', '500')
52
- .option('-o, --overlap <number>', 'Chunk overlap size', '50')
53
- .option('-m, --metadata <json>', 'JSON metadata to attach to the documents')
61
+ contextlake.command('ingest <datasource_name>')
62
+ .description('Process and ingest all files from a connected data source into the knowledge base')
54
63
  .action(commands.ingestAction);
55
64
  // Search
56
65
  contextlake.command('search <query>')
@@ -70,6 +79,10 @@ function registerAll(ctx, logger) {
70
79
  .option('--ids <ids...>', 'List of specific file IDs to delete')
71
80
  .option('-f, --filter <string>', 'Filter string to match documents for deletion')
72
81
  .action(commands.deleteAction);
82
+ // Onboard
83
+ contextlake.command('onboard')
84
+ .description('Configure credentials for ContextLake')
85
+ .action(commands.onboardAction);
73
86
  }, { commands: ['contextlake'] });
74
87
  logger.info(`[${new Date().toISOString()}] [ContextLake] CLI commands registered`);
75
88
  }
@@ -86,7 +99,7 @@ function registerAll(ctx, logger) {
86
99
  const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
87
100
  ctx.registerCommand({
88
101
  name: 'contextlake-ingest',
89
- description: 'Ingest files into the knowledge base (usage: /contextlake-ingest file1 file2)',
102
+ description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
90
103
  acceptsArgs: true,
91
104
  handler: slashCommands.ingestHandler
92
105
  });
@@ -108,6 +121,18 @@ function registerAll(ctx, logger) {
108
121
  acceptsArgs: true,
109
122
  handler: slashCommands.deleteHandler
110
123
  });
124
+ ctx.registerCommand({
125
+ name: 'contextlake-profiler',
126
+ description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
127
+ acceptsArgs: true,
128
+ handler: slashCommands.profilerHandler
129
+ });
130
+ ctx.registerCommand({
131
+ name: 'contextlake-list-datasource',
132
+ description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
133
+ acceptsArgs: false,
134
+ handler: slashCommands.listDatasourceHandler
135
+ });
111
136
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash commands registered`);
112
137
  }
113
138
  catch (error) {
@@ -12,4 +12,10 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
12
12
  deleteHandler: (commandCtx: any) => Promise<{
13
13
  text: string;
14
14
  }>;
15
+ profilerHandler: (commandCtx: any) => Promise<{
16
+ text: string;
17
+ }>;
18
+ listDatasourceHandler: (commandCtx: any) => Promise<{
19
+ text: string;
20
+ }>;
15
21
  };
@@ -1,9 +1,46 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  Object.defineProperty(exports, "__esModule", { value: true });
3
36
  exports.getSlashCommands = getSlashCommands;
4
37
  const ingest_1 = require("../lib/actions/ingest");
5
38
  const retrieve_1 = require("../lib/actions/retrieve");
6
39
  const manage_1 = require("../lib/actions/manage");
40
+ const profiler_1 = require("../lib/actions/profiler");
41
+ const fs = __importStar(require("fs"));
42
+ const path = __importStar(require("path"));
43
+ const os = __importStar(require("os"));
7
44
  function getSlashCommands(pluginConfig, logger) {
8
45
  return {
9
46
  ingestHandler: async (commandCtx) => {
@@ -12,12 +49,16 @@ function getSlashCommands(pluginConfig, logger) {
12
49
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
13
50
  try {
14
51
  if (args.length === 0) {
15
- return { text: `**Error:** Missing files. Usage: /contextlake-ingest /path/to/file1 /path/to/file2` };
52
+ return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
16
53
  }
17
- const result = await (0, ingest_1.ingestAssets)({
18
- files: args,
19
- metadata: {}
20
- }, pluginConfig, logger);
54
+ const datasource_name = args[0];
55
+ const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
56
+ const dsDir = path.join(BASE_DIR, datasource_name);
57
+ const dbPath = path.join(dsDir, 'catalog_db');
58
+ if (!fs.existsSync(dbPath)) {
59
+ return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
60
+ }
61
+ const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
21
62
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
22
63
  return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
23
64
  }
@@ -86,6 +127,49 @@ function getSlashCommands(pluginConfig, logger) {
86
127
  logger.error(`[ContextLake] Slash delete failed`, { error: e.message });
87
128
  return { text: `**Error executing delete:** ${e.message}` };
88
129
  }
89
- }
130
+ },
131
+ profilerHandler: async (commandCtx) => {
132
+ const rawArgs = commandCtx.args || "";
133
+ const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
134
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
135
+ try {
136
+ if (args.length < 4) {
137
+ return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]` };
138
+ }
139
+ const [datasource_name, vendor, bucket, prefix, endpoint, access_key, secret_key, region] = args;
140
+ if (!['volcengine', 'alibaba', 'tencent', 'aws', 'local'].includes(vendor)) {
141
+ return { text: `**Error:** Invalid vendor. Must be one of: volcengine, alibaba, tencent, aws, local` };
142
+ }
143
+ const params = {
144
+ datasource_name,
145
+ vendor: vendor,
146
+ bucket,
147
+ prefix,
148
+ endpoint,
149
+ access_key,
150
+ secret_key,
151
+ region,
152
+ };
153
+ const result = await (0, profiler_1.connectDataSource)(params);
154
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
155
+ return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
156
+ }
157
+ catch (e) {
158
+ logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
159
+ return { text: `**Error executing profiler:** ${e.message}` };
160
+ }
161
+ },
162
+ listDatasourceHandler: async (commandCtx) => {
163
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
164
+ try {
165
+ const result = await (0, profiler_1.listDataSources)();
166
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource completed`, { result });
167
+ return { text: `**Data Sources:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
168
+ }
169
+ catch (e) {
170
+ logger.error(`[ContextLake] Slash list-datasource failed`, { error: e.message });
171
+ return { text: `**Error executing list-datasource:** ${e.message}` };
172
+ }
173
+ },
90
174
  };
91
175
  }
@@ -6,4 +6,9 @@ export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: a
6
6
  listTool: AnyAgentTool;
7
7
  deleteTool: AnyAgentTool;
8
8
  lasDataProfilerTool: AnyAgentTool;
9
+ listDatasourceTool: AnyAgentTool;
10
+ listS3ObjectsTool: AnyAgentTool;
11
+ readS3ObjectTool: AnyAgentTool;
12
+ writeLanceCatalogTool: AnyAgentTool;
13
+ lasTools: AnyAgentTool[];
9
14
  };
@@ -5,44 +5,54 @@ const ingest_1 = require("../lib/actions/ingest");
5
5
  const retrieve_1 = require("../lib/actions/retrieve");
6
6
  const manage_1 = require("../lib/actions/manage");
7
7
  const profiler_1 = require("../lib/actions/profiler");
8
+ const las_tools_1 = require("../lib/actions/las-tools");
9
+ const s3_tools_1 = require("../lib/actions/s3-tools");
10
+ const lance_tools_1 = require("../lib/actions/lance-tools");
8
11
  function getAgentTools(pluginConfig, logger) {
12
+ const lasTools = (0, las_tools_1.getLasTools)(pluginConfig, logger);
9
13
  return {
14
+ lasTools,
15
+ listDatasourceTool: {
16
+ name: 'contextlake-list-datasource',
17
+ label: 'ContextLake List Datasources',
18
+ description: `List all connected and profiled data sources.`,
19
+ parameters: {
20
+ type: 'object',
21
+ properties: {},
22
+ required: [],
23
+ additionalProperties: false
24
+ },
25
+ async execute(toolCallId, params) {
26
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Executing list-datasource skill, toolCallId: ${toolCallId}`);
27
+ try {
28
+ const result = await (0, profiler_1.listDataSources)();
29
+ return {
30
+ content: [{ type: "text", text: JSON.stringify(result) }],
31
+ details: result
32
+ };
33
+ }
34
+ catch (error) {
35
+ logger.error(`[${new Date().toISOString()}] [ContextLake] list-datasource skill failed`, { error: error.message });
36
+ return {
37
+ content: [{ type: "text", text: String(error.message) }],
38
+ details: { error: error.message }
39
+ };
40
+ }
41
+ }
42
+ },
10
43
  ingestTool: {
11
44
  name: 'contextlake-ingest',
12
45
  label: 'ContextLake Ingest',
13
- description: `Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
46
+ description: `Process and ingest all files from a connected data source into the knowledge base.
14
47
  Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
15
- Supports processing of various file types including PDF, Word, Markdown, and Text.
16
- Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
17
-
18
- Example User Queries:
19
- - "帮我把这个文档注入到知识湖中"
20
- - "上传这份 PDF 到知识库"
21
- - "Please ingest these documents into ContextLake"
22
- - "将 /path/to/doc.txt 添加到知识库"`,
48
+ Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
49
+ Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
23
50
  parameters: {
24
51
  type: 'object',
25
52
  properties: {
26
- files: {
27
- type: 'array',
28
- items: { type: 'string' },
29
- description: 'List of file paths to ingest'
30
- },
31
- metadata: {
32
- type: 'object',
33
- description: 'Optional JSON metadata to attach to documents',
34
- additionalProperties: true
35
- },
36
- chunkSize: {
37
- type: 'integer',
38
- description: 'Chunk size for text splitting'
39
- },
40
- overlap: {
41
- type: 'integer',
42
- description: 'Overlap size for text splitting'
43
- }
53
+ datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
44
54
  },
45
- required: ['files'],
55
+ required: ['datasource_name'],
46
56
  additionalProperties: false
47
57
  },
48
58
  async execute(toolCallId, params) {
@@ -56,21 +66,21 @@ Example User Queries:
56
66
  catch (e) {
57
67
  logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
58
68
  return {
59
- content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'files' array.` }],
60
- details: { error: `Invalid params format: received string "${params}", expected object with 'files' array.` }
69
+ content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
70
+ details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
61
71
  };
62
72
  }
63
73
  }
64
- if (!actualParams.files && actualParams.params && actualParams.params.files) {
74
+ if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
65
75
  actualParams = actualParams.params;
66
76
  }
67
- if (!actualParams.files || !Array.isArray(actualParams.files)) {
77
+ if (!actualParams.datasource_name) {
68
78
  return {
69
- content: [{ type: "text", text: `Invalid params: 'files' must be an array. Received keys: ${Object.keys(actualParams)}` }],
70
- details: { error: `Invalid params: 'files' must be an array. Received keys: ${Object.keys(actualParams)}` }
79
+ content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
80
+ details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
71
81
  };
72
82
  }
73
- const result = await (0, ingest_1.ingestAssets)(actualParams, pluginConfig, logger);
83
+ const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
74
84
  logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
75
85
  return {
76
86
  content: [{ type: "text", text: JSON.stringify(result) }],
@@ -81,8 +91,7 @@ Example User Queries:
81
91
  logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
82
92
  return {
83
93
  content: [{ type: "text", text: String(error.message) }],
84
- details: { error: error.message
85
- }
94
+ details: { error: error.message }
86
95
  };
87
96
  }
88
97
  }
@@ -274,11 +283,97 @@ Example User Queries:
274
283
  logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
275
284
  return {
276
285
  content: [{ type: "text", text: String(error.message) }],
277
- details: { error: error.message
278
- }
286
+ details: { error: error.message }
279
287
  };
280
288
  }
281
289
  }
290
+ },
291
+ listS3ObjectsTool: {
292
+ name: 'list-s3-objects',
293
+ label: 'List S3 Objects',
294
+ description: 'List objects in an S3-compatible bucket or local directory',
295
+ parameters: {
296
+ type: 'object',
297
+ properties: {
298
+ vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
299
+ bucket: { type: 'string' },
300
+ prefix: { type: 'string' },
301
+ endpoint: { type: 'string' },
302
+ access_key: { type: 'string' },
303
+ secret_key: { type: 'string' },
304
+ region: { type: 'string' },
305
+ maxKeys: { type: 'integer' },
306
+ continuationToken: { type: 'string' }
307
+ },
308
+ required: ['vendor', 'bucket'],
309
+ additionalProperties: false
310
+ },
311
+ async execute(toolCallId, params) {
312
+ let actualParams = params.params || params;
313
+ try {
314
+ const result = await (0, s3_tools_1.listS3Objects)(actualParams, actualParams.prefix || '', actualParams.maxKeys, actualParams.continuationToken);
315
+ return { content: [{ type: "text", text: JSON.stringify(result) }], details: result };
316
+ }
317
+ catch (e) {
318
+ return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
319
+ }
320
+ }
321
+ },
322
+ readS3ObjectTool: {
323
+ name: 'read-s3-object',
324
+ label: 'Read S3 Object',
325
+ description: 'Read the contents or headers of an S3 object',
326
+ parameters: {
327
+ type: 'object',
328
+ properties: {
329
+ vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
330
+ bucket: { type: 'string' },
331
+ key: { type: 'string' },
332
+ endpoint: { type: 'string' },
333
+ access_key: { type: 'string' },
334
+ secret_key: { type: 'string' },
335
+ region: { type: 'string' },
336
+ maxBytes: { type: 'integer' }
337
+ },
338
+ required: ['vendor', 'bucket', 'key'],
339
+ additionalProperties: false
340
+ },
341
+ async execute(toolCallId, params) {
342
+ let actualParams = params.params || params;
343
+ try {
344
+ const buf = await (0, s3_tools_1.readS3Object)(actualParams, actualParams.key, actualParams.maxBytes);
345
+ // Return as base64 string
346
+ return { content: [{ type: "text", text: buf.toString('base64') }], details: { length: buf.length } };
347
+ }
348
+ catch (e) {
349
+ return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
350
+ }
351
+ }
352
+ },
353
+ writeLanceCatalogTool: {
354
+ name: 'write-lance-catalog',
355
+ label: 'Write LanceDB Catalog',
356
+ description: 'Write an array of file records into a local LanceDB table',
357
+ parameters: {
358
+ type: 'object',
359
+ properties: {
360
+ db_path: { type: 'string' },
361
+ table_name: { type: 'string' },
362
+ records: { type: 'array', items: { type: 'object' } }
363
+ },
364
+ required: ['db_path', 'table_name', 'records'],
365
+ additionalProperties: false
366
+ },
367
+ async execute(toolCallId, params) {
368
+ let actualParams = params.params || params;
369
+ try {
370
+ await (0, lance_tools_1.writeLanceCatalog)(actualParams);
371
+ return { content: [{ type: "text", text: "Successfully wrote records to LanceDB" }], details: { count: actualParams.records.length } };
372
+ }
373
+ catch (e) {
374
+ return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
375
+ }
376
+ }
282
377
  }
283
378
  };
284
379
  }
@@ -0,0 +1,15 @@
1
+ import { ContextLakeConfig } from '../../utils/config';
2
+ export interface IngestSourceParams {
3
+ datasource_name: string;
4
+ }
5
+ export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
6
+ file: any;
7
+ status: string;
8
+ chunks: number;
9
+ message?: undefined;
10
+ } | {
11
+ file: any;
12
+ status: string;
13
+ message: any;
14
+ chunks?: undefined;
15
+ })[]>;