@byted-las/contextlake-openclaw 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,5 @@
1
1
  import { ContextLakeConfig } from '../utils/config';
2
2
  export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
3
- connectAction: (datasource_name: string, url: string, options: any) => Promise<void>;
4
- ingestAction: (datasource_name: string) => Promise<void>;
5
3
  searchAction: (query: any, options: any) => Promise<void>;
6
4
  listAction: (options: any) => Promise<void>;
7
5
  deleteAction: (options: any) => Promise<void>;
@@ -1,11 +1,8 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.getCliCommands = getCliCommands;
4
- // @ts-ignore
5
- const ingest_1 = require("../lib/actions/ingest");
6
4
  const retrieve_1 = require("../lib/actions/retrieve");
7
5
  const manage_1 = require("../lib/actions/manage");
8
- const profiler_1 = require("../lib/actions/profiler");
9
6
  const credentials_1 = require("../utils/credentials");
10
7
  function parseOptionalInt(value, fallback) {
11
8
  const parsed = Number.parseInt(String(value), 10);
@@ -34,49 +31,6 @@ function parseMetadata(metadata) {
34
31
  }
35
32
  function getCliCommands(pluginConfig, logger) {
36
33
  return {
37
- connectAction: async (datasource_name, url, options) => {
38
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
39
- try {
40
- const params = {
41
- datasource_name,
42
- url,
43
- endpoint: options.endpoint,
44
- access_key: options.ak,
45
- secret_key: options.sk,
46
- region: options.region,
47
- sample_rows: parseInt(options.sampleRows),
48
- };
49
- // eslint-disable-next-line no-console
50
- console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
51
- // eslint-disable-next-line no-console
52
- console.log(` url: ${params.url}`);
53
- const result = await (0, profiler_1.connectDataSource)(params);
54
- // eslint-disable-next-line no-console
55
- console.log(JSON.stringify(result, null, 2));
56
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
57
- }
58
- catch (e) {
59
- // eslint-disable-next-line no-console
60
- console.error('Error:', e.message);
61
- logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
62
- process.exitCode = 1;
63
- }
64
- },
65
- ingestAction: async (datasource_name) => {
66
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
67
- try {
68
- const result = await (0, ingest_1.ingestSource)({
69
- datasource_name
70
- }, pluginConfig, logger);
71
- // eslint-disable-next-line no-console
72
- console.log(JSON.stringify(result, null, 2));
73
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
74
- }
75
- catch (e) {
76
- console.error('Error:', e.message);
77
- logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
78
- }
79
- },
80
34
  searchAction: async (query, options) => {
81
35
  logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
82
36
  try {
@@ -10,16 +10,12 @@ function registerAll(ctx, logger) {
10
10
  // Register Agent Tools
11
11
  try {
12
12
  const tools = (0, tools_1.getAgentTools)(pluginConfig, logger);
13
- ctx.registerTool(tools.ingestTool);
14
- logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
15
13
  ctx.registerTool(tools.retrieveTool);
16
14
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
17
15
  ctx.registerTool(tools.listTool);
18
16
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listTool.name}`);
19
17
  ctx.registerTool(tools.deleteTool);
20
18
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
21
- ctx.registerTool(tools.lasDataProfilerTool);
22
- logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
23
19
  ctx.registerTool(tools.listS3ObjectsTool);
24
20
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
25
21
  ctx.registerTool(tools.readS3ObjectTool);
@@ -49,19 +45,6 @@ function registerAll(ctx, logger) {
49
45
  const contextlake = program.command('contextlake')
50
46
  .description('Manage ContextLake knowledge base');
51
47
  const commands = (0, cli_1.getCliCommands)(pluginConfig, logger);
52
- // connect -- data source profiling (las-data-profiler)
53
- contextlake.command('connect <datasource_name> <url>')
54
- .description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
55
- .option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
56
- .option('--ak <credential_id>', 'Credential ID for the data source')
57
- .option('--sk <credential_value>', 'Credential value for the data source')
58
- .option('--region <region>', 'Region identifier (e.g. cn-beijing)')
59
- .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
60
- .action(commands.connectAction);
61
- // Ingest
62
- contextlake.command('ingest <datasource_name>')
63
- .description('Process and ingest all files from a connected data source into the knowledge base')
64
- .action(commands.ingestAction);
65
48
  // Search
66
49
  contextlake.command('search <query>')
67
50
  .description('Search the knowledge base for relevant documents')
@@ -98,12 +81,6 @@ function registerAll(ctx, logger) {
98
81
  return;
99
82
  }
100
83
  const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
101
- ctx.registerCommand({
102
- name: 'contextlake-ingest',
103
- description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
104
- acceptsArgs: true,
105
- handler: slashCommands.ingestHandler
106
- });
107
84
  ctx.registerCommand({
108
85
  name: 'contextlake-list',
109
86
  description: 'List documents currently in the knowledge base',
@@ -122,12 +99,6 @@ function registerAll(ctx, logger) {
122
99
  acceptsArgs: true,
123
100
  handler: slashCommands.deleteHandler
124
101
  });
125
- ctx.registerCommand({
126
- name: 'contextlake-profiler',
127
- description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
128
- acceptsArgs: true,
129
- handler: slashCommands.profilerHandler
130
- });
131
102
  ctx.registerCommand({
132
103
  name: 'contextlake-list-datasource',
133
104
  description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
@@ -1,8 +1,5 @@
1
1
  import { ContextLakeConfig } from '../utils/config';
2
2
  export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any): {
3
- ingestHandler: (commandCtx: any) => Promise<{
4
- text: string;
5
- }>;
6
3
  listHandler: (commandCtx: any) => Promise<{
7
4
  text: string;
8
5
  }>;
@@ -12,9 +9,6 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
12
9
  deleteHandler: (commandCtx: any) => Promise<{
13
10
  text: string;
14
11
  }>;
15
- profilerHandler: (commandCtx: any) => Promise<{
16
- text: string;
17
- }>;
18
12
  listDatasourceHandler: (commandCtx: any) => Promise<{
19
13
  text: string;
20
14
  }>;
@@ -1,72 +1,11 @@
1
1
  "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
2
  Object.defineProperty(exports, "__esModule", { value: true });
36
3
  exports.getSlashCommands = getSlashCommands;
37
- const ingest_1 = require("../lib/actions/ingest");
38
4
  const retrieve_1 = require("../lib/actions/retrieve");
39
5
  const manage_1 = require("../lib/actions/manage");
40
6
  const profiler_1 = require("../lib/actions/profiler");
41
- const fs = __importStar(require("fs"));
42
- const path = __importStar(require("path"));
43
- const os = __importStar(require("os"));
44
7
  function getSlashCommands(pluginConfig, logger) {
45
8
  return {
46
- ingestHandler: async (commandCtx) => {
47
- const rawArgs = commandCtx.args || "";
48
- const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
49
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
50
- try {
51
- if (args.length === 0) {
52
- return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
53
- }
54
- const datasource_name = args[0];
55
- const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
56
- const dsDir = path.join(BASE_DIR, datasource_name);
57
- const dbPath = path.join(dsDir, 'catalog_db');
58
- if (!fs.existsSync(dbPath)) {
59
- return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
60
- }
61
- const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
62
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
63
- return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
64
- }
65
- catch (e) {
66
- logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
67
- return { text: `**Error executing ingest:** ${e.message}` };
68
- }
69
- },
70
9
  listHandler: async (commandCtx) => {
71
10
  const rawArgs = commandCtx.args || "";
72
11
  const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
@@ -128,32 +67,6 @@ function getSlashCommands(pluginConfig, logger) {
128
67
  return { text: `**Error executing delete:** ${e.message}` };
129
68
  }
130
69
  },
131
- profilerHandler: async (commandCtx) => {
132
- const rawArgs = commandCtx.args || "";
133
- const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
134
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
135
- try {
136
- if (args.length < 2) {
137
- return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
138
- }
139
- const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
140
- const params = {
141
- datasource_name,
142
- url,
143
- endpoint,
144
- access_key,
145
- secret_key,
146
- region,
147
- };
148
- const result = await (0, profiler_1.connectDataSource)(params);
149
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
150
- return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
151
- }
152
- catch (e) {
153
- logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
154
- return { text: `**Error executing profiler:** ${e.message}` };
155
- }
156
- },
157
70
  listDatasourceHandler: async (commandCtx) => {
158
71
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
159
72
  try {
@@ -1,11 +1,9 @@
1
1
  import { ContextLakeConfig } from '../utils/config';
2
2
  import type { AnyAgentTool } from 'openclaw/plugin-sdk';
3
3
  export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
4
- ingestTool: AnyAgentTool;
5
4
  retrieveTool: AnyAgentTool;
6
5
  listTool: AnyAgentTool;
7
6
  deleteTool: AnyAgentTool;
8
- lasDataProfilerTool: AnyAgentTool;
9
7
  listDatasourceTool: AnyAgentTool;
10
8
  listS3ObjectsTool: AnyAgentTool;
11
9
  readS3ObjectTool: AnyAgentTool;
@@ -1,7 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.getAgentTools = getAgentTools;
4
- const ingest_1 = require("../lib/actions/ingest");
5
4
  const retrieve_1 = require("../lib/actions/retrieve");
6
5
  const manage_1 = require("../lib/actions/manage");
7
6
  const profiler_1 = require("../lib/actions/profiler");
@@ -40,62 +39,6 @@ function getAgentTools(pluginConfig, logger) {
40
39
  }
41
40
  }
42
41
  },
43
- ingestTool: {
44
- name: 'contextlake-ingest',
45
- label: 'ContextLake Ingest',
46
- description: `Process and ingest all files from a connected data source into the knowledge base.
47
- Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
48
- Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
49
- Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
50
- parameters: {
51
- type: 'object',
52
- properties: {
53
- datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
54
- },
55
- required: ['datasource_name'],
56
- additionalProperties: false
57
- },
58
- async execute(toolCallId, params) {
59
- logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
60
- try {
61
- let actualParams = params;
62
- if (typeof params === 'string') {
63
- try {
64
- actualParams = JSON.parse(params);
65
- }
66
- catch (e) {
67
- logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
68
- return {
69
- content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
70
- details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
71
- };
72
- }
73
- }
74
- if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
75
- actualParams = actualParams.params;
76
- }
77
- if (!actualParams.datasource_name) {
78
- return {
79
- content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
80
- details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
81
- };
82
- }
83
- const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
84
- logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
85
- return {
86
- content: [{ type: "text", text: JSON.stringify(result) }],
87
- details: result
88
- };
89
- }
90
- catch (error) {
91
- logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
92
- return {
93
- content: [{ type: "text", text: String(error.message) }],
94
- details: { error: error.message }
95
- };
96
- }
97
- }
98
- },
99
42
  retrieveTool: {
100
43
  name: 'contextlake-retrieve',
101
44
  label: 'ContextLake Retrieve',
@@ -245,43 +188,6 @@ Example User Queries:
245
188
  }
246
189
  }
247
190
  },
248
- lasDataProfilerTool: {
249
- name: 'las-data-profiler',
250
- label: 'LAS Data Profiler',
251
- description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
252
- parameters: {
253
- type: 'object',
254
- properties: {
255
- datasource_name: { type: 'string', description: 'Name of the data source' },
256
- url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
257
- sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
258
- },
259
- required: ['datasource_name', 'url'],
260
- additionalProperties: false
261
- },
262
- async execute(toolCallId, params) {
263
- logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
264
- try {
265
- let actualParams = params;
266
- if (params && params.params) {
267
- actualParams = params.params;
268
- }
269
- const result = await (0, profiler_1.connectDataSource)(actualParams);
270
- logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
271
- return {
272
- content: [{ type: "text", text: JSON.stringify(result) }],
273
- details: result
274
- };
275
- }
276
- catch (error) {
277
- logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
278
- return {
279
- content: [{ type: "text", text: String(error.message) }],
280
- details: { error: error.message }
281
- };
282
- }
283
- }
284
- },
285
191
  listS3ObjectsTool: {
286
192
  name: 'list-s3-objects',
287
193
  label: 'List S3 Objects',
@@ -1,11 +1,13 @@
1
1
  ---
2
2
  name: byted-las-data-profiler
3
3
  description: |
4
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
5
- It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
4
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
5
+ performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
6
+ extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
7
+ and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
6
8
 
7
9
  IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
8
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
10
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
9
11
  ---
10
12
 
11
13
  ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
16
18
  - Need to understand what a batch of data files contains and what their schema looks like
17
19
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
18
20
  - Need to write the meta-information of object storage or local files into LanceDB
21
+ - Mentions TOS, boto3, or object storage data profiling
19
22
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
23
+ - Need to batch identify the type and size of remote/local files and build an index
24
+ - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
25
+ - Need to connect/dock a data source for profiling
26
+ - Mentions "connect" data source, docking data source
20
27
 
21
- ## Overall Workflow
22
- When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
23
- If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
28
+ ## Overview
29
+ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
24
30
 
25
- ## Parameter Description (for `las-data-profiler` tool)
26
- | Parameter | Description | Example |
27
- |-----------|-------------|---------|
28
- | datasource_name | The name of the data source | my_tos_data |
29
- | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
30
- | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
31
- | access_key | AK | - |
32
- | secret_key | SK | - |
33
- | region | Region identifier | cn-beijing |
34
- | bucket | Bucket name (root directory path when local) | my-data-bucket |
35
- | prefix | Path prefix to limit the scan scope | datasets/2024/ |
31
+ 1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
32
+ 2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
33
+ 3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
34
+ 4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
36
35
 
37
36
  ## Output Location
38
37
  - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
39
- - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
38
+ - Table names: `files`, `structured_schemas`, `media_metadata`
39
+
40
+ ## Available Tools for this Skill
41
+ - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
+ - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
+ - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
44
+
45
+ Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
@@ -1,42 +1,59 @@
1
1
  ---
2
2
  name: contextlake-ingest
3
3
  description: |
4
- Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
5
- Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
6
- Supports processing of various file types including PDF, Word, Markdown, and Text.
7
- Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
8
-
9
- Example User Queries:
10
- - "帮我把这个文档注入到知识湖中"
11
- - "上传这份 PDF 到知识库"
12
- - "Please ingest these documents into ContextLake"
13
- - "将 /path/to/doc.txt 添加到知识库"
14
-
15
- Example Tool Call:
16
- ```json
17
- {
18
- "files": ["/absolute/path/to/document.pdf"],
19
- "metadata": { "category": "tech" }
20
- }
21
- ```
22
-
23
- parameters:
24
- files:
25
- type: array
26
- items:
27
- type: string
28
- description: List of file paths to ingest
29
- required: true
30
- metadata:
31
- type: object
32
- description: Optional JSON metadata to attach to documents
33
- required: false
34
- chunkSize:
35
- type: integer
36
- description: Chunk size for text splitting
37
- required: false
38
- overlap:
39
- type: integer
40
- description: Overlap size for text splitting
41
- required: false
4
+ A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
5
+ Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
6
+
7
+ When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
8
+ you MUST follow the two-step workflow outlined below using the specific tools provided.
9
+
10
+ IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
11
+ Do not use custom Python scripts, external CLI tools, or unrelated plugins.
12
+ ---
13
+
14
+ # ContextLake Data Ingestion Workflow Guide
15
+
16
+ To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
17
+
18
+ ## Step 1: Data Profiling
19
+ Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
20
+
21
+ **Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
22
+
23
+ ### How it works:
24
+ - **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
25
+ - **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
26
+
27
+ *Wait for Step 1 to complete successfully before proceeding to Step 2.*
28
+
42
29
  ---
30
+
31
+ ## Step 2: Data Ingestion
32
+ Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
33
+
34
+ **Tool to use:** OpenClaw Agent using basic tools guided by this skill.
35
+
36
+ ### How it works:
37
+ - **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
38
+ - **Underlying tools used:**
39
+ 1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
40
+ 2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
41
+ 3. Chunk and process the text.
42
+ 4. Use the embedding tool or model to generate vectors.
43
+ 5. Save the final chunks and vectors into the main ContextLake knowledge base.
44
+
45
+ *Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
46
+
47
+ ---
48
+
49
+ ## Auxiliary Tools (Use only when necessary)
50
+ If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
51
+
52
+ - **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
53
+ - Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
54
+ - **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
55
+ - **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
56
+
57
+ ## Error Handling
58
+ - If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
59
+ - Always report the results of both steps to the user clearly.
@@ -1,11 +1,13 @@
1
1
  ---
2
2
  name: byted-las-data-profiler
3
3
  description: |
4
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
5
- It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
4
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
5
+ performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
6
+ extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
7
+ and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
6
8
 
7
9
  IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
8
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
10
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
9
11
  ---
10
12
 
11
13
  ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
16
18
  - Need to understand what a batch of data files contains and what their schema looks like
17
19
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
18
20
  - Need to write the meta-information of object storage or local files into LanceDB
21
+ - Mentions TOS, boto3, or object storage data profiling
19
22
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
23
+ - Need to batch identify the type and size of remote/local files and build an index
24
+ - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
25
+ - Need to connect/dock a data source for profiling
26
+ - Mentions "connect" data source, docking data source
20
27
 
21
- ## Overall Workflow
22
- When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
23
- If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
28
+ ## Overview
29
+ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
24
30
 
25
- ## Parameter Description (for `las-data-profiler` tool)
26
- | Parameter | Description | Example |
27
- |-----------|-------------|---------|
28
- | datasource_name | The name of the data source | my_tos_data |
29
- | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
30
- | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
31
- | access_key | AK | - |
32
- | secret_key | SK | - |
33
- | region | Region identifier | cn-beijing |
34
- | bucket | Bucket name (root directory path when local) | my-data-bucket |
35
- | prefix | Path prefix to limit the scan scope | datasets/2024/ |
31
+ 1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
32
+ 2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
33
+ 3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
34
+ 4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
36
35
 
37
36
  ## Output Location
38
37
  - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
39
- - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
38
+ - Table names: `files`, `structured_schemas`, `media_metadata`
39
+
40
+ ## Available Tools for this Skill
41
+ - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
+ - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
+ - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
44
+
45
+ Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "id": "contextlake-openclaw",
3
3
  "name": "ContextLake",
4
- "version": "1.0.6",
4
+ "version": "1.0.7",
5
5
  "description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
6
6
  "skills": ["./src/skills"],
7
7
  "configSchema": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@byted-las/contextlake-openclaw",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "ContextLake OpenClaw Plugin for managing knowledge base",
5
5
  "main": "index.ts",
6
6
  "files": [
@@ -37,51 +37,6 @@ function parseMetadata(metadata: any): Record<string, any> {
37
37
 
38
38
  export function getCliCommands(pluginConfig: ContextLakeConfig, logger: any) {
39
39
  return {
40
- connectAction: async (datasource_name: string, url: string, options: any) => {
41
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
42
- try {
43
- const params: ConnectParams = {
44
- datasource_name,
45
- url,
46
- endpoint: options.endpoint,
47
- access_key: options.ak,
48
- secret_key: options.sk,
49
- region: options.region,
50
- sample_rows: parseInt(options.sampleRows),
51
- };
52
-
53
- // eslint-disable-next-line no-console
54
- console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
55
- // eslint-disable-next-line no-console
56
- console.log(` url: ${params.url}`);
57
-
58
- const result = await connectDataSource(params);
59
- // eslint-disable-next-line no-console
60
- console.log(JSON.stringify(result, null, 2));
61
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
62
- } catch (e: any) {
63
- // eslint-disable-next-line no-console
64
- console.error('Error:', e.message);
65
- logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
66
- process.exitCode = 1;
67
- }
68
- },
69
-
70
- ingestAction: async (datasource_name: string) => {
71
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
72
- try {
73
- const result = await ingestSource({
74
- datasource_name
75
- }, pluginConfig, logger);
76
- // eslint-disable-next-line no-console
77
- console.log(JSON.stringify(result, null, 2));
78
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
79
- } catch (e: any) {
80
- console.error('Error:', e.message);
81
- logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
82
- }
83
- },
84
-
85
40
  searchAction: async (query: any, options: any) => {
86
41
  logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
87
42
  try {
@@ -12,9 +12,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
12
12
  try {
13
13
  const tools = getAgentTools(pluginConfig, logger);
14
14
 
15
- ctx.registerTool(tools.ingestTool );
16
- logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
17
-
18
15
  ctx.registerTool(tools.retrieveTool );
19
16
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
20
17
 
@@ -23,9 +20,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
23
20
 
24
21
  ctx.registerTool(tools.deleteTool );
25
22
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
26
-
27
- ctx.registerTool(tools.lasDataProfilerTool );
28
- logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
29
23
 
30
24
  ctx.registerTool(tools.listS3ObjectsTool );
31
25
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
@@ -65,21 +59,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
65
59
 
66
60
  const commands = getCliCommands(pluginConfig, logger);
67
61
 
68
- // connect -- data source profiling (las-data-profiler)
69
- contextlake.command('connect <datasource_name> <url>')
70
- .description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
71
- .option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
72
- .option('--ak <credential_id>', 'Credential ID for the data source')
73
- .option('--sk <credential_value>', 'Credential value for the data source')
74
- .option('--region <region>', 'Region identifier (e.g. cn-beijing)')
75
- .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
76
- .action(commands.connectAction);
77
-
78
- // Ingest
79
- contextlake.command('ingest <datasource_name>')
80
- .description('Process and ingest all files from a connected data source into the knowledge base')
81
- .action(commands.ingestAction);
82
-
83
62
  // Search
84
63
  contextlake.command('search <query>')
85
64
  .description('Search the knowledge base for relevant documents')
@@ -122,13 +101,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
122
101
 
123
102
  const slashCommands = getSlashCommands(pluginConfig, logger);
124
103
 
125
- ctx.registerCommand({
126
- name: 'contextlake-ingest',
127
- description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
128
- acceptsArgs: true,
129
- handler: slashCommands.ingestHandler
130
- });
131
-
132
104
  ctx.registerCommand({
133
105
  name: 'contextlake-list',
134
106
  description: 'List documents currently in the knowledge base',
@@ -150,13 +122,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
150
122
  handler: slashCommands.deleteHandler
151
123
  });
152
124
 
153
- ctx.registerCommand({
154
- name: 'contextlake-profiler',
155
- description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
156
- acceptsArgs: true,
157
- handler: slashCommands.profilerHandler
158
- });
159
-
160
125
  ctx.registerCommand({
161
126
  name: 'contextlake-list-datasource',
162
127
  description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
@@ -9,36 +9,6 @@ import * as os from 'os';
9
9
 
10
10
  export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
11
11
  return {
12
- ingestHandler: async (commandCtx: any) => {
13
- const rawArgs = commandCtx.args || "";
14
- const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
15
-
16
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
17
- try {
18
- if (args.length === 0) {
19
- return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
20
- }
21
-
22
- const datasource_name = args[0];
23
-
24
- const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
25
- const dsDir = path.join(BASE_DIR, datasource_name);
26
- const dbPath = path.join(dsDir, 'catalog_db');
27
-
28
- if (!fs.existsSync(dbPath)) {
29
- return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
30
- }
31
-
32
- const result = await ingestSource({ datasource_name }, pluginConfig, logger);
33
-
34
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
35
- return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
36
- } catch (e: any) {
37
- logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
38
- return { text: `**Error executing ingest:** ${e.message}` };
39
- }
40
- },
41
-
42
12
  listHandler: async (commandCtx: any) => {
43
13
  const rawArgs = commandCtx.args || "";
44
14
  const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
@@ -102,35 +72,6 @@ export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
102
72
  return { text: `**Error executing delete:** ${e.message}` };
103
73
  }
104
74
  },
105
- profilerHandler: async (commandCtx: any) => {
106
- const rawArgs = commandCtx.args || "";
107
- const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
108
-
109
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
110
- try {
111
- if (args.length < 2) {
112
- return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
113
- }
114
-
115
- const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
116
-
117
- const params: ConnectParams = {
118
- datasource_name,
119
- url,
120
- endpoint,
121
- access_key,
122
- secret_key,
123
- region,
124
- };
125
-
126
- const result = await connectDataSource(params);
127
- logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
128
- return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
129
- } catch (e: any) {
130
- logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
131
- return { text: `**Error executing profiler:** ${e.message}` };
132
- }
133
- },
134
75
 
135
76
  listDatasourceHandler: async (commandCtx: any) => {
136
77
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
@@ -10,11 +10,9 @@ import { ContextLakeConfig } from '../utils/config';
10
10
  import type { AnyAgentTool } from 'openclaw/plugin-sdk';
11
11
 
12
12
  export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
13
- ingestTool: AnyAgentTool;
14
13
  retrieveTool: AnyAgentTool;
15
14
  listTool: AnyAgentTool;
16
15
  deleteTool: AnyAgentTool;
17
- lasDataProfilerTool: AnyAgentTool;
18
16
  listDatasourceTool: AnyAgentTool;
19
17
  listS3ObjectsTool: AnyAgentTool;
20
18
  readS3ObjectTool: AnyAgentTool;
@@ -54,65 +52,6 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
54
52
  }
55
53
  }
56
54
  },
57
- ingestTool: {
58
- name: 'contextlake-ingest',
59
- label: 'ContextLake Ingest',
60
- description: `Process and ingest all files from a connected data source into the knowledge base.
61
- Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
62
- Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
63
- Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
64
- parameters: {
65
- type: 'object',
66
- properties: {
67
- datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
68
- },
69
- required: ['datasource_name'],
70
- additionalProperties: false
71
- },
72
-
73
- async execute(toolCallId: string, params: any) {
74
- logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
75
-
76
- try {
77
- let actualParams = params;
78
- if (typeof params === 'string') {
79
- try {
80
- actualParams = JSON.parse(params);
81
- } catch (e) {
82
- logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
83
- return {
84
- content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
85
- details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
86
- } as any;
87
- }
88
- }
89
-
90
- if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
91
- actualParams = actualParams.params;
92
- }
93
-
94
- if (!actualParams.datasource_name) {
95
- return {
96
- content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
97
- details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
98
- } as any;
99
- }
100
-
101
- const result = await ingestSource(actualParams, pluginConfig, logger);
102
- logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
103
- return {
104
- content: [{ type: "text", text: JSON.stringify(result) }],
105
- details: result
106
- } as any;
107
- } catch (error: any) {
108
- logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
109
- return {
110
- content: [{ type: "text", text: String(error.message) }],
111
- details: { error: error.message }
112
- } as any;
113
- }
114
- }
115
- },
116
55
  retrieveTool: {
117
56
  name: 'contextlake-retrieve',
118
57
  label: 'ContextLake Retrieve',
@@ -268,44 +207,6 @@ Example User Queries:
268
207
  }
269
208
  }
270
209
  },
271
- lasDataProfilerTool: {
272
- name: 'las-data-profiler',
273
- label: 'LAS Data Profiler',
274
- description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
275
- parameters: {
276
- type: 'object',
277
- properties: {
278
- datasource_name: { type: 'string', description: 'Name of the data source' },
279
- url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
280
- sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
281
- },
282
- required: ['datasource_name', 'url'],
283
- additionalProperties: false
284
- },
285
-
286
- async execute(toolCallId: string, params: any) {
287
- logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
288
-
289
- try {
290
- let actualParams = params;
291
- if (params && params.params) {
292
- actualParams = params.params;
293
- }
294
- const result = await connectDataSource(actualParams);
295
- logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
296
- return {
297
- content: [{ type: "text", text: JSON.stringify(result) }],
298
- details: result
299
- } as any;
300
- } catch (error: any) {
301
- logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
302
- return {
303
- content: [{ type: "text", text: String(error.message) }],
304
- details: { error: error.message }
305
- } as any;
306
- }
307
- }
308
- },
309
210
  listS3ObjectsTool: {
310
211
  name: 'list-s3-objects',
311
212
  label: 'List S3 Objects',
@@ -1,42 +1,59 @@
1
1
  ---
2
2
  name: contextlake-ingest
3
3
  description: |
4
- Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
5
- Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
6
- Supports processing of various file types including PDF, Word, Markdown, and Text.
7
- Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
8
-
9
- Example User Queries:
10
- - "帮我把这个文档注入到知识湖中"
11
- - "上传这份 PDF 到知识库"
12
- - "Please ingest these documents into ContextLake"
13
- - "将 /path/to/doc.txt 添加到知识库"
14
-
15
- Example Tool Call:
16
- ```json
17
- {
18
- "files": ["/absolute/path/to/document.pdf"],
19
- "metadata": { "category": "tech" }
20
- }
21
- ```
22
-
23
- parameters:
24
- files:
25
- type: array
26
- items:
27
- type: string
28
- description: List of file paths to ingest
29
- required: true
30
- metadata:
31
- type: object
32
- description: Optional JSON metadata to attach to documents
33
- required: false
34
- chunkSize:
35
- type: integer
36
- description: Chunk size for text splitting
37
- required: false
38
- overlap:
39
- type: integer
40
- description: Overlap size for text splitting
41
- required: false
4
+ A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
5
+ Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
6
+
7
+ When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
8
+ you MUST follow the two-step workflow outlined below using the specific tools provided.
9
+
10
+ IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
11
+ Do not use custom Python scripts, external CLI tools, or unrelated plugins.
12
+ ---
13
+
14
+ # ContextLake Data Ingestion Workflow Guide
15
+
16
+ To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
17
+
18
+ ## Step 1: Data Profiling
19
+ Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
20
+
21
+ **Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
22
+
23
+ ### How it works:
24
+ - **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
25
+ - **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
26
+
27
+ *Wait for Step 1 to complete successfully before proceeding to Step 2.*
28
+
42
29
  ---
30
+
31
+ ## Step 2: Data Ingestion
32
+ Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
33
+
34
+ **Tool to use:** OpenClaw Agent using basic tools guided by this skill.
35
+
36
+ ### How it works:
37
+ - **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
38
+ - **Underlying tools used:**
39
+ 1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
40
+ 2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
41
+ 3. Chunk and process the text.
42
+ 4. Use the embedding tool or model to generate vectors.
43
+ 5. Save the final chunks and vectors into the main ContextLake knowledge base.
44
+
45
+ *Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
46
+
47
+ ---
48
+
49
+ ## Auxiliary Tools (Use only when necessary)
50
+ If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
51
+
52
+ - **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
53
+ - Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
54
+ - **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
55
+ - **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
56
+
57
+ ## Error Handling
58
+ - If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
59
+ - Always report the results of both steps to the user clearly.
@@ -1,11 +1,13 @@
1
1
  ---
2
2
  name: byted-las-data-profiler
3
3
  description: |
4
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
5
- It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
4
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
5
+ performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
6
+ extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
7
+ and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
6
8
 
7
9
  IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
8
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
10
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
9
11
  ---
10
12
 
11
13
  ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
16
18
  - Need to understand what a batch of data files contains and what their schema looks like
17
19
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
18
20
  - Need to write the meta-information of object storage or local files into LanceDB
21
+ - Mentions TOS, boto3, or object storage data profiling
19
22
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
23
+ - Need to batch identify the type and size of remote/local files and build an index
24
+ - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
25
+ - Need to connect/dock a data source for profiling
26
+ - Mentions "connect" data source, docking data source
20
27
 
21
- ## Overall Workflow
22
- When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
23
- If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
28
+ ## Overview
29
+ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
24
30
 
25
- ## Parameter Description (for `las-data-profiler` tool)
26
- | Parameter | Description | Example |
27
- |-----------|-------------|---------|
28
- | datasource_name | The name of the data source | my_tos_data |
29
- | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
30
- | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
31
- | access_key | AK | - |
32
- | secret_key | SK | - |
33
- | region | Region identifier | cn-beijing |
34
- | bucket | Bucket name (root directory path when local) | my-data-bucket |
35
- | prefix | Path prefix to limit the scan scope | datasets/2024/ |
31
+ 1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
32
+ 2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
33
+ 3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
34
+ 4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
36
35
 
37
36
  ## Output Location
38
37
  - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
39
- - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
38
+ - Table names: `files`, `structured_schemas`, `media_metadata`
39
+
40
+ ## Available Tools for this Skill
41
+ - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
+ - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
+ - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
44
+
45
+ Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.