@byted-las/contextlake-openclaw 1.0.6 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/commands/cli.d.ts +0 -2
- package/dist/src/commands/cli.js +0 -46
- package/dist/src/commands/index.js +0 -29
- package/dist/src/commands/slashcmd.d.ts +0 -6
- package/dist/src/commands/slashcmd.js +0 -87
- package/dist/src/commands/tools.d.ts +0 -2
- package/dist/src/commands/tools.js +0 -94
- package/dist/src/skills/contextlake-ingest/SKILL.md +55 -38
- package/dist/src/skills/las-data-profiler/SKILL.md +24 -18
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
- package/src/commands/cli.ts +0 -45
- package/src/commands/index.ts +0 -35
- package/src/commands/slashcmd.ts +0 -59
- package/src/commands/tools.ts +0 -99
- package/src/skills/contextlake-ingest/SKILL.md +55 -38
- package/src/skills/las-data-profiler/SKILL.md +24 -18
- package/dist/src/lib/actions/ingest-source.d.ts +0 -15
- package/dist/src/lib/actions/ingest-source.js +0 -193
- package/dist/src/lib/actions/las.d.ts +0 -64
- package/dist/src/lib/actions/las.js +0 -72
- package/dist/src/lib/scripts/s3_catalog.py +0 -617
- package/dist/src/service/embedding/local.d.ts +0 -14
- package/dist/src/service/embedding/local.js +0 -107
- package/dist/src/skills/SKILL.md +0 -39
package/src/commands/index.ts
CHANGED
|
@@ -12,9 +12,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
12
12
|
try {
|
|
13
13
|
const tools = getAgentTools(pluginConfig, logger);
|
|
14
14
|
|
|
15
|
-
ctx.registerTool(tools.ingestTool );
|
|
16
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
|
|
17
|
-
|
|
18
15
|
ctx.registerTool(tools.retrieveTool );
|
|
19
16
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
|
|
20
17
|
|
|
@@ -23,9 +20,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
23
20
|
|
|
24
21
|
ctx.registerTool(tools.deleteTool );
|
|
25
22
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
26
|
-
|
|
27
|
-
ctx.registerTool(tools.lasDataProfilerTool );
|
|
28
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
29
23
|
|
|
30
24
|
ctx.registerTool(tools.listS3ObjectsTool );
|
|
31
25
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
|
|
@@ -65,21 +59,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
65
59
|
|
|
66
60
|
const commands = getCliCommands(pluginConfig, logger);
|
|
67
61
|
|
|
68
|
-
// connect -- data source profiling (las-data-profiler)
|
|
69
|
-
contextlake.command('connect <datasource_name> <url>')
|
|
70
|
-
.description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
|
|
71
|
-
.option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
|
|
72
|
-
.option('--ak <credential_id>', 'Credential ID for the data source')
|
|
73
|
-
.option('--sk <credential_value>', 'Credential value for the data source')
|
|
74
|
-
.option('--region <region>', 'Region identifier (e.g. cn-beijing)')
|
|
75
|
-
.option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
|
|
76
|
-
.action(commands.connectAction);
|
|
77
|
-
|
|
78
|
-
// Ingest
|
|
79
|
-
contextlake.command('ingest <datasource_name>')
|
|
80
|
-
.description('Process and ingest all files from a connected data source into the knowledge base')
|
|
81
|
-
.action(commands.ingestAction);
|
|
82
|
-
|
|
83
62
|
// Search
|
|
84
63
|
contextlake.command('search <query>')
|
|
85
64
|
.description('Search the knowledge base for relevant documents')
|
|
@@ -122,13 +101,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
122
101
|
|
|
123
102
|
const slashCommands = getSlashCommands(pluginConfig, logger);
|
|
124
103
|
|
|
125
|
-
ctx.registerCommand({
|
|
126
|
-
name: 'contextlake-ingest',
|
|
127
|
-
description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
|
|
128
|
-
acceptsArgs: true,
|
|
129
|
-
handler: slashCommands.ingestHandler
|
|
130
|
-
});
|
|
131
|
-
|
|
132
104
|
ctx.registerCommand({
|
|
133
105
|
name: 'contextlake-list',
|
|
134
106
|
description: 'List documents currently in the knowledge base',
|
|
@@ -150,13 +122,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
150
122
|
handler: slashCommands.deleteHandler
|
|
151
123
|
});
|
|
152
124
|
|
|
153
|
-
ctx.registerCommand({
|
|
154
|
-
name: 'contextlake-profiler',
|
|
155
|
-
description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
|
|
156
|
-
acceptsArgs: true,
|
|
157
|
-
handler: slashCommands.profilerHandler
|
|
158
|
-
});
|
|
159
|
-
|
|
160
125
|
ctx.registerCommand({
|
|
161
126
|
name: 'contextlake-list-datasource',
|
|
162
127
|
description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
|
package/src/commands/slashcmd.ts
CHANGED
|
@@ -9,36 +9,6 @@ import * as os from 'os';
|
|
|
9
9
|
|
|
10
10
|
export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
|
|
11
11
|
return {
|
|
12
|
-
ingestHandler: async (commandCtx: any) => {
|
|
13
|
-
const rawArgs = commandCtx.args || "";
|
|
14
|
-
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
15
|
-
|
|
16
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
|
|
17
|
-
try {
|
|
18
|
-
if (args.length === 0) {
|
|
19
|
-
return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const datasource_name = args[0];
|
|
23
|
-
|
|
24
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
25
|
-
const dsDir = path.join(BASE_DIR, datasource_name);
|
|
26
|
-
const dbPath = path.join(dsDir, 'catalog_db');
|
|
27
|
-
|
|
28
|
-
if (!fs.existsSync(dbPath)) {
|
|
29
|
-
return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const result = await ingestSource({ datasource_name }, pluginConfig, logger);
|
|
33
|
-
|
|
34
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
|
|
35
|
-
return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
36
|
-
} catch (e: any) {
|
|
37
|
-
logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
|
|
38
|
-
return { text: `**Error executing ingest:** ${e.message}` };
|
|
39
|
-
}
|
|
40
|
-
},
|
|
41
|
-
|
|
42
12
|
listHandler: async (commandCtx: any) => {
|
|
43
13
|
const rawArgs = commandCtx.args || "";
|
|
44
14
|
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
@@ -102,35 +72,6 @@ export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
|
|
|
102
72
|
return { text: `**Error executing delete:** ${e.message}` };
|
|
103
73
|
}
|
|
104
74
|
},
|
|
105
|
-
profilerHandler: async (commandCtx: any) => {
|
|
106
|
-
const rawArgs = commandCtx.args || "";
|
|
107
|
-
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
108
|
-
|
|
109
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
|
|
110
|
-
try {
|
|
111
|
-
if (args.length < 2) {
|
|
112
|
-
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
|
|
116
|
-
|
|
117
|
-
const params: ConnectParams = {
|
|
118
|
-
datasource_name,
|
|
119
|
-
url,
|
|
120
|
-
endpoint,
|
|
121
|
-
access_key,
|
|
122
|
-
secret_key,
|
|
123
|
-
region,
|
|
124
|
-
};
|
|
125
|
-
|
|
126
|
-
const result = await connectDataSource(params);
|
|
127
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
|
|
128
|
-
return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
129
|
-
} catch (e: any) {
|
|
130
|
-
logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
|
|
131
|
-
return { text: `**Error executing profiler:** ${e.message}` };
|
|
132
|
-
}
|
|
133
|
-
},
|
|
134
75
|
|
|
135
76
|
listDatasourceHandler: async (commandCtx: any) => {
|
|
136
77
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
|
package/src/commands/tools.ts
CHANGED
|
@@ -10,11 +10,9 @@ import { ContextLakeConfig } from '../utils/config';
|
|
|
10
10
|
import type { AnyAgentTool } from 'openclaw/plugin-sdk';
|
|
11
11
|
|
|
12
12
|
export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
13
|
-
ingestTool: AnyAgentTool;
|
|
14
13
|
retrieveTool: AnyAgentTool;
|
|
15
14
|
listTool: AnyAgentTool;
|
|
16
15
|
deleteTool: AnyAgentTool;
|
|
17
|
-
lasDataProfilerTool: AnyAgentTool;
|
|
18
16
|
listDatasourceTool: AnyAgentTool;
|
|
19
17
|
listS3ObjectsTool: AnyAgentTool;
|
|
20
18
|
readS3ObjectTool: AnyAgentTool;
|
|
@@ -54,65 +52,6 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
|
54
52
|
}
|
|
55
53
|
}
|
|
56
54
|
},
|
|
57
|
-
ingestTool: {
|
|
58
|
-
name: 'contextlake-ingest',
|
|
59
|
-
label: 'ContextLake Ingest',
|
|
60
|
-
description: `Process and ingest all files from a connected data source into the knowledge base.
|
|
61
|
-
Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
|
|
62
|
-
Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
|
|
63
|
-
Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
|
|
64
|
-
parameters: {
|
|
65
|
-
type: 'object',
|
|
66
|
-
properties: {
|
|
67
|
-
datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
|
|
68
|
-
},
|
|
69
|
-
required: ['datasource_name'],
|
|
70
|
-
additionalProperties: false
|
|
71
|
-
},
|
|
72
|
-
|
|
73
|
-
async execute(toolCallId: string, params: any) {
|
|
74
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
75
|
-
|
|
76
|
-
try {
|
|
77
|
-
let actualParams = params;
|
|
78
|
-
if (typeof params === 'string') {
|
|
79
|
-
try {
|
|
80
|
-
actualParams = JSON.parse(params);
|
|
81
|
-
} catch (e) {
|
|
82
|
-
logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
|
|
83
|
-
return {
|
|
84
|
-
content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
|
|
85
|
-
details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
|
|
86
|
-
} as any;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
|
|
91
|
-
actualParams = actualParams.params;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
if (!actualParams.datasource_name) {
|
|
95
|
-
return {
|
|
96
|
-
content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
|
|
97
|
-
details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
|
|
98
|
-
} as any;
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
const result = await ingestSource(actualParams, pluginConfig, logger);
|
|
102
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
|
|
103
|
-
return {
|
|
104
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
105
|
-
details: result
|
|
106
|
-
} as any;
|
|
107
|
-
} catch (error: any) {
|
|
108
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
|
|
109
|
-
return {
|
|
110
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
111
|
-
details: { error: error.message }
|
|
112
|
-
} as any;
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
55
|
retrieveTool: {
|
|
117
56
|
name: 'contextlake-retrieve',
|
|
118
57
|
label: 'ContextLake Retrieve',
|
|
@@ -268,44 +207,6 @@ Example User Queries:
|
|
|
268
207
|
}
|
|
269
208
|
}
|
|
270
209
|
},
|
|
271
|
-
lasDataProfilerTool: {
|
|
272
|
-
name: 'las-data-profiler',
|
|
273
|
-
label: 'LAS Data Profiler',
|
|
274
|
-
description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
|
|
275
|
-
parameters: {
|
|
276
|
-
type: 'object',
|
|
277
|
-
properties: {
|
|
278
|
-
datasource_name: { type: 'string', description: 'Name of the data source' },
|
|
279
|
-
url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
|
|
280
|
-
sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
|
|
281
|
-
},
|
|
282
|
-
required: ['datasource_name', 'url'],
|
|
283
|
-
additionalProperties: false
|
|
284
|
-
},
|
|
285
|
-
|
|
286
|
-
async execute(toolCallId: string, params: any) {
|
|
287
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
288
|
-
|
|
289
|
-
try {
|
|
290
|
-
let actualParams = params;
|
|
291
|
-
if (params && params.params) {
|
|
292
|
-
actualParams = params.params;
|
|
293
|
-
}
|
|
294
|
-
const result = await connectDataSource(actualParams);
|
|
295
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
|
|
296
|
-
return {
|
|
297
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
298
|
-
details: result
|
|
299
|
-
} as any;
|
|
300
|
-
} catch (error: any) {
|
|
301
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
302
|
-
return {
|
|
303
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
304
|
-
details: { error: error.message }
|
|
305
|
-
} as any;
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
},
|
|
309
210
|
listS3ObjectsTool: {
|
|
310
211
|
name: 'list-s3-objects',
|
|
311
212
|
label: 'List S3 Objects',
|
|
@@ -1,42 +1,59 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: contextlake-ingest
|
|
3
3
|
description: |
|
|
4
|
-
|
|
5
|
-
Use this
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
required: true
|
|
30
|
-
metadata:
|
|
31
|
-
type: object
|
|
32
|
-
description: Optional JSON metadata to attach to documents
|
|
33
|
-
required: false
|
|
34
|
-
chunkSize:
|
|
35
|
-
type: integer
|
|
36
|
-
description: Chunk size for text splitting
|
|
37
|
-
required: false
|
|
38
|
-
overlap:
|
|
39
|
-
type: integer
|
|
40
|
-
description: Overlap size for text splitting
|
|
41
|
-
required: false
|
|
4
|
+
A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
|
|
5
|
+
Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
|
|
6
|
+
|
|
7
|
+
When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
|
|
8
|
+
you MUST follow the two-step workflow outlined below using the specific tools provided.
|
|
9
|
+
|
|
10
|
+
IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
|
|
11
|
+
Do not use custom Python scripts, external CLI tools, or unrelated plugins.
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ContextLake Data Ingestion Workflow Guide
|
|
15
|
+
|
|
16
|
+
To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
|
|
17
|
+
|
|
18
|
+
## Step 1: Data Profiling
|
|
19
|
+
Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
|
|
20
|
+
|
|
21
|
+
**Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
|
|
22
|
+
|
|
23
|
+
### How it works:
|
|
24
|
+
- **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
|
|
25
|
+
- **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
|
|
26
|
+
|
|
27
|
+
*Wait for Step 1 to complete successfully before proceeding to Step 2.*
|
|
28
|
+
|
|
42
29
|
---
|
|
30
|
+
|
|
31
|
+
## Step 2: Data Ingestion
|
|
32
|
+
Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
|
|
33
|
+
|
|
34
|
+
**Tool to use:** OpenClaw Agent using basic tools guided by this skill.
|
|
35
|
+
|
|
36
|
+
### How it works:
|
|
37
|
+
- **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
|
|
38
|
+
- **Underlying tools used:**
|
|
39
|
+
1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
|
|
40
|
+
2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
|
|
41
|
+
3. Chunk and process the text.
|
|
42
|
+
4. Use the embedding tool or model to generate vectors.
|
|
43
|
+
5. Save the final chunks and vectors into the main ContextLake knowledge base.
|
|
44
|
+
|
|
45
|
+
*Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Auxiliary Tools (Use only when necessary)
|
|
50
|
+
If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
|
|
51
|
+
|
|
52
|
+
- **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
|
|
53
|
+
- Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
|
|
54
|
+
- **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
|
|
55
|
+
- **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
|
|
56
|
+
|
|
57
|
+
## Error Handling
|
|
58
|
+
- If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
|
|
59
|
+
- Always report the results of both steps to the user clearly.
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: byted-las-data-profiler
|
|
3
3
|
description: |
|
|
4
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them
|
|
5
|
-
|
|
4
|
+
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
|
|
5
|
+
performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
|
|
6
|
+
extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
|
|
7
|
+
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
6
8
|
|
|
7
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
8
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) via formal tool calls to accomplish the profiling tasks. DO NOT execute them as bash/shell commands.
|
|
9
11
|
---
|
|
10
12
|
|
|
11
13
|
## Trigger Scenarios
|
|
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
|
|
|
16
18
|
- Need to understand what a batch of data files contains and what their schema looks like
|
|
17
19
|
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
18
20
|
- Need to write the meta-information of object storage or local files into LanceDB
|
|
21
|
+
- Mentions TOS, boto3, or object storage data profiling
|
|
19
22
|
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
23
|
+
- Need to batch identify the type and size of remote/local files and build an index
|
|
24
|
+
- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
|
|
25
|
+
- Need to connect/dock a data source for profiling
|
|
26
|
+
- Mentions "connect" data source, docking data source
|
|
20
27
|
|
|
21
|
-
##
|
|
22
|
-
|
|
23
|
-
If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
|
|
28
|
+
## Overview
|
|
29
|
+
This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
|
|
24
30
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
|
|
30
|
-
| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
|
|
31
|
-
| access_key | AK | - |
|
|
32
|
-
| secret_key | SK | - |
|
|
33
|
-
| region | Region identifier | cn-beijing |
|
|
34
|
-
| bucket | Bucket name (root directory path when local) | my-data-bucket |
|
|
35
|
-
| prefix | Path prefix to limit the scan scope | datasets/2024/ |
|
|
31
|
+
1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
|
|
32
|
+
2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
|
|
33
|
+
3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
|
|
34
|
+
4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
|
|
36
35
|
|
|
37
36
|
## Output Location
|
|
38
37
|
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
39
|
-
-
|
|
38
|
+
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
|
+
|
|
40
|
+
## Available Tools for this Skill
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory. (Call this as an Agent Tool, NOT a bash command).
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction. (Call this as an Agent Tool, NOT a bash command).
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog. (Call this as an Agent Tool, NOT a bash command).
|
|
44
|
+
|
|
45
|
+
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
-
export interface IngestSourceParams {
|
|
3
|
-
datasource_name: string;
|
|
4
|
-
}
|
|
5
|
-
export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
|
|
6
|
-
file: any;
|
|
7
|
-
status: string;
|
|
8
|
-
chunks: number;
|
|
9
|
-
message?: undefined;
|
|
10
|
-
} | {
|
|
11
|
-
file: any;
|
|
12
|
-
status: string;
|
|
13
|
-
message: any;
|
|
14
|
-
chunks?: undefined;
|
|
15
|
-
})[]>;
|
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.ingestSource = ingestSource;
|
|
37
|
-
const factory_1 = require("../../service/metadata/factory");
|
|
38
|
-
const las_api_1 = require("./las-api");
|
|
39
|
-
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
40
|
-
const path = __importStar(require("path"));
|
|
41
|
-
const fs = __importStar(require("fs"));
|
|
42
|
-
const os = __importStar(require("os"));
|
|
43
|
-
// @ts-ignore
|
|
44
|
-
const uuid_1 = require("uuid");
|
|
45
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
-
async function ingestSource(params, config, logger) {
|
|
47
|
-
if (logger) {
|
|
48
|
-
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
49
|
-
}
|
|
50
|
-
else {
|
|
51
|
-
// eslint-disable-next-line no-console
|
|
52
|
-
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
53
|
-
}
|
|
54
|
-
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
55
|
-
const dbPath = path.join(dsDir, 'catalog_db');
|
|
56
|
-
if (!fs.existsSync(dbPath)) {
|
|
57
|
-
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
58
|
-
}
|
|
59
|
-
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
60
|
-
const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
|
|
61
|
-
await metadataProvider.connect();
|
|
62
|
-
const lasClient = new las_api_1.LasApiClient(config, logger);
|
|
63
|
-
const results = [];
|
|
64
|
-
// Connect to the profiler LanceDB to read the file catalog
|
|
65
|
-
const profilerDb = await lancedb.connect(dbPath);
|
|
66
|
-
const tableNames = await profilerDb.tableNames();
|
|
67
|
-
if (!tableNames.includes('file_catalog')) {
|
|
68
|
-
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
69
|
-
}
|
|
70
|
-
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
71
|
-
const files = await catalogTable.query().toArray();
|
|
72
|
-
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
73
|
-
// Simple chunking for text
|
|
74
|
-
const splitText = (text, chunkSize = 500, overlap = 50) => {
|
|
75
|
-
const chunks = [];
|
|
76
|
-
if (!text)
|
|
77
|
-
return chunks;
|
|
78
|
-
let i = 0;
|
|
79
|
-
while (i < text.length) {
|
|
80
|
-
chunks.push(text.slice(i, i + chunkSize));
|
|
81
|
-
i += chunkSize - overlap;
|
|
82
|
-
}
|
|
83
|
-
return chunks;
|
|
84
|
-
};
|
|
85
|
-
const processText = async (text, fileInfo) => {
|
|
86
|
-
const chunks = splitText(text);
|
|
87
|
-
const docs = [];
|
|
88
|
-
for (const chunk of chunks) {
|
|
89
|
-
const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
|
|
90
|
-
docs.push({
|
|
91
|
-
id: (0, uuid_1.v4)(),
|
|
92
|
-
vector,
|
|
93
|
-
text: chunk,
|
|
94
|
-
source: fileInfo.key,
|
|
95
|
-
file_type: fileInfo.category,
|
|
96
|
-
storage_type: 'source',
|
|
97
|
-
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
98
|
-
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
99
|
-
created_at: Date.now(),
|
|
100
|
-
binary_data: Buffer.from('')
|
|
101
|
-
});
|
|
102
|
-
}
|
|
103
|
-
return docs;
|
|
104
|
-
};
|
|
105
|
-
for (const file of files) {
|
|
106
|
-
try {
|
|
107
|
-
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
108
|
-
let docs = [];
|
|
109
|
-
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
110
|
-
if (file.media_type === 'pdf') {
|
|
111
|
-
// PDF Parse
|
|
112
|
-
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
113
|
-
url: fileUrl
|
|
114
|
-
});
|
|
115
|
-
const markdown = result.data?.markdown || '';
|
|
116
|
-
docs = await processText(markdown, file);
|
|
117
|
-
}
|
|
118
|
-
else if (file.media_type === 'image') {
|
|
119
|
-
// Multimodal Embedding directly
|
|
120
|
-
const vector = await metadataProvider.generateMultimodalEmbedding([
|
|
121
|
-
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
122
|
-
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
123
|
-
]);
|
|
124
|
-
docs.push({
|
|
125
|
-
id: (0, uuid_1.v4)(),
|
|
126
|
-
vector,
|
|
127
|
-
text: 'Image from dataset',
|
|
128
|
-
source: file.key,
|
|
129
|
-
file_type: 'image',
|
|
130
|
-
storage_type: 'source',
|
|
131
|
-
url: fileUrl,
|
|
132
|
-
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
133
|
-
created_at: Date.now(),
|
|
134
|
-
binary_data: Buffer.from('')
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
else if (file.media_type === 'audio') {
|
|
138
|
-
// ASR
|
|
139
|
-
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
140
|
-
audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
|
|
141
|
-
request: { model_name: 'bigmodel' }
|
|
142
|
-
});
|
|
143
|
-
const text = result.data?.result?.text || '';
|
|
144
|
-
docs = await processText(text, file);
|
|
145
|
-
}
|
|
146
|
-
else if (file.media_type === 'video') {
|
|
147
|
-
// Video understanding -> text -> embedding
|
|
148
|
-
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
149
|
-
video_url: fileUrl,
|
|
150
|
-
query: "详细描述这个视频的内容",
|
|
151
|
-
model_name: "doubao-seed-2-0-lite-260215"
|
|
152
|
-
});
|
|
153
|
-
// Assuming video output is a text description somewhere in the response.
|
|
154
|
-
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
155
|
-
const text = JSON.stringify(result.data || '');
|
|
156
|
-
// Also need audio extract and ASR for video
|
|
157
|
-
// 1. Extract audio
|
|
158
|
-
// The output_path_template needs a unique path per video
|
|
159
|
-
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
|
|
160
|
-
await lasClient.process('las_audio_extract_and_split', {
|
|
161
|
-
input_path: fileUrl,
|
|
162
|
-
output_path_template: audioOutputPath,
|
|
163
|
-
output_format: 'wav'
|
|
164
|
-
});
|
|
165
|
-
// 2. ASR on the extracted audio
|
|
166
|
-
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
167
|
-
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
168
|
-
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
169
|
-
request: { model_name: 'bigmodel' }
|
|
170
|
-
});
|
|
171
|
-
const audioText = asrResult.data?.result?.text || '';
|
|
172
|
-
// Combine video text and audio text
|
|
173
|
-
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
174
|
-
docs = await processText(combinedText, file);
|
|
175
|
-
}
|
|
176
|
-
else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
177
|
-
// If we had a direct text content, we could process it here.
|
|
178
|
-
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
179
|
-
// Fallback just logs
|
|
180
|
-
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
181
|
-
}
|
|
182
|
-
if (docs.length > 0) {
|
|
183
|
-
await metadataProvider.addAssets(docs);
|
|
184
|
-
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
catch (error) {
|
|
188
|
-
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
189
|
-
results.push({ file: file.key, status: 'error', message: error.message });
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
return results;
|
|
193
|
-
}
|