@byted-las/contextlake-openclaw 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/commands/cli.d.ts +0 -2
- package/dist/src/commands/cli.js +0 -46
- package/dist/src/commands/index.js +0 -29
- package/dist/src/commands/slashcmd.d.ts +0 -6
- package/dist/src/commands/slashcmd.js +0 -87
- package/dist/src/commands/tools.d.ts +0 -2
- package/dist/src/commands/tools.js +0 -94
- package/dist/src/skills/SKILL.md +26 -18
- package/dist/src/skills/contextlake-delete/SKILL.md +2 -0
- package/dist/src/skills/contextlake-ingest/SKILL.md +57 -38
- package/dist/src/skills/contextlake-list/SKILL.md +2 -0
- package/dist/src/skills/contextlake-retrieve/SKILL.md +2 -0
- package/dist/src/skills/las-data-profiler/SKILL.md +26 -18
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/src/commands/cli.ts +0 -45
- package/src/commands/index.ts +0 -35
- package/src/commands/slashcmd.ts +0 -59
- package/src/commands/tools.ts +0 -99
- package/src/skills/contextlake-delete/SKILL.md +2 -0
- package/src/skills/contextlake-ingest/SKILL.md +57 -38
- package/src/skills/contextlake-list/SKILL.md +2 -0
- package/src/skills/contextlake-retrieve/SKILL.md +2 -0
- package/src/skills/las-data-profiler/SKILL.md +26 -18
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../utils/config';
|
|
2
2
|
export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
|
|
3
|
-
connectAction: (datasource_name: string, url: string, options: any) => Promise<void>;
|
|
4
|
-
ingestAction: (datasource_name: string) => Promise<void>;
|
|
5
3
|
searchAction: (query: any, options: any) => Promise<void>;
|
|
6
4
|
listAction: (options: any) => Promise<void>;
|
|
7
5
|
deleteAction: (options: any) => Promise<void>;
|
package/dist/src/commands/cli.js
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.getCliCommands = getCliCommands;
|
|
4
|
-
// @ts-ignore
|
|
5
|
-
const ingest_1 = require("../lib/actions/ingest");
|
|
6
4
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
7
5
|
const manage_1 = require("../lib/actions/manage");
|
|
8
|
-
const profiler_1 = require("../lib/actions/profiler");
|
|
9
6
|
const credentials_1 = require("../utils/credentials");
|
|
10
7
|
function parseOptionalInt(value, fallback) {
|
|
11
8
|
const parsed = Number.parseInt(String(value), 10);
|
|
@@ -34,49 +31,6 @@ function parseMetadata(metadata) {
|
|
|
34
31
|
}
|
|
35
32
|
function getCliCommands(pluginConfig, logger) {
|
|
36
33
|
return {
|
|
37
|
-
connectAction: async (datasource_name, url, options) => {
|
|
38
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
|
|
39
|
-
try {
|
|
40
|
-
const params = {
|
|
41
|
-
datasource_name,
|
|
42
|
-
url,
|
|
43
|
-
endpoint: options.endpoint,
|
|
44
|
-
access_key: options.ak,
|
|
45
|
-
secret_key: options.sk,
|
|
46
|
-
region: options.region,
|
|
47
|
-
sample_rows: parseInt(options.sampleRows),
|
|
48
|
-
};
|
|
49
|
-
// eslint-disable-next-line no-console
|
|
50
|
-
console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
|
|
51
|
-
// eslint-disable-next-line no-console
|
|
52
|
-
console.log(` url: ${params.url}`);
|
|
53
|
-
const result = await (0, profiler_1.connectDataSource)(params);
|
|
54
|
-
// eslint-disable-next-line no-console
|
|
55
|
-
console.log(JSON.stringify(result, null, 2));
|
|
56
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
|
|
57
|
-
}
|
|
58
|
-
catch (e) {
|
|
59
|
-
// eslint-disable-next-line no-console
|
|
60
|
-
console.error('Error:', e.message);
|
|
61
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
|
|
62
|
-
process.exitCode = 1;
|
|
63
|
-
}
|
|
64
|
-
},
|
|
65
|
-
ingestAction: async (datasource_name) => {
|
|
66
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
|
|
67
|
-
try {
|
|
68
|
-
const result = await (0, ingest_1.ingestSource)({
|
|
69
|
-
datasource_name
|
|
70
|
-
}, pluginConfig, logger);
|
|
71
|
-
// eslint-disable-next-line no-console
|
|
72
|
-
console.log(JSON.stringify(result, null, 2));
|
|
73
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
|
|
74
|
-
}
|
|
75
|
-
catch (e) {
|
|
76
|
-
console.error('Error:', e.message);
|
|
77
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
|
|
78
|
-
}
|
|
79
|
-
},
|
|
80
34
|
searchAction: async (query, options) => {
|
|
81
35
|
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
|
|
82
36
|
try {
|
|
@@ -10,16 +10,12 @@ function registerAll(ctx, logger) {
|
|
|
10
10
|
// Register Agent Tools
|
|
11
11
|
try {
|
|
12
12
|
const tools = (0, tools_1.getAgentTools)(pluginConfig, logger);
|
|
13
|
-
ctx.registerTool(tools.ingestTool);
|
|
14
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
|
|
15
13
|
ctx.registerTool(tools.retrieveTool);
|
|
16
14
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
|
|
17
15
|
ctx.registerTool(tools.listTool);
|
|
18
16
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listTool.name}`);
|
|
19
17
|
ctx.registerTool(tools.deleteTool);
|
|
20
18
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
21
|
-
ctx.registerTool(tools.lasDataProfilerTool);
|
|
22
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
23
19
|
ctx.registerTool(tools.listS3ObjectsTool);
|
|
24
20
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
|
|
25
21
|
ctx.registerTool(tools.readS3ObjectTool);
|
|
@@ -49,19 +45,6 @@ function registerAll(ctx, logger) {
|
|
|
49
45
|
const contextlake = program.command('contextlake')
|
|
50
46
|
.description('Manage ContextLake knowledge base');
|
|
51
47
|
const commands = (0, cli_1.getCliCommands)(pluginConfig, logger);
|
|
52
|
-
// connect -- data source profiling (las-data-profiler)
|
|
53
|
-
contextlake.command('connect <datasource_name> <url>')
|
|
54
|
-
.description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
|
|
55
|
-
.option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
|
|
56
|
-
.option('--ak <credential_id>', 'Credential ID for the data source')
|
|
57
|
-
.option('--sk <credential_value>', 'Credential value for the data source')
|
|
58
|
-
.option('--region <region>', 'Region identifier (e.g. cn-beijing)')
|
|
59
|
-
.option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
|
|
60
|
-
.action(commands.connectAction);
|
|
61
|
-
// Ingest
|
|
62
|
-
contextlake.command('ingest <datasource_name>')
|
|
63
|
-
.description('Process and ingest all files from a connected data source into the knowledge base')
|
|
64
|
-
.action(commands.ingestAction);
|
|
65
48
|
// Search
|
|
66
49
|
contextlake.command('search <query>')
|
|
67
50
|
.description('Search the knowledge base for relevant documents')
|
|
@@ -98,12 +81,6 @@ function registerAll(ctx, logger) {
|
|
|
98
81
|
return;
|
|
99
82
|
}
|
|
100
83
|
const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
|
|
101
|
-
ctx.registerCommand({
|
|
102
|
-
name: 'contextlake-ingest',
|
|
103
|
-
description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
|
|
104
|
-
acceptsArgs: true,
|
|
105
|
-
handler: slashCommands.ingestHandler
|
|
106
|
-
});
|
|
107
84
|
ctx.registerCommand({
|
|
108
85
|
name: 'contextlake-list',
|
|
109
86
|
description: 'List documents currently in the knowledge base',
|
|
@@ -122,12 +99,6 @@ function registerAll(ctx, logger) {
|
|
|
122
99
|
acceptsArgs: true,
|
|
123
100
|
handler: slashCommands.deleteHandler
|
|
124
101
|
});
|
|
125
|
-
ctx.registerCommand({
|
|
126
|
-
name: 'contextlake-profiler',
|
|
127
|
-
description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
|
|
128
|
-
acceptsArgs: true,
|
|
129
|
-
handler: slashCommands.profilerHandler
|
|
130
|
-
});
|
|
131
102
|
ctx.registerCommand({
|
|
132
103
|
name: 'contextlake-list-datasource',
|
|
133
104
|
description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
|
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../utils/config';
|
|
2
2
|
export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any): {
|
|
3
|
-
ingestHandler: (commandCtx: any) => Promise<{
|
|
4
|
-
text: string;
|
|
5
|
-
}>;
|
|
6
3
|
listHandler: (commandCtx: any) => Promise<{
|
|
7
4
|
text: string;
|
|
8
5
|
}>;
|
|
@@ -12,9 +9,6 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
|
|
|
12
9
|
deleteHandler: (commandCtx: any) => Promise<{
|
|
13
10
|
text: string;
|
|
14
11
|
}>;
|
|
15
|
-
profilerHandler: (commandCtx: any) => Promise<{
|
|
16
|
-
text: string;
|
|
17
|
-
}>;
|
|
18
12
|
listDatasourceHandler: (commandCtx: any) => Promise<{
|
|
19
13
|
text: string;
|
|
20
14
|
}>;
|
|
@@ -1,72 +1,11 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
3
|
exports.getSlashCommands = getSlashCommands;
|
|
37
|
-
const ingest_1 = require("../lib/actions/ingest");
|
|
38
4
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
39
5
|
const manage_1 = require("../lib/actions/manage");
|
|
40
6
|
const profiler_1 = require("../lib/actions/profiler");
|
|
41
|
-
const fs = __importStar(require("fs"));
|
|
42
|
-
const path = __importStar(require("path"));
|
|
43
|
-
const os = __importStar(require("os"));
|
|
44
7
|
function getSlashCommands(pluginConfig, logger) {
|
|
45
8
|
return {
|
|
46
|
-
ingestHandler: async (commandCtx) => {
|
|
47
|
-
const rawArgs = commandCtx.args || "";
|
|
48
|
-
const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
|
|
49
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
|
|
50
|
-
try {
|
|
51
|
-
if (args.length === 0) {
|
|
52
|
-
return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
|
|
53
|
-
}
|
|
54
|
-
const datasource_name = args[0];
|
|
55
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
56
|
-
const dsDir = path.join(BASE_DIR, datasource_name);
|
|
57
|
-
const dbPath = path.join(dsDir, 'catalog_db');
|
|
58
|
-
if (!fs.existsSync(dbPath)) {
|
|
59
|
-
return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
|
|
60
|
-
}
|
|
61
|
-
const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
|
|
62
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
|
|
63
|
-
return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
64
|
-
}
|
|
65
|
-
catch (e) {
|
|
66
|
-
logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
|
|
67
|
-
return { text: `**Error executing ingest:** ${e.message}` };
|
|
68
|
-
}
|
|
69
|
-
},
|
|
70
9
|
listHandler: async (commandCtx) => {
|
|
71
10
|
const rawArgs = commandCtx.args || "";
|
|
72
11
|
const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
|
|
@@ -128,32 +67,6 @@ function getSlashCommands(pluginConfig, logger) {
|
|
|
128
67
|
return { text: `**Error executing delete:** ${e.message}` };
|
|
129
68
|
}
|
|
130
69
|
},
|
|
131
|
-
profilerHandler: async (commandCtx) => {
|
|
132
|
-
const rawArgs = commandCtx.args || "";
|
|
133
|
-
const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
|
|
134
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
|
|
135
|
-
try {
|
|
136
|
-
if (args.length < 2) {
|
|
137
|
-
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
|
|
138
|
-
}
|
|
139
|
-
const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
|
|
140
|
-
const params = {
|
|
141
|
-
datasource_name,
|
|
142
|
-
url,
|
|
143
|
-
endpoint,
|
|
144
|
-
access_key,
|
|
145
|
-
secret_key,
|
|
146
|
-
region,
|
|
147
|
-
};
|
|
148
|
-
const result = await (0, profiler_1.connectDataSource)(params);
|
|
149
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
|
|
150
|
-
return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
151
|
-
}
|
|
152
|
-
catch (e) {
|
|
153
|
-
logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
|
|
154
|
-
return { text: `**Error executing profiler:** ${e.message}` };
|
|
155
|
-
}
|
|
156
|
-
},
|
|
157
70
|
listDatasourceHandler: async (commandCtx) => {
|
|
158
71
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
|
|
159
72
|
try {
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../utils/config';
|
|
2
2
|
import type { AnyAgentTool } from 'openclaw/plugin-sdk';
|
|
3
3
|
export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
4
|
-
ingestTool: AnyAgentTool;
|
|
5
4
|
retrieveTool: AnyAgentTool;
|
|
6
5
|
listTool: AnyAgentTool;
|
|
7
6
|
deleteTool: AnyAgentTool;
|
|
8
|
-
lasDataProfilerTool: AnyAgentTool;
|
|
9
7
|
listDatasourceTool: AnyAgentTool;
|
|
10
8
|
listS3ObjectsTool: AnyAgentTool;
|
|
11
9
|
readS3ObjectTool: AnyAgentTool;
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.getAgentTools = getAgentTools;
|
|
4
|
-
const ingest_1 = require("../lib/actions/ingest");
|
|
5
4
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
6
5
|
const manage_1 = require("../lib/actions/manage");
|
|
7
6
|
const profiler_1 = require("../lib/actions/profiler");
|
|
@@ -40,62 +39,6 @@ function getAgentTools(pluginConfig, logger) {
|
|
|
40
39
|
}
|
|
41
40
|
}
|
|
42
41
|
},
|
|
43
|
-
ingestTool: {
|
|
44
|
-
name: 'contextlake-ingest',
|
|
45
|
-
label: 'ContextLake Ingest',
|
|
46
|
-
description: `Process and ingest all files from a connected data source into the knowledge base.
|
|
47
|
-
Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
|
|
48
|
-
Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
|
|
49
|
-
Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
|
|
50
|
-
parameters: {
|
|
51
|
-
type: 'object',
|
|
52
|
-
properties: {
|
|
53
|
-
datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
|
|
54
|
-
},
|
|
55
|
-
required: ['datasource_name'],
|
|
56
|
-
additionalProperties: false
|
|
57
|
-
},
|
|
58
|
-
async execute(toolCallId, params) {
|
|
59
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
60
|
-
try {
|
|
61
|
-
let actualParams = params;
|
|
62
|
-
if (typeof params === 'string') {
|
|
63
|
-
try {
|
|
64
|
-
actualParams = JSON.parse(params);
|
|
65
|
-
}
|
|
66
|
-
catch (e) {
|
|
67
|
-
logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
|
|
68
|
-
return {
|
|
69
|
-
content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
|
|
70
|
-
details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
|
|
71
|
-
};
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
|
|
75
|
-
actualParams = actualParams.params;
|
|
76
|
-
}
|
|
77
|
-
if (!actualParams.datasource_name) {
|
|
78
|
-
return {
|
|
79
|
-
content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
|
|
80
|
-
details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
|
|
84
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
|
|
85
|
-
return {
|
|
86
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
87
|
-
details: result
|
|
88
|
-
};
|
|
89
|
-
}
|
|
90
|
-
catch (error) {
|
|
91
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
|
|
92
|
-
return {
|
|
93
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
94
|
-
details: { error: error.message }
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
},
|
|
99
42
|
retrieveTool: {
|
|
100
43
|
name: 'contextlake-retrieve',
|
|
101
44
|
label: 'ContextLake Retrieve',
|
|
@@ -245,43 +188,6 @@ Example User Queries:
|
|
|
245
188
|
}
|
|
246
189
|
}
|
|
247
190
|
},
|
|
248
|
-
lasDataProfilerTool: {
|
|
249
|
-
name: 'las-data-profiler',
|
|
250
|
-
label: 'LAS Data Profiler',
|
|
251
|
-
description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
|
|
252
|
-
parameters: {
|
|
253
|
-
type: 'object',
|
|
254
|
-
properties: {
|
|
255
|
-
datasource_name: { type: 'string', description: 'Name of the data source' },
|
|
256
|
-
url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
|
|
257
|
-
sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
|
|
258
|
-
},
|
|
259
|
-
required: ['datasource_name', 'url'],
|
|
260
|
-
additionalProperties: false
|
|
261
|
-
},
|
|
262
|
-
async execute(toolCallId, params) {
|
|
263
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
264
|
-
try {
|
|
265
|
-
let actualParams = params;
|
|
266
|
-
if (params && params.params) {
|
|
267
|
-
actualParams = params.params;
|
|
268
|
-
}
|
|
269
|
-
const result = await (0, profiler_1.connectDataSource)(actualParams);
|
|
270
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
|
|
271
|
-
return {
|
|
272
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
273
|
-
details: result
|
|
274
|
-
};
|
|
275
|
-
}
|
|
276
|
-
catch (error) {
|
|
277
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
278
|
-
return {
|
|
279
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
280
|
-
details: { error: error.message }
|
|
281
|
-
};
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
},
|
|
285
191
|
listS3ObjectsTool: {
|
|
286
192
|
name: 'list-s3-objects',
|
|
287
193
|
label: 'List S3 Objects',
|
package/dist/src/skills/SKILL.md
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: byted-las-data-profiler
|
|
2
3
|
description: |
|
|
3
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them
|
|
4
|
-
|
|
4
|
+
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
|
|
5
|
+
performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
|
|
6
|
+
extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
|
|
7
|
+
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
5
8
|
|
|
6
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
7
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
11
|
+
---
|
|
8
12
|
|
|
9
13
|
## Trigger Scenarios
|
|
10
14
|
Be sure to use this Skill when the user mentions the following scenarios:
|
|
@@ -14,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
|
|
|
14
18
|
- Need to understand what a batch of data files contains and what their schema looks like
|
|
15
19
|
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
16
20
|
- Need to write the meta-information of object storage or local files into LanceDB
|
|
21
|
+
- Mentions TOS, boto3, or object storage data profiling
|
|
17
22
|
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
23
|
+
- Need to batch identify the type and size of remote/local files and build an index
|
|
24
|
+
- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
|
|
25
|
+
- Need to connect/dock a data source for profiling
|
|
26
|
+
- Mentions "connect" data source, docking data source
|
|
18
27
|
|
|
19
|
-
##
|
|
20
|
-
|
|
21
|
-
If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
|
|
28
|
+
## Overview
|
|
29
|
+
This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
|
|
22
30
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
|
|
28
|
-
| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
|
|
29
|
-
| access_key | AK | - |
|
|
30
|
-
| secret_key | SK | - |
|
|
31
|
-
| region | Region identifier | cn-beijing |
|
|
32
|
-
| bucket | Bucket name (root directory path when local) | my-data-bucket |
|
|
33
|
-
| prefix | Path prefix to limit the scan scope | datasets/2024/ |
|
|
31
|
+
1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
|
|
32
|
+
2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
|
|
33
|
+
3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
|
|
34
|
+
4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
|
|
34
35
|
|
|
35
36
|
## Output Location
|
|
36
37
|
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
37
|
-
-
|
|
38
|
+
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
|
+
|
|
40
|
+
## Available Tools for this Skill
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
44
|
+
|
|
45
|
+
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-delete
|
|
2
3
|
description: |
|
|
3
4
|
Delete documents and assets from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -34,3 +35,4 @@ parameters:
|
|
|
34
35
|
type: string
|
|
35
36
|
description: SQL-like filter string to identify documents to delete (e.g. "metadata.category = 'obsolete'").
|
|
36
37
|
required: false
|
|
38
|
+
---
|
|
@@ -1,40 +1,59 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-ingest
|
|
2
3
|
description: |
|
|
3
|
-
|
|
4
|
-
Use this
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
4
|
+
A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
|
|
5
|
+
Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
|
|
6
|
+
|
|
7
|
+
When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
|
|
8
|
+
you MUST follow the two-step workflow outlined below using the specific tools provided.
|
|
9
|
+
|
|
10
|
+
IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
|
|
11
|
+
Do not use custom Python scripts, external CLI tools, or unrelated plugins.
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ContextLake Data Ingestion Workflow Guide
|
|
15
|
+
|
|
16
|
+
To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
|
|
17
|
+
|
|
18
|
+
## Step 1: Data Profiling
|
|
19
|
+
Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
|
|
20
|
+
|
|
21
|
+
**Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
|
|
22
|
+
|
|
23
|
+
### How it works:
|
|
24
|
+
- **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
|
|
25
|
+
- **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
|
|
26
|
+
|
|
27
|
+
*Wait for Step 1 to complete successfully before proceeding to Step 2.*
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Step 2: Data Ingestion
|
|
32
|
+
Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
|
|
33
|
+
|
|
34
|
+
**Tool to use:** OpenClaw Agent using basic tools guided by this skill.
|
|
35
|
+
|
|
36
|
+
### How it works:
|
|
37
|
+
- **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
|
|
38
|
+
- **Underlying tools used:**
|
|
39
|
+
1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
|
|
40
|
+
2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
|
|
41
|
+
3. Chunk and process the text.
|
|
42
|
+
4. Use the embedding tool or model to generate vectors.
|
|
43
|
+
5. Save the final chunks and vectors into the main ContextLake knowledge base.
|
|
44
|
+
|
|
45
|
+
*Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Auxiliary Tools (Use only when necessary)
|
|
50
|
+
If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
|
|
51
|
+
|
|
52
|
+
- **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
|
|
53
|
+
- Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
|
|
54
|
+
- **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
|
|
55
|
+
- **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
|
|
56
|
+
|
|
57
|
+
## Error Handling
|
|
58
|
+
- If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
|
|
59
|
+
- Always report the results of both steps to the user clearly.
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-list
|
|
2
3
|
description: |
|
|
3
4
|
List documents and assets currently in the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -20,3 +21,4 @@ parameters:
|
|
|
20
21
|
type: integer
|
|
21
22
|
description: Maximum number of documents to return (default 100).
|
|
22
23
|
required: false
|
|
24
|
+
---
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-retrieve
|
|
2
3
|
description: |
|
|
3
4
|
Search, query, and retrieve relevant information from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -35,3 +36,4 @@ parameters:
|
|
|
35
36
|
type: boolean
|
|
36
37
|
description: Whether to include binary content
|
|
37
38
|
required: false
|
|
39
|
+
---
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: byted-las-data-profiler
|
|
2
3
|
description: |
|
|
3
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them
|
|
4
|
-
|
|
4
|
+
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
|
|
5
|
+
performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
|
|
6
|
+
extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
|
|
7
|
+
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
5
8
|
|
|
6
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
7
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
11
|
+
---
|
|
8
12
|
|
|
9
13
|
## Trigger Scenarios
|
|
10
14
|
Be sure to use this Skill when the user mentions the following scenarios:
|
|
@@ -14,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
|
|
|
14
18
|
- Need to understand what a batch of data files contains and what their schema looks like
|
|
15
19
|
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
16
20
|
- Need to write the meta-information of object storage or local files into LanceDB
|
|
21
|
+
- Mentions TOS, boto3, or object storage data profiling
|
|
17
22
|
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
23
|
+
- Need to batch identify the type and size of remote/local files and build an index
|
|
24
|
+
- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
|
|
25
|
+
- Need to connect/dock a data source for profiling
|
|
26
|
+
- Mentions "connect" data source, docking data source
|
|
18
27
|
|
|
19
|
-
##
|
|
20
|
-
|
|
21
|
-
If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
|
|
28
|
+
## Overview
|
|
29
|
+
This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
|
|
22
30
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
|
|
28
|
-
| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
|
|
29
|
-
| access_key | AK | - |
|
|
30
|
-
| secret_key | SK | - |
|
|
31
|
-
| region | Region identifier | cn-beijing |
|
|
32
|
-
| bucket | Bucket name (root directory path when local) | my-data-bucket |
|
|
33
|
-
| prefix | Path prefix to limit the scan scope | datasets/2024/ |
|
|
31
|
+
1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
|
|
32
|
+
2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
|
|
33
|
+
3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
|
|
34
|
+
4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
|
|
34
35
|
|
|
35
36
|
## Output Location
|
|
36
37
|
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
37
|
-
-
|
|
38
|
+
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
|
+
|
|
40
|
+
## Available Tools for this Skill
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
44
|
+
|
|
45
|
+
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|
package/openclaw.plugin.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "contextlake-openclaw",
|
|
3
3
|
"name": "ContextLake",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.7",
|
|
5
5
|
"description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
|
|
6
6
|
"skills": ["./src/skills"],
|
|
7
7
|
"configSchema": {
|
package/package.json
CHANGED
package/src/commands/cli.ts
CHANGED
|
@@ -37,51 +37,6 @@ function parseMetadata(metadata: any): Record<string, any> {
|
|
|
37
37
|
|
|
38
38
|
export function getCliCommands(pluginConfig: ContextLakeConfig, logger: any) {
|
|
39
39
|
return {
|
|
40
|
-
connectAction: async (datasource_name: string, url: string, options: any) => {
|
|
41
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
|
|
42
|
-
try {
|
|
43
|
-
const params: ConnectParams = {
|
|
44
|
-
datasource_name,
|
|
45
|
-
url,
|
|
46
|
-
endpoint: options.endpoint,
|
|
47
|
-
access_key: options.ak,
|
|
48
|
-
secret_key: options.sk,
|
|
49
|
-
region: options.region,
|
|
50
|
-
sample_rows: parseInt(options.sampleRows),
|
|
51
|
-
};
|
|
52
|
-
|
|
53
|
-
// eslint-disable-next-line no-console
|
|
54
|
-
console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
|
|
55
|
-
// eslint-disable-next-line no-console
|
|
56
|
-
console.log(` url: ${params.url}`);
|
|
57
|
-
|
|
58
|
-
const result = await connectDataSource(params);
|
|
59
|
-
// eslint-disable-next-line no-console
|
|
60
|
-
console.log(JSON.stringify(result, null, 2));
|
|
61
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
|
|
62
|
-
} catch (e: any) {
|
|
63
|
-
// eslint-disable-next-line no-console
|
|
64
|
-
console.error('Error:', e.message);
|
|
65
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
|
|
66
|
-
process.exitCode = 1;
|
|
67
|
-
}
|
|
68
|
-
},
|
|
69
|
-
|
|
70
|
-
ingestAction: async (datasource_name: string) => {
|
|
71
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
|
|
72
|
-
try {
|
|
73
|
-
const result = await ingestSource({
|
|
74
|
-
datasource_name
|
|
75
|
-
}, pluginConfig, logger);
|
|
76
|
-
// eslint-disable-next-line no-console
|
|
77
|
-
console.log(JSON.stringify(result, null, 2));
|
|
78
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
|
|
79
|
-
} catch (e: any) {
|
|
80
|
-
console.error('Error:', e.message);
|
|
81
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
|
|
82
|
-
}
|
|
83
|
-
},
|
|
84
|
-
|
|
85
40
|
searchAction: async (query: any, options: any) => {
|
|
86
41
|
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
|
|
87
42
|
try {
|
package/src/commands/index.ts
CHANGED
|
@@ -12,9 +12,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
12
12
|
try {
|
|
13
13
|
const tools = getAgentTools(pluginConfig, logger);
|
|
14
14
|
|
|
15
|
-
ctx.registerTool(tools.ingestTool );
|
|
16
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
|
|
17
|
-
|
|
18
15
|
ctx.registerTool(tools.retrieveTool );
|
|
19
16
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
|
|
20
17
|
|
|
@@ -23,9 +20,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
23
20
|
|
|
24
21
|
ctx.registerTool(tools.deleteTool );
|
|
25
22
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
26
|
-
|
|
27
|
-
ctx.registerTool(tools.lasDataProfilerTool );
|
|
28
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
29
23
|
|
|
30
24
|
ctx.registerTool(tools.listS3ObjectsTool );
|
|
31
25
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
|
|
@@ -65,21 +59,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
65
59
|
|
|
66
60
|
const commands = getCliCommands(pluginConfig, logger);
|
|
67
61
|
|
|
68
|
-
// connect -- data source profiling (las-data-profiler)
|
|
69
|
-
contextlake.command('connect <datasource_name> <url>')
|
|
70
|
-
.description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
|
|
71
|
-
.option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
|
|
72
|
-
.option('--ak <credential_id>', 'Credential ID for the data source')
|
|
73
|
-
.option('--sk <credential_value>', 'Credential value for the data source')
|
|
74
|
-
.option('--region <region>', 'Region identifier (e.g. cn-beijing)')
|
|
75
|
-
.option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
|
|
76
|
-
.action(commands.connectAction);
|
|
77
|
-
|
|
78
|
-
// Ingest
|
|
79
|
-
contextlake.command('ingest <datasource_name>')
|
|
80
|
-
.description('Process and ingest all files from a connected data source into the knowledge base')
|
|
81
|
-
.action(commands.ingestAction);
|
|
82
|
-
|
|
83
62
|
// Search
|
|
84
63
|
contextlake.command('search <query>')
|
|
85
64
|
.description('Search the knowledge base for relevant documents')
|
|
@@ -122,13 +101,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
122
101
|
|
|
123
102
|
const slashCommands = getSlashCommands(pluginConfig, logger);
|
|
124
103
|
|
|
125
|
-
ctx.registerCommand({
|
|
126
|
-
name: 'contextlake-ingest',
|
|
127
|
-
description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
|
|
128
|
-
acceptsArgs: true,
|
|
129
|
-
handler: slashCommands.ingestHandler
|
|
130
|
-
});
|
|
131
|
-
|
|
132
104
|
ctx.registerCommand({
|
|
133
105
|
name: 'contextlake-list',
|
|
134
106
|
description: 'List documents currently in the knowledge base',
|
|
@@ -150,13 +122,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
|
|
|
150
122
|
handler: slashCommands.deleteHandler
|
|
151
123
|
});
|
|
152
124
|
|
|
153
|
-
ctx.registerCommand({
|
|
154
|
-
name: 'contextlake-profiler',
|
|
155
|
-
description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
|
|
156
|
-
acceptsArgs: true,
|
|
157
|
-
handler: slashCommands.profilerHandler
|
|
158
|
-
});
|
|
159
|
-
|
|
160
125
|
ctx.registerCommand({
|
|
161
126
|
name: 'contextlake-list-datasource',
|
|
162
127
|
description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
|
package/src/commands/slashcmd.ts
CHANGED
|
@@ -9,36 +9,6 @@ import * as os from 'os';
|
|
|
9
9
|
|
|
10
10
|
export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
|
|
11
11
|
return {
|
|
12
|
-
ingestHandler: async (commandCtx: any) => {
|
|
13
|
-
const rawArgs = commandCtx.args || "";
|
|
14
|
-
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
15
|
-
|
|
16
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
|
|
17
|
-
try {
|
|
18
|
-
if (args.length === 0) {
|
|
19
|
-
return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const datasource_name = args[0];
|
|
23
|
-
|
|
24
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
25
|
-
const dsDir = path.join(BASE_DIR, datasource_name);
|
|
26
|
-
const dbPath = path.join(dsDir, 'catalog_db');
|
|
27
|
-
|
|
28
|
-
if (!fs.existsSync(dbPath)) {
|
|
29
|
-
return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const result = await ingestSource({ datasource_name }, pluginConfig, logger);
|
|
33
|
-
|
|
34
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
|
|
35
|
-
return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
36
|
-
} catch (e: any) {
|
|
37
|
-
logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
|
|
38
|
-
return { text: `**Error executing ingest:** ${e.message}` };
|
|
39
|
-
}
|
|
40
|
-
},
|
|
41
|
-
|
|
42
12
|
listHandler: async (commandCtx: any) => {
|
|
43
13
|
const rawArgs = commandCtx.args || "";
|
|
44
14
|
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
@@ -102,35 +72,6 @@ export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
|
|
|
102
72
|
return { text: `**Error executing delete:** ${e.message}` };
|
|
103
73
|
}
|
|
104
74
|
},
|
|
105
|
-
profilerHandler: async (commandCtx: any) => {
|
|
106
|
-
const rawArgs = commandCtx.args || "";
|
|
107
|
-
const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
|
|
108
|
-
|
|
109
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
|
|
110
|
-
try {
|
|
111
|
-
if (args.length < 2) {
|
|
112
|
-
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
|
|
116
|
-
|
|
117
|
-
const params: ConnectParams = {
|
|
118
|
-
datasource_name,
|
|
119
|
-
url,
|
|
120
|
-
endpoint,
|
|
121
|
-
access_key,
|
|
122
|
-
secret_key,
|
|
123
|
-
region,
|
|
124
|
-
};
|
|
125
|
-
|
|
126
|
-
const result = await connectDataSource(params);
|
|
127
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
|
|
128
|
-
return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
129
|
-
} catch (e: any) {
|
|
130
|
-
logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
|
|
131
|
-
return { text: `**Error executing profiler:** ${e.message}` };
|
|
132
|
-
}
|
|
133
|
-
},
|
|
134
75
|
|
|
135
76
|
listDatasourceHandler: async (commandCtx: any) => {
|
|
136
77
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
|
package/src/commands/tools.ts
CHANGED
|
@@ -10,11 +10,9 @@ import { ContextLakeConfig } from '../utils/config';
|
|
|
10
10
|
import type { AnyAgentTool } from 'openclaw/plugin-sdk';
|
|
11
11
|
|
|
12
12
|
export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
13
|
-
ingestTool: AnyAgentTool;
|
|
14
13
|
retrieveTool: AnyAgentTool;
|
|
15
14
|
listTool: AnyAgentTool;
|
|
16
15
|
deleteTool: AnyAgentTool;
|
|
17
|
-
lasDataProfilerTool: AnyAgentTool;
|
|
18
16
|
listDatasourceTool: AnyAgentTool;
|
|
19
17
|
listS3ObjectsTool: AnyAgentTool;
|
|
20
18
|
readS3ObjectTool: AnyAgentTool;
|
|
@@ -54,65 +52,6 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
|
54
52
|
}
|
|
55
53
|
}
|
|
56
54
|
},
|
|
57
|
-
ingestTool: {
|
|
58
|
-
name: 'contextlake-ingest',
|
|
59
|
-
label: 'ContextLake Ingest',
|
|
60
|
-
description: `Process and ingest all files from a connected data source into the knowledge base.
|
|
61
|
-
Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
|
|
62
|
-
Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
|
|
63
|
-
Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
|
|
64
|
-
parameters: {
|
|
65
|
-
type: 'object',
|
|
66
|
-
properties: {
|
|
67
|
-
datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
|
|
68
|
-
},
|
|
69
|
-
required: ['datasource_name'],
|
|
70
|
-
additionalProperties: false
|
|
71
|
-
},
|
|
72
|
-
|
|
73
|
-
async execute(toolCallId: string, params: any) {
|
|
74
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
75
|
-
|
|
76
|
-
try {
|
|
77
|
-
let actualParams = params;
|
|
78
|
-
if (typeof params === 'string') {
|
|
79
|
-
try {
|
|
80
|
-
actualParams = JSON.parse(params);
|
|
81
|
-
} catch (e) {
|
|
82
|
-
logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
|
|
83
|
-
return {
|
|
84
|
-
content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
|
|
85
|
-
details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
|
|
86
|
-
} as any;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
|
|
91
|
-
actualParams = actualParams.params;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
if (!actualParams.datasource_name) {
|
|
95
|
-
return {
|
|
96
|
-
content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
|
|
97
|
-
details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
|
|
98
|
-
} as any;
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
const result = await ingestSource(actualParams, pluginConfig, logger);
|
|
102
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
|
|
103
|
-
return {
|
|
104
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
105
|
-
details: result
|
|
106
|
-
} as any;
|
|
107
|
-
} catch (error: any) {
|
|
108
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
|
|
109
|
-
return {
|
|
110
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
111
|
-
details: { error: error.message }
|
|
112
|
-
} as any;
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
55
|
retrieveTool: {
|
|
117
56
|
name: 'contextlake-retrieve',
|
|
118
57
|
label: 'ContextLake Retrieve',
|
|
@@ -268,44 +207,6 @@ Example User Queries:
|
|
|
268
207
|
}
|
|
269
208
|
}
|
|
270
209
|
},
|
|
271
|
-
lasDataProfilerTool: {
|
|
272
|
-
name: 'las-data-profiler',
|
|
273
|
-
label: 'LAS Data Profiler',
|
|
274
|
-
description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
|
|
275
|
-
parameters: {
|
|
276
|
-
type: 'object',
|
|
277
|
-
properties: {
|
|
278
|
-
datasource_name: { type: 'string', description: 'Name of the data source' },
|
|
279
|
-
url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
|
|
280
|
-
sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
|
|
281
|
-
},
|
|
282
|
-
required: ['datasource_name', 'url'],
|
|
283
|
-
additionalProperties: false
|
|
284
|
-
},
|
|
285
|
-
|
|
286
|
-
async execute(toolCallId: string, params: any) {
|
|
287
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
|
|
288
|
-
|
|
289
|
-
try {
|
|
290
|
-
let actualParams = params;
|
|
291
|
-
if (params && params.params) {
|
|
292
|
-
actualParams = params.params;
|
|
293
|
-
}
|
|
294
|
-
const result = await connectDataSource(actualParams);
|
|
295
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
|
|
296
|
-
return {
|
|
297
|
-
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
298
|
-
details: result
|
|
299
|
-
} as any;
|
|
300
|
-
} catch (error: any) {
|
|
301
|
-
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
302
|
-
return {
|
|
303
|
-
content: [{ type: "text", text: String(error.message) }],
|
|
304
|
-
details: { error: error.message }
|
|
305
|
-
} as any;
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
},
|
|
309
210
|
listS3ObjectsTool: {
|
|
310
211
|
name: 'list-s3-objects',
|
|
311
212
|
label: 'List S3 Objects',
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-delete
|
|
2
3
|
description: |
|
|
3
4
|
Delete documents and assets from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -34,3 +35,4 @@ parameters:
|
|
|
34
35
|
type: string
|
|
35
36
|
description: SQL-like filter string to identify documents to delete (e.g. "metadata.category = 'obsolete'").
|
|
36
37
|
required: false
|
|
38
|
+
---
|
|
@@ -1,40 +1,59 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-ingest
|
|
2
3
|
description: |
|
|
3
|
-
|
|
4
|
-
Use this
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
4
|
+
A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
|
|
5
|
+
Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
|
|
6
|
+
|
|
7
|
+
When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
|
|
8
|
+
you MUST follow the two-step workflow outlined below using the specific tools provided.
|
|
9
|
+
|
|
10
|
+
IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
|
|
11
|
+
Do not use custom Python scripts, external CLI tools, or unrelated plugins.
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# ContextLake Data Ingestion Workflow Guide
|
|
15
|
+
|
|
16
|
+
To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
|
|
17
|
+
|
|
18
|
+
## Step 1: Data Profiling
|
|
19
|
+
Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
|
|
20
|
+
|
|
21
|
+
**Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
|
|
22
|
+
|
|
23
|
+
### How it works:
|
|
24
|
+
- **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
|
|
25
|
+
- **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
|
|
26
|
+
|
|
27
|
+
*Wait for Step 1 to complete successfully before proceeding to Step 2.*
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Step 2: Data Ingestion
|
|
32
|
+
Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
|
|
33
|
+
|
|
34
|
+
**Tool to use:** OpenClaw Agent using basic tools guided by this skill.
|
|
35
|
+
|
|
36
|
+
### How it works:
|
|
37
|
+
- **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
|
|
38
|
+
- **Underlying tools used:**
|
|
39
|
+
1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
|
|
40
|
+
2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
|
|
41
|
+
3. Chunk and process the text.
|
|
42
|
+
4. Use the embedding tool or model to generate vectors.
|
|
43
|
+
5. Save the final chunks and vectors into the main ContextLake knowledge base.
|
|
44
|
+
|
|
45
|
+
*Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Auxiliary Tools (Use only when necessary)
|
|
50
|
+
If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
|
|
51
|
+
|
|
52
|
+
- **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
|
|
53
|
+
- Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
|
|
54
|
+
- **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
|
|
55
|
+
- **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
|
|
56
|
+
|
|
57
|
+
## Error Handling
|
|
58
|
+
- If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
|
|
59
|
+
- Always report the results of both steps to the user clearly.
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-list
|
|
2
3
|
description: |
|
|
3
4
|
List documents and assets currently in the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -20,3 +21,4 @@ parameters:
|
|
|
20
21
|
type: integer
|
|
21
22
|
description: Maximum number of documents to return (default 100).
|
|
22
23
|
required: false
|
|
24
|
+
---
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: contextlake-retrieve
|
|
2
3
|
description: |
|
|
3
4
|
Search, query, and retrieve relevant information from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
|
|
@@ -35,3 +36,4 @@ parameters:
|
|
|
35
36
|
type: boolean
|
|
36
37
|
description: Whether to include binary content
|
|
37
38
|
required: false
|
|
39
|
+
---
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
---
|
|
1
2
|
name: byted-las-data-profiler
|
|
2
3
|
description: |
|
|
3
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them
|
|
4
|
-
|
|
4
|
+
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
|
|
5
|
+
performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
|
|
6
|
+
extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
|
|
7
|
+
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
5
8
|
|
|
6
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
7
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
11
|
+
---
|
|
8
12
|
|
|
9
13
|
## Trigger Scenarios
|
|
10
14
|
Be sure to use this Skill when the user mentions the following scenarios:
|
|
@@ -14,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
|
|
|
14
18
|
- Need to understand what a batch of data files contains and what their schema looks like
|
|
15
19
|
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
16
20
|
- Need to write the meta-information of object storage or local files into LanceDB
|
|
21
|
+
- Mentions TOS, boto3, or object storage data profiling
|
|
17
22
|
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
23
|
+
- Need to batch identify the type and size of remote/local files and build an index
|
|
24
|
+
- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
|
|
25
|
+
- Need to connect/dock a data source for profiling
|
|
26
|
+
- Mentions "connect" data source, docking data source
|
|
18
27
|
|
|
19
|
-
##
|
|
20
|
-
|
|
21
|
-
If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
|
|
28
|
+
## Overview
|
|
29
|
+
This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
|
|
22
30
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
|
|
28
|
-
| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
|
|
29
|
-
| access_key | AK | - |
|
|
30
|
-
| secret_key | SK | - |
|
|
31
|
-
| region | Region identifier | cn-beijing |
|
|
32
|
-
| bucket | Bucket name (root directory path when local) | my-data-bucket |
|
|
33
|
-
| prefix | Path prefix to limit the scan scope | datasets/2024/ |
|
|
31
|
+
1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
|
|
32
|
+
2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
|
|
33
|
+
3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
|
|
34
|
+
4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
|
|
34
35
|
|
|
35
36
|
## Output Location
|
|
36
37
|
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
37
|
-
-
|
|
38
|
+
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
|
+
|
|
40
|
+
## Available Tools for this Skill
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
44
|
+
|
|
45
|
+
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|