@byted-las/contextlake-openclaw 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -23
- package/dist/src/commands/cli.d.ts +1 -1
- package/dist/src/commands/cli.js +10 -14
- package/dist/src/commands/index.js +11 -4
- package/dist/src/commands/slashcmd.js +4 -9
- package/dist/src/commands/tools.d.ts +5 -0
- package/dist/src/commands/tools.js +180 -10
- package/dist/src/lib/actions/lance-tools.d.ts +13 -0
- package/dist/src/lib/actions/lance-tools.js +73 -0
- package/dist/src/lib/actions/las-tools.js +58 -0
- package/dist/src/lib/actions/profiler.d.ts +4 -3
- package/dist/src/lib/actions/profiler.js +156 -141
- package/dist/src/lib/actions/s3-tools.d.ts +21 -0
- package/dist/src/lib/actions/s3-tools.js +221 -0
- package/dist/src/skills/SKILL.md +14 -151
- package/dist/src/skills/las-data-profiler/SKILL.md +14 -151
- package/dist/src/utils/config.js +5 -4
- package/dist/src/utils/credentials.d.ts +4 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +3 -1
- package/src/commands/cli.ts +10 -14
- package/src/commands/index.ts +16 -4
- package/src/commands/slashcmd.ts +4 -10
- package/src/commands/tools.ts +177 -12
- package/src/lib/actions/lance-tools.ts +58 -0
- package/src/lib/actions/las-tools.ts +56 -0
- package/src/lib/actions/profiler.ts +148 -157
- package/src/lib/actions/s3-tools.ts +203 -0
- package/src/skills/las-data-profiler/SKILL.md +14 -151
- package/src/utils/config.ts +5 -4
- package/src/utils/credentials.ts +6 -0
- package/src/lib/scripts/s3_catalog.py +0 -617
package/README.md
CHANGED
|
@@ -1,27 +1,32 @@
|
|
|
1
1
|
# ContextLake OpenClaw Plugin
|
|
2
2
|
|
|
3
|
-
A powerful,
|
|
3
|
+
A powerful, multi-modal Knowledge Base / Knowledge Lake (知识库/知识湖) plugin for the OpenClaw Agent framework.
|
|
4
4
|
|
|
5
|
-
This plugin allows OpenClaw agents to natively understand, index, and retrieve
|
|
5
|
+
This plugin allows OpenClaw agents to natively understand, profile, index, and retrieve data from diverse data sources (Local, TOS, S3, etc.). It supports advanced multi-modal document ingestion (Text, PDF, Images, Audio, Video) powered by Large AI Services (LAS) and vector similarity search.
|
|
6
6
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
13
|
-
- **
|
|
14
|
-
|
|
15
|
-
- Uses `
|
|
16
|
-
-
|
|
9
|
+
- **Data Profiling**: Connect to heterogeneous data sources (TOS/OSS/COS/S3/Local) and auto-profile schemas and media metadata.
|
|
10
|
+
- **Multi-modal Ingestion**: Automatically extracts text, converts PDFs, parses audio (ASR), understands video, and embeds images directly using LAS multi-modal models.
|
|
11
|
+
- **Agentic Tools**: Exposes `contextlake-ingest`, `contextlake-retrieve`, `contextlake-list`, `contextlake-delete`, `las-data-profiler`, and `contextlake-list-datasource` skills directly to the LLM.
|
|
12
|
+
- **Slash Commands**: Quick chat actions (`/contextlake-search`, `/contextlake-ingest`, `/contextlake-profiler`) for fast human-in-the-loop operations.
|
|
13
|
+
- **CLI Management**: Full command-line interface for bulk operations and pipeline integration.
|
|
14
|
+
- **Pluggable Architecture**:
|
|
15
|
+
- Uses `LanceDB` for local embedded multi-modal vector storage.
|
|
16
|
+
- Pluggable storage providers (Local and TOS).
|
|
17
|
+
|
|
18
|
+
## Documentation
|
|
19
|
+
|
|
20
|
+
For a comprehensive guide on installation, initialization (`onboard`), connecting data sources (`connect`/`profiler`), ingestion, and searching, please refer to the [**USAGE.md**](./USAGE.md).
|
|
21
|
+
|
|
22
|
+
For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
|
|
17
23
|
|
|
18
24
|
## Installation
|
|
19
25
|
|
|
20
26
|
Assuming you are in an OpenClaw environment:
|
|
21
27
|
|
|
22
28
|
```bash
|
|
23
|
-
npm install
|
|
24
|
-
npm run build
|
|
29
|
+
npm install @byted-las/contextlake-openclaw
|
|
25
30
|
```
|
|
26
31
|
|
|
27
32
|
Then register the plugin in your OpenClaw configuration:
|
|
@@ -30,7 +35,7 @@ Then register the plugin in your OpenClaw configuration:
|
|
|
30
35
|
{
|
|
31
36
|
"plugins": [
|
|
32
37
|
{
|
|
33
|
-
"package": "contextlake-openclaw",
|
|
38
|
+
"package": "@byted-las/contextlake-openclaw",
|
|
34
39
|
"config": {
|
|
35
40
|
"metadata_storage": {
|
|
36
41
|
"type": "local",
|
|
@@ -46,19 +51,36 @@ Then register the plugin in your OpenClaw configuration:
|
|
|
46
51
|
}
|
|
47
52
|
```
|
|
48
53
|
|
|
54
|
+
## Quick Start (Slash Commands)
|
|
55
|
+
|
|
56
|
+
1. **Initialize Credentials** (Run in terminal):
|
|
57
|
+
```bash
|
|
58
|
+
openclaw contextlake onboard
|
|
59
|
+
```
|
|
60
|
+
2. **Profile a Data Source**:
|
|
61
|
+
```text
|
|
62
|
+
/contextlake-profiler my_data local /path/to/my/files .
|
|
63
|
+
```
|
|
64
|
+
3. **Ingest the Data Source** (Processes PDF, Audio, Images, Text):
|
|
65
|
+
```text
|
|
66
|
+
/contextlake-ingest my_data
|
|
67
|
+
```
|
|
68
|
+
4. **Search your Knowledge Base**:
|
|
69
|
+
```text
|
|
70
|
+
/contextlake-search How do I configure the API?
|
|
71
|
+
```
|
|
72
|
+
|
|
49
73
|
## Agent Tools (Skills)
|
|
50
74
|
|
|
51
75
|
The LLM agent automatically has access to the following tools:
|
|
52
76
|
|
|
53
|
-
1. **`
|
|
54
|
-
2. **`contextlake-
|
|
55
|
-
3. **`contextlake-
|
|
56
|
-
4. **`contextlake-
|
|
77
|
+
1. **`las-data-profiler`**: Connect to a data source and profile its structure.
|
|
78
|
+
2. **`contextlake-list-datasource`**: List all connected and profiled data sources.
|
|
79
|
+
3. **`contextlake-ingest`**: Process and ingest all profiled files into the knowledge base via multi-modal embeddings.
|
|
80
|
+
4. **`contextlake-retrieve`**: Performs vector search on the indexed documents based on a query.
|
|
81
|
+
5. **`contextlake-list`**: Lists all currently indexed documents.
|
|
82
|
+
6. **`contextlake-delete`**: Removes documents by ID or filter.
|
|
57
83
|
|
|
58
84
|
You can simply talk to the agent:
|
|
59
|
-
> "
|
|
60
|
-
> "在知识湖中检索关于架构设计的文档。"
|
|
61
|
-
|
|
62
|
-
## Development
|
|
63
|
-
|
|
64
|
-
For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
|
|
85
|
+
> "帮我把 `my_data` 数据源注入到知识库中。"
|
|
86
|
+
> "在知识湖中检索关于架构设计的文档。"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../utils/config';
|
|
2
2
|
export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
|
|
3
|
-
connectAction: (datasource_name: string, options: any) => Promise<void>;
|
|
3
|
+
connectAction: (datasource_name: string, url: string, options: any) => Promise<void>;
|
|
4
4
|
ingestAction: (datasource_name: string) => Promise<void>;
|
|
5
5
|
searchAction: (query: any, options: any) => Promise<void>;
|
|
6
6
|
listAction: (options: any) => Promise<void>;
|
package/dist/src/commands/cli.js
CHANGED
|
@@ -34,28 +34,22 @@ function parseMetadata(metadata) {
|
|
|
34
34
|
}
|
|
35
35
|
function getCliCommands(pluginConfig, logger) {
|
|
36
36
|
return {
|
|
37
|
-
connectAction: async (datasource_name, options) => {
|
|
38
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, options });
|
|
37
|
+
connectAction: async (datasource_name, url, options) => {
|
|
38
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
|
|
39
39
|
try {
|
|
40
40
|
const params = {
|
|
41
41
|
datasource_name,
|
|
42
|
-
|
|
42
|
+
url,
|
|
43
43
|
endpoint: options.endpoint,
|
|
44
44
|
access_key: options.ak,
|
|
45
45
|
secret_key: options.sk,
|
|
46
46
|
region: options.region,
|
|
47
|
-
bucket: options.bucket,
|
|
48
|
-
prefix: options.prefix,
|
|
49
47
|
sample_rows: parseInt(options.sampleRows),
|
|
50
48
|
};
|
|
51
49
|
// eslint-disable-next-line no-console
|
|
52
50
|
console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
|
|
53
51
|
// eslint-disable-next-line no-console
|
|
54
|
-
console.log(`
|
|
55
|
-
// eslint-disable-next-line no-console
|
|
56
|
-
console.log(` bucket: ${params.bucket}`);
|
|
57
|
-
// eslint-disable-next-line no-console
|
|
58
|
-
console.log(` prefix: ${params.prefix}`);
|
|
52
|
+
console.log(` url: ${params.url}`);
|
|
59
53
|
const result = await (0, profiler_1.connectDataSource)(params);
|
|
60
54
|
// eslint-disable-next-line no-console
|
|
61
55
|
console.log(JSON.stringify(result, null, 2));
|
|
@@ -141,12 +135,14 @@ function getCliCommands(pluginConfig, logger) {
|
|
|
141
135
|
// eslint-disable-next-line no-console
|
|
142
136
|
console.log('Please provide your credentials below. Press enter to keep the current value.');
|
|
143
137
|
const lasApiKey = await (0, credentials_1.promptForInput)('LAS_API_KEY', currentCreds.LAS_API_KEY);
|
|
144
|
-
const
|
|
145
|
-
const
|
|
138
|
+
const accessKey = await (0, credentials_1.promptForInput)('ACCESS_KEY', currentCreds.ACCESS_KEY || currentCreds.VOLCENGINE_ACCESS_KEY);
|
|
139
|
+
const secretKey = await (0, credentials_1.promptForInput)('SECRET_KEY', currentCreds.SECRET_KEY || currentCreds.VOLCENGINE_SECRET_KEY);
|
|
140
|
+
const region = await (0, credentials_1.promptForInput)('REGION', currentCreds.REGION || currentCreds.VOLCENGINE_REGION || 'cn-beijing');
|
|
146
141
|
const newCreds = {
|
|
147
142
|
LAS_API_KEY: lasApiKey,
|
|
148
|
-
|
|
149
|
-
|
|
143
|
+
ACCESS_KEY: accessKey,
|
|
144
|
+
SECRET_KEY: secretKey,
|
|
145
|
+
REGION: region
|
|
150
146
|
};
|
|
151
147
|
(0, credentials_1.saveCredentials)(newCreds);
|
|
152
148
|
// eslint-disable-next-line no-console
|
|
@@ -20,6 +20,16 @@ function registerAll(ctx, logger) {
|
|
|
20
20
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
21
21
|
ctx.registerTool(tools.lasDataProfilerTool);
|
|
22
22
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
23
|
+
ctx.registerTool(tools.listS3ObjectsTool);
|
|
24
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
|
|
25
|
+
ctx.registerTool(tools.readS3ObjectTool);
|
|
26
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readS3ObjectTool.name}`);
|
|
27
|
+
ctx.registerTool(tools.writeLanceCatalogTool);
|
|
28
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.writeLanceCatalogTool.name}`);
|
|
29
|
+
ctx.registerTool(tools.readLanceCatalogTool);
|
|
30
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readLanceCatalogTool.name}`);
|
|
31
|
+
ctx.registerTool(tools.generatePresignedUrlTool);
|
|
32
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.generatePresignedUrlTool.name}`);
|
|
23
33
|
ctx.registerTool(tools.listDatasourceTool);
|
|
24
34
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
|
|
25
35
|
for (const lasTool of tools.lasTools) {
|
|
@@ -40,15 +50,12 @@ function registerAll(ctx, logger) {
|
|
|
40
50
|
.description('Manage ContextLake knowledge base');
|
|
41
51
|
const commands = (0, cli_1.getCliCommands)(pluginConfig, logger);
|
|
42
52
|
// connect -- data source profiling (las-data-profiler)
|
|
43
|
-
contextlake.command('connect <datasource_name>')
|
|
53
|
+
contextlake.command('connect <datasource_name> <url>')
|
|
44
54
|
.description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
|
|
45
|
-
.requiredOption('--vendor <vendor>', 'Data source type: volcengine | alibaba | tencent | aws | local')
|
|
46
55
|
.option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
|
|
47
56
|
.option('--ak <credential_id>', 'Credential ID for the data source')
|
|
48
57
|
.option('--sk <credential_value>', 'Credential value for the data source')
|
|
49
58
|
.option('--region <region>', 'Region identifier (e.g. cn-beijing)')
|
|
50
|
-
.requiredOption('--bucket <bucket>', 'Bucket name (or local root directory for local vendor)')
|
|
51
|
-
.requiredOption('--prefix <prefix>', 'Path prefix to limit scan scope')
|
|
52
59
|
.option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
|
|
53
60
|
.action(commands.connectAction);
|
|
54
61
|
// Ingest
|
|
@@ -133,18 +133,13 @@ function getSlashCommands(pluginConfig, logger) {
|
|
|
133
133
|
const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
|
|
134
134
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
|
|
135
135
|
try {
|
|
136
|
-
if (args.length <
|
|
137
|
-
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <
|
|
138
|
-
}
|
|
139
|
-
const [datasource_name, vendor, bucket, prefix, endpoint, access_key, secret_key, region] = args;
|
|
140
|
-
if (!['volcengine', 'alibaba', 'tencent', 'aws', 'local'].includes(vendor)) {
|
|
141
|
-
return { text: `**Error:** Invalid vendor. Must be one of: volcengine, alibaba, tencent, aws, local` };
|
|
136
|
+
if (args.length < 2) {
|
|
137
|
+
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
|
|
142
138
|
}
|
|
139
|
+
const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
|
|
143
140
|
const params = {
|
|
144
141
|
datasource_name,
|
|
145
|
-
|
|
146
|
-
bucket,
|
|
147
|
-
prefix,
|
|
142
|
+
url,
|
|
148
143
|
endpoint,
|
|
149
144
|
access_key,
|
|
150
145
|
secret_key,
|
|
@@ -7,5 +7,10 @@ export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: a
|
|
|
7
7
|
deleteTool: AnyAgentTool;
|
|
8
8
|
lasDataProfilerTool: AnyAgentTool;
|
|
9
9
|
listDatasourceTool: AnyAgentTool;
|
|
10
|
+
listS3ObjectsTool: AnyAgentTool;
|
|
11
|
+
readS3ObjectTool: AnyAgentTool;
|
|
12
|
+
generatePresignedUrlTool: AnyAgentTool;
|
|
13
|
+
writeLanceCatalogTool: AnyAgentTool;
|
|
14
|
+
readLanceCatalogTool: AnyAgentTool;
|
|
10
15
|
lasTools: AnyAgentTool[];
|
|
11
16
|
};
|
|
@@ -6,6 +6,8 @@ const retrieve_1 = require("../lib/actions/retrieve");
|
|
|
6
6
|
const manage_1 = require("../lib/actions/manage");
|
|
7
7
|
const profiler_1 = require("../lib/actions/profiler");
|
|
8
8
|
const las_tools_1 = require("../lib/actions/las-tools");
|
|
9
|
+
const s3_tools_1 = require("../lib/actions/s3-tools");
|
|
10
|
+
const lance_tools_1 = require("../lib/actions/lance-tools");
|
|
9
11
|
function getAgentTools(pluginConfig, logger) {
|
|
10
12
|
const lasTools = (0, las_tools_1.getLasTools)(pluginConfig, logger);
|
|
11
13
|
return {
|
|
@@ -251,16 +253,10 @@ Example User Queries:
|
|
|
251
253
|
type: 'object',
|
|
252
254
|
properties: {
|
|
253
255
|
datasource_name: { type: 'string', description: 'Name of the data source' },
|
|
254
|
-
|
|
255
|
-
endpoint: { type: 'string', description: 'S3 Endpoint URL (not needed for local)' },
|
|
256
|
-
access_key: { type: 'string', description: 'Credential ID for the data source' },
|
|
257
|
-
secret_key: { type: 'string', description: 'Credential value for the data source' },
|
|
258
|
-
region: { type: 'string', description: 'Region identifier (e.g. cn-beijing)' },
|
|
259
|
-
bucket: { type: 'string', description: 'Bucket name (or local root directory for local vendor)' },
|
|
260
|
-
prefix: { type: 'string', description: 'Path prefix to limit scan scope' },
|
|
256
|
+
url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
|
|
261
257
|
sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
|
|
262
258
|
},
|
|
263
|
-
required: ['datasource_name', '
|
|
259
|
+
required: ['datasource_name', 'url'],
|
|
264
260
|
additionalProperties: false
|
|
265
261
|
},
|
|
266
262
|
async execute(toolCallId, params) {
|
|
@@ -281,11 +277,185 @@ Example User Queries:
|
|
|
281
277
|
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
282
278
|
return {
|
|
283
279
|
content: [{ type: "text", text: String(error.message) }],
|
|
284
|
-
details: { error: error.message
|
|
285
|
-
}
|
|
280
|
+
details: { error: error.message }
|
|
286
281
|
};
|
|
287
282
|
}
|
|
288
283
|
}
|
|
284
|
+
},
|
|
285
|
+
listS3ObjectsTool: {
|
|
286
|
+
name: 'list-s3-objects',
|
|
287
|
+
label: 'List S3 Objects',
|
|
288
|
+
description: 'List objects in an S3-compatible bucket or local directory',
|
|
289
|
+
parameters: {
|
|
290
|
+
type: 'object',
|
|
291
|
+
properties: {
|
|
292
|
+
url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., file:///path)' },
|
|
293
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
|
|
294
|
+
bucket: { type: 'string', description: 'Required if url is not provided' },
|
|
295
|
+
prefix: { type: 'string' },
|
|
296
|
+
endpoint: { type: 'string' },
|
|
297
|
+
maxKeys: { type: 'integer' },
|
|
298
|
+
continuationToken: { type: 'string' }
|
|
299
|
+
},
|
|
300
|
+
required: [],
|
|
301
|
+
additionalProperties: false
|
|
302
|
+
},
|
|
303
|
+
async execute(toolCallId, params) {
|
|
304
|
+
let actualParams = params.params || params;
|
|
305
|
+
try {
|
|
306
|
+
const result = await (0, s3_tools_1.listS3Objects)(actualParams, actualParams.prefix || '', actualParams.maxKeys, actualParams.continuationToken);
|
|
307
|
+
return { content: [{ type: "text", text: JSON.stringify(result) }], details: result };
|
|
308
|
+
}
|
|
309
|
+
catch (e) {
|
|
310
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
},
|
|
314
|
+
readS3ObjectTool: {
|
|
315
|
+
name: 'read-s3-object',
|
|
316
|
+
label: 'Read S3 Object',
|
|
317
|
+
description: 'Read the contents or headers of an S3 object',
|
|
318
|
+
parameters: {
|
|
319
|
+
type: 'object',
|
|
320
|
+
properties: {
|
|
321
|
+
url: { type: 'string', description: 'Full URL to the object (e.g. tos://bucket/path/to/key.txt)' },
|
|
322
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
|
|
323
|
+
bucket: { type: 'string', description: 'Required if url is not provided' },
|
|
324
|
+
key: { type: 'string', description: 'Required if url is not provided' },
|
|
325
|
+
endpoint: { type: 'string' },
|
|
326
|
+
maxBytes: { type: 'integer' }
|
|
327
|
+
},
|
|
328
|
+
required: [],
|
|
329
|
+
additionalProperties: false
|
|
330
|
+
},
|
|
331
|
+
async execute(toolCallId, params) {
|
|
332
|
+
let actualParams = params.params || params;
|
|
333
|
+
try {
|
|
334
|
+
// Extract key from url if provided
|
|
335
|
+
let key = actualParams.key;
|
|
336
|
+
if (actualParams.url && !key) {
|
|
337
|
+
try {
|
|
338
|
+
if (actualParams.url.startsWith('file://')) {
|
|
339
|
+
// Key is not strictly needed for file://, bucket contains the path in parseS3Url
|
|
340
|
+
key = '';
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
const parsedUrl = new URL(actualParams.url);
|
|
344
|
+
key = parsedUrl.pathname.replace(/^\//, '');
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
catch (e) {
|
|
348
|
+
// let it fail in readS3Object
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
if (!key && !actualParams.url?.startsWith('file://')) {
|
|
352
|
+
throw new Error('key is required or must be part of the url');
|
|
353
|
+
}
|
|
354
|
+
const buf = await (0, s3_tools_1.readS3Object)(actualParams, key, actualParams.maxBytes);
|
|
355
|
+
// Return as base64 string
|
|
356
|
+
return { content: [{ type: "text", text: buf.toString('base64') }], details: { length: buf.length } };
|
|
357
|
+
}
|
|
358
|
+
catch (e) {
|
|
359
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
},
|
|
363
|
+
generatePresignedUrlTool: {
|
|
364
|
+
name: 'generate-presigned-url',
|
|
365
|
+
label: 'Generate Presigned URL',
|
|
366
|
+
description: 'Generate a presigned HTTP URL for an S3/TOS object, allowing temporary public access',
|
|
367
|
+
parameters: {
|
|
368
|
+
type: 'object',
|
|
369
|
+
properties: {
|
|
370
|
+
url: { type: 'string', description: 'Full URL to the object (e.g. tos://bucket/path/to/key.txt)' },
|
|
371
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
|
|
372
|
+
bucket: { type: 'string', description: 'Required if url is not provided' },
|
|
373
|
+
key: { type: 'string', description: 'Required if url is not provided' },
|
|
374
|
+
endpoint: { type: 'string' },
|
|
375
|
+
expiresIn: { type: 'integer', description: 'Expiration time in seconds (default 3600)' }
|
|
376
|
+
},
|
|
377
|
+
required: [],
|
|
378
|
+
additionalProperties: false
|
|
379
|
+
},
|
|
380
|
+
async execute(toolCallId, params) {
|
|
381
|
+
let actualParams = params.params || params;
|
|
382
|
+
try {
|
|
383
|
+
let key = actualParams.key;
|
|
384
|
+
if (actualParams.url && !key) {
|
|
385
|
+
try {
|
|
386
|
+
if (actualParams.url.startsWith('file://')) {
|
|
387
|
+
key = '';
|
|
388
|
+
}
|
|
389
|
+
else {
|
|
390
|
+
const parsedUrl = new URL(actualParams.url);
|
|
391
|
+
key = parsedUrl.pathname.replace(/^\//, '');
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
catch (e) {
|
|
395
|
+
// let it fail in getPresignedUrl
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
if (!key && !actualParams.url?.startsWith('file://')) {
|
|
399
|
+
throw new Error('key is required or must be part of the url');
|
|
400
|
+
}
|
|
401
|
+
const url = await (0, s3_tools_1.getPresignedUrl)(actualParams, key, actualParams.expiresIn);
|
|
402
|
+
return { content: [{ type: "text", text: url }], details: { url } };
|
|
403
|
+
}
|
|
404
|
+
catch (e) {
|
|
405
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
},
|
|
409
|
+
writeLanceCatalogTool: {
|
|
410
|
+
name: 'write-lance-catalog',
|
|
411
|
+
label: 'Write LanceDB Catalog',
|
|
412
|
+
description: 'Write an array of file records into a local LanceDB table',
|
|
413
|
+
parameters: {
|
|
414
|
+
type: 'object',
|
|
415
|
+
properties: {
|
|
416
|
+
db_path: { type: 'string' },
|
|
417
|
+
table_name: { type: 'string' },
|
|
418
|
+
records: { type: 'array', items: { type: 'object' } }
|
|
419
|
+
},
|
|
420
|
+
required: ['db_path', 'table_name', 'records'],
|
|
421
|
+
additionalProperties: false
|
|
422
|
+
},
|
|
423
|
+
async execute(toolCallId, params) {
|
|
424
|
+
let actualParams = params.params || params;
|
|
425
|
+
try {
|
|
426
|
+
await (0, lance_tools_1.writeLanceCatalog)(actualParams);
|
|
427
|
+
return { content: [{ type: "text", text: "Successfully wrote records to LanceDB" }], details: { count: actualParams.records.length } };
|
|
428
|
+
}
|
|
429
|
+
catch (e) {
|
|
430
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
},
|
|
434
|
+
readLanceCatalogTool: {
|
|
435
|
+
name: 'read-lance-catalog',
|
|
436
|
+
label: 'Read LanceDB Catalog',
|
|
437
|
+
description: 'Read records from a local LanceDB table for validation or ingestion processes',
|
|
438
|
+
parameters: {
|
|
439
|
+
type: 'object',
|
|
440
|
+
properties: {
|
|
441
|
+
db_path: { type: 'string', description: 'Path to the local LanceDB database' },
|
|
442
|
+
table_name: { type: 'string', description: 'Name of the table to read' },
|
|
443
|
+
limit: { type: 'integer', description: 'Maximum number of records to return' },
|
|
444
|
+
filter: { type: 'string', description: 'SQL-like filter string (e.g., "category = \'structured\'")' }
|
|
445
|
+
},
|
|
446
|
+
required: ['db_path', 'table_name'],
|
|
447
|
+
additionalProperties: false
|
|
448
|
+
},
|
|
449
|
+
async execute(toolCallId, params) {
|
|
450
|
+
let actualParams = params.params || params;
|
|
451
|
+
try {
|
|
452
|
+
const results = await (0, lance_tools_1.readLanceCatalog)(actualParams);
|
|
453
|
+
return { content: [{ type: "text", text: JSON.stringify(results) }], details: { count: results.length, data: results } };
|
|
454
|
+
}
|
|
455
|
+
catch (e) {
|
|
456
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
457
|
+
}
|
|
458
|
+
}
|
|
289
459
|
}
|
|
290
460
|
};
|
|
291
461
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export interface LanceWriteParams {
|
|
2
|
+
db_path: string;
|
|
3
|
+
table_name: string;
|
|
4
|
+
records: any[];
|
|
5
|
+
}
|
|
6
|
+
export interface LanceReadParams {
|
|
7
|
+
db_path: string;
|
|
8
|
+
table_name: string;
|
|
9
|
+
limit?: number;
|
|
10
|
+
filter?: string;
|
|
11
|
+
}
|
|
12
|
+
export declare function writeLanceCatalog(params: LanceWriteParams): Promise<void>;
|
|
13
|
+
export declare function readLanceCatalog(params: LanceReadParams): Promise<any[]>;
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.writeLanceCatalog = writeLanceCatalog;
|
|
37
|
+
exports.readLanceCatalog = readLanceCatalog;
|
|
38
|
+
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
39
|
+
const fs = __importStar(require("fs"));
|
|
40
|
+
async function writeLanceCatalog(params) {
|
|
41
|
+
if (!params.records || params.records.length === 0) {
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
const db = await lancedb.connect(params.db_path);
|
|
45
|
+
const tableNames = await db.tableNames();
|
|
46
|
+
if (tableNames.includes(params.table_name)) {
|
|
47
|
+
const table = await db.openTable(params.table_name);
|
|
48
|
+
await table.add(params.records);
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
await db.createTable(params.table_name, params.records);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
async function readLanceCatalog(params) {
|
|
55
|
+
if (!fs.existsSync(params.db_path)) {
|
|
56
|
+
throw new Error(`Database not found at ${params.db_path}`);
|
|
57
|
+
}
|
|
58
|
+
const db = await lancedb.connect(params.db_path);
|
|
59
|
+
const tableNames = await db.tableNames();
|
|
60
|
+
if (!tableNames.includes(params.table_name)) {
|
|
61
|
+
throw new Error(`Table '${params.table_name}' not found in database`);
|
|
62
|
+
}
|
|
63
|
+
const table = await db.openTable(params.table_name);
|
|
64
|
+
let query = table.query();
|
|
65
|
+
if (params.filter) {
|
|
66
|
+
query = query.where(params.filter);
|
|
67
|
+
}
|
|
68
|
+
if (params.limit && params.limit > 0) {
|
|
69
|
+
query = query.limit(params.limit);
|
|
70
|
+
}
|
|
71
|
+
const results = await query.toArray();
|
|
72
|
+
return results;
|
|
73
|
+
}
|