@byted-las/contextlake-openclaw 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -23
- package/dist/src/commands/cli.js +6 -4
- package/dist/src/commands/index.js +6 -0
- package/dist/src/commands/tools.d.ts +3 -0
- package/dist/src/commands/tools.js +90 -2
- package/dist/src/lib/actions/lance-tools.d.ts +6 -0
- package/dist/src/lib/actions/lance-tools.js +51 -0
- package/dist/src/lib/actions/profiler.js +119 -140
- package/dist/src/lib/actions/s3-tools.d.ts +18 -0
- package/dist/src/lib/actions/s3-tools.js +167 -0
- package/dist/src/skills/SKILL.md +14 -151
- package/dist/src/skills/las-data-profiler/SKILL.md +14 -151
- package/dist/src/utils/config.js +5 -4
- package/dist/src/utils/credentials.d.ts +4 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -1
- package/src/commands/cli.ts +6 -4
- package/src/commands/index.ts +9 -0
- package/src/commands/tools.ts +91 -4
- package/src/lib/actions/lance-tools.ts +23 -0
- package/src/lib/actions/profiler.ts +116 -160
- package/src/lib/actions/s3-tools.ts +148 -0
- package/src/skills/las-data-profiler/SKILL.md +14 -151
- package/src/utils/config.ts +5 -4
- package/src/utils/credentials.ts +6 -0
- package/src/lib/scripts/s3_catalog.py +0 -617
package/README.md
CHANGED
|
@@ -1,27 +1,32 @@
|
|
|
1
1
|
# ContextLake OpenClaw Plugin
|
|
2
2
|
|
|
3
|
-
A powerful,
|
|
3
|
+
A powerful, multi-modal Knowledge Base / Knowledge Lake (知识库/知识湖) plugin for the OpenClaw Agent framework.
|
|
4
4
|
|
|
5
|
-
This plugin allows OpenClaw agents to natively understand, index, and retrieve
|
|
5
|
+
This plugin allows OpenClaw agents to natively understand, profile, index, and retrieve data from diverse data sources (Local, TOS, S3, etc.). It supports advanced multi-modal document ingestion (Text, PDF, Images, Audio, Video) powered by Large AI Services (LAS) and vector similarity search.
|
|
6
6
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
13
|
-
- **
|
|
14
|
-
|
|
15
|
-
- Uses `
|
|
16
|
-
-
|
|
9
|
+
- **Data Profiling**: Connect to heterogeneous data sources (TOS/OSS/COS/S3/Local) and auto-profile schemas and media metadata.
|
|
10
|
+
- **Multi-modal Ingestion**: Automatically extracts text, converts PDFs, parses audio (ASR), understands video, and embeds images directly using LAS multi-modal models.
|
|
11
|
+
- **Agentic Tools**: Exposes `contextlake-ingest`, `contextlake-retrieve`, `contextlake-list`, `contextlake-delete`, `las-data-profiler`, and `contextlake-list-datasource` skills directly to the LLM.
|
|
12
|
+
- **Slash Commands**: Quick chat actions (`/contextlake-search`, `/contextlake-ingest`, `/contextlake-profiler`) for fast human-in-the-loop operations.
|
|
13
|
+
- **CLI Management**: Full command-line interface for bulk operations and pipeline integration.
|
|
14
|
+
- **Pluggable Architecture**:
|
|
15
|
+
- Uses `LanceDB` for local embedded multi-modal vector storage.
|
|
16
|
+
- Pluggable storage providers (Local and TOS).
|
|
17
|
+
|
|
18
|
+
## Documentation
|
|
19
|
+
|
|
20
|
+
For a comprehensive guide on installation, initialization (`onboard`), connecting data sources (`connect`/`profiler`), ingestion, and searching, please refer to the [**USAGE.md**](./USAGE.md).
|
|
21
|
+
|
|
22
|
+
For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
|
|
17
23
|
|
|
18
24
|
## Installation
|
|
19
25
|
|
|
20
26
|
Assuming you are in an OpenClaw environment:
|
|
21
27
|
|
|
22
28
|
```bash
|
|
23
|
-
npm install
|
|
24
|
-
npm run build
|
|
29
|
+
npm install @byted-las/contextlake-openclaw
|
|
25
30
|
```
|
|
26
31
|
|
|
27
32
|
Then register the plugin in your OpenClaw configuration:
|
|
@@ -30,7 +35,7 @@ Then register the plugin in your OpenClaw configuration:
|
|
|
30
35
|
{
|
|
31
36
|
"plugins": [
|
|
32
37
|
{
|
|
33
|
-
"package": "contextlake-openclaw",
|
|
38
|
+
"package": "@byted-las/contextlake-openclaw",
|
|
34
39
|
"config": {
|
|
35
40
|
"metadata_storage": {
|
|
36
41
|
"type": "local",
|
|
@@ -46,19 +51,36 @@ Then register the plugin in your OpenClaw configuration:
|
|
|
46
51
|
}
|
|
47
52
|
```
|
|
48
53
|
|
|
54
|
+
## Quick Start (Slash Commands)
|
|
55
|
+
|
|
56
|
+
1. **Initialize Credentials** (Run in terminal):
|
|
57
|
+
```bash
|
|
58
|
+
openclaw contextlake onboard
|
|
59
|
+
```
|
|
60
|
+
2. **Profile a Data Source**:
|
|
61
|
+
```text
|
|
62
|
+
/contextlake-profiler my_data local /path/to/my/files .
|
|
63
|
+
```
|
|
64
|
+
3. **Ingest the Data Source** (Processes PDF, Audio, Images, Text):
|
|
65
|
+
```text
|
|
66
|
+
/contextlake-ingest my_data
|
|
67
|
+
```
|
|
68
|
+
4. **Search your Knowledge Base**:
|
|
69
|
+
```text
|
|
70
|
+
/contextlake-search How do I configure the API?
|
|
71
|
+
```
|
|
72
|
+
|
|
49
73
|
## Agent Tools (Skills)
|
|
50
74
|
|
|
51
75
|
The LLM agent automatically has access to the following tools:
|
|
52
76
|
|
|
53
|
-
1. **`
|
|
54
|
-
2. **`contextlake-
|
|
55
|
-
3. **`contextlake-
|
|
56
|
-
4. **`contextlake-
|
|
77
|
+
1. **`las-data-profiler`**: Connect to a data source and profile its structure.
|
|
78
|
+
2. **`contextlake-list-datasource`**: List all connected and profiled data sources.
|
|
79
|
+
3. **`contextlake-ingest`**: Process and ingest all profiled files into the knowledge base via multi-modal embeddings.
|
|
80
|
+
4. **`contextlake-retrieve`**: Performs vector search on the indexed documents based on a query.
|
|
81
|
+
5. **`contextlake-list`**: Lists all currently indexed documents.
|
|
82
|
+
6. **`contextlake-delete`**: Removes documents by ID or filter.
|
|
57
83
|
|
|
58
84
|
You can simply talk to the agent:
|
|
59
|
-
> "
|
|
60
|
-
> "在知识湖中检索关于架构设计的文档。"
|
|
61
|
-
|
|
62
|
-
## Development
|
|
63
|
-
|
|
64
|
-
For detailed architecture and internal workflows, please refer to [AGENTS.md](./AGENTS.md).
|
|
85
|
+
> "帮我把 `my_data` 数据源注入到知识库中。"
|
|
86
|
+
> "在知识湖中检索关于架构设计的文档。"
|
package/dist/src/commands/cli.js
CHANGED
|
@@ -141,12 +141,14 @@ function getCliCommands(pluginConfig, logger) {
|
|
|
141
141
|
// eslint-disable-next-line no-console
|
|
142
142
|
console.log('Please provide your credentials below. Press enter to keep the current value.');
|
|
143
143
|
const lasApiKey = await (0, credentials_1.promptForInput)('LAS_API_KEY', currentCreds.LAS_API_KEY);
|
|
144
|
-
const
|
|
145
|
-
const
|
|
144
|
+
const accessKey = await (0, credentials_1.promptForInput)('ACCESS_KEY', currentCreds.ACCESS_KEY || currentCreds.VOLCENGINE_ACCESS_KEY);
|
|
145
|
+
const secretKey = await (0, credentials_1.promptForInput)('SECRET_KEY', currentCreds.SECRET_KEY || currentCreds.VOLCENGINE_SECRET_KEY);
|
|
146
|
+
const region = await (0, credentials_1.promptForInput)('REGION', currentCreds.REGION || currentCreds.VOLCENGINE_REGION || 'cn-beijing');
|
|
146
147
|
const newCreds = {
|
|
147
148
|
LAS_API_KEY: lasApiKey,
|
|
148
|
-
|
|
149
|
-
|
|
149
|
+
ACCESS_KEY: accessKey,
|
|
150
|
+
SECRET_KEY: secretKey,
|
|
151
|
+
REGION: region
|
|
150
152
|
};
|
|
151
153
|
(0, credentials_1.saveCredentials)(newCreds);
|
|
152
154
|
// eslint-disable-next-line no-console
|
|
@@ -20,6 +20,12 @@ function registerAll(ctx, logger) {
|
|
|
20
20
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
21
21
|
ctx.registerTool(tools.lasDataProfilerTool);
|
|
22
22
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
23
|
+
ctx.registerTool(tools.listS3ObjectsTool);
|
|
24
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
|
|
25
|
+
ctx.registerTool(tools.readS3ObjectTool);
|
|
26
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readS3ObjectTool.name}`);
|
|
27
|
+
ctx.registerTool(tools.writeLanceCatalogTool);
|
|
28
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.writeLanceCatalogTool.name}`);
|
|
23
29
|
ctx.registerTool(tools.listDatasourceTool);
|
|
24
30
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
|
|
25
31
|
for (const lasTool of tools.lasTools) {
|
|
@@ -7,5 +7,8 @@ export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: a
|
|
|
7
7
|
deleteTool: AnyAgentTool;
|
|
8
8
|
lasDataProfilerTool: AnyAgentTool;
|
|
9
9
|
listDatasourceTool: AnyAgentTool;
|
|
10
|
+
listS3ObjectsTool: AnyAgentTool;
|
|
11
|
+
readS3ObjectTool: AnyAgentTool;
|
|
12
|
+
writeLanceCatalogTool: AnyAgentTool;
|
|
10
13
|
lasTools: AnyAgentTool[];
|
|
11
14
|
};
|
|
@@ -6,6 +6,8 @@ const retrieve_1 = require("../lib/actions/retrieve");
|
|
|
6
6
|
const manage_1 = require("../lib/actions/manage");
|
|
7
7
|
const profiler_1 = require("../lib/actions/profiler");
|
|
8
8
|
const las_tools_1 = require("../lib/actions/las-tools");
|
|
9
|
+
const s3_tools_1 = require("../lib/actions/s3-tools");
|
|
10
|
+
const lance_tools_1 = require("../lib/actions/lance-tools");
|
|
9
11
|
function getAgentTools(pluginConfig, logger) {
|
|
10
12
|
const lasTools = (0, las_tools_1.getLasTools)(pluginConfig, logger);
|
|
11
13
|
return {
|
|
@@ -281,11 +283,97 @@ Example User Queries:
|
|
|
281
283
|
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
282
284
|
return {
|
|
283
285
|
content: [{ type: "text", text: String(error.message) }],
|
|
284
|
-
details: { error: error.message
|
|
285
|
-
}
|
|
286
|
+
details: { error: error.message }
|
|
286
287
|
};
|
|
287
288
|
}
|
|
288
289
|
}
|
|
290
|
+
},
|
|
291
|
+
listS3ObjectsTool: {
|
|
292
|
+
name: 'list-s3-objects',
|
|
293
|
+
label: 'List S3 Objects',
|
|
294
|
+
description: 'List objects in an S3-compatible bucket or local directory',
|
|
295
|
+
parameters: {
|
|
296
|
+
type: 'object',
|
|
297
|
+
properties: {
|
|
298
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
|
|
299
|
+
bucket: { type: 'string' },
|
|
300
|
+
prefix: { type: 'string' },
|
|
301
|
+
endpoint: { type: 'string' },
|
|
302
|
+
access_key: { type: 'string' },
|
|
303
|
+
secret_key: { type: 'string' },
|
|
304
|
+
region: { type: 'string' },
|
|
305
|
+
maxKeys: { type: 'integer' },
|
|
306
|
+
continuationToken: { type: 'string' }
|
|
307
|
+
},
|
|
308
|
+
required: ['vendor', 'bucket'],
|
|
309
|
+
additionalProperties: false
|
|
310
|
+
},
|
|
311
|
+
async execute(toolCallId, params) {
|
|
312
|
+
let actualParams = params.params || params;
|
|
313
|
+
try {
|
|
314
|
+
const result = await (0, s3_tools_1.listS3Objects)(actualParams, actualParams.prefix || '', actualParams.maxKeys, actualParams.continuationToken);
|
|
315
|
+
return { content: [{ type: "text", text: JSON.stringify(result) }], details: result };
|
|
316
|
+
}
|
|
317
|
+
catch (e) {
|
|
318
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
},
|
|
322
|
+
readS3ObjectTool: {
|
|
323
|
+
name: 'read-s3-object',
|
|
324
|
+
label: 'Read S3 Object',
|
|
325
|
+
description: 'Read the contents or headers of an S3 object',
|
|
326
|
+
parameters: {
|
|
327
|
+
type: 'object',
|
|
328
|
+
properties: {
|
|
329
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
|
|
330
|
+
bucket: { type: 'string' },
|
|
331
|
+
key: { type: 'string' },
|
|
332
|
+
endpoint: { type: 'string' },
|
|
333
|
+
access_key: { type: 'string' },
|
|
334
|
+
secret_key: { type: 'string' },
|
|
335
|
+
region: { type: 'string' },
|
|
336
|
+
maxBytes: { type: 'integer' }
|
|
337
|
+
},
|
|
338
|
+
required: ['vendor', 'bucket', 'key'],
|
|
339
|
+
additionalProperties: false
|
|
340
|
+
},
|
|
341
|
+
async execute(toolCallId, params) {
|
|
342
|
+
let actualParams = params.params || params;
|
|
343
|
+
try {
|
|
344
|
+
const buf = await (0, s3_tools_1.readS3Object)(actualParams, actualParams.key, actualParams.maxBytes);
|
|
345
|
+
// Return as base64 string
|
|
346
|
+
return { content: [{ type: "text", text: buf.toString('base64') }], details: { length: buf.length } };
|
|
347
|
+
}
|
|
348
|
+
catch (e) {
|
|
349
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
},
|
|
353
|
+
writeLanceCatalogTool: {
|
|
354
|
+
name: 'write-lance-catalog',
|
|
355
|
+
label: 'Write LanceDB Catalog',
|
|
356
|
+
description: 'Write an array of file records into a local LanceDB table',
|
|
357
|
+
parameters: {
|
|
358
|
+
type: 'object',
|
|
359
|
+
properties: {
|
|
360
|
+
db_path: { type: 'string' },
|
|
361
|
+
table_name: { type: 'string' },
|
|
362
|
+
records: { type: 'array', items: { type: 'object' } }
|
|
363
|
+
},
|
|
364
|
+
required: ['db_path', 'table_name', 'records'],
|
|
365
|
+
additionalProperties: false
|
|
366
|
+
},
|
|
367
|
+
async execute(toolCallId, params) {
|
|
368
|
+
let actualParams = params.params || params;
|
|
369
|
+
try {
|
|
370
|
+
await (0, lance_tools_1.writeLanceCatalog)(actualParams);
|
|
371
|
+
return { content: [{ type: "text", text: "Successfully wrote records to LanceDB" }], details: { count: actualParams.records.length } };
|
|
372
|
+
}
|
|
373
|
+
catch (e) {
|
|
374
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } };
|
|
375
|
+
}
|
|
376
|
+
}
|
|
289
377
|
}
|
|
290
378
|
};
|
|
291
379
|
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.writeLanceCatalog = writeLanceCatalog;
|
|
37
|
+
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
38
|
+
async function writeLanceCatalog(params) {
|
|
39
|
+
if (!params.records || params.records.length === 0) {
|
|
40
|
+
return;
|
|
41
|
+
}
|
|
42
|
+
const db = await lancedb.connect(params.db_path);
|
|
43
|
+
const tableNames = await db.tableNames();
|
|
44
|
+
if (tableNames.includes(params.table_name)) {
|
|
45
|
+
const table = await db.openTable(params.table_name);
|
|
46
|
+
await table.add(params.records);
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
await db.createTable(params.table_name, params.records);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -38,25 +38,16 @@ exports.listDataSources = listDataSources;
|
|
|
38
38
|
const path = __importStar(require("path"));
|
|
39
39
|
const fs = __importStar(require("fs"));
|
|
40
40
|
const os = __importStar(require("os"));
|
|
41
|
-
const
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
// ---------------------------------------------------------------------------
|
|
41
|
+
const s3_tools_1 = require("./s3-tools");
|
|
42
|
+
const lance_tools_1 = require("./lance-tools");
|
|
43
|
+
const mime = __importStar(require("mime-types"));
|
|
45
44
|
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
-
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
47
|
-
// ---------------------------------------------------------------------------
|
|
48
|
-
// Helpers
|
|
49
|
-
// ---------------------------------------------------------------------------
|
|
50
45
|
function getDataSourceDir(name) {
|
|
51
46
|
return path.join(BASE_DIR, name);
|
|
52
47
|
}
|
|
53
48
|
function ensureDir(dir) {
|
|
54
49
|
fs.mkdirSync(dir, { recursive: true });
|
|
55
50
|
}
|
|
56
|
-
/**
|
|
57
|
-
* Generate env.sh with all connection parameters for this datasource.
|
|
58
|
-
* This file can be sourced to re-run the profiler or for debugging.
|
|
59
|
-
*/
|
|
60
51
|
function writeEnvFile(dir, params) {
|
|
61
52
|
const envPath = path.join(dir, 'env.sh');
|
|
62
53
|
const lines = [
|
|
@@ -69,166 +60,154 @@ function writeEnvFile(dir, params) {
|
|
|
69
60
|
`export LAS_BUCKET="${params.bucket}"`,
|
|
70
61
|
`export LAS_PREFIX="${params.prefix}"`,
|
|
71
62
|
];
|
|
72
|
-
if (params.endpoint)
|
|
63
|
+
if (params.endpoint)
|
|
73
64
|
lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
|
|
74
|
-
|
|
75
|
-
if (params.access_key) {
|
|
65
|
+
if (params.access_key)
|
|
76
66
|
lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
|
|
77
|
-
|
|
78
|
-
if (params.secret_key) {
|
|
67
|
+
if (params.secret_key)
|
|
79
68
|
lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
|
|
80
|
-
|
|
81
|
-
if (params.region) {
|
|
69
|
+
if (params.region)
|
|
82
70
|
lines.push(`export LAS_REGION="${params.region}"`);
|
|
83
|
-
|
|
84
|
-
if (params.sample_rows) {
|
|
71
|
+
if (params.sample_rows)
|
|
85
72
|
lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
|
|
86
|
-
}
|
|
87
73
|
lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
|
|
88
74
|
lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
|
|
89
75
|
lines.push('');
|
|
90
76
|
fs.writeFileSync(envPath, lines.join('\n'), { mode: 0o600 });
|
|
91
77
|
return envPath;
|
|
92
78
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
*/
|
|
112
|
-
function getScriptPath() {
|
|
113
|
-
// The Python script is located in the scripts directory
|
|
114
|
-
return path.join(__dirname, '../scripts', 's3_catalog.py');
|
|
79
|
+
function classifyFile(ext) {
|
|
80
|
+
ext = ext.toLowerCase();
|
|
81
|
+
const STRUCTURED_EXTS = ['.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'];
|
|
82
|
+
const IMAGE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'];
|
|
83
|
+
const AUDIO_EXTS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'];
|
|
84
|
+
const VIDEO_EXTS = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'];
|
|
85
|
+
const PDF_EXTS = ['.pdf'];
|
|
86
|
+
if (STRUCTURED_EXTS.includes(ext))
|
|
87
|
+
return { category: 'structured', mediaType: '' };
|
|
88
|
+
if (IMAGE_EXTS.includes(ext))
|
|
89
|
+
return { category: 'non-structured', mediaType: 'image' };
|
|
90
|
+
if (AUDIO_EXTS.includes(ext))
|
|
91
|
+
return { category: 'non-structured', mediaType: 'audio' };
|
|
92
|
+
if (VIDEO_EXTS.includes(ext))
|
|
93
|
+
return { category: 'non-structured', mediaType: 'video' };
|
|
94
|
+
if (PDF_EXTS.includes(ext))
|
|
95
|
+
return { category: 'non-structured', mediaType: 'pdf' };
|
|
96
|
+
return { category: 'non-structured', mediaType: '' };
|
|
115
97
|
}
|
|
116
|
-
// ---------------------------------------------------------------------------
|
|
117
|
-
// Main Entry
|
|
118
|
-
// ---------------------------------------------------------------------------
|
|
119
98
|
async function connectDataSource(params, _ctx) {
|
|
120
|
-
|
|
121
|
-
if (!params.datasource_name) {
|
|
99
|
+
if (!params.datasource_name)
|
|
122
100
|
throw new Error('datasource_name is required');
|
|
123
|
-
|
|
124
|
-
if (!params.vendor) {
|
|
101
|
+
if (!params.vendor)
|
|
125
102
|
throw new Error('vendor is required');
|
|
126
|
-
|
|
127
|
-
if (!params.bucket) {
|
|
103
|
+
if (!params.bucket)
|
|
128
104
|
throw new Error('bucket is required');
|
|
129
|
-
|
|
130
|
-
if (params.prefix === undefined || params.prefix === null) {
|
|
105
|
+
if (params.prefix === undefined || params.prefix === null)
|
|
131
106
|
throw new Error('prefix is required');
|
|
132
|
-
}
|
|
133
|
-
// For non-local vendors, validate credentials
|
|
134
107
|
if (params.vendor !== 'local') {
|
|
135
|
-
if (!params.endpoint && params.vendor !== 'aws')
|
|
108
|
+
if (!params.endpoint && params.vendor !== 'aws')
|
|
136
109
|
throw new Error(`endpoint is required for vendor "${params.vendor}"`);
|
|
110
|
+
let ak = params.access_key;
|
|
111
|
+
let sk = params.secret_key;
|
|
112
|
+
if (!ak || !sk) {
|
|
113
|
+
try {
|
|
114
|
+
const { loadCredentials } = require('../../utils/credentials');
|
|
115
|
+
const creds = loadCredentials();
|
|
116
|
+
ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
|
|
117
|
+
sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
|
|
118
|
+
}
|
|
119
|
+
catch (e) {
|
|
120
|
+
// ignore
|
|
121
|
+
}
|
|
137
122
|
}
|
|
138
|
-
const ak = params.access_key || process.env.TOS_ACCESS_KEY || process.env.S3_ACCESS_KEY || process.env.AWS_ACCESS_KEY_ID;
|
|
139
|
-
const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
|
|
140
123
|
if (!ak || !sk) {
|
|
141
|
-
throw new Error('access_key and secret_key are required
|
|
124
|
+
throw new Error('access_key and secret_key are required');
|
|
142
125
|
}
|
|
143
|
-
// Normalise into params so env.sh picks them up
|
|
144
126
|
params.access_key = ak;
|
|
145
127
|
params.secret_key = sk;
|
|
146
128
|
}
|
|
147
129
|
const dsDir = getDataSourceDir(params.datasource_name);
|
|
148
130
|
const dbPath = path.join(dsDir, 'catalog_db');
|
|
149
131
|
ensureDir(dsDir);
|
|
150
|
-
// 1. Write env.sh
|
|
151
132
|
const envPath = writeEnvFile(dsDir, params);
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
'
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
db_path: dbPath,
|
|
198
|
-
env_path: envPath,
|
|
199
|
-
tables: [],
|
|
200
|
-
error: stderr || `Python script exited with code ${code}`,
|
|
201
|
-
});
|
|
202
|
-
return;
|
|
203
|
-
}
|
|
204
|
-
// Try to parse structured output from the script
|
|
205
|
-
try {
|
|
206
|
-
const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
|
|
207
|
-
const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
|
|
208
|
-
resolve({
|
|
209
|
-
status: 'success',
|
|
210
|
-
datasource_name: params.datasource_name,
|
|
211
|
-
db_path: dbPath,
|
|
212
|
-
env_path: envPath,
|
|
213
|
-
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
214
|
-
summary: result.summary || {
|
|
215
|
-
total_files: 0,
|
|
216
|
-
structured_files: 0,
|
|
217
|
-
media_files: 0,
|
|
218
|
-
},
|
|
219
|
-
});
|
|
220
|
-
}
|
|
221
|
-
catch {
|
|
222
|
-
resolve({
|
|
223
|
-
status: 'success',
|
|
224
|
-
datasource_name: params.datasource_name,
|
|
225
|
-
db_path: dbPath,
|
|
226
|
-
env_path: envPath,
|
|
227
|
-
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
133
|
+
try {
|
|
134
|
+
let isTruncated = true;
|
|
135
|
+
let continuationToken = undefined;
|
|
136
|
+
let total_files = 0;
|
|
137
|
+
let structured_files = 0;
|
|
138
|
+
let media_files = 0;
|
|
139
|
+
const allRecords = [];
|
|
140
|
+
const scan_ts = new Date().toISOString() + 'Z';
|
|
141
|
+
while (isTruncated) {
|
|
142
|
+
const response = await (0, s3_tools_1.listS3Objects)(params, params.prefix, 1000, continuationToken);
|
|
143
|
+
for (const obj of response.Contents) {
|
|
144
|
+
const key = obj.Key || '';
|
|
145
|
+
if (key.endsWith('/'))
|
|
146
|
+
continue;
|
|
147
|
+
const name = path.basename(key);
|
|
148
|
+
const ext = path.extname(name).toLowerCase();
|
|
149
|
+
const mimeType = mime.lookup(name) || '';
|
|
150
|
+
const { category, mediaType } = classifyFile(ext);
|
|
151
|
+
const depth = (key.match(/\//g) || []).length;
|
|
152
|
+
const parentDir = key.includes('/') ? path.basename(path.dirname(key)) : '';
|
|
153
|
+
total_files++;
|
|
154
|
+
if (category === 'structured')
|
|
155
|
+
structured_files++;
|
|
156
|
+
if (mediaType)
|
|
157
|
+
media_files++;
|
|
158
|
+
allRecords.push({
|
|
159
|
+
file_path: key,
|
|
160
|
+
file_name: name,
|
|
161
|
+
extension: ext,
|
|
162
|
+
mime_type: mimeType,
|
|
163
|
+
category: category,
|
|
164
|
+
media_type: mediaType,
|
|
165
|
+
size_bytes: obj.Size || 0,
|
|
166
|
+
last_modified: obj.LastModified ? String(obj.LastModified) : '',
|
|
167
|
+
created_time: obj._created_time ? String(obj._created_time) : '',
|
|
168
|
+
etag: (obj.ETag || '').replace(/"/g, ''),
|
|
169
|
+
storage_class: obj.StorageClass || '',
|
|
170
|
+
is_multipart: (obj.ETag || '').includes('-'),
|
|
171
|
+
depth: depth,
|
|
172
|
+
parent_dir: parentDir,
|
|
173
|
+
vendor: params.vendor,
|
|
174
|
+
bucket: params.bucket,
|
|
175
|
+
has_schema: false,
|
|
176
|
+
has_media_meta: false,
|
|
177
|
+
scan_timestamp: scan_ts
|
|
228
178
|
});
|
|
229
179
|
}
|
|
180
|
+
isTruncated = response.IsTruncated || false;
|
|
181
|
+
continuationToken = response.NextContinuationToken;
|
|
182
|
+
}
|
|
183
|
+
await (0, lance_tools_1.writeLanceCatalog)({
|
|
184
|
+
db_path: dbPath,
|
|
185
|
+
table_name: 'file_catalog',
|
|
186
|
+
records: allRecords
|
|
230
187
|
});
|
|
231
|
-
|
|
188
|
+
return {
|
|
189
|
+
status: 'success',
|
|
190
|
+
datasource_name: params.datasource_name,
|
|
191
|
+
db_path: dbPath,
|
|
192
|
+
env_path: envPath,
|
|
193
|
+
tables: ['file_catalog'],
|
|
194
|
+
summary: {
|
|
195
|
+
total_files,
|
|
196
|
+
structured_files,
|
|
197
|
+
media_files
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
catch (error) {
|
|
202
|
+
return {
|
|
203
|
+
status: 'error',
|
|
204
|
+
datasource_name: params.datasource_name,
|
|
205
|
+
db_path: dbPath,
|
|
206
|
+
env_path: envPath,
|
|
207
|
+
tables: [],
|
|
208
|
+
error: error.message
|
|
209
|
+
};
|
|
210
|
+
}
|
|
232
211
|
}
|
|
233
212
|
async function listDataSources(_ctx) {
|
|
234
213
|
try {
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export interface S3Params {
|
|
2
|
+
vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
|
|
3
|
+
endpoint?: string;
|
|
4
|
+
access_key?: string;
|
|
5
|
+
secret_key?: string;
|
|
6
|
+
region?: string;
|
|
7
|
+
bucket: string;
|
|
8
|
+
}
|
|
9
|
+
export declare function listS3Objects(params: S3Params, prefix: string, maxKeys?: number, continuationToken?: string): Promise<{
|
|
10
|
+
Contents: any[];
|
|
11
|
+
IsTruncated: boolean;
|
|
12
|
+
NextContinuationToken: undefined;
|
|
13
|
+
} | {
|
|
14
|
+
Contents: import("@aws-sdk/client-s3")._Object[];
|
|
15
|
+
IsTruncated: boolean | undefined;
|
|
16
|
+
NextContinuationToken: string | undefined;
|
|
17
|
+
}>;
|
|
18
|
+
export declare function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer>;
|