@byted-las/contextlake-openclaw 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -23
- package/dist/src/commands/cli.js +6 -4
- package/dist/src/commands/index.js +6 -0
- package/dist/src/commands/tools.d.ts +3 -0
- package/dist/src/commands/tools.js +90 -2
- package/dist/src/lib/actions/lance-tools.d.ts +6 -0
- package/dist/src/lib/actions/lance-tools.js +51 -0
- package/dist/src/lib/actions/profiler.js +119 -140
- package/dist/src/lib/actions/s3-tools.d.ts +18 -0
- package/dist/src/lib/actions/s3-tools.js +167 -0
- package/dist/src/skills/SKILL.md +14 -151
- package/dist/src/skills/las-data-profiler/SKILL.md +14 -151
- package/dist/src/utils/config.js +5 -4
- package/dist/src/utils/credentials.d.ts +4 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -1
- package/src/commands/cli.ts +6 -4
- package/src/commands/index.ts +9 -0
- package/src/commands/tools.ts +91 -4
- package/src/lib/actions/lance-tools.ts +23 -0
- package/src/lib/actions/profiler.ts +116 -160
- package/src/lib/actions/s3-tools.ts +148 -0
- package/src/skills/las-data-profiler/SKILL.md +14 -151
- package/src/utils/config.ts +5 -4
- package/src/utils/credentials.ts +6 -0
- package/src/lib/scripts/s3_catalog.py +0 -617
package/src/commands/tools.ts
CHANGED
|
@@ -3,6 +3,8 @@ import { retrieveAssets } from '../lib/actions/retrieve';
|
|
|
3
3
|
import { listAssets, deleteAssets } from '../lib/actions/manage';
|
|
4
4
|
import { connectDataSource, listDataSources } from '../lib/actions/profiler';
|
|
5
5
|
import { getLasTools } from '../lib/actions/las-tools';
|
|
6
|
+
import { listS3Objects, readS3Object } from '../lib/actions/s3-tools';
|
|
7
|
+
import { writeLanceCatalog } from '../lib/actions/lance-tools';
|
|
6
8
|
import { ContextLakeConfig } from '../utils/config';
|
|
7
9
|
// @ts-ignore
|
|
8
10
|
import type { AnyAgentTool } from 'openclaw/plugin-sdk';
|
|
@@ -14,6 +16,9 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
|
|
|
14
16
|
deleteTool: AnyAgentTool;
|
|
15
17
|
lasDataProfilerTool: AnyAgentTool;
|
|
16
18
|
listDatasourceTool: AnyAgentTool;
|
|
19
|
+
listS3ObjectsTool: AnyAgentTool;
|
|
20
|
+
readS3ObjectTool: AnyAgentTool;
|
|
21
|
+
writeLanceCatalogTool: AnyAgentTool;
|
|
17
22
|
lasTools: AnyAgentTool[];
|
|
18
23
|
} {
|
|
19
24
|
const lasTools = getLasTools(pluginConfig, logger);
|
|
@@ -299,13 +304,95 @@ Example User Queries:
|
|
|
299
304
|
} catch (error: any) {
|
|
300
305
|
logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
|
|
301
306
|
return {
|
|
302
|
-
content: [{ type: "text", text: String(error.message
|
|
303
|
-
|
|
304
|
-
details: { error: error.message
|
|
305
|
-
}
|
|
307
|
+
content: [{ type: "text", text: String(error.message) }],
|
|
308
|
+
details: { error: error.message }
|
|
306
309
|
} as any;
|
|
307
310
|
}
|
|
308
311
|
}
|
|
312
|
+
},
|
|
313
|
+
listS3ObjectsTool: {
|
|
314
|
+
name: 'list-s3-objects',
|
|
315
|
+
label: 'List S3 Objects',
|
|
316
|
+
description: 'List objects in an S3-compatible bucket or local directory',
|
|
317
|
+
parameters: {
|
|
318
|
+
type: 'object',
|
|
319
|
+
properties: {
|
|
320
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
|
|
321
|
+
bucket: { type: 'string' },
|
|
322
|
+
prefix: { type: 'string' },
|
|
323
|
+
endpoint: { type: 'string' },
|
|
324
|
+
access_key: { type: 'string' },
|
|
325
|
+
secret_key: { type: 'string' },
|
|
326
|
+
region: { type: 'string' },
|
|
327
|
+
maxKeys: { type: 'integer' },
|
|
328
|
+
continuationToken: { type: 'string' }
|
|
329
|
+
},
|
|
330
|
+
required: ['vendor', 'bucket'],
|
|
331
|
+
additionalProperties: false
|
|
332
|
+
},
|
|
333
|
+
async execute(toolCallId: string, params: any) {
|
|
334
|
+
let actualParams = params.params || params;
|
|
335
|
+
try {
|
|
336
|
+
const result = await listS3Objects(actualParams, actualParams.prefix || '', actualParams.maxKeys, actualParams.continuationToken);
|
|
337
|
+
return { content: [{ type: "text", text: JSON.stringify(result) }], details: result } as any;
|
|
338
|
+
} catch (e: any) {
|
|
339
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
},
|
|
343
|
+
readS3ObjectTool: {
|
|
344
|
+
name: 'read-s3-object',
|
|
345
|
+
label: 'Read S3 Object',
|
|
346
|
+
description: 'Read the contents or headers of an S3 object',
|
|
347
|
+
parameters: {
|
|
348
|
+
type: 'object',
|
|
349
|
+
properties: {
|
|
350
|
+
vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
|
|
351
|
+
bucket: { type: 'string' },
|
|
352
|
+
key: { type: 'string' },
|
|
353
|
+
endpoint: { type: 'string' },
|
|
354
|
+
access_key: { type: 'string' },
|
|
355
|
+
secret_key: { type: 'string' },
|
|
356
|
+
region: { type: 'string' },
|
|
357
|
+
maxBytes: { type: 'integer' }
|
|
358
|
+
},
|
|
359
|
+
required: ['vendor', 'bucket', 'key'],
|
|
360
|
+
additionalProperties: false
|
|
361
|
+
},
|
|
362
|
+
async execute(toolCallId: string, params: any) {
|
|
363
|
+
let actualParams = params.params || params;
|
|
364
|
+
try {
|
|
365
|
+
const buf = await readS3Object(actualParams, actualParams.key, actualParams.maxBytes);
|
|
366
|
+
// Return as base64 string
|
|
367
|
+
return { content: [{ type: "text", text: buf.toString('base64') }], details: { length: buf.length } } as any;
|
|
368
|
+
} catch (e: any) {
|
|
369
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
},
|
|
373
|
+
writeLanceCatalogTool: {
|
|
374
|
+
name: 'write-lance-catalog',
|
|
375
|
+
label: 'Write LanceDB Catalog',
|
|
376
|
+
description: 'Write an array of file records into a local LanceDB table',
|
|
377
|
+
parameters: {
|
|
378
|
+
type: 'object',
|
|
379
|
+
properties: {
|
|
380
|
+
db_path: { type: 'string' },
|
|
381
|
+
table_name: { type: 'string' },
|
|
382
|
+
records: { type: 'array', items: { type: 'object' } }
|
|
383
|
+
},
|
|
384
|
+
required: ['db_path', 'table_name', 'records'],
|
|
385
|
+
additionalProperties: false
|
|
386
|
+
},
|
|
387
|
+
async execute(toolCallId: string, params: any) {
|
|
388
|
+
let actualParams = params.params || params;
|
|
389
|
+
try {
|
|
390
|
+
await writeLanceCatalog(actualParams);
|
|
391
|
+
return { content: [{ type: "text", text: "Successfully wrote records to LanceDB" }], details: { count: actualParams.records.length } } as any;
|
|
392
|
+
} catch (e: any) {
|
|
393
|
+
return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
309
396
|
}
|
|
310
397
|
};
|
|
311
398
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import * as lancedb from '@lancedb/lancedb';
|
|
2
|
+
|
|
3
|
+
export interface LanceWriteParams {
|
|
4
|
+
db_path: string;
|
|
5
|
+
table_name: string;
|
|
6
|
+
records: any[];
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export async function writeLanceCatalog(params: LanceWriteParams) {
|
|
10
|
+
if (!params.records || params.records.length === 0) {
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const db = await lancedb.connect(params.db_path);
|
|
15
|
+
const tableNames = await db.tableNames();
|
|
16
|
+
|
|
17
|
+
if (tableNames.includes(params.table_name)) {
|
|
18
|
+
const table = await db.openTable(params.table_name);
|
|
19
|
+
await table.add(params.records);
|
|
20
|
+
} else {
|
|
21
|
+
await db.createTable(params.table_name, params.records);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import * as path from 'path';
|
|
2
2
|
import * as fs from 'fs';
|
|
3
3
|
import * as os from 'os';
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
// Types
|
|
8
|
-
// ---------------------------------------------------------------------------
|
|
4
|
+
import { listS3Objects } from './s3-tools';
|
|
5
|
+
import { writeLanceCatalog } from './lance-tools';
|
|
6
|
+
import * as mime from 'mime-types';
|
|
9
7
|
|
|
10
8
|
export interface ConnectParams {
|
|
11
9
|
datasource_name: string;
|
|
@@ -33,16 +31,7 @@ export interface ConnectResult {
|
|
|
33
31
|
error?: string;
|
|
34
32
|
}
|
|
35
33
|
|
|
36
|
-
// ---------------------------------------------------------------------------
|
|
37
|
-
// Constants
|
|
38
|
-
// ---------------------------------------------------------------------------
|
|
39
|
-
|
|
40
34
|
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
41
|
-
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
42
|
-
|
|
43
|
-
// ---------------------------------------------------------------------------
|
|
44
|
-
// Helpers
|
|
45
|
-
// ---------------------------------------------------------------------------
|
|
46
35
|
|
|
47
36
|
function getDataSourceDir(name: string): string {
|
|
48
37
|
return path.join(BASE_DIR, name);
|
|
@@ -52,10 +41,6 @@ function ensureDir(dir: string): void {
|
|
|
52
41
|
fs.mkdirSync(dir, { recursive: true });
|
|
53
42
|
}
|
|
54
43
|
|
|
55
|
-
/**
|
|
56
|
-
* Generate env.sh with all connection parameters for this datasource.
|
|
57
|
-
* This file can be sourced to re-run the profiler or for debugging.
|
|
58
|
-
*/
|
|
59
44
|
function writeEnvFile(dir: string, params: ConnectParams): string {
|
|
60
45
|
const envPath = path.join(dir, 'env.sh');
|
|
61
46
|
const lines: string[] = [
|
|
@@ -69,21 +54,11 @@ function writeEnvFile(dir: string, params: ConnectParams): string {
|
|
|
69
54
|
`export LAS_PREFIX="${params.prefix}"`,
|
|
70
55
|
];
|
|
71
56
|
|
|
72
|
-
if (params.endpoint) {
|
|
73
|
-
|
|
74
|
-
}
|
|
75
|
-
if (params.
|
|
76
|
-
|
|
77
|
-
}
|
|
78
|
-
if (params.secret_key) {
|
|
79
|
-
lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
|
|
80
|
-
}
|
|
81
|
-
if (params.region) {
|
|
82
|
-
lines.push(`export LAS_REGION="${params.region}"`);
|
|
83
|
-
}
|
|
84
|
-
if (params.sample_rows) {
|
|
85
|
-
lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
|
|
86
|
-
}
|
|
57
|
+
if (params.endpoint) lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
|
|
58
|
+
if (params.access_key) lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
|
|
59
|
+
if (params.secret_key) lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
|
|
60
|
+
if (params.region) lines.push(`export LAS_REGION="${params.region}"`);
|
|
61
|
+
if (params.sample_rows) lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
|
|
87
62
|
|
|
88
63
|
lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
|
|
89
64
|
lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
|
|
@@ -93,65 +68,51 @@ function writeEnvFile(dir: string, params: ConnectParams): string {
|
|
|
93
68
|
return envPath;
|
|
94
69
|
}
|
|
95
70
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Get the path to the bundled Python script.
|
|
114
|
-
*/
|
|
115
|
-
function getScriptPath(): string {
|
|
116
|
-
// The Python script is located in the scripts directory
|
|
117
|
-
return path.join(__dirname, '../scripts', 's3_catalog.py');
|
|
71
|
+
function classifyFile(ext: string): { category: string; mediaType: string } {
|
|
72
|
+
ext = ext.toLowerCase();
|
|
73
|
+
const STRUCTURED_EXTS = ['.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'];
|
|
74
|
+
const IMAGE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'];
|
|
75
|
+
const AUDIO_EXTS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'];
|
|
76
|
+
const VIDEO_EXTS = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'];
|
|
77
|
+
const PDF_EXTS = ['.pdf'];
|
|
78
|
+
|
|
79
|
+
if (STRUCTURED_EXTS.includes(ext)) return { category: 'structured', mediaType: '' };
|
|
80
|
+
if (IMAGE_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'image' };
|
|
81
|
+
if (AUDIO_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'audio' };
|
|
82
|
+
if (VIDEO_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'video' };
|
|
83
|
+
if (PDF_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'pdf' };
|
|
84
|
+
|
|
85
|
+
return { category: 'non-structured', mediaType: '' };
|
|
118
86
|
}
|
|
119
87
|
|
|
120
|
-
// ---------------------------------------------------------------------------
|
|
121
|
-
// Main Entry
|
|
122
|
-
// ---------------------------------------------------------------------------
|
|
123
|
-
|
|
124
88
|
export async function connectDataSource(
|
|
125
89
|
params: ConnectParams,
|
|
126
90
|
_ctx?: any
|
|
127
91
|
): Promise<ConnectResult> {
|
|
128
|
-
|
|
129
|
-
if (!params.
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
if (!params.vendor) {
|
|
133
|
-
throw new Error('vendor is required');
|
|
134
|
-
}
|
|
135
|
-
if (!params.bucket) {
|
|
136
|
-
throw new Error('bucket is required');
|
|
137
|
-
}
|
|
138
|
-
if (params.prefix === undefined || params.prefix === null) {
|
|
139
|
-
throw new Error('prefix is required');
|
|
140
|
-
}
|
|
92
|
+
if (!params.datasource_name) throw new Error('datasource_name is required');
|
|
93
|
+
if (!params.vendor) throw new Error('vendor is required');
|
|
94
|
+
if (!params.bucket) throw new Error('bucket is required');
|
|
95
|
+
if (params.prefix === undefined || params.prefix === null) throw new Error('prefix is required');
|
|
141
96
|
|
|
142
|
-
// For non-local vendors, validate credentials
|
|
143
97
|
if (params.vendor !== 'local') {
|
|
144
|
-
if (!params.endpoint && params.vendor !== 'aws') {
|
|
145
|
-
|
|
98
|
+
if (!params.endpoint && params.vendor !== 'aws') throw new Error(`endpoint is required for vendor "${params.vendor}"`);
|
|
99
|
+
let ak = params.access_key;
|
|
100
|
+
let sk = params.secret_key;
|
|
101
|
+
|
|
102
|
+
if (!ak || !sk) {
|
|
103
|
+
try {
|
|
104
|
+
const { loadCredentials } = require('../../utils/credentials');
|
|
105
|
+
const creds = loadCredentials();
|
|
106
|
+
ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
|
|
107
|
+
sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
|
|
108
|
+
} catch(e) {
|
|
109
|
+
// ignore
|
|
110
|
+
}
|
|
146
111
|
}
|
|
147
|
-
|
|
148
|
-
const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
|
|
112
|
+
|
|
149
113
|
if (!ak || !sk) {
|
|
150
|
-
throw new Error(
|
|
151
|
-
'access_key and secret_key are required (via params or env vars TOS_ACCESS_KEY/TOS_SECRET_KEY, S3_ACCESS_KEY/S3_SECRET_KEY, AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)'
|
|
152
|
-
);
|
|
114
|
+
throw new Error('access_key and secret_key are required');
|
|
153
115
|
}
|
|
154
|
-
// Normalise into params so env.sh picks them up
|
|
155
116
|
params.access_key = ak;
|
|
156
117
|
params.secret_key = sk;
|
|
157
118
|
}
|
|
@@ -160,97 +121,92 @@ export async function connectDataSource(
|
|
|
160
121
|
const dbPath = path.join(dsDir, 'catalog_db');
|
|
161
122
|
|
|
162
123
|
ensureDir(dsDir);
|
|
163
|
-
|
|
164
|
-
// 1. Write env.sh
|
|
165
124
|
const envPath = writeEnvFile(dsDir, params);
|
|
166
125
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
126
|
+
try {
|
|
127
|
+
let isTruncated = true;
|
|
128
|
+
let continuationToken: string | undefined = undefined;
|
|
129
|
+
let total_files = 0;
|
|
130
|
+
let structured_files = 0;
|
|
131
|
+
let media_files = 0;
|
|
132
|
+
|
|
133
|
+
const allRecords: any[] = [];
|
|
134
|
+
const scan_ts = new Date().toISOString() + 'Z';
|
|
135
|
+
|
|
136
|
+
while (isTruncated) {
|
|
137
|
+
const response = await listS3Objects(params, params.prefix, 1000, continuationToken);
|
|
138
|
+
|
|
139
|
+
for (const obj of response.Contents) {
|
|
140
|
+
const key = obj.Key || '';
|
|
141
|
+
if (key.endsWith('/')) continue;
|
|
142
|
+
|
|
143
|
+
const name = path.basename(key);
|
|
144
|
+
const ext = path.extname(name).toLowerCase();
|
|
145
|
+
const mimeType = mime.lookup(name) || '';
|
|
146
|
+
const { category, mediaType } = classifyFile(ext);
|
|
147
|
+
const depth = (key.match(/\//g) || []).length;
|
|
148
|
+
const parentDir = key.includes('/') ? path.basename(path.dirname(key)) : '';
|
|
149
|
+
|
|
150
|
+
total_files++;
|
|
151
|
+
if (category === 'structured') structured_files++;
|
|
152
|
+
if (mediaType) media_files++;
|
|
153
|
+
|
|
154
|
+
allRecords.push({
|
|
155
|
+
file_path: key,
|
|
156
|
+
file_name: name,
|
|
157
|
+
extension: ext,
|
|
158
|
+
mime_type: mimeType,
|
|
159
|
+
category: category,
|
|
160
|
+
media_type: mediaType,
|
|
161
|
+
size_bytes: obj.Size || 0,
|
|
162
|
+
last_modified: obj.LastModified ? String(obj.LastModified) : '',
|
|
163
|
+
created_time: obj._created_time ? String(obj._created_time) : '',
|
|
164
|
+
etag: (obj.ETag || '').replace(/"/g, ''),
|
|
165
|
+
storage_class: obj.StorageClass || '',
|
|
166
|
+
is_multipart: (obj.ETag || '').includes('-'),
|
|
167
|
+
depth: depth,
|
|
168
|
+
parent_dir: parentDir,
|
|
169
|
+
vendor: params.vendor,
|
|
170
|
+
bucket: params.bucket,
|
|
171
|
+
has_schema: false,
|
|
172
|
+
has_media_meta: false,
|
|
173
|
+
scan_timestamp: scan_ts
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
isTruncated = response.IsTruncated || false;
|
|
178
|
+
continuationToken = response.NextContinuationToken;
|
|
179
|
+
}
|
|
213
180
|
|
|
214
|
-
|
|
215
|
-
if (code !== 0) {
|
|
216
|
-
resolve({
|
|
217
|
-
status: 'error',
|
|
218
|
-
datasource_name: params.datasource_name,
|
|
181
|
+
await writeLanceCatalog({
|
|
219
182
|
db_path: dbPath,
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
});
|
|
224
|
-
return;
|
|
225
|
-
}
|
|
183
|
+
table_name: 'file_catalog',
|
|
184
|
+
records: allRecords
|
|
185
|
+
});
|
|
226
186
|
|
|
227
|
-
|
|
228
|
-
try {
|
|
229
|
-
const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
|
|
230
|
-
const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
|
|
231
|
-
resolve({
|
|
187
|
+
return {
|
|
232
188
|
status: 'success',
|
|
233
189
|
datasource_name: params.datasource_name,
|
|
234
190
|
db_path: dbPath,
|
|
235
191
|
env_path: envPath,
|
|
236
|
-
tables: ['file_catalog'
|
|
237
|
-
summary:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
192
|
+
tables: ['file_catalog'],
|
|
193
|
+
summary: {
|
|
194
|
+
total_files,
|
|
195
|
+
structured_files,
|
|
196
|
+
media_files
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
} catch (error: any) {
|
|
201
|
+
return {
|
|
202
|
+
status: 'error',
|
|
246
203
|
datasource_name: params.datasource_name,
|
|
247
204
|
db_path: dbPath,
|
|
248
205
|
env_path: envPath,
|
|
249
|
-
tables: [
|
|
250
|
-
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
});
|
|
206
|
+
tables: [],
|
|
207
|
+
error: error.message
|
|
208
|
+
};
|
|
209
|
+
}
|
|
254
210
|
}
|
|
255
211
|
|
|
256
212
|
export async function listDataSources(
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
|
|
5
|
+
export interface S3Params {
|
|
6
|
+
vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
|
|
7
|
+
endpoint?: string;
|
|
8
|
+
access_key?: string;
|
|
9
|
+
secret_key?: string;
|
|
10
|
+
region?: string;
|
|
11
|
+
bucket: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function createS3Client(params: S3Params): S3Client | null {
|
|
15
|
+
if (params.vendor === 'local') return null;
|
|
16
|
+
|
|
17
|
+
let endpoint = params.endpoint;
|
|
18
|
+
let region = params.region;
|
|
19
|
+
|
|
20
|
+
// Attempt to load credentials if not provided
|
|
21
|
+
let ak = params.access_key;
|
|
22
|
+
let sk = params.secret_key;
|
|
23
|
+
|
|
24
|
+
if (!ak || !sk || !region) {
|
|
25
|
+
try {
|
|
26
|
+
const { loadCredentials } = require('../../utils/credentials');
|
|
27
|
+
const creds = loadCredentials();
|
|
28
|
+
ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
|
|
29
|
+
sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
|
|
30
|
+
region = region || creds.REGION || creds.VOLCENGINE_REGION;
|
|
31
|
+
} catch (e) {
|
|
32
|
+
// ignore
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
ak = ak || '';
|
|
37
|
+
sk = sk || '';
|
|
38
|
+
|
|
39
|
+
if (params.vendor === 'volcengine' && !endpoint) {
|
|
40
|
+
endpoint = `https://tos-s3-${region || 'cn-beijing'}.volces.com`;
|
|
41
|
+
} else if (params.vendor === 'alibaba' && !endpoint) {
|
|
42
|
+
endpoint = `https://s3.oss-${region || 'cn-hangzhou'}.aliyuncs.com`;
|
|
43
|
+
} else if (params.vendor === 'tencent' && !endpoint) {
|
|
44
|
+
endpoint = `https://cos.${region || 'ap-beijing'}.myqcloud.com`;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return new S3Client({
|
|
48
|
+
region: region || 'us-east-1',
|
|
49
|
+
endpoint: endpoint,
|
|
50
|
+
credentials: {
|
|
51
|
+
accessKeyId: ak,
|
|
52
|
+
secretAccessKey: sk
|
|
53
|
+
},
|
|
54
|
+
forcePathStyle: false // usually false for virtual hosted style
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export async function listS3Objects(params: S3Params, prefix: string, maxKeys: number = 1000, continuationToken?: string) {
|
|
59
|
+
if (params.vendor === 'local') {
|
|
60
|
+
const root = params.bucket;
|
|
61
|
+
const prefixPath = prefix && prefix !== '.' ? path.join(root, prefix) : root;
|
|
62
|
+
const files: any[] = [];
|
|
63
|
+
|
|
64
|
+
function walkSync(currentDirPath: string) {
|
|
65
|
+
if (!fs.existsSync(currentDirPath)) return;
|
|
66
|
+
const dirents = fs.readdirSync(currentDirPath, { withFileTypes: true });
|
|
67
|
+
for (const dirent of dirents) {
|
|
68
|
+
const res = path.resolve(currentDirPath, dirent.name);
|
|
69
|
+
if (dirent.isDirectory()) {
|
|
70
|
+
walkSync(res);
|
|
71
|
+
} else {
|
|
72
|
+
const stat = fs.statSync(res);
|
|
73
|
+
files.push({
|
|
74
|
+
Key: path.relative(root, res),
|
|
75
|
+
Size: stat.size,
|
|
76
|
+
LastModified: stat.mtime,
|
|
77
|
+
ETag: '',
|
|
78
|
+
StorageClass: 'LOCAL',
|
|
79
|
+
_created_time: stat.ctime
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
walkSync(prefixPath);
|
|
85
|
+
return {
|
|
86
|
+
Contents: files,
|
|
87
|
+
IsTruncated: false,
|
|
88
|
+
NextContinuationToken: undefined
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const client = createS3Client(params);
|
|
93
|
+
if (!client) throw new Error('Failed to create S3 client');
|
|
94
|
+
|
|
95
|
+
const command = new ListObjectsV2Command({
|
|
96
|
+
Bucket: params.bucket,
|
|
97
|
+
Prefix: prefix,
|
|
98
|
+
MaxKeys: maxKeys,
|
|
99
|
+
ContinuationToken: continuationToken
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
const response = await client.send(command);
|
|
103
|
+
return {
|
|
104
|
+
Contents: response.Contents || [],
|
|
105
|
+
IsTruncated: response.IsTruncated,
|
|
106
|
+
NextContinuationToken: response.NextContinuationToken
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export async function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer> {
|
|
111
|
+
if (params.vendor === 'local') {
|
|
112
|
+
const fullPath = path.join(params.bucket, key);
|
|
113
|
+
if (maxBytes) {
|
|
114
|
+
const fd = fs.openSync(fullPath, 'r');
|
|
115
|
+
const buffer = Buffer.alloc(maxBytes);
|
|
116
|
+
const bytesRead = fs.readSync(fd, buffer, 0, maxBytes, 0);
|
|
117
|
+
fs.closeSync(fd);
|
|
118
|
+
return buffer.subarray(0, bytesRead);
|
|
119
|
+
} else {
|
|
120
|
+
return fs.readFileSync(fullPath);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const client = createS3Client(params);
|
|
125
|
+
if (!client) throw new Error('Failed to create S3 client');
|
|
126
|
+
|
|
127
|
+
const commandInput: any = {
|
|
128
|
+
Bucket: params.bucket,
|
|
129
|
+
Key: key
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
if (maxBytes) {
|
|
133
|
+
commandInput.Range = `bytes=0-${maxBytes - 1}`;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const command = new GetObjectCommand(commandInput);
|
|
137
|
+
const response = await client.send(command);
|
|
138
|
+
|
|
139
|
+
if (response.Body) {
|
|
140
|
+
// @ts-ignore
|
|
141
|
+
const chunks = [];
|
|
142
|
+
for await (const chunk of response.Body as any) {
|
|
143
|
+
chunks.push(chunk);
|
|
144
|
+
}
|
|
145
|
+
return Buffer.concat(chunks);
|
|
146
|
+
}
|
|
147
|
+
return Buffer.alloc(0);
|
|
148
|
+
}
|