@byted-las/contextlake-openclaw 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@byted-las/contextlake-openclaw",
3
- "version": "1.0.4",
3
+ "version": "1.0.5",
4
4
  "description": "ContextLake OpenClaw Plugin for managing knowledge base",
5
5
  "main": "index.ts",
6
6
  "files": [
@@ -34,6 +34,7 @@
34
34
  },
35
35
  "dependencies": {
36
36
  "@aws-sdk/client-s3": "^3.1014.0",
37
+ "@aws-sdk/s3-request-presigner": "^3.1014.0",
37
38
  "@lancedb/lancedb": "^0.26.2",
38
39
  "@volcengine/tos-sdk": "^2.9.0",
39
40
  "commander": "^14.0.3",
@@ -37,29 +37,23 @@ function parseMetadata(metadata: any): Record<string, any> {
37
37
 
38
38
  export function getCliCommands(pluginConfig: ContextLakeConfig, logger: any) {
39
39
  return {
40
- connectAction: async (datasource_name: string, options: any) => {
41
- logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, options });
40
+ connectAction: async (datasource_name: string, url: string, options: any) => {
41
+ logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
42
42
  try {
43
43
  const params: ConnectParams = {
44
44
  datasource_name,
45
- vendor: options.vendor,
45
+ url,
46
46
  endpoint: options.endpoint,
47
47
  access_key: options.ak,
48
48
  secret_key: options.sk,
49
49
  region: options.region,
50
- bucket: options.bucket,
51
- prefix: options.prefix,
52
50
  sample_rows: parseInt(options.sampleRows),
53
51
  };
54
52
 
55
53
  // eslint-disable-next-line no-console
56
54
  console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
57
55
  // eslint-disable-next-line no-console
58
- console.log(` vendor: ${params.vendor}`);
59
- // eslint-disable-next-line no-console
60
- console.log(` bucket: ${params.bucket}`);
61
- // eslint-disable-next-line no-console
62
- console.log(` prefix: ${params.prefix}`);
56
+ console.log(` url: ${params.url}`);
63
57
 
64
58
  const result = await connectDataSource(params);
65
59
  // eslint-disable-next-line no-console
@@ -35,6 +35,12 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
35
35
 
36
36
  ctx.registerTool(tools.writeLanceCatalogTool );
37
37
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.writeLanceCatalogTool.name}`);
38
+
39
+ ctx.registerTool(tools.readLanceCatalogTool );
40
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readLanceCatalogTool.name}`);
41
+
42
+ ctx.registerTool(tools.generatePresignedUrlTool );
43
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.generatePresignedUrlTool.name}`);
38
44
 
39
45
  ctx.registerTool(tools.listDatasourceTool );
40
46
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
@@ -60,15 +66,12 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
60
66
  const commands = getCliCommands(pluginConfig, logger);
61
67
 
62
68
  // connect -- data source profiling (las-data-profiler)
63
- contextlake.command('connect <datasource_name>')
69
+ contextlake.command('connect <datasource_name> <url>')
64
70
  .description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
65
- .requiredOption('--vendor <vendor>', 'Data source type: volcengine | alibaba | tencent | aws | local')
66
71
  .option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
67
72
  .option('--ak <credential_id>', 'Credential ID for the data source')
68
73
  .option('--sk <credential_value>', 'Credential value for the data source')
69
74
  .option('--region <region>', 'Region identifier (e.g. cn-beijing)')
70
- .requiredOption('--bucket <bucket>', 'Bucket name (or local root directory for local vendor)')
71
- .requiredOption('--prefix <prefix>', 'Path prefix to limit scan scope')
72
75
  .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
73
76
  .action(commands.connectAction);
74
77
 
@@ -108,21 +108,15 @@ export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
108
108
 
109
109
  logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
110
110
  try {
111
- if (args.length < 4) {
112
- return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]` };
111
+ if (args.length < 2) {
112
+ return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
113
113
  }
114
114
 
115
- const [datasource_name, vendor, bucket, prefix, endpoint, access_key, secret_key, region] = args;
116
-
117
- if (!['volcengine', 'alibaba', 'tencent', 'aws', 'local'].includes(vendor)) {
118
- return { text: `**Error:** Invalid vendor. Must be one of: volcengine, alibaba, tencent, aws, local` };
119
- }
115
+ const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
120
116
 
121
117
  const params: ConnectParams = {
122
118
  datasource_name,
123
- vendor: vendor as ConnectParams['vendor'],
124
- bucket,
125
- prefix,
119
+ url,
126
120
  endpoint,
127
121
  access_key,
128
122
  secret_key,
@@ -3,8 +3,8 @@ import { retrieveAssets } from '../lib/actions/retrieve';
3
3
  import { listAssets, deleteAssets } from '../lib/actions/manage';
4
4
  import { connectDataSource, listDataSources } from '../lib/actions/profiler';
5
5
  import { getLasTools } from '../lib/actions/las-tools';
6
- import { listS3Objects, readS3Object } from '../lib/actions/s3-tools';
7
- import { writeLanceCatalog } from '../lib/actions/lance-tools';
6
+ import { listS3Objects, readS3Object, getPresignedUrl } from '../lib/actions/s3-tools';
7
+ import { writeLanceCatalog, readLanceCatalog } from '../lib/actions/lance-tools';
8
8
  import { ContextLakeConfig } from '../utils/config';
9
9
  // @ts-ignore
10
10
  import type { AnyAgentTool } from 'openclaw/plugin-sdk';
@@ -18,7 +18,9 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
18
18
  listDatasourceTool: AnyAgentTool;
19
19
  listS3ObjectsTool: AnyAgentTool;
20
20
  readS3ObjectTool: AnyAgentTool;
21
+ generatePresignedUrlTool: AnyAgentTool;
21
22
  writeLanceCatalogTool: AnyAgentTool;
23
+ readLanceCatalogTool: AnyAgentTool;
22
24
  lasTools: AnyAgentTool[];
23
25
  } {
24
26
  const lasTools = getLasTools(pluginConfig, logger);
@@ -274,16 +276,10 @@ Example User Queries:
274
276
  type: 'object',
275
277
  properties: {
276
278
  datasource_name: { type: 'string', description: 'Name of the data source' },
277
- vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Data source type' },
278
- endpoint: { type: 'string', description: 'S3 Endpoint URL (not needed for local)' },
279
- access_key: { type: 'string', description: 'Credential ID for the data source' },
280
- secret_key: { type: 'string', description: 'Credential value for the data source' },
281
- region: { type: 'string', description: 'Region identifier (e.g. cn-beijing)' },
282
- bucket: { type: 'string', description: 'Bucket name (or local root directory for local vendor)' },
283
- prefix: { type: 'string', description: 'Path prefix to limit scan scope' },
279
+ url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
284
280
  sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
285
281
  },
286
- required: ['datasource_name', 'vendor', 'bucket', 'prefix'],
282
+ required: ['datasource_name', 'url'],
287
283
  additionalProperties: false
288
284
  },
289
285
 
@@ -317,17 +313,15 @@ Example User Queries:
317
313
  parameters: {
318
314
  type: 'object',
319
315
  properties: {
320
- vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
321
- bucket: { type: 'string' },
316
+ url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., file:///path)' },
317
+ vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
318
+ bucket: { type: 'string', description: 'Required if url is not provided' },
322
319
  prefix: { type: 'string' },
323
320
  endpoint: { type: 'string' },
324
- access_key: { type: 'string' },
325
- secret_key: { type: 'string' },
326
- region: { type: 'string' },
327
321
  maxKeys: { type: 'integer' },
328
322
  continuationToken: { type: 'string' }
329
323
  },
330
- required: ['vendor', 'bucket'],
324
+ required: [],
331
325
  additionalProperties: false
332
326
  },
333
327
  async execute(toolCallId: string, params: any) {
@@ -347,22 +341,38 @@ Example User Queries:
347
341
  parameters: {
348
342
  type: 'object',
349
343
  properties: {
350
- vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'] },
351
- bucket: { type: 'string' },
352
- key: { type: 'string' },
344
+ url: { type: 'string', description: 'Full URL to the object (e.g. tos://bucket/path/to/key.txt)' },
345
+ vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
346
+ bucket: { type: 'string', description: 'Required if url is not provided' },
347
+ key: { type: 'string', description: 'Required if url is not provided' },
353
348
  endpoint: { type: 'string' },
354
- access_key: { type: 'string' },
355
- secret_key: { type: 'string' },
356
- region: { type: 'string' },
357
349
  maxBytes: { type: 'integer' }
358
350
  },
359
- required: ['vendor', 'bucket', 'key'],
351
+ required: [],
360
352
  additionalProperties: false
361
353
  },
362
354
  async execute(toolCallId: string, params: any) {
363
355
  let actualParams = params.params || params;
364
356
  try {
365
- const buf = await readS3Object(actualParams, actualParams.key, actualParams.maxBytes);
357
+ // Extract key from url if provided
358
+ let key = actualParams.key;
359
+ if (actualParams.url && !key) {
360
+ try {
361
+ if (actualParams.url.startsWith('file://')) {
362
+ // Key is not strictly needed for file://, bucket contains the path in parseS3Url
363
+ key = '';
364
+ } else {
365
+ const parsedUrl = new URL(actualParams.url);
366
+ key = parsedUrl.pathname.replace(/^\//, '');
367
+ }
368
+ } catch (e) {
369
+ // let it fail in readS3Object
370
+ }
371
+ }
372
+ if (!key && !actualParams.url?.startsWith('file://')) {
373
+ throw new Error('key is required or must be part of the url');
374
+ }
375
+ const buf = await readS3Object(actualParams, key, actualParams.maxBytes);
366
376
  // Return as base64 string
367
377
  return { content: [{ type: "text", text: buf.toString('base64') }], details: { length: buf.length } } as any;
368
378
  } catch (e: any) {
@@ -370,6 +380,49 @@ Example User Queries:
370
380
  }
371
381
  }
372
382
  },
383
+ generatePresignedUrlTool: {
384
+ name: 'generate-presigned-url',
385
+ label: 'Generate Presigned URL',
386
+ description: 'Generate a presigned HTTP URL for an S3/TOS object, allowing temporary public access',
387
+ parameters: {
388
+ type: 'object',
389
+ properties: {
390
+ url: { type: 'string', description: 'Full URL to the object (e.g. tos://bucket/path/to/key.txt)' },
391
+ vendor: { type: 'string', enum: ['volcengine', 'alibaba', 'tencent', 'aws', 'local'], description: 'Required if url is not provided' },
392
+ bucket: { type: 'string', description: 'Required if url is not provided' },
393
+ key: { type: 'string', description: 'Required if url is not provided' },
394
+ endpoint: { type: 'string' },
395
+ expiresIn: { type: 'integer', description: 'Expiration time in seconds (default 3600)' }
396
+ },
397
+ required: [],
398
+ additionalProperties: false
399
+ },
400
+ async execute(toolCallId: string, params: any) {
401
+ let actualParams = params.params || params;
402
+ try {
403
+ let key = actualParams.key;
404
+ if (actualParams.url && !key) {
405
+ try {
406
+ if (actualParams.url.startsWith('file://')) {
407
+ key = '';
408
+ } else {
409
+ const parsedUrl = new URL(actualParams.url);
410
+ key = parsedUrl.pathname.replace(/^\//, '');
411
+ }
412
+ } catch (e) {
413
+ // let it fail in getPresignedUrl
414
+ }
415
+ }
416
+ if (!key && !actualParams.url?.startsWith('file://')) {
417
+ throw new Error('key is required or must be part of the url');
418
+ }
419
+ const url = await getPresignedUrl(actualParams, key, actualParams.expiresIn);
420
+ return { content: [{ type: "text", text: url }], details: { url } } as any;
421
+ } catch (e: any) {
422
+ return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
423
+ }
424
+ }
425
+ },
373
426
  writeLanceCatalogTool: {
374
427
  name: 'write-lance-catalog',
375
428
  label: 'Write LanceDB Catalog',
@@ -393,6 +446,31 @@ Example User Queries:
393
446
  return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
394
447
  }
395
448
  }
449
+ },
450
+ readLanceCatalogTool: {
451
+ name: 'read-lance-catalog',
452
+ label: 'Read LanceDB Catalog',
453
+ description: 'Read records from a local LanceDB table for validation or ingestion processes',
454
+ parameters: {
455
+ type: 'object',
456
+ properties: {
457
+ db_path: { type: 'string', description: 'Path to the local LanceDB database' },
458
+ table_name: { type: 'string', description: 'Name of the table to read' },
459
+ limit: { type: 'integer', description: 'Maximum number of records to return' },
460
+ filter: { type: 'string', description: 'SQL-like filter string (e.g., "category = \'structured\'")' }
461
+ },
462
+ required: ['db_path', 'table_name'],
463
+ additionalProperties: false
464
+ },
465
+ async execute(toolCallId: string, params: any) {
466
+ let actualParams = params.params || params;
467
+ try {
468
+ const results = await readLanceCatalog(actualParams);
469
+ return { content: [{ type: "text", text: JSON.stringify(results) }], details: { count: results.length, data: results } } as any;
470
+ } catch (e: any) {
471
+ return { content: [{ type: "text", text: String(e.message) }], details: { error: e.message } } as any;
472
+ }
473
+ }
396
474
  }
397
475
  };
398
476
  }
@@ -1,4 +1,5 @@
1
1
  import * as lancedb from '@lancedb/lancedb';
2
+ import * as fs from 'fs';
2
3
 
3
4
  export interface LanceWriteParams {
4
5
  db_path: string;
@@ -6,6 +7,13 @@ export interface LanceWriteParams {
6
7
  records: any[];
7
8
  }
8
9
 
10
+ export interface LanceReadParams {
11
+ db_path: string;
12
+ table_name: string;
13
+ limit?: number;
14
+ filter?: string;
15
+ }
16
+
9
17
  export async function writeLanceCatalog(params: LanceWriteParams) {
10
18
  if (!params.records || params.records.length === 0) {
11
19
  return;
@@ -21,3 +29,30 @@ export async function writeLanceCatalog(params: LanceWriteParams) {
21
29
  await db.createTable(params.table_name, params.records);
22
30
  }
23
31
  }
32
+
33
+ export async function readLanceCatalog(params: LanceReadParams) {
34
+ if (!fs.existsSync(params.db_path)) {
35
+ throw new Error(`Database not found at ${params.db_path}`);
36
+ }
37
+
38
+ const db = await lancedb.connect(params.db_path);
39
+ const tableNames = await db.tableNames();
40
+
41
+ if (!tableNames.includes(params.table_name)) {
42
+ throw new Error(`Table '${params.table_name}' not found in database`);
43
+ }
44
+
45
+ const table = await db.openTable(params.table_name);
46
+ let query = table.query();
47
+
48
+ if (params.filter) {
49
+ query = query.where(params.filter);
50
+ }
51
+
52
+ if (params.limit && params.limit > 0) {
53
+ query = query.limit(params.limit);
54
+ }
55
+
56
+ const results = await query.toArray();
57
+ return results;
58
+ }
@@ -1,11 +1,36 @@
1
1
  import { LasApiClient } from './las-api';
2
2
  import { ContextLakeConfig } from '../../utils/config';
3
+ import { getPresignedUrl } from './s3-tools';
3
4
  // @ts-ignore
4
5
  import type { AnyAgentTool } from 'openclaw/plugin-sdk';
5
6
 
6
7
  export function getLasTools(pluginConfig: ContextLakeConfig, logger: any): AnyAgentTool[] {
7
8
  const apiClient = new LasApiClient(pluginConfig, logger);
8
9
 
10
+ const processUrl = async (url: string): Promise<string> => {
11
+ if (!url) return url;
12
+ if (url.startsWith('tos://')) {
13
+ // LAS operators prefer native tos:// paths when supported, leave as is
14
+ return url;
15
+ } else if (url.startsWith('oss://') || url.startsWith('s3://') || url.startsWith('cos://') || url.startsWith('file://')) {
16
+ // Need presigned HTTP url for other vendors
17
+ logger.info(`[LasTools] Presigning URL for vendor: ${url}`);
18
+ try {
19
+ // If it's a file:// we also presign it to file:// which might not be supported by remote LAS,
20
+ // but local files typically need to be uploaded to TOS first. We'll leave file:// to fail or be handled elsewhere.
21
+ if (url.startsWith('file://')) return url;
22
+
23
+ const urlParts = new URL(url);
24
+ const key = urlParts.pathname.replace(/^\//, '');
25
+ return await getPresignedUrl({ url }, key, 3600);
26
+ } catch (e: any) {
27
+ logger.warn(`[LasTools] Failed to presign URL: ${url}`, { error: e.message });
28
+ return url; // fallback to original
29
+ }
30
+ }
31
+ return url;
32
+ };
33
+
9
34
  const callApi = async (method: string, args: any[]) => {
10
35
  try {
11
36
  // @ts-ignore
@@ -35,6 +60,9 @@ Parameters in data:
35
60
  required: ['data']
36
61
  },
37
62
  async execute(toolCallId: string, params: any) {
63
+ if (params.data?.image) {
64
+ params.data.image = await processUrl(params.data.image);
65
+ }
38
66
  return await callApi('process', ['las_image_resample', params.data]);
39
67
  }
40
68
  },
@@ -55,6 +83,9 @@ Parameters in data:
55
83
  required: ['data']
56
84
  },
57
85
  async execute(toolCallId: string, params: any) {
86
+ if (params.data?.input_path) {
87
+ params.data.input_path = await processUrl(params.data.input_path);
88
+ }
58
89
  return await callApi('process', ['las_audio_extract_and_split', params.data]);
59
90
  }
60
91
  },
@@ -73,6 +104,9 @@ Parameters in data:
73
104
  required: ['data']
74
105
  },
75
106
  async execute(toolCallId: string, params: any) {
107
+ if (params.data?.input_path) {
108
+ params.data.input_path = await processUrl(params.data.input_path);
109
+ }
76
110
  return await callApi('process', ['las_audio_convert', params.data]);
77
111
  }
78
112
  },
@@ -91,6 +125,9 @@ Parameters in data:
91
125
  required: ['data']
92
126
  },
93
127
  async execute(toolCallId: string, params: any) {
128
+ if (params.data?.audio?.url) {
129
+ params.data.audio.url = await processUrl(params.data.audio.url);
130
+ }
94
131
  return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
95
132
  }
96
133
  },
@@ -107,6 +144,9 @@ Parameters in data:
107
144
  required: ['data']
108
145
  },
109
146
  async execute(toolCallId: string, params: any) {
147
+ if (params.data?.audio?.url) {
148
+ params.data.audio.url = await processUrl(params.data.audio.url);
149
+ }
110
150
  return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
111
151
  }
112
152
  },
@@ -130,6 +170,13 @@ Parameters:
130
170
  required: ['model', 'input']
131
171
  },
132
172
  async execute(toolCallId: string, params: any) {
173
+ if (params.input && Array.isArray(params.input)) {
174
+ for (const item of params.input) {
175
+ if (item.type === 'image_url' && item.image_url?.url) {
176
+ item.image_url.url = await processUrl(item.image_url.url);
177
+ }
178
+ }
179
+ }
133
180
  return await callApi('multimodalEmbedding', [
134
181
  params.model,
135
182
  params.input,
@@ -153,6 +200,9 @@ Parameters in data:
153
200
  required: ['data']
154
201
  },
155
202
  async execute(toolCallId: string, params: any) {
203
+ if (params.data?.video_url) {
204
+ params.data.video_url = await processUrl(params.data.video_url);
205
+ }
156
206
  return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
157
207
  }
158
208
  },
@@ -171,6 +221,9 @@ Parameters in data:
171
221
  required: ['data']
172
222
  },
173
223
  async execute(toolCallId: string, params: any) {
224
+ if (params.data?.url) {
225
+ params.data.url = await processUrl(params.data.url);
226
+ }
174
227
  return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
175
228
  }
176
229
  },
@@ -189,6 +242,9 @@ Parameters in data:
189
242
  required: ['data']
190
243
  },
191
244
  async execute(toolCallId: string, params: any) {
245
+ if (params.data?.video_url) {
246
+ params.data.video_url = await processUrl(params.data.video_url);
247
+ }
192
248
  return await callApi('submitAndPoll', ['las_video_resize', params.data]);
193
249
  }
194
250
  }
@@ -7,13 +7,14 @@ import * as mime from 'mime-types';
7
7
 
8
8
  export interface ConnectParams {
9
9
  datasource_name: string;
10
- vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
10
+ url: string;
11
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
11
12
  endpoint?: string;
12
13
  access_key?: string;
13
14
  secret_key?: string;
14
15
  region?: string;
15
- bucket: string;
16
- prefix: string;
16
+ bucket?: string;
17
+ prefix?: string;
17
18
  sample_rows?: number;
18
19
  }
19
20
 
@@ -90,9 +91,36 @@ export async function connectDataSource(
90
91
  _ctx?: any
91
92
  ): Promise<ConnectResult> {
92
93
  if (!params.datasource_name) throw new Error('datasource_name is required');
93
- if (!params.vendor) throw new Error('vendor is required');
94
- if (!params.bucket) throw new Error('bucket is required');
95
- if (params.prefix === undefined || params.prefix === null) throw new Error('prefix is required');
94
+ if (!params.url) throw new Error('url is required (e.g. tos://bucket/prefix)');
95
+
96
+ // Parse URL: tos://bucket/prefix
97
+ try {
98
+ if (params.url.startsWith('file://') || params.url.startsWith('/')) {
99
+ params.vendor = 'local';
100
+ const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
101
+ params.bucket = localPath;
102
+ params.prefix = '.';
103
+ } else {
104
+ const parsedUrl = new URL(params.url);
105
+ const protocol = parsedUrl.protocol.replace(':', '');
106
+
107
+ if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
108
+ if (protocol === 'tos') params.vendor = 'volcengine';
109
+ else if (protocol === 'oss') params.vendor = 'alibaba';
110
+ else if (protocol === 'cos') params.vendor = 'tencent';
111
+ else if (protocol === 's3') params.vendor = 'aws';
112
+
113
+ params.bucket = parsedUrl.hostname;
114
+ params.prefix = parsedUrl.pathname.replace(/^\//, ''); // Remove leading slash
115
+ } else {
116
+ throw new Error(`Unsupported protocol: ${protocol}`);
117
+ }
118
+ }
119
+ } catch (e: any) {
120
+ if (!params.vendor || !params.bucket || params.prefix === undefined) {
121
+ throw new Error(`Invalid url format: ${e.message}`);
122
+ }
123
+ }
96
124
 
97
125
  if (params.vendor !== 'local') {
98
126
  if (!params.endpoint && params.vendor !== 'aws') throw new Error(`endpoint is required for vendor "${params.vendor}"`);
@@ -134,7 +162,14 @@ export async function connectDataSource(
134
162
  const scan_ts = new Date().toISOString() + 'Z';
135
163
 
136
164
  while (isTruncated) {
137
- const response = await listS3Objects(params, params.prefix, 1000, continuationToken);
165
+ const response = await listS3Objects({
166
+ vendor: params.vendor as any,
167
+ bucket: params.bucket as string,
168
+ endpoint: params.endpoint,
169
+ access_key: params.access_key,
170
+ secret_key: params.secret_key,
171
+ region: params.region
172
+ }, params.prefix || '', 1000, continuationToken);
138
173
 
139
174
  for (const obj of response.Contents) {
140
175
  const key = obj.Key || '';
@@ -1,17 +1,51 @@
1
1
  import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
2
+ import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
2
3
  import * as fs from 'fs';
3
4
  import * as path from 'path';
4
5
 
5
6
  export interface S3Params {
6
- vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
7
+ url?: string;
8
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
7
9
  endpoint?: string;
8
10
  access_key?: string;
9
11
  secret_key?: string;
10
12
  region?: string;
11
- bucket: string;
13
+ bucket?: string;
14
+ }
15
+
16
+ export function parseS3Url(params: S3Params): S3Params {
17
+ if (params.url) {
18
+ if (params.url.startsWith('file://') || params.url.startsWith('/')) {
19
+ params.vendor = 'local';
20
+ const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
21
+ params.bucket = localPath;
22
+ } else {
23
+ const parsedUrl = new URL(params.url);
24
+ const protocol = parsedUrl.protocol.replace(':', '');
25
+
26
+ if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
27
+ if (protocol === 'tos') params.vendor = 'volcengine';
28
+ else if (protocol === 'oss') params.vendor = 'alibaba';
29
+ else if (protocol === 'cos') params.vendor = 'tencent';
30
+ else if (protocol === 's3') params.vendor = 'aws';
31
+
32
+ params.bucket = parsedUrl.hostname;
33
+ // Prefix is usually parsed separately or passed explicitly for listing
34
+ } else {
35
+ throw new Error(`Unsupported protocol: ${protocol}`);
36
+ }
37
+ }
38
+ }
39
+
40
+ if (!params.vendor || !params.bucket) {
41
+ throw new Error('Could not determine vendor or bucket. Please provide a valid url or vendor/bucket directly.');
42
+ }
43
+
44
+ return params;
12
45
  }
13
46
 
14
47
  function createS3Client(params: S3Params): S3Client | null {
48
+ params = parseS3Url(params);
15
49
  if (params.vendor === 'local') return null;
16
50
 
17
51
  let endpoint = params.endpoint;
@@ -56,8 +90,9 @@ function createS3Client(params: S3Params): S3Client | null {
56
90
  }
57
91
 
58
92
  export async function listS3Objects(params: S3Params, prefix: string, maxKeys: number = 1000, continuationToken?: string) {
93
+ params = parseS3Url(params);
59
94
  if (params.vendor === 'local') {
60
- const root = params.bucket;
95
+ const root = params.bucket as string;
61
96
  const prefixPath = prefix && prefix !== '.' ? path.join(root, prefix) : root;
62
97
  const files: any[] = [];
63
98
 
@@ -108,8 +143,9 @@ export async function listS3Objects(params: S3Params, prefix: string, maxKeys: n
108
143
  }
109
144
 
110
145
  export async function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer> {
146
+ params = parseS3Url(params);
111
147
  if (params.vendor === 'local') {
112
- const fullPath = path.join(params.bucket, key);
148
+ const fullPath = path.join(params.bucket as string, key);
113
149
  if (maxBytes) {
114
150
  const fd = fs.openSync(fullPath, 'r');
115
151
  const buffer = Buffer.alloc(maxBytes);
@@ -146,3 +182,22 @@ export async function readS3Object(params: S3Params, key: string, maxBytes?: num
146
182
  }
147
183
  return Buffer.alloc(0);
148
184
  }
185
+
186
+ export async function getPresignedUrl(params: S3Params, key: string, expiresIn: number = 3600): Promise<string> {
187
+ params = parseS3Url(params);
188
+ if (params.vendor === 'local') {
189
+ const fullPath = path.join(params.bucket as string, key);
190
+ return `file://${fullPath}`;
191
+ }
192
+
193
+ const client = createS3Client(params);
194
+ if (!client) throw new Error('Failed to create S3 client');
195
+
196
+ const command = new GetObjectCommand({
197
+ Bucket: params.bucket,
198
+ Key: key
199
+ });
200
+
201
+ const signedUrl = await getSignedUrl(client, command, { expiresIn });
202
+ return signedUrl;
203
+ }