@byted-las/contextlake-openclaw 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,35 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.getLasTools = getLasTools;
4
4
  const las_api_1 = require("./las-api");
5
+ const s3_tools_1 = require("./s3-tools");
5
6
  function getLasTools(pluginConfig, logger) {
6
7
  const apiClient = new las_api_1.LasApiClient(pluginConfig, logger);
8
+ const processUrl = async (url) => {
9
+ if (!url)
10
+ return url;
11
+ if (url.startsWith('tos://')) {
12
+ // LAS operators prefer native tos:// paths when supported, leave as is
13
+ return url;
14
+ }
15
+ else if (url.startsWith('oss://') || url.startsWith('s3://') || url.startsWith('cos://') || url.startsWith('file://')) {
16
+ // Need presigned HTTP url for other vendors
17
+ logger.info(`[LasTools] Presigning URL for vendor: ${url}`);
18
+ try {
19
+ // If it's a file:// we also presign it to file:// which might not be supported by remote LAS,
20
+ // but local files typically need to be uploaded to TOS first. We'll leave file:// to fail or be handled elsewhere.
21
+ if (url.startsWith('file://'))
22
+ return url;
23
+ const urlParts = new URL(url);
24
+ const key = urlParts.pathname.replace(/^\//, '');
25
+ return await (0, s3_tools_1.getPresignedUrl)({ url }, key, 3600);
26
+ }
27
+ catch (e) {
28
+ logger.warn(`[LasTools] Failed to presign URL: ${url}`, { error: e.message });
29
+ return url; // fallback to original
30
+ }
31
+ }
32
+ return url;
33
+ };
7
34
  const callApi = async (method, args) => {
8
35
  try {
9
36
  // @ts-ignore
@@ -33,6 +60,9 @@ Parameters in data:
33
60
  required: ['data']
34
61
  },
35
62
  async execute(toolCallId, params) {
63
+ if (params.data?.image) {
64
+ params.data.image = await processUrl(params.data.image);
65
+ }
36
66
  return await callApi('process', ['las_image_resample', params.data]);
37
67
  }
38
68
  },
@@ -53,6 +83,9 @@ Parameters in data:
53
83
  required: ['data']
54
84
  },
55
85
  async execute(toolCallId, params) {
86
+ if (params.data?.input_path) {
87
+ params.data.input_path = await processUrl(params.data.input_path);
88
+ }
56
89
  return await callApi('process', ['las_audio_extract_and_split', params.data]);
57
90
  }
58
91
  },
@@ -71,6 +104,9 @@ Parameters in data:
71
104
  required: ['data']
72
105
  },
73
106
  async execute(toolCallId, params) {
107
+ if (params.data?.input_path) {
108
+ params.data.input_path = await processUrl(params.data.input_path);
109
+ }
74
110
  return await callApi('process', ['las_audio_convert', params.data]);
75
111
  }
76
112
  },
@@ -89,6 +125,9 @@ Parameters in data:
89
125
  required: ['data']
90
126
  },
91
127
  async execute(toolCallId, params) {
128
+ if (params.data?.audio?.url) {
129
+ params.data.audio.url = await processUrl(params.data.audio.url);
130
+ }
92
131
  return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
93
132
  }
94
133
  },
@@ -105,6 +144,9 @@ Parameters in data:
105
144
  required: ['data']
106
145
  },
107
146
  async execute(toolCallId, params) {
147
+ if (params.data?.audio?.url) {
148
+ params.data.audio.url = await processUrl(params.data.audio.url);
149
+ }
108
150
  return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
109
151
  }
110
152
  },
@@ -128,6 +170,13 @@ Parameters:
128
170
  required: ['model', 'input']
129
171
  },
130
172
  async execute(toolCallId, params) {
173
+ if (params.input && Array.isArray(params.input)) {
174
+ for (const item of params.input) {
175
+ if (item.type === 'image_url' && item.image_url?.url) {
176
+ item.image_url.url = await processUrl(item.image_url.url);
177
+ }
178
+ }
179
+ }
131
180
  return await callApi('multimodalEmbedding', [
132
181
  params.model,
133
182
  params.input,
@@ -151,6 +200,9 @@ Parameters in data:
151
200
  required: ['data']
152
201
  },
153
202
  async execute(toolCallId, params) {
203
+ if (params.data?.video_url) {
204
+ params.data.video_url = await processUrl(params.data.video_url);
205
+ }
154
206
  return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
155
207
  }
156
208
  },
@@ -169,6 +221,9 @@ Parameters in data:
169
221
  required: ['data']
170
222
  },
171
223
  async execute(toolCallId, params) {
224
+ if (params.data?.url) {
225
+ params.data.url = await processUrl(params.data.url);
226
+ }
172
227
  return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
173
228
  }
174
229
  },
@@ -187,6 +242,9 @@ Parameters in data:
187
242
  required: ['data']
188
243
  },
189
244
  async execute(toolCallId, params) {
245
+ if (params.data?.video_url) {
246
+ params.data.video_url = await processUrl(params.data.video_url);
247
+ }
190
248
  return await callApi('submitAndPoll', ['las_video_resize', params.data]);
191
249
  }
192
250
  }
@@ -1,12 +1,13 @@
1
1
  export interface ConnectParams {
2
2
  datasource_name: string;
3
- vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
3
+ url: string;
4
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
4
5
  endpoint?: string;
5
6
  access_key?: string;
6
7
  secret_key?: string;
7
8
  region?: string;
8
- bucket: string;
9
- prefix: string;
9
+ bucket?: string;
10
+ prefix?: string;
10
11
  sample_rows?: number;
11
12
  }
12
13
  export interface ConnectResult {
@@ -38,25 +38,16 @@ exports.listDataSources = listDataSources;
38
38
  const path = __importStar(require("path"));
39
39
  const fs = __importStar(require("fs"));
40
40
  const os = __importStar(require("os"));
41
- const child_process_1 = require("child_process");
42
- // ---------------------------------------------------------------------------
43
- // Constants
44
- // ---------------------------------------------------------------------------
41
+ const s3_tools_1 = require("./s3-tools");
42
+ const lance_tools_1 = require("./lance-tools");
43
+ const mime = __importStar(require("mime-types"));
45
44
  const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
46
- const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
47
- // ---------------------------------------------------------------------------
48
- // Helpers
49
- // ---------------------------------------------------------------------------
50
45
  function getDataSourceDir(name) {
51
46
  return path.join(BASE_DIR, name);
52
47
  }
53
48
  function ensureDir(dir) {
54
49
  fs.mkdirSync(dir, { recursive: true });
55
50
  }
56
- /**
57
- * Generate env.sh with all connection parameters for this datasource.
58
- * This file can be sourced to re-run the profiler or for debugging.
59
- */
60
51
  function writeEnvFile(dir, params) {
61
52
  const envPath = path.join(dir, 'env.sh');
62
53
  const lines = [
@@ -69,166 +60,190 @@ function writeEnvFile(dir, params) {
69
60
  `export LAS_BUCKET="${params.bucket}"`,
70
61
  `export LAS_PREFIX="${params.prefix}"`,
71
62
  ];
72
- if (params.endpoint) {
63
+ if (params.endpoint)
73
64
  lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
74
- }
75
- if (params.access_key) {
65
+ if (params.access_key)
76
66
  lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
77
- }
78
- if (params.secret_key) {
67
+ if (params.secret_key)
79
68
  lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
80
- }
81
- if (params.region) {
69
+ if (params.region)
82
70
  lines.push(`export LAS_REGION="${params.region}"`);
83
- }
84
- if (params.sample_rows) {
71
+ if (params.sample_rows)
85
72
  lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
86
- }
87
73
  lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
88
74
  lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
89
75
  lines.push('');
90
76
  fs.writeFileSync(envPath, lines.join('\n'), { mode: 0o600 });
91
77
  return envPath;
92
78
  }
93
- /**
94
- * Install Python dependencies if not already available.
95
- */
96
- function ensurePythonDeps() {
97
- try {
98
- (0, child_process_1.execSync)(`python3 -c "import boto3, lancedb, pyarrow, pandas, PIL, mutagen, fitz"`, {
99
- stdio: 'pipe',
100
- });
101
- }
102
- catch {
103
- console.log('[las-data-profiler] Installing Python dependencies...');
104
- (0, child_process_1.execSync)(`pip3 install --user ${PYTHON_DEPS.join(' ')}`, {
105
- stdio: 'inherit',
106
- });
107
- }
108
- }
109
- /**
110
- * Get the path to the bundled Python script.
111
- */
112
- function getScriptPath() {
113
- // The Python script is located in the scripts directory
114
- return path.join(__dirname, '../scripts', 's3_catalog.py');
79
+ function classifyFile(ext) {
80
+ ext = ext.toLowerCase();
81
+ const STRUCTURED_EXTS = ['.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'];
82
+ const IMAGE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'];
83
+ const AUDIO_EXTS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'];
84
+ const VIDEO_EXTS = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'];
85
+ const PDF_EXTS = ['.pdf'];
86
+ if (STRUCTURED_EXTS.includes(ext))
87
+ return { category: 'structured', mediaType: '' };
88
+ if (IMAGE_EXTS.includes(ext))
89
+ return { category: 'non-structured', mediaType: 'image' };
90
+ if (AUDIO_EXTS.includes(ext))
91
+ return { category: 'non-structured', mediaType: 'audio' };
92
+ if (VIDEO_EXTS.includes(ext))
93
+ return { category: 'non-structured', mediaType: 'video' };
94
+ if (PDF_EXTS.includes(ext))
95
+ return { category: 'non-structured', mediaType: 'pdf' };
96
+ return { category: 'non-structured', mediaType: '' };
115
97
  }
116
- // ---------------------------------------------------------------------------
117
- // Main Entry
118
- // ---------------------------------------------------------------------------
119
98
  async function connectDataSource(params, _ctx) {
120
- // Validate required params
121
- if (!params.datasource_name) {
99
+ if (!params.datasource_name)
122
100
  throw new Error('datasource_name is required');
101
+ if (!params.url)
102
+ throw new Error('url is required (e.g. tos://bucket/prefix)');
103
+ // Parse URL: tos://bucket/prefix
104
+ try {
105
+ if (params.url.startsWith('file://') || params.url.startsWith('/')) {
106
+ params.vendor = 'local';
107
+ const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
108
+ params.bucket = localPath;
109
+ params.prefix = '.';
110
+ }
111
+ else {
112
+ const parsedUrl = new URL(params.url);
113
+ const protocol = parsedUrl.protocol.replace(':', '');
114
+ if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
115
+ if (protocol === 'tos')
116
+ params.vendor = 'volcengine';
117
+ else if (protocol === 'oss')
118
+ params.vendor = 'alibaba';
119
+ else if (protocol === 'cos')
120
+ params.vendor = 'tencent';
121
+ else if (protocol === 's3')
122
+ params.vendor = 'aws';
123
+ params.bucket = parsedUrl.hostname;
124
+ params.prefix = parsedUrl.pathname.replace(/^\//, ''); // Remove leading slash
125
+ }
126
+ else {
127
+ throw new Error(`Unsupported protocol: ${protocol}`);
128
+ }
129
+ }
123
130
  }
124
- if (!params.vendor) {
125
- throw new Error('vendor is required');
126
- }
127
- if (!params.bucket) {
128
- throw new Error('bucket is required');
129
- }
130
- if (params.prefix === undefined || params.prefix === null) {
131
- throw new Error('prefix is required');
131
+ catch (e) {
132
+ if (!params.vendor || !params.bucket || params.prefix === undefined) {
133
+ throw new Error(`Invalid url format: ${e.message}`);
134
+ }
132
135
  }
133
- // For non-local vendors, validate credentials
134
136
  if (params.vendor !== 'local') {
135
- if (!params.endpoint && params.vendor !== 'aws') {
137
+ if (!params.endpoint && params.vendor !== 'aws')
136
138
  throw new Error(`endpoint is required for vendor "${params.vendor}"`);
139
+ let ak = params.access_key;
140
+ let sk = params.secret_key;
141
+ if (!ak || !sk) {
142
+ try {
143
+ const { loadCredentials } = require('../../utils/credentials');
144
+ const creds = loadCredentials();
145
+ ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
146
+ sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
147
+ }
148
+ catch (e) {
149
+ // ignore
150
+ }
137
151
  }
138
- const ak = params.access_key || process.env.TOS_ACCESS_KEY || process.env.S3_ACCESS_KEY || process.env.AWS_ACCESS_KEY_ID;
139
- const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
140
152
  if (!ak || !sk) {
141
- throw new Error('access_key and secret_key are required (via params or env vars TOS_ACCESS_KEY/TOS_SECRET_KEY, S3_ACCESS_KEY/S3_SECRET_KEY, AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)');
153
+ throw new Error('access_key and secret_key are required');
142
154
  }
143
- // Normalise into params so env.sh picks them up
144
155
  params.access_key = ak;
145
156
  params.secret_key = sk;
146
157
  }
147
158
  const dsDir = getDataSourceDir(params.datasource_name);
148
159
  const dbPath = path.join(dsDir, 'catalog_db');
149
160
  ensureDir(dsDir);
150
- // 1. Write env.sh
151
161
  const envPath = writeEnvFile(dsDir, params);
152
- // 2. Ensure Python dependencies
153
- ensurePythonDeps();
154
- // 3. Build CLI args for the Python script
155
- const scriptPath = getScriptPath();
156
- const args = [
157
- scriptPath,
158
- '--vendor', params.vendor,
159
- '--bucket', params.bucket,
160
- '--prefix', params.prefix,
161
- '--db-path', dbPath,
162
- ];
163
- if (params.endpoint) {
164
- args.push('--endpoint', params.endpoint);
165
- }
166
- if (params.access_key) {
167
- args.push('--ak', params.access_key);
168
- }
169
- if (params.secret_key) {
170
- args.push('--sk', params.secret_key);
171
- }
172
- if (params.region) {
173
- args.push('--region', params.region);
174
- }
175
- if (params.sample_rows) {
176
- args.push('--sample-rows', String(params.sample_rows));
177
- }
178
- // 4. Execute the profiling script
179
- return new Promise((resolve) => {
180
- let stdout = '';
181
- let stderr = '';
182
- const proc = (0, child_process_1.spawn)('python3', args, {
183
- cwd: dsDir,
184
- env: { ...process.env },
185
- });
186
- proc.stdout.on('data', (data) => {
187
- stdout += data.toString();
188
- });
189
- proc.stderr.on('data', (data) => {
190
- stderr += data.toString();
191
- });
192
- proc.on('close', (code) => {
193
- if (code !== 0) {
194
- resolve({
195
- status: 'error',
196
- datasource_name: params.datasource_name,
197
- db_path: dbPath,
198
- env_path: envPath,
199
- tables: [],
200
- error: stderr || `Python script exited with code ${code}`,
201
- });
202
- return;
203
- }
204
- // Try to parse structured output from the script
205
- try {
206
- const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
207
- const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
208
- resolve({
209
- status: 'success',
210
- datasource_name: params.datasource_name,
211
- db_path: dbPath,
212
- env_path: envPath,
213
- tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
214
- summary: result.summary || {
215
- total_files: 0,
216
- structured_files: 0,
217
- media_files: 0,
218
- },
219
- });
220
- }
221
- catch {
222
- resolve({
223
- status: 'success',
224
- datasource_name: params.datasource_name,
225
- db_path: dbPath,
226
- env_path: envPath,
227
- tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
162
+ try {
163
+ let isTruncated = true;
164
+ let continuationToken = undefined;
165
+ let total_files = 0;
166
+ let structured_files = 0;
167
+ let media_files = 0;
168
+ const allRecords = [];
169
+ const scan_ts = new Date().toISOString() + 'Z';
170
+ while (isTruncated) {
171
+ const response = await (0, s3_tools_1.listS3Objects)({
172
+ vendor: params.vendor,
173
+ bucket: params.bucket,
174
+ endpoint: params.endpoint,
175
+ access_key: params.access_key,
176
+ secret_key: params.secret_key,
177
+ region: params.region
178
+ }, params.prefix || '', 1000, continuationToken);
179
+ for (const obj of response.Contents) {
180
+ const key = obj.Key || '';
181
+ if (key.endsWith('/'))
182
+ continue;
183
+ const name = path.basename(key);
184
+ const ext = path.extname(name).toLowerCase();
185
+ const mimeType = mime.lookup(name) || '';
186
+ const { category, mediaType } = classifyFile(ext);
187
+ const depth = (key.match(/\//g) || []).length;
188
+ const parentDir = key.includes('/') ? path.basename(path.dirname(key)) : '';
189
+ total_files++;
190
+ if (category === 'structured')
191
+ structured_files++;
192
+ if (mediaType)
193
+ media_files++;
194
+ allRecords.push({
195
+ file_path: key,
196
+ file_name: name,
197
+ extension: ext,
198
+ mime_type: mimeType,
199
+ category: category,
200
+ media_type: mediaType,
201
+ size_bytes: obj.Size || 0,
202
+ last_modified: obj.LastModified ? String(obj.LastModified) : '',
203
+ created_time: obj._created_time ? String(obj._created_time) : '',
204
+ etag: (obj.ETag || '').replace(/"/g, ''),
205
+ storage_class: obj.StorageClass || '',
206
+ is_multipart: (obj.ETag || '').includes('-'),
207
+ depth: depth,
208
+ parent_dir: parentDir,
209
+ vendor: params.vendor,
210
+ bucket: params.bucket,
211
+ has_schema: false,
212
+ has_media_meta: false,
213
+ scan_timestamp: scan_ts
228
214
  });
229
215
  }
216
+ isTruncated = response.IsTruncated || false;
217
+ continuationToken = response.NextContinuationToken;
218
+ }
219
+ await (0, lance_tools_1.writeLanceCatalog)({
220
+ db_path: dbPath,
221
+ table_name: 'file_catalog',
222
+ records: allRecords
230
223
  });
231
- });
224
+ return {
225
+ status: 'success',
226
+ datasource_name: params.datasource_name,
227
+ db_path: dbPath,
228
+ env_path: envPath,
229
+ tables: ['file_catalog'],
230
+ summary: {
231
+ total_files,
232
+ structured_files,
233
+ media_files
234
+ }
235
+ };
236
+ }
237
+ catch (error) {
238
+ return {
239
+ status: 'error',
240
+ datasource_name: params.datasource_name,
241
+ db_path: dbPath,
242
+ env_path: envPath,
243
+ tables: [],
244
+ error: error.message
245
+ };
246
+ }
232
247
  }
233
248
  async function listDataSources(_ctx) {
234
249
  try {
@@ -0,0 +1,21 @@
1
+ export interface S3Params {
2
+ url?: string;
3
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
4
+ endpoint?: string;
5
+ access_key?: string;
6
+ secret_key?: string;
7
+ region?: string;
8
+ bucket?: string;
9
+ }
10
+ export declare function parseS3Url(params: S3Params): S3Params;
11
+ export declare function listS3Objects(params: S3Params, prefix: string, maxKeys?: number, continuationToken?: string): Promise<{
12
+ Contents: any[];
13
+ IsTruncated: boolean;
14
+ NextContinuationToken: undefined;
15
+ } | {
16
+ Contents: import("@aws-sdk/client-s3")._Object[];
17
+ IsTruncated: boolean | undefined;
18
+ NextContinuationToken: string | undefined;
19
+ }>;
20
+ export declare function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer>;
21
+ export declare function getPresignedUrl(params: S3Params, key: string, expiresIn?: number): Promise<string>;