@byted-las/contextlake-openclaw 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,20 @@
1
1
  import * as path from 'path';
2
2
  import * as fs from 'fs';
3
3
  import * as os from 'os';
4
- import { execSync, spawn } from 'child_process';
5
-
6
- // ---------------------------------------------------------------------------
7
- // Types
8
- // ---------------------------------------------------------------------------
4
+ import { listS3Objects } from './s3-tools';
5
+ import { writeLanceCatalog } from './lance-tools';
6
+ import * as mime from 'mime-types';
9
7
 
10
8
  export interface ConnectParams {
11
9
  datasource_name: string;
12
- vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
10
+ url: string;
11
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
13
12
  endpoint?: string;
14
13
  access_key?: string;
15
14
  secret_key?: string;
16
15
  region?: string;
17
- bucket: string;
18
- prefix: string;
16
+ bucket?: string;
17
+ prefix?: string;
19
18
  sample_rows?: number;
20
19
  }
21
20
 
@@ -33,16 +32,7 @@ export interface ConnectResult {
33
32
  error?: string;
34
33
  }
35
34
 
36
- // ---------------------------------------------------------------------------
37
- // Constants
38
- // ---------------------------------------------------------------------------
39
-
40
35
  const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
41
- const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
42
-
43
- // ---------------------------------------------------------------------------
44
- // Helpers
45
- // ---------------------------------------------------------------------------
46
36
 
47
37
  function getDataSourceDir(name: string): string {
48
38
  return path.join(BASE_DIR, name);
@@ -52,10 +42,6 @@ function ensureDir(dir: string): void {
52
42
  fs.mkdirSync(dir, { recursive: true });
53
43
  }
54
44
 
55
- /**
56
- * Generate env.sh with all connection parameters for this datasource.
57
- * This file can be sourced to re-run the profiler or for debugging.
58
- */
59
45
  function writeEnvFile(dir: string, params: ConnectParams): string {
60
46
  const envPath = path.join(dir, 'env.sh');
61
47
  const lines: string[] = [
@@ -69,21 +55,11 @@ function writeEnvFile(dir: string, params: ConnectParams): string {
69
55
  `export LAS_PREFIX="${params.prefix}"`,
70
56
  ];
71
57
 
72
- if (params.endpoint) {
73
- lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
74
- }
75
- if (params.access_key) {
76
- lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
77
- }
78
- if (params.secret_key) {
79
- lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
80
- }
81
- if (params.region) {
82
- lines.push(`export LAS_REGION="${params.region}"`);
83
- }
84
- if (params.sample_rows) {
85
- lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
86
- }
58
+ if (params.endpoint) lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
59
+ if (params.access_key) lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
60
+ if (params.secret_key) lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
61
+ if (params.region) lines.push(`export LAS_REGION="${params.region}"`);
62
+ if (params.sample_rows) lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
87
63
 
88
64
  lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
89
65
  lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
@@ -93,65 +69,78 @@ function writeEnvFile(dir: string, params: ConnectParams): string {
93
69
  return envPath;
94
70
  }
95
71
 
96
- /**
97
- * Install Python dependencies if not already available.
98
- */
99
- function ensurePythonDeps(): void {
100
- try {
101
- execSync(`python3 -c "import boto3, lancedb, pyarrow, pandas, PIL, mutagen, fitz"`, {
102
- stdio: 'pipe',
103
- });
104
- } catch {
105
- console.log('[las-data-profiler] Installing Python dependencies...');
106
- execSync(`pip3 install --user ${PYTHON_DEPS.join(' ')}`, {
107
- stdio: 'inherit',
108
- });
109
- }
110
- }
72
+ function classifyFile(ext: string): { category: string; mediaType: string } {
73
+ ext = ext.toLowerCase();
74
+ const STRUCTURED_EXTS = ['.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'];
75
+ const IMAGE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'];
76
+ const AUDIO_EXTS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'];
77
+ const VIDEO_EXTS = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'];
78
+ const PDF_EXTS = ['.pdf'];
111
79
 
112
- /**
113
- * Get the path to the bundled Python script.
114
- */
115
- function getScriptPath(): string {
116
- // The Python script is located in the scripts directory
117
- return path.join(__dirname, '../scripts', 's3_catalog.py');
80
+ if (STRUCTURED_EXTS.includes(ext)) return { category: 'structured', mediaType: '' };
81
+ if (IMAGE_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'image' };
82
+ if (AUDIO_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'audio' };
83
+ if (VIDEO_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'video' };
84
+ if (PDF_EXTS.includes(ext)) return { category: 'non-structured', mediaType: 'pdf' };
85
+
86
+ return { category: 'non-structured', mediaType: '' };
118
87
  }
119
88
 
120
- // ---------------------------------------------------------------------------
121
- // Main Entry
122
- // ---------------------------------------------------------------------------
123
-
124
89
  export async function connectDataSource(
125
90
  params: ConnectParams,
126
91
  _ctx?: any
127
92
  ): Promise<ConnectResult> {
128
- // Validate required params
129
- if (!params.datasource_name) {
130
- throw new Error('datasource_name is required');
131
- }
132
- if (!params.vendor) {
133
- throw new Error('vendor is required');
134
- }
135
- if (!params.bucket) {
136
- throw new Error('bucket is required');
137
- }
138
- if (params.prefix === undefined || params.prefix === null) {
139
- throw new Error('prefix is required');
93
+ if (!params.datasource_name) throw new Error('datasource_name is required');
94
+ if (!params.url) throw new Error('url is required (e.g. tos://bucket/prefix)');
95
+
96
+ // Parse URL: tos://bucket/prefix
97
+ try {
98
+ if (params.url.startsWith('file://') || params.url.startsWith('/')) {
99
+ params.vendor = 'local';
100
+ const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
101
+ params.bucket = localPath;
102
+ params.prefix = '.';
103
+ } else {
104
+ const parsedUrl = new URL(params.url);
105
+ const protocol = parsedUrl.protocol.replace(':', '');
106
+
107
+ if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
108
+ if (protocol === 'tos') params.vendor = 'volcengine';
109
+ else if (protocol === 'oss') params.vendor = 'alibaba';
110
+ else if (protocol === 'cos') params.vendor = 'tencent';
111
+ else if (protocol === 's3') params.vendor = 'aws';
112
+
113
+ params.bucket = parsedUrl.hostname;
114
+ params.prefix = parsedUrl.pathname.replace(/^\//, ''); // Remove leading slash
115
+ } else {
116
+ throw new Error(`Unsupported protocol: ${protocol}`);
117
+ }
118
+ }
119
+ } catch (e: any) {
120
+ if (!params.vendor || !params.bucket || params.prefix === undefined) {
121
+ throw new Error(`Invalid url format: ${e.message}`);
122
+ }
140
123
  }
141
124
 
142
- // For non-local vendors, validate credentials
143
125
  if (params.vendor !== 'local') {
144
- if (!params.endpoint && params.vendor !== 'aws') {
145
- throw new Error(`endpoint is required for vendor "${params.vendor}"`);
126
+ if (!params.endpoint && params.vendor !== 'aws') throw new Error(`endpoint is required for vendor "${params.vendor}"`);
127
+ let ak = params.access_key;
128
+ let sk = params.secret_key;
129
+
130
+ if (!ak || !sk) {
131
+ try {
132
+ const { loadCredentials } = require('../../utils/credentials');
133
+ const creds = loadCredentials();
134
+ ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
135
+ sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
136
+ } catch(e) {
137
+ // ignore
138
+ }
146
139
  }
147
- const ak = params.access_key || process.env.TOS_ACCESS_KEY || process.env.S3_ACCESS_KEY || process.env.AWS_ACCESS_KEY_ID;
148
- const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
140
+
149
141
  if (!ak || !sk) {
150
- throw new Error(
151
- 'access_key and secret_key are required (via params or env vars TOS_ACCESS_KEY/TOS_SECRET_KEY, S3_ACCESS_KEY/S3_SECRET_KEY, AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)'
152
- );
142
+ throw new Error('access_key and secret_key are required');
153
143
  }
154
- // Normalise into params so env.sh picks them up
155
144
  params.access_key = ak;
156
145
  params.secret_key = sk;
157
146
  }
@@ -160,97 +149,99 @@ export async function connectDataSource(
160
149
  const dbPath = path.join(dsDir, 'catalog_db');
161
150
 
162
151
  ensureDir(dsDir);
163
-
164
- // 1. Write env.sh
165
152
  const envPath = writeEnvFile(dsDir, params);
166
153
 
167
- // 2. Ensure Python dependencies
168
- ensurePythonDeps();
169
-
170
- // 3. Build CLI args for the Python script
171
- const scriptPath = getScriptPath();
172
- const args: string[] = [
173
- scriptPath,
174
- '--vendor', params.vendor,
175
- '--bucket', params.bucket,
176
- '--prefix', params.prefix,
177
- '--db-path', dbPath,
178
- ];
179
-
180
- if (params.endpoint) {
181
- args.push('--endpoint', params.endpoint);
182
- }
183
- if (params.access_key) {
184
- args.push('--ak', params.access_key);
185
- }
186
- if (params.secret_key) {
187
- args.push('--sk', params.secret_key);
188
- }
189
- if (params.region) {
190
- args.push('--region', params.region);
191
- }
192
- if (params.sample_rows) {
193
- args.push('--sample-rows', String(params.sample_rows));
194
- }
154
+ try {
155
+ let isTruncated = true;
156
+ let continuationToken: string | undefined = undefined;
157
+ let total_files = 0;
158
+ let structured_files = 0;
159
+ let media_files = 0;
160
+
161
+ const allRecords: any[] = [];
162
+ const scan_ts = new Date().toISOString() + 'Z';
195
163
 
196
- // 4. Execute the profiling script
197
- return new Promise<ConnectResult>((resolve) => {
198
- let stdout = '';
199
- let stderr = '';
164
+ while (isTruncated) {
165
+ const response = await listS3Objects({
166
+ vendor: params.vendor as any,
167
+ bucket: params.bucket as string,
168
+ endpoint: params.endpoint,
169
+ access_key: params.access_key,
170
+ secret_key: params.secret_key,
171
+ region: params.region
172
+ }, params.prefix || '', 1000, continuationToken);
173
+
174
+ for (const obj of response.Contents) {
175
+ const key = obj.Key || '';
176
+ if (key.endsWith('/')) continue;
200
177
 
201
- const proc = spawn('python3', args, {
202
- cwd: dsDir,
203
- env: { ...process.env },
204
- });
178
+ const name = path.basename(key);
179
+ const ext = path.extname(name).toLowerCase();
180
+ const mimeType = mime.lookup(name) || '';
181
+ const { category, mediaType } = classifyFile(ext);
182
+ const depth = (key.match(/\//g) || []).length;
183
+ const parentDir = key.includes('/') ? path.basename(path.dirname(key)) : '';
184
+
185
+ total_files++;
186
+ if (category === 'structured') structured_files++;
187
+ if (mediaType) media_files++;
205
188
 
206
- proc.stdout.on('data', (data: Buffer) => {
207
- stdout += data.toString();
208
- });
189
+ allRecords.push({
190
+ file_path: key,
191
+ file_name: name,
192
+ extension: ext,
193
+ mime_type: mimeType,
194
+ category: category,
195
+ media_type: mediaType,
196
+ size_bytes: obj.Size || 0,
197
+ last_modified: obj.LastModified ? String(obj.LastModified) : '',
198
+ created_time: obj._created_time ? String(obj._created_time) : '',
199
+ etag: (obj.ETag || '').replace(/"/g, ''),
200
+ storage_class: obj.StorageClass || '',
201
+ is_multipart: (obj.ETag || '').includes('-'),
202
+ depth: depth,
203
+ parent_dir: parentDir,
204
+ vendor: params.vendor,
205
+ bucket: params.bucket,
206
+ has_schema: false,
207
+ has_media_meta: false,
208
+ scan_timestamp: scan_ts
209
+ });
210
+ }
209
211
 
210
- proc.stderr.on('data', (data: Buffer) => {
211
- stderr += data.toString();
212
- });
212
+ isTruncated = response.IsTruncated || false;
213
+ continuationToken = response.NextContinuationToken;
214
+ }
213
215
 
214
- proc.on('close', (code: number | null) => {
215
- if (code !== 0) {
216
- resolve({
217
- status: 'error',
218
- datasource_name: params.datasource_name,
216
+ await writeLanceCatalog({
219
217
  db_path: dbPath,
220
- env_path: envPath,
221
- tables: [],
222
- error: stderr || `Python script exited with code ${code}`,
223
- });
224
- return;
225
- }
218
+ table_name: 'file_catalog',
219
+ records: allRecords
220
+ });
226
221
 
227
- // Try to parse structured output from the script
228
- try {
229
- const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
230
- const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
231
- resolve({
222
+ return {
232
223
  status: 'success',
233
224
  datasource_name: params.datasource_name,
234
225
  db_path: dbPath,
235
226
  env_path: envPath,
236
- tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
237
- summary: result.summary || {
238
- total_files: 0,
239
- structured_files: 0,
240
- media_files: 0,
241
- },
242
- });
243
- } catch {
244
- resolve({
245
- status: 'success',
227
+ tables: ['file_catalog'],
228
+ summary: {
229
+ total_files,
230
+ structured_files,
231
+ media_files
232
+ }
233
+ };
234
+
235
+ } catch (error: any) {
236
+ return {
237
+ status: 'error',
246
238
  datasource_name: params.datasource_name,
247
239
  db_path: dbPath,
248
240
  env_path: envPath,
249
- tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
250
- });
251
- }
252
- });
253
- });
241
+ tables: [],
242
+ error: error.message
243
+ };
244
+ }
254
245
  }
255
246
 
256
247
  export async function listDataSources(
@@ -0,0 +1,203 @@
1
+ import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
2
+ import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
3
+ import * as fs from 'fs';
4
+ import * as path from 'path';
5
+
6
+ export interface S3Params {
7
+ url?: string;
8
+ vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
9
+ endpoint?: string;
10
+ access_key?: string;
11
+ secret_key?: string;
12
+ region?: string;
13
+ bucket?: string;
14
+ }
15
+
16
+ export function parseS3Url(params: S3Params): S3Params {
17
+ if (params.url) {
18
+ if (params.url.startsWith('file://') || params.url.startsWith('/')) {
19
+ params.vendor = 'local';
20
+ const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
21
+ params.bucket = localPath;
22
+ } else {
23
+ const parsedUrl = new URL(params.url);
24
+ const protocol = parsedUrl.protocol.replace(':', '');
25
+
26
+ if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
27
+ if (protocol === 'tos') params.vendor = 'volcengine';
28
+ else if (protocol === 'oss') params.vendor = 'alibaba';
29
+ else if (protocol === 'cos') params.vendor = 'tencent';
30
+ else if (protocol === 's3') params.vendor = 'aws';
31
+
32
+ params.bucket = parsedUrl.hostname;
33
+ // Prefix is usually parsed separately or passed explicitly for listing
34
+ } else {
35
+ throw new Error(`Unsupported protocol: ${protocol}`);
36
+ }
37
+ }
38
+ }
39
+
40
+ if (!params.vendor || !params.bucket) {
41
+ throw new Error('Could not determine vendor or bucket. Please provide a valid url or vendor/bucket directly.');
42
+ }
43
+
44
+ return params;
45
+ }
46
+
47
+ function createS3Client(params: S3Params): S3Client | null {
48
+ params = parseS3Url(params);
49
+ if (params.vendor === 'local') return null;
50
+
51
+ let endpoint = params.endpoint;
52
+ let region = params.region;
53
+
54
+ // Attempt to load credentials if not provided
55
+ let ak = params.access_key;
56
+ let sk = params.secret_key;
57
+
58
+ if (!ak || !sk || !region) {
59
+ try {
60
+ const { loadCredentials } = require('../../utils/credentials');
61
+ const creds = loadCredentials();
62
+ ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
63
+ sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
64
+ region = region || creds.REGION || creds.VOLCENGINE_REGION;
65
+ } catch (e) {
66
+ // ignore
67
+ }
68
+ }
69
+
70
+ ak = ak || '';
71
+ sk = sk || '';
72
+
73
+ if (params.vendor === 'volcengine' && !endpoint) {
74
+ endpoint = `https://tos-s3-${region || 'cn-beijing'}.volces.com`;
75
+ } else if (params.vendor === 'alibaba' && !endpoint) {
76
+ endpoint = `https://s3.oss-${region || 'cn-hangzhou'}.aliyuncs.com`;
77
+ } else if (params.vendor === 'tencent' && !endpoint) {
78
+ endpoint = `https://cos.${region || 'ap-beijing'}.myqcloud.com`;
79
+ }
80
+
81
+ return new S3Client({
82
+ region: region || 'us-east-1',
83
+ endpoint: endpoint,
84
+ credentials: {
85
+ accessKeyId: ak,
86
+ secretAccessKey: sk
87
+ },
88
+ forcePathStyle: false // usually false for virtual hosted style
89
+ });
90
+ }
91
+
92
+ export async function listS3Objects(params: S3Params, prefix: string, maxKeys: number = 1000, continuationToken?: string) {
93
+ params = parseS3Url(params);
94
+ if (params.vendor === 'local') {
95
+ const root = params.bucket as string;
96
+ const prefixPath = prefix && prefix !== '.' ? path.join(root, prefix) : root;
97
+ const files: any[] = [];
98
+
99
+ function walkSync(currentDirPath: string) {
100
+ if (!fs.existsSync(currentDirPath)) return;
101
+ const dirents = fs.readdirSync(currentDirPath, { withFileTypes: true });
102
+ for (const dirent of dirents) {
103
+ const res = path.resolve(currentDirPath, dirent.name);
104
+ if (dirent.isDirectory()) {
105
+ walkSync(res);
106
+ } else {
107
+ const stat = fs.statSync(res);
108
+ files.push({
109
+ Key: path.relative(root, res),
110
+ Size: stat.size,
111
+ LastModified: stat.mtime,
112
+ ETag: '',
113
+ StorageClass: 'LOCAL',
114
+ _created_time: stat.ctime
115
+ });
116
+ }
117
+ }
118
+ }
119
+ walkSync(prefixPath);
120
+ return {
121
+ Contents: files,
122
+ IsTruncated: false,
123
+ NextContinuationToken: undefined
124
+ };
125
+ }
126
+
127
+ const client = createS3Client(params);
128
+ if (!client) throw new Error('Failed to create S3 client');
129
+
130
+ const command = new ListObjectsV2Command({
131
+ Bucket: params.bucket,
132
+ Prefix: prefix,
133
+ MaxKeys: maxKeys,
134
+ ContinuationToken: continuationToken
135
+ });
136
+
137
+ const response = await client.send(command);
138
+ return {
139
+ Contents: response.Contents || [],
140
+ IsTruncated: response.IsTruncated,
141
+ NextContinuationToken: response.NextContinuationToken
142
+ };
143
+ }
144
+
145
+ export async function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer> {
146
+ params = parseS3Url(params);
147
+ if (params.vendor === 'local') {
148
+ const fullPath = path.join(params.bucket as string, key);
149
+ if (maxBytes) {
150
+ const fd = fs.openSync(fullPath, 'r');
151
+ const buffer = Buffer.alloc(maxBytes);
152
+ const bytesRead = fs.readSync(fd, buffer, 0, maxBytes, 0);
153
+ fs.closeSync(fd);
154
+ return buffer.subarray(0, bytesRead);
155
+ } else {
156
+ return fs.readFileSync(fullPath);
157
+ }
158
+ }
159
+
160
+ const client = createS3Client(params);
161
+ if (!client) throw new Error('Failed to create S3 client');
162
+
163
+ const commandInput: any = {
164
+ Bucket: params.bucket,
165
+ Key: key
166
+ };
167
+
168
+ if (maxBytes) {
169
+ commandInput.Range = `bytes=0-${maxBytes - 1}`;
170
+ }
171
+
172
+ const command = new GetObjectCommand(commandInput);
173
+ const response = await client.send(command);
174
+
175
+ if (response.Body) {
176
+ // @ts-ignore
177
+ const chunks = [];
178
+ for await (const chunk of response.Body as any) {
179
+ chunks.push(chunk);
180
+ }
181
+ return Buffer.concat(chunks);
182
+ }
183
+ return Buffer.alloc(0);
184
+ }
185
+
186
+ export async function getPresignedUrl(params: S3Params, key: string, expiresIn: number = 3600): Promise<string> {
187
+ params = parseS3Url(params);
188
+ if (params.vendor === 'local') {
189
+ const fullPath = path.join(params.bucket as string, key);
190
+ return `file://${fullPath}`;
191
+ }
192
+
193
+ const client = createS3Client(params);
194
+ if (!client) throw new Error('Failed to create S3 client');
195
+
196
+ const command = new GetObjectCommand({
197
+ Bucket: params.bucket,
198
+ Key: key
199
+ });
200
+
201
+ const signedUrl = await getSignedUrl(client, command, { expiresIn });
202
+ return signedUrl;
203
+ }