@byted-las/contextlake-openclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +64 -0
  2. package/bin/contextlake-openclaw.js +5 -0
  3. package/dist/index.d.ts +113 -0
  4. package/dist/index.js +73 -0
  5. package/dist/src/client/lancedb.d.ts +30 -0
  6. package/dist/src/client/lancedb.js +113 -0
  7. package/dist/src/client/tos.d.ts +19 -0
  8. package/dist/src/client/tos.js +81 -0
  9. package/dist/src/commands/cli.d.ts +6 -0
  10. package/dist/src/commands/cli.js +78 -0
  11. package/dist/src/commands/index.d.ts +1 -0
  12. package/dist/src/commands/index.js +139 -0
  13. package/dist/src/commands/slashcmd.d.ts +14 -0
  14. package/dist/src/commands/slashcmd.js +91 -0
  15. package/dist/src/commands/tools.d.ts +219 -0
  16. package/dist/src/commands/tools.js +286 -0
  17. package/dist/src/lib/actions/ingest.d.ts +8 -0
  18. package/dist/src/lib/actions/ingest.js +123 -0
  19. package/dist/src/lib/actions/manage.d.ts +15 -0
  20. package/dist/src/lib/actions/manage.js +91 -0
  21. package/dist/src/lib/actions/retrieve.d.ts +8 -0
  22. package/dist/src/lib/actions/retrieve.js +73 -0
  23. package/dist/src/processor/loader.d.ts +7 -0
  24. package/dist/src/processor/loader.js +83 -0
  25. package/dist/src/service/embedding/factory.d.ts +2 -0
  26. package/dist/src/service/embedding/factory.js +16 -0
  27. package/dist/src/service/embedding/interface.d.ts +18 -0
  28. package/dist/src/service/embedding/interface.js +2 -0
  29. package/dist/src/service/embedding/local.d.ts +14 -0
  30. package/dist/src/service/embedding/local.js +104 -0
  31. package/dist/src/service/embedding/remote.d.ts +9 -0
  32. package/dist/src/service/embedding/remote.js +42 -0
  33. package/dist/src/service/metadata/factory.d.ts +13 -0
  34. package/dist/src/service/metadata/factory.js +48 -0
  35. package/dist/src/service/metadata/interface.d.ts +17 -0
  36. package/dist/src/service/metadata/interface.js +2 -0
  37. package/dist/src/service/metadata/local.d.ts +13 -0
  38. package/dist/src/service/metadata/local.js +49 -0
  39. package/dist/src/service/storage/factory.d.ts +2 -0
  40. package/dist/src/service/storage/factory.js +19 -0
  41. package/dist/src/service/storage/interface.d.ts +32 -0
  42. package/dist/src/service/storage/interface.js +2 -0
  43. package/dist/src/service/storage/local.d.ts +9 -0
  44. package/dist/src/service/storage/local.js +72 -0
  45. package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
  46. package/dist/src/skills/las-data-profiler/index.js +231 -0
  47. package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
  48. package/dist/src/skills/las-data-profiler/register.js +19 -0
  49. package/dist/src/utils/config.d.ts +1 -0
  50. package/dist/src/utils/config.js +16 -0
  51. package/index.ts +78 -0
  52. package/openclaw.plugin.json +57 -0
  53. package/package.json +52 -0
  54. package/src/client/lancedb.ts +102 -0
  55. package/src/client/tos.ts +100 -0
  56. package/src/commands/cli.ts +77 -0
  57. package/src/commands/index.ts +156 -0
  58. package/src/commands/slashcmd.ts +95 -0
  59. package/src/commands/tools.ts +286 -0
  60. package/src/lib/actions/ingest.ts +103 -0
  61. package/src/lib/actions/manage.ts +107 -0
  62. package/src/lib/actions/retrieve.ts +90 -0
  63. package/src/processor/loader.ts +58 -0
  64. package/src/service/embedding/factory.ts +13 -0
  65. package/src/service/embedding/interface.ts +21 -0
  66. package/src/service/embedding/local.ts +118 -0
  67. package/src/service/embedding/remote.ts +45 -0
  68. package/src/service/metadata/factory.ts +52 -0
  69. package/src/service/metadata/interface.ts +19 -0
  70. package/src/service/metadata/local.ts +60 -0
  71. package/src/service/storage/factory.ts +16 -0
  72. package/src/service/storage/interface.ts +36 -0
  73. package/src/service/storage/local.ts +42 -0
  74. package/src/skills/contextlake-delete/SKILL.md +36 -0
  75. package/src/skills/contextlake-ingest/SKILL.md +40 -0
  76. package/src/skills/contextlake-list/SKILL.md +22 -0
  77. package/src/skills/contextlake-retrieve/SKILL.md +37 -0
  78. package/src/skills/las-data-profiler/SKILL.md +174 -0
  79. package/src/skills/las-data-profiler/index.ts +254 -0
  80. package/src/skills/las-data-profiler/register.ts +19 -0
  81. package/src/skills/las-data-profiler/s3_catalog.py +608 -0
  82. package/src/utils/config.ts +13 -0
@@ -0,0 +1,608 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ s3_catalog.py -- Data source profiling tool for LanceDB
4
+
5
+ Three-pass scanning:
6
+ Pass 1: Walk all files -> file_catalog
7
+ Pass 2: Sample structured data -> structured_schemas
8
+ Pass 3: Read media file headers -> media_metadata
9
+
10
+ Supported sources: volcengine TOS / alibaba OSS / tencent COS / aws S3 / local
11
+ Output: LanceDB tables (file_catalog, structured_schemas, media_metadata)
12
+ """
13
+
14
+ import argparse
15
+ import datetime
16
+ import io
17
+ import json
18
+ import mimetypes
19
+ import os
20
+ import re
21
+ import sys
22
+ import time
23
+ from pathlib import Path
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Lazy imports
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def _import_boto3():
31
+ try:
32
+ import boto3
33
+ from botocore.config import Config as BotoConfig
34
+ return boto3, BotoConfig
35
+ except ImportError:
36
+ print("ERROR: boto3 not installed. Run: pip install boto3", file=sys.stderr)
37
+ sys.exit(1)
38
+
39
+ def _import_lancedb():
40
+ try:
41
+ import lancedb
42
+ return lancedb
43
+ except ImportError:
44
+ print("ERROR: lancedb not installed. Run: pip install lancedb", file=sys.stderr)
45
+ sys.exit(1)
46
+
47
+ def _import_pandas():
48
+ try:
49
+ import pandas as pd
50
+ return pd
51
+ except ImportError:
52
+ print("ERROR: pandas not installed. Run: pip install pandas", file=sys.stderr)
53
+ sys.exit(1)
54
+
55
+ def _import_pyarrow():
56
+ try:
57
+ import pyarrow as pa
58
+ import pyarrow.parquet as pq
59
+ return pa, pq
60
+ except ImportError:
61
+ print("ERROR: pyarrow not installed. Run: pip install pyarrow", file=sys.stderr)
62
+ sys.exit(1)
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Constants
66
+ # ---------------------------------------------------------------------------
67
+
68
+ STRUCTURED_EXTS = {'.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'}
69
+ IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'}
70
+ AUDIO_EXTS = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
71
+ VIDEO_EXTS = {'.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'}
72
+ PDF_EXTS = {'.pdf'}
73
+
74
+ IMAGE_HEAD_BYTES = 64 * 1024
75
+ AUDIO_HEAD_BYTES = 512 * 1024
76
+ VIDEO_HEAD_BYTES = 2 * 1024 * 1024
77
+ PDF_HEAD_BYTES = 256 * 1024
78
+
79
+
80
+ def classify_file(ext: str) -> Tuple[str, Optional[str]]:
81
+ ext = ext.lower()
82
+ if ext in STRUCTURED_EXTS:
83
+ return ('structured', None)
84
+ if ext in IMAGE_EXTS:
85
+ return ('non-structured', 'image')
86
+ if ext in AUDIO_EXTS:
87
+ return ('non-structured', 'audio')
88
+ if ext in VIDEO_EXTS:
89
+ return ('non-structured', 'video')
90
+ if ext in PDF_EXTS:
91
+ return ('non-structured', 'pdf')
92
+ return ('non-structured', None)
93
+
94
+
95
+ # ===================================================================
96
+ # S3 Client Abstraction
97
+ # ===================================================================
98
+
99
+ class S3Client:
100
+ """Thin wrapper around boto3 S3 with vendor-specific configuration."""
101
+
102
+ def __init__(self, vendor: str, endpoint: str, credential_id: str, credential_secret: str, region: str):
103
+ boto3, BotoConfig = _import_boto3()
104
+ self.vendor = vendor
105
+
106
+ config_kwargs: Dict[str, Any] = {}
107
+ if vendor == 'volcengine':
108
+ config_kwargs['s3'] = {'addressing_style': 'virtual'}
109
+ elif vendor == 'alibaba':
110
+ config_kwargs['signature_version'] = 's3'
111
+ config_kwargs['s3'] = {'addressing_style': 'virtual'}
112
+ elif vendor == 'tencent':
113
+ config_kwargs['s3'] = {'addressing_style': 'virtual'}
114
+
115
+ self.client = boto3.client(
116
+ 's3',
117
+ endpoint_url=endpoint or None,
118
+ aws_access_key_id=credential_id,
119
+ aws_secret_access_key=credential_secret,
120
+ region_name=region,
121
+ config=BotoConfig(**config_kwargs) if config_kwargs else None,
122
+ )
123
+
124
+ def list_objects(self, bucket: str, prefix: str):
125
+ paginator = self.client.get_paginator('list_objects_v2')
126
+ for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
127
+ for obj in page.get('Contents', []):
128
+ yield obj
129
+
130
+ def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
131
+ resp = self.client.get_object(Bucket=bucket, Key=key, Range=f'bytes={start}-{end}')
132
+ return resp['Body'].read()
133
+
134
+ def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
135
+ kwargs: Dict[str, Any] = {'Bucket': bucket, 'Key': key}
136
+ if max_bytes:
137
+ kwargs['Range'] = f'bytes=0-{max_bytes - 1}'
138
+ resp = self.client.get_object(**kwargs)
139
+ return resp['Body'].read()
140
+
141
+
142
+ # ===================================================================
143
+ # Local FS Abstraction
144
+ # ===================================================================
145
+
146
+ class LocalClient:
147
+ def __init__(self, root: str):
148
+ self.root = root
149
+
150
+ def list_objects(self, bucket: str, prefix: str):
151
+ base = Path(bucket)
152
+ prefix_path = base / prefix if prefix and prefix != '.' else base
153
+ for dirpath, _dirs, files in os.walk(prefix_path):
154
+ for fname in files:
155
+ full = Path(dirpath) / fname
156
+ stat = full.stat()
157
+ key = str(full.relative_to(base))
158
+ yield {
159
+ 'Key': key,
160
+ 'Size': stat.st_size,
161
+ 'LastModified': datetime.datetime.fromtimestamp(stat.st_mtime),
162
+ 'ETag': '',
163
+ 'StorageClass': 'LOCAL',
164
+ '_created_time': datetime.datetime.fromtimestamp(stat.st_ctime),
165
+ }
166
+
167
+ def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
168
+ fpath = Path(bucket) / key
169
+ with open(fpath, 'rb') as f:
170
+ f.seek(start)
171
+ return f.read(end - start + 1)
172
+
173
+ def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
174
+ fpath = Path(bucket) / key
175
+ with open(fpath, 'rb') as f:
176
+ if max_bytes:
177
+ return f.read(max_bytes)
178
+ return f.read()
179
+
180
+
181
+ # ===================================================================
182
+ # Pass 1: File Catalog
183
+ # ===================================================================
184
+
185
+ def build_file_catalog(client, bucket: str, prefix: str, vendor: str) -> List[Dict]:
186
+ catalog: List[Dict] = []
187
+ scan_ts = datetime.datetime.utcnow().isoformat() + 'Z'
188
+
189
+ for obj in client.list_objects(bucket, prefix):
190
+ key = obj['Key']
191
+ if key.endswith('/'):
192
+ continue
193
+
194
+ name = os.path.basename(key)
195
+ ext = os.path.splitext(name)[1].lower()
196
+ mime, _ = mimetypes.guess_type(name)
197
+ category, media_type = classify_file(ext)
198
+ etag = obj.get('ETag', '').strip('"')
199
+ depth = key.count('/')
200
+ parent_dir = os.path.basename(os.path.dirname(key)) if '/' in key else ''
201
+
202
+ catalog.append({
203
+ 'file_path': key,
204
+ 'file_name': name,
205
+ 'extension': ext,
206
+ 'mime_type': mime or '',
207
+ 'category': category,
208
+ 'media_type': media_type or '',
209
+ 'size_bytes': obj.get('Size', 0),
210
+ 'last_modified': str(obj.get('LastModified', '')),
211
+ 'created_time': str(obj.get('_created_time', '')),
212
+ 'etag': etag,
213
+ 'storage_class': obj.get('StorageClass', ''),
214
+ 'is_multipart': '-' in etag,
215
+ 'depth': depth,
216
+ 'parent_dir': parent_dir,
217
+ 'vendor': vendor,
218
+ 'bucket': bucket,
219
+ 'has_schema': False,
220
+ 'has_media_meta': False,
221
+ 'scan_timestamp': scan_ts,
222
+ })
223
+
224
+ return catalog
225
+
226
+
227
+ # ===================================================================
228
+ # Pass 2: Structured Schema Analysis
229
+ # ===================================================================
230
+
231
+ def infer_semantic_hint(series) -> Tuple[str, str]:
232
+ pd = _import_pandas()
233
+ non_null = series.dropna()
234
+ n = len(non_null)
235
+ if n == 0:
236
+ return ('constant', 'all null')
237
+
238
+ unique_count = non_null.nunique()
239
+ unique_ratio = unique_count / n if n > 0 else 0
240
+
241
+ if set(non_null.unique()).issubset({True, False, 0, 1, 'true', 'false', 'True', 'False'}):
242
+ return ('boolean', f'{unique_count} distinct values')
243
+
244
+ if unique_count == 1:
245
+ return ('constant', f'value: {non_null.iloc[0]}')
246
+
247
+ dtype_str = str(series.dtype)
248
+
249
+ if 'int' in dtype_str or 'float' in dtype_str:
250
+ return ('numeric', dtype_str)
251
+
252
+ sample_val = non_null.iloc[0]
253
+ if isinstance(sample_val, (list, dict)):
254
+ return ('structured', type(sample_val).__name__)
255
+
256
+ try:
257
+ str_vals = non_null.astype(str)
258
+ avg_len = str_vals.str.len().mean()
259
+
260
+ path_pattern = re.compile(r'[\\\\/]|^s3://|^tos://|^gs://|^https?://')
261
+ path_ratio = str_vals.apply(lambda x: bool(path_pattern.search(x))).mean()
262
+ if path_ratio > 0.5:
263
+ return ('file_path', f'{path_ratio:.0%} match path/URI pattern')
264
+
265
+ ts_pattern = re.compile(r'\d{4}[-/]\d{2}[-/]\d{2}')
266
+ ts_ratio = str_vals.apply(lambda x: bool(ts_pattern.search(x))).mean()
267
+ if ts_ratio > 0.5:
268
+ return ('timestamp', f'{ts_ratio:.0%} match timestamp pattern')
269
+
270
+ if unique_ratio > 0.9 and avg_len < 50:
271
+ return ('id', f'unique_ratio={unique_ratio:.2f}, avg_len={avg_len:.1f}')
272
+
273
+ if unique_count < 50 or unique_ratio < 0.2:
274
+ return ('categorical', f'{unique_count} categories')
275
+
276
+ if avg_len > 50 and unique_ratio > 0.5:
277
+ return ('text', f'avg_len={avg_len:.1f}')
278
+
279
+ except Exception:
280
+ pass
281
+
282
+ return ('text', '')
283
+
284
+
285
+ def analyze_structured_file(client, bucket: str, key: str, ext: str, sample_rows: int) -> List[Dict]:
286
+ pd = _import_pandas()
287
+ pa, pq = _import_pyarrow()
288
+
289
+ schemas: List[Dict] = []
290
+ max_download = 2 * 1024 * 1024
291
+
292
+ try:
293
+ raw = client.get_object(bucket, key, max_bytes=max_download)
294
+ except Exception as e:
295
+ return [{'file_path': key, 'error': str(e)}]
296
+
297
+ df = None
298
+ fmt = ext.lstrip('.')
299
+
300
+ try:
301
+ if ext in ('.parquet', '.pq'):
302
+ buf = io.BytesIO(raw)
303
+ table = pq.read_table(buf)
304
+ df = table.to_pandas().head(sample_rows)
305
+ fmt = 'parquet'
306
+ elif ext == '.csv':
307
+ df = pd.read_csv(io.BytesIO(raw), nrows=sample_rows, on_bad_lines='skip')
308
+ elif ext == '.tsv':
309
+ df = pd.read_csv(io.BytesIO(raw), sep='\t', nrows=sample_rows, on_bad_lines='skip')
310
+ elif ext in ('.jsonl', '.ndjson'):
311
+ lines = raw.decode('utf-8', errors='replace').strip().split('\n')[:sample_rows]
312
+ records = [json.loads(line) for line in lines if line.strip()]
313
+ df = pd.json_normalize(records)
314
+ elif ext == '.json':
315
+ data = json.loads(raw.decode('utf-8', errors='replace'))
316
+ if isinstance(data, list):
317
+ df = pd.json_normalize(data[:sample_rows])
318
+ elif isinstance(data, dict):
319
+ df = pd.json_normalize([data])
320
+ except Exception as e:
321
+ return [{'file_path': key, 'error': f'parse error: {e}'}]
322
+
323
+ if df is None or df.empty:
324
+ return []
325
+
326
+ for col in df.columns:
327
+ series = df[col]
328
+ non_null = series.dropna()
329
+ unique_count = int(non_null.nunique()) if len(non_null) > 0 else 0
330
+ non_null_ratio = len(non_null) / len(series) if len(series) > 0 else 0.0
331
+
332
+ sample_values = []
333
+ try:
334
+ sample_values = [str(v) for v in non_null.unique()[:3]]
335
+ except Exception:
336
+ pass
337
+
338
+ hint, detail = infer_semantic_hint(series)
339
+
340
+ schemas.append({
341
+ 'file_path': key,
342
+ 'vendor': '',
343
+ 'bucket': bucket,
344
+ 'format': fmt,
345
+ 'column_name': str(col),
346
+ 'column_type': str(series.dtype),
347
+ 'non_null_ratio': round(non_null_ratio, 4),
348
+ 'unique_count': unique_count,
349
+ 'sample_values': json.dumps(sample_values, ensure_ascii=False),
350
+ 'semantic_hint': hint,
351
+ 'semantic_detail': detail,
352
+ })
353
+
354
+ return schemas
355
+
356
+
357
+ # ===================================================================
358
+ # Pass 3: Media Metadata Extraction
359
+ # ===================================================================
360
+
361
+ def extract_image_meta(data: bytes) -> Dict:
362
+ meta: Dict[str, Any] = {
363
+ 'width': 0, 'height': 0, 'image_format': '', 'color_mode': '', 'exif_summary': '{}',
364
+ }
365
+ try:
366
+ from PIL import Image
367
+ img = Image.open(io.BytesIO(data))
368
+ meta['width'] = img.width
369
+ meta['height'] = img.height
370
+ meta['image_format'] = img.format or ''
371
+ meta['color_mode'] = img.mode or ''
372
+
373
+ exif = {}
374
+ exif_data = getattr(img, '_getexif', lambda: None)()
375
+ if exif_data:
376
+ for tag_id, value in list(exif_data.items())[:10]:
377
+ try:
378
+ from PIL.ExifTags import TAGS
379
+ tag_name = TAGS.get(tag_id, str(tag_id))
380
+ exif[tag_name] = str(value)[:100]
381
+ except Exception:
382
+ pass
383
+ meta['exif_summary'] = json.dumps(exif, ensure_ascii=False)
384
+ except Exception as e:
385
+ meta['extract_error'] = str(e)
386
+ return meta
387
+
388
+
389
+ def extract_audio_meta(data: bytes) -> Dict:
390
+ meta: Dict[str, Any] = {
391
+ 'duration_sec': 0.0, 'codec': '', 'sample_rate': 0, 'channels': 0,
392
+ 'bitrate': 0, 'tags_summary': '{}',
393
+ }
394
+ try:
395
+ import mutagen
396
+ f = mutagen.File(io.BytesIO(data))
397
+ if f:
398
+ info = getattr(f, 'info', None)
399
+ if info:
400
+ meta['duration_sec'] = round(getattr(info, 'length', 0.0), 2)
401
+ meta['sample_rate'] = getattr(info, 'sample_rate', 0)
402
+ meta['channels'] = getattr(info, 'channels', 0)
403
+ meta['bitrate'] = getattr(info, 'bitrate', 0)
404
+ meta['codec'] = type(info).__name__
405
+
406
+ tags = {}
407
+ if f.tags:
408
+ for k in list(f.tags.keys())[:10]:
409
+ try:
410
+ tags[str(k)] = str(f.tags[k])[:100]
411
+ except Exception:
412
+ pass
413
+ meta['tags_summary'] = json.dumps(tags, ensure_ascii=False)
414
+ except Exception as e:
415
+ meta['extract_error'] = str(e)
416
+ return meta
417
+
418
+
419
+ def extract_video_meta(data: bytes) -> Dict:
420
+ meta: Dict[str, Any] = {
421
+ 'width': 0, 'height': 0, 'duration_sec': 0.0, 'container': '',
422
+ }
423
+ try:
424
+ if data[:4] == b'\x1a\x45\xdf\xa3':
425
+ meta['container'] = 'mkv/webm'
426
+ elif len(data) > 8 and data[4:8] == b'ftyp':
427
+ ftyp = data[8:12].decode('ascii', errors='replace').strip()
428
+ meta['container'] = ftyp
429
+ elif data[:4] == b'RIFF':
430
+ meta['container'] = 'avi'
431
+ elif data[:3] == b'FLV':
432
+ meta['container'] = 'flv'
433
+ except Exception as e:
434
+ meta['extract_error'] = str(e)
435
+ return meta
436
+
437
+
438
+ def extract_pdf_meta(data: bytes) -> Dict:
439
+ meta: Dict[str, Any] = {
440
+ 'page_count': 0, 'pdf_title': '', 'pdf_author': '',
441
+ 'creation_date': '', 'encrypted': False,
442
+ 'page_width_pt': 0.0, 'page_height_pt': 0.0,
443
+ }
444
+ try:
445
+ import fitz
446
+ doc = fitz.open(stream=data, filetype='pdf')
447
+ meta['page_count'] = doc.page_count
448
+ md = doc.metadata or {}
449
+ meta['pdf_title'] = md.get('title', '')
450
+ meta['pdf_author'] = md.get('author', '')
451
+ meta['creation_date'] = md.get('creationDate', '')
452
+ meta['encrypted'] = doc.is_encrypted
453
+
454
+ if doc.page_count > 0:
455
+ page = doc[0]
456
+ rect = page.rect
457
+ meta['page_width_pt'] = round(rect.width, 2)
458
+ meta['page_height_pt'] = round(rect.height, 2)
459
+ doc.close()
460
+ except Exception as e:
461
+ meta['extract_error'] = str(e)
462
+ return meta
463
+
464
+
465
+ def extract_media_metadata(client, bucket: str, key: str, media_type: str) -> Dict:
466
+ head_size = {
467
+ 'image': IMAGE_HEAD_BYTES,
468
+ 'audio': AUDIO_HEAD_BYTES,
469
+ 'video': VIDEO_HEAD_BYTES,
470
+ 'pdf': PDF_HEAD_BYTES,
471
+ }.get(media_type, IMAGE_HEAD_BYTES)
472
+
473
+ try:
474
+ data = client.get_object(bucket, key, max_bytes=head_size)
475
+ except Exception as e:
476
+ return {'extract_error': f'download failed: {e}'}
477
+
478
+ if media_type == 'image':
479
+ return extract_image_meta(data)
480
+ elif media_type == 'audio':
481
+ return extract_audio_meta(data)
482
+ elif media_type == 'video':
483
+ return extract_video_meta(data)
484
+ elif media_type == 'pdf':
485
+ return extract_pdf_meta(data)
486
+ return {}
487
+
488
+
489
+ # ===================================================================
490
+ # LanceDB Writer
491
+ # ===================================================================
492
+
493
+ def write_to_lancedb(db_path: str, table_name: str, records: List[Dict]):
494
+ if not records:
495
+ return
496
+ lancedb = _import_lancedb()
497
+ pd = _import_pandas()
498
+
499
+ db = lancedb.connect(db_path)
500
+ df = pd.DataFrame(records)
501
+
502
+ table_names = db.table_names()
503
+ if table_name in table_names:
504
+ db.drop_table(table_name)
505
+
506
+ db.create_table(table_name, data=df)
507
+ print(f" [LanceDB] Wrote {len(records)} records to '{table_name}'")
508
+
509
+
510
+ # ===================================================================
511
+ # Main
512
+ # ===================================================================
513
+
514
+ def main():
515
+ parser = argparse.ArgumentParser(description='S3-compatible data profiler -> LanceDB')
516
+ parser.add_argument('--vendor', required=True,
517
+ choices=['volcengine', 'alibaba', 'tencent', 'aws', 'local'])
518
+ parser.add_argument('--endpoint', default='')
519
+ parser.add_argument('--ak', default='', dest='cred_id',
520
+ help='Access credential ID')
521
+ parser.add_argument('--sk', default='', dest='cred_secret',
522
+ help='Access credential value')
523
+ parser.add_argument('--region', default='')
524
+ parser.add_argument('--bucket', required=True)
525
+ parser.add_argument('--prefix', required=True)
526
+ parser.add_argument('--db-path', default='./catalog_db')
527
+ parser.add_argument('--sample-rows', type=int, default=100)
528
+ args = parser.parse_args()
529
+
530
+ print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
531
+ print(f"[las-data-profiler] db_path={args.db_path}")
532
+
533
+ if args.vendor == 'local':
534
+ client = LocalClient(args.bucket)
535
+ else:
536
+ client = S3Client(
537
+ vendor=args.vendor,
538
+ endpoint=args.endpoint,
539
+ credential_id=args.cred_id,
540
+ credential_secret=args.cred_secret,
541
+ region=args.region,
542
+ )
543
+
544
+ # ---- Pass 1: File Catalog ----
545
+ print("\n[Pass 1] Scanning files...")
546
+ catalog = build_file_catalog(client, args.bucket, args.prefix, args.vendor)
547
+ print(f" Found {len(catalog)} files")
548
+
549
+ # ---- Pass 2: Structured Schemas ----
550
+ print("\n[Pass 2] Analyzing structured data...")
551
+ structured_files = [f for f in catalog if f['category'] == 'structured']
552
+ all_schemas: List[Dict] = []
553
+ for i, entry in enumerate(structured_files):
554
+ key = entry['file_path']
555
+ ext = entry['extension']
556
+ print(f" [{i+1}/{len(structured_files)}] {key}")
557
+ schemas = analyze_structured_file(client, args.bucket, key, ext, args.sample_rows)
558
+ for s in schemas:
559
+ s['vendor'] = args.vendor
560
+ all_schemas.extend(schemas)
561
+ entry['has_schema'] = True
562
+
563
+ print(f" Analyzed {len(structured_files)} files, {len(all_schemas)} column records")
564
+
565
+ # ---- Pass 3: Media Metadata ----
566
+ print("\n[Pass 3] Extracting media metadata...")
567
+ media_files = [f for f in catalog if f['media_type'] in ('image', 'audio', 'video', 'pdf')]
568
+ all_media_meta: List[Dict] = []
569
+ for i, entry in enumerate(media_files):
570
+ key = entry['file_path']
571
+ media_type = entry['media_type']
572
+ print(f" [{i+1}/{len(media_files)}] {key} ({media_type})")
573
+ meta = extract_media_metadata(client, args.bucket, key, media_type)
574
+ meta['file_path'] = key
575
+ meta['vendor'] = args.vendor
576
+ meta['bucket'] = args.bucket
577
+ meta['media_type'] = media_type
578
+ for col in ['width', 'height', 'image_format', 'color_mode', 'exif_summary',
579
+ 'duration_sec', 'codec', 'sample_rate', 'channels', 'bitrate',
580
+ 'tags_summary', 'container',
581
+ 'page_count', 'pdf_title', 'pdf_author', 'creation_date',
582
+ 'encrypted', 'page_width_pt', 'page_height_pt', 'extract_error']:
583
+ meta.setdefault(col, '' if isinstance(meta.get(col), str) else 0)
584
+ all_media_meta.append(meta)
585
+ entry['has_media_meta'] = True
586
+
587
+ print(f" Extracted metadata for {len(media_files)} media files")
588
+
589
+ # ---- Write to LanceDB ----
590
+ print(f"\n[LanceDB] Writing to {args.db_path}")
591
+ write_to_lancedb(args.db_path, 'file_catalog', catalog)
592
+ write_to_lancedb(args.db_path, 'structured_schemas', all_schemas)
593
+ write_to_lancedb(args.db_path, 'media_metadata', all_media_meta)
594
+
595
+ # ---- Summary JSON (stdout, for Node.js to parse) ----
596
+ summary = {
597
+ 'summary': {
598
+ 'total_files': len(catalog),
599
+ 'structured_files': len(structured_files),
600
+ 'media_files': len(media_files),
601
+ }
602
+ }
603
+ print(f"\n{json.dumps(summary)}")
604
+ print("\n[las-data-profiler] Done!")
605
+
606
+
607
+ if __name__ == '__main__':
608
+ main()
@@ -0,0 +1,13 @@
1
+ export function getPluginConfig(ctx: any) {
2
+ return ctx.config?.plugins?.entries?.['contextlake-openclaw']?.config || {
3
+ metadata_storage: {
4
+ type: 'local',
5
+ lancedb_uri: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'data'),
6
+ embedding: {
7
+ provider: 'local',
8
+ model_name: 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf'
9
+ }
10
+ },
11
+ file_storage: { type: 'local', local_base_dir: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'files') }
12
+ };
13
+ }