@byted-las/contextlake-openclaw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/bin/contextlake-openclaw.js +5 -0
- package/dist/index.d.ts +113 -0
- package/dist/index.js +73 -0
- package/dist/src/client/lancedb.d.ts +30 -0
- package/dist/src/client/lancedb.js +113 -0
- package/dist/src/client/tos.d.ts +19 -0
- package/dist/src/client/tos.js +81 -0
- package/dist/src/commands/cli.d.ts +6 -0
- package/dist/src/commands/cli.js +78 -0
- package/dist/src/commands/index.d.ts +1 -0
- package/dist/src/commands/index.js +139 -0
- package/dist/src/commands/slashcmd.d.ts +14 -0
- package/dist/src/commands/slashcmd.js +91 -0
- package/dist/src/commands/tools.d.ts +219 -0
- package/dist/src/commands/tools.js +286 -0
- package/dist/src/lib/actions/ingest.d.ts +8 -0
- package/dist/src/lib/actions/ingest.js +123 -0
- package/dist/src/lib/actions/manage.d.ts +15 -0
- package/dist/src/lib/actions/manage.js +91 -0
- package/dist/src/lib/actions/retrieve.d.ts +8 -0
- package/dist/src/lib/actions/retrieve.js +73 -0
- package/dist/src/processor/loader.d.ts +7 -0
- package/dist/src/processor/loader.js +83 -0
- package/dist/src/service/embedding/factory.d.ts +2 -0
- package/dist/src/service/embedding/factory.js +16 -0
- package/dist/src/service/embedding/interface.d.ts +18 -0
- package/dist/src/service/embedding/interface.js +2 -0
- package/dist/src/service/embedding/local.d.ts +14 -0
- package/dist/src/service/embedding/local.js +104 -0
- package/dist/src/service/embedding/remote.d.ts +9 -0
- package/dist/src/service/embedding/remote.js +42 -0
- package/dist/src/service/metadata/factory.d.ts +13 -0
- package/dist/src/service/metadata/factory.js +48 -0
- package/dist/src/service/metadata/interface.d.ts +17 -0
- package/dist/src/service/metadata/interface.js +2 -0
- package/dist/src/service/metadata/local.d.ts +13 -0
- package/dist/src/service/metadata/local.js +49 -0
- package/dist/src/service/storage/factory.d.ts +2 -0
- package/dist/src/service/storage/factory.js +19 -0
- package/dist/src/service/storage/interface.d.ts +32 -0
- package/dist/src/service/storage/interface.js +2 -0
- package/dist/src/service/storage/local.d.ts +9 -0
- package/dist/src/service/storage/local.js +72 -0
- package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
- package/dist/src/skills/las-data-profiler/index.js +231 -0
- package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
- package/dist/src/skills/las-data-profiler/register.js +19 -0
- package/dist/src/utils/config.d.ts +1 -0
- package/dist/src/utils/config.js +16 -0
- package/index.ts +78 -0
- package/openclaw.plugin.json +57 -0
- package/package.json +52 -0
- package/src/client/lancedb.ts +102 -0
- package/src/client/tos.ts +100 -0
- package/src/commands/cli.ts +77 -0
- package/src/commands/index.ts +156 -0
- package/src/commands/slashcmd.ts +95 -0
- package/src/commands/tools.ts +286 -0
- package/src/lib/actions/ingest.ts +103 -0
- package/src/lib/actions/manage.ts +107 -0
- package/src/lib/actions/retrieve.ts +90 -0
- package/src/processor/loader.ts +58 -0
- package/src/service/embedding/factory.ts +13 -0
- package/src/service/embedding/interface.ts +21 -0
- package/src/service/embedding/local.ts +118 -0
- package/src/service/embedding/remote.ts +45 -0
- package/src/service/metadata/factory.ts +52 -0
- package/src/service/metadata/interface.ts +19 -0
- package/src/service/metadata/local.ts +60 -0
- package/src/service/storage/factory.ts +16 -0
- package/src/service/storage/interface.ts +36 -0
- package/src/service/storage/local.ts +42 -0
- package/src/skills/contextlake-delete/SKILL.md +36 -0
- package/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/src/skills/contextlake-list/SKILL.md +22 -0
- package/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/src/skills/las-data-profiler/SKILL.md +174 -0
- package/src/skills/las-data-profiler/index.ts +254 -0
- package/src/skills/las-data-profiler/register.ts +19 -0
- package/src/skills/las-data-profiler/s3_catalog.py +608 -0
- package/src/utils/config.ts +13 -0
|
@@ -0,0 +1,608 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
s3_catalog.py -- Data source profiling tool for LanceDB
|
|
4
|
+
|
|
5
|
+
Three-pass scanning:
|
|
6
|
+
Pass 1: Walk all files -> file_catalog
|
|
7
|
+
Pass 2: Sample structured data -> structured_schemas
|
|
8
|
+
Pass 3: Read media file headers -> media_metadata
|
|
9
|
+
|
|
10
|
+
Supported sources: volcengine TOS / alibaba OSS / tencent COS / aws S3 / local
|
|
11
|
+
Output: LanceDB tables (file_catalog, structured_schemas, media_metadata)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import datetime
|
|
16
|
+
import io
|
|
17
|
+
import json
|
|
18
|
+
import mimetypes
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Lazy imports
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def _import_boto3():
|
|
31
|
+
try:
|
|
32
|
+
import boto3
|
|
33
|
+
from botocore.config import Config as BotoConfig
|
|
34
|
+
return boto3, BotoConfig
|
|
35
|
+
except ImportError:
|
|
36
|
+
print("ERROR: boto3 not installed. Run: pip install boto3", file=sys.stderr)
|
|
37
|
+
sys.exit(1)
|
|
38
|
+
|
|
39
|
+
def _import_lancedb():
|
|
40
|
+
try:
|
|
41
|
+
import lancedb
|
|
42
|
+
return lancedb
|
|
43
|
+
except ImportError:
|
|
44
|
+
print("ERROR: lancedb not installed. Run: pip install lancedb", file=sys.stderr)
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
def _import_pandas():
|
|
48
|
+
try:
|
|
49
|
+
import pandas as pd
|
|
50
|
+
return pd
|
|
51
|
+
except ImportError:
|
|
52
|
+
print("ERROR: pandas not installed. Run: pip install pandas", file=sys.stderr)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
def _import_pyarrow():
|
|
56
|
+
try:
|
|
57
|
+
import pyarrow as pa
|
|
58
|
+
import pyarrow.parquet as pq
|
|
59
|
+
return pa, pq
|
|
60
|
+
except ImportError:
|
|
61
|
+
print("ERROR: pyarrow not installed. Run: pip install pyarrow", file=sys.stderr)
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Constants
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
STRUCTURED_EXTS = {'.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'}
|
|
69
|
+
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'}
|
|
70
|
+
AUDIO_EXTS = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
|
|
71
|
+
VIDEO_EXTS = {'.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'}
|
|
72
|
+
PDF_EXTS = {'.pdf'}
|
|
73
|
+
|
|
74
|
+
IMAGE_HEAD_BYTES = 64 * 1024
|
|
75
|
+
AUDIO_HEAD_BYTES = 512 * 1024
|
|
76
|
+
VIDEO_HEAD_BYTES = 2 * 1024 * 1024
|
|
77
|
+
PDF_HEAD_BYTES = 256 * 1024
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def classify_file(ext: str) -> Tuple[str, Optional[str]]:
|
|
81
|
+
ext = ext.lower()
|
|
82
|
+
if ext in STRUCTURED_EXTS:
|
|
83
|
+
return ('structured', None)
|
|
84
|
+
if ext in IMAGE_EXTS:
|
|
85
|
+
return ('non-structured', 'image')
|
|
86
|
+
if ext in AUDIO_EXTS:
|
|
87
|
+
return ('non-structured', 'audio')
|
|
88
|
+
if ext in VIDEO_EXTS:
|
|
89
|
+
return ('non-structured', 'video')
|
|
90
|
+
if ext in PDF_EXTS:
|
|
91
|
+
return ('non-structured', 'pdf')
|
|
92
|
+
return ('non-structured', None)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ===================================================================
|
|
96
|
+
# S3 Client Abstraction
|
|
97
|
+
# ===================================================================
|
|
98
|
+
|
|
99
|
+
class S3Client:
|
|
100
|
+
"""Thin wrapper around boto3 S3 with vendor-specific configuration."""
|
|
101
|
+
|
|
102
|
+
def __init__(self, vendor: str, endpoint: str, credential_id: str, credential_secret: str, region: str):
|
|
103
|
+
boto3, BotoConfig = _import_boto3()
|
|
104
|
+
self.vendor = vendor
|
|
105
|
+
|
|
106
|
+
config_kwargs: Dict[str, Any] = {}
|
|
107
|
+
if vendor == 'volcengine':
|
|
108
|
+
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
109
|
+
elif vendor == 'alibaba':
|
|
110
|
+
config_kwargs['signature_version'] = 's3'
|
|
111
|
+
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
112
|
+
elif vendor == 'tencent':
|
|
113
|
+
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
114
|
+
|
|
115
|
+
self.client = boto3.client(
|
|
116
|
+
's3',
|
|
117
|
+
endpoint_url=endpoint or None,
|
|
118
|
+
aws_access_key_id=credential_id,
|
|
119
|
+
aws_secret_access_key=credential_secret,
|
|
120
|
+
region_name=region,
|
|
121
|
+
config=BotoConfig(**config_kwargs) if config_kwargs else None,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def list_objects(self, bucket: str, prefix: str):
|
|
125
|
+
paginator = self.client.get_paginator('list_objects_v2')
|
|
126
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
127
|
+
for obj in page.get('Contents', []):
|
|
128
|
+
yield obj
|
|
129
|
+
|
|
130
|
+
def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
|
|
131
|
+
resp = self.client.get_object(Bucket=bucket, Key=key, Range=f'bytes={start}-{end}')
|
|
132
|
+
return resp['Body'].read()
|
|
133
|
+
|
|
134
|
+
def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
|
|
135
|
+
kwargs: Dict[str, Any] = {'Bucket': bucket, 'Key': key}
|
|
136
|
+
if max_bytes:
|
|
137
|
+
kwargs['Range'] = f'bytes=0-{max_bytes - 1}'
|
|
138
|
+
resp = self.client.get_object(**kwargs)
|
|
139
|
+
return resp['Body'].read()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ===================================================================
|
|
143
|
+
# Local FS Abstraction
|
|
144
|
+
# ===================================================================
|
|
145
|
+
|
|
146
|
+
class LocalClient:
|
|
147
|
+
def __init__(self, root: str):
|
|
148
|
+
self.root = root
|
|
149
|
+
|
|
150
|
+
def list_objects(self, bucket: str, prefix: str):
|
|
151
|
+
base = Path(bucket)
|
|
152
|
+
prefix_path = base / prefix if prefix and prefix != '.' else base
|
|
153
|
+
for dirpath, _dirs, files in os.walk(prefix_path):
|
|
154
|
+
for fname in files:
|
|
155
|
+
full = Path(dirpath) / fname
|
|
156
|
+
stat = full.stat()
|
|
157
|
+
key = str(full.relative_to(base))
|
|
158
|
+
yield {
|
|
159
|
+
'Key': key,
|
|
160
|
+
'Size': stat.st_size,
|
|
161
|
+
'LastModified': datetime.datetime.fromtimestamp(stat.st_mtime),
|
|
162
|
+
'ETag': '',
|
|
163
|
+
'StorageClass': 'LOCAL',
|
|
164
|
+
'_created_time': datetime.datetime.fromtimestamp(stat.st_ctime),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
|
|
168
|
+
fpath = Path(bucket) / key
|
|
169
|
+
with open(fpath, 'rb') as f:
|
|
170
|
+
f.seek(start)
|
|
171
|
+
return f.read(end - start + 1)
|
|
172
|
+
|
|
173
|
+
def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
|
|
174
|
+
fpath = Path(bucket) / key
|
|
175
|
+
with open(fpath, 'rb') as f:
|
|
176
|
+
if max_bytes:
|
|
177
|
+
return f.read(max_bytes)
|
|
178
|
+
return f.read()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ===================================================================
|
|
182
|
+
# Pass 1: File Catalog
|
|
183
|
+
# ===================================================================
|
|
184
|
+
|
|
185
|
+
def build_file_catalog(client, bucket: str, prefix: str, vendor: str) -> List[Dict]:
|
|
186
|
+
catalog: List[Dict] = []
|
|
187
|
+
scan_ts = datetime.datetime.utcnow().isoformat() + 'Z'
|
|
188
|
+
|
|
189
|
+
for obj in client.list_objects(bucket, prefix):
|
|
190
|
+
key = obj['Key']
|
|
191
|
+
if key.endswith('/'):
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
name = os.path.basename(key)
|
|
195
|
+
ext = os.path.splitext(name)[1].lower()
|
|
196
|
+
mime, _ = mimetypes.guess_type(name)
|
|
197
|
+
category, media_type = classify_file(ext)
|
|
198
|
+
etag = obj.get('ETag', '').strip('"')
|
|
199
|
+
depth = key.count('/')
|
|
200
|
+
parent_dir = os.path.basename(os.path.dirname(key)) if '/' in key else ''
|
|
201
|
+
|
|
202
|
+
catalog.append({
|
|
203
|
+
'file_path': key,
|
|
204
|
+
'file_name': name,
|
|
205
|
+
'extension': ext,
|
|
206
|
+
'mime_type': mime or '',
|
|
207
|
+
'category': category,
|
|
208
|
+
'media_type': media_type or '',
|
|
209
|
+
'size_bytes': obj.get('Size', 0),
|
|
210
|
+
'last_modified': str(obj.get('LastModified', '')),
|
|
211
|
+
'created_time': str(obj.get('_created_time', '')),
|
|
212
|
+
'etag': etag,
|
|
213
|
+
'storage_class': obj.get('StorageClass', ''),
|
|
214
|
+
'is_multipart': '-' in etag,
|
|
215
|
+
'depth': depth,
|
|
216
|
+
'parent_dir': parent_dir,
|
|
217
|
+
'vendor': vendor,
|
|
218
|
+
'bucket': bucket,
|
|
219
|
+
'has_schema': False,
|
|
220
|
+
'has_media_meta': False,
|
|
221
|
+
'scan_timestamp': scan_ts,
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
return catalog
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ===================================================================
|
|
228
|
+
# Pass 2: Structured Schema Analysis
|
|
229
|
+
# ===================================================================
|
|
230
|
+
|
|
231
|
+
def infer_semantic_hint(series) -> Tuple[str, str]:
|
|
232
|
+
pd = _import_pandas()
|
|
233
|
+
non_null = series.dropna()
|
|
234
|
+
n = len(non_null)
|
|
235
|
+
if n == 0:
|
|
236
|
+
return ('constant', 'all null')
|
|
237
|
+
|
|
238
|
+
unique_count = non_null.nunique()
|
|
239
|
+
unique_ratio = unique_count / n if n > 0 else 0
|
|
240
|
+
|
|
241
|
+
if set(non_null.unique()).issubset({True, False, 0, 1, 'true', 'false', 'True', 'False'}):
|
|
242
|
+
return ('boolean', f'{unique_count} distinct values')
|
|
243
|
+
|
|
244
|
+
if unique_count == 1:
|
|
245
|
+
return ('constant', f'value: {non_null.iloc[0]}')
|
|
246
|
+
|
|
247
|
+
dtype_str = str(series.dtype)
|
|
248
|
+
|
|
249
|
+
if 'int' in dtype_str or 'float' in dtype_str:
|
|
250
|
+
return ('numeric', dtype_str)
|
|
251
|
+
|
|
252
|
+
sample_val = non_null.iloc[0]
|
|
253
|
+
if isinstance(sample_val, (list, dict)):
|
|
254
|
+
return ('structured', type(sample_val).__name__)
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
str_vals = non_null.astype(str)
|
|
258
|
+
avg_len = str_vals.str.len().mean()
|
|
259
|
+
|
|
260
|
+
path_pattern = re.compile(r'[\\\\/]|^s3://|^tos://|^gs://|^https?://')
|
|
261
|
+
path_ratio = str_vals.apply(lambda x: bool(path_pattern.search(x))).mean()
|
|
262
|
+
if path_ratio > 0.5:
|
|
263
|
+
return ('file_path', f'{path_ratio:.0%} match path/URI pattern')
|
|
264
|
+
|
|
265
|
+
ts_pattern = re.compile(r'\d{4}[-/]\d{2}[-/]\d{2}')
|
|
266
|
+
ts_ratio = str_vals.apply(lambda x: bool(ts_pattern.search(x))).mean()
|
|
267
|
+
if ts_ratio > 0.5:
|
|
268
|
+
return ('timestamp', f'{ts_ratio:.0%} match timestamp pattern')
|
|
269
|
+
|
|
270
|
+
if unique_ratio > 0.9 and avg_len < 50:
|
|
271
|
+
return ('id', f'unique_ratio={unique_ratio:.2f}, avg_len={avg_len:.1f}')
|
|
272
|
+
|
|
273
|
+
if unique_count < 50 or unique_ratio < 0.2:
|
|
274
|
+
return ('categorical', f'{unique_count} categories')
|
|
275
|
+
|
|
276
|
+
if avg_len > 50 and unique_ratio > 0.5:
|
|
277
|
+
return ('text', f'avg_len={avg_len:.1f}')
|
|
278
|
+
|
|
279
|
+
except Exception:
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
return ('text', '')
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def analyze_structured_file(client, bucket: str, key: str, ext: str, sample_rows: int) -> List[Dict]:
|
|
286
|
+
pd = _import_pandas()
|
|
287
|
+
pa, pq = _import_pyarrow()
|
|
288
|
+
|
|
289
|
+
schemas: List[Dict] = []
|
|
290
|
+
max_download = 2 * 1024 * 1024
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
raw = client.get_object(bucket, key, max_bytes=max_download)
|
|
294
|
+
except Exception as e:
|
|
295
|
+
return [{'file_path': key, 'error': str(e)}]
|
|
296
|
+
|
|
297
|
+
df = None
|
|
298
|
+
fmt = ext.lstrip('.')
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
if ext in ('.parquet', '.pq'):
|
|
302
|
+
buf = io.BytesIO(raw)
|
|
303
|
+
table = pq.read_table(buf)
|
|
304
|
+
df = table.to_pandas().head(sample_rows)
|
|
305
|
+
fmt = 'parquet'
|
|
306
|
+
elif ext == '.csv':
|
|
307
|
+
df = pd.read_csv(io.BytesIO(raw), nrows=sample_rows, on_bad_lines='skip')
|
|
308
|
+
elif ext == '.tsv':
|
|
309
|
+
df = pd.read_csv(io.BytesIO(raw), sep='\t', nrows=sample_rows, on_bad_lines='skip')
|
|
310
|
+
elif ext in ('.jsonl', '.ndjson'):
|
|
311
|
+
lines = raw.decode('utf-8', errors='replace').strip().split('\n')[:sample_rows]
|
|
312
|
+
records = [json.loads(line) for line in lines if line.strip()]
|
|
313
|
+
df = pd.json_normalize(records)
|
|
314
|
+
elif ext == '.json':
|
|
315
|
+
data = json.loads(raw.decode('utf-8', errors='replace'))
|
|
316
|
+
if isinstance(data, list):
|
|
317
|
+
df = pd.json_normalize(data[:sample_rows])
|
|
318
|
+
elif isinstance(data, dict):
|
|
319
|
+
df = pd.json_normalize([data])
|
|
320
|
+
except Exception as e:
|
|
321
|
+
return [{'file_path': key, 'error': f'parse error: {e}'}]
|
|
322
|
+
|
|
323
|
+
if df is None or df.empty:
|
|
324
|
+
return []
|
|
325
|
+
|
|
326
|
+
for col in df.columns:
|
|
327
|
+
series = df[col]
|
|
328
|
+
non_null = series.dropna()
|
|
329
|
+
unique_count = int(non_null.nunique()) if len(non_null) > 0 else 0
|
|
330
|
+
non_null_ratio = len(non_null) / len(series) if len(series) > 0 else 0.0
|
|
331
|
+
|
|
332
|
+
sample_values = []
|
|
333
|
+
try:
|
|
334
|
+
sample_values = [str(v) for v in non_null.unique()[:3]]
|
|
335
|
+
except Exception:
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
hint, detail = infer_semantic_hint(series)
|
|
339
|
+
|
|
340
|
+
schemas.append({
|
|
341
|
+
'file_path': key,
|
|
342
|
+
'vendor': '',
|
|
343
|
+
'bucket': bucket,
|
|
344
|
+
'format': fmt,
|
|
345
|
+
'column_name': str(col),
|
|
346
|
+
'column_type': str(series.dtype),
|
|
347
|
+
'non_null_ratio': round(non_null_ratio, 4),
|
|
348
|
+
'unique_count': unique_count,
|
|
349
|
+
'sample_values': json.dumps(sample_values, ensure_ascii=False),
|
|
350
|
+
'semantic_hint': hint,
|
|
351
|
+
'semantic_detail': detail,
|
|
352
|
+
})
|
|
353
|
+
|
|
354
|
+
return schemas
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
# ===================================================================
|
|
358
|
+
# Pass 3: Media Metadata Extraction
|
|
359
|
+
# ===================================================================
|
|
360
|
+
|
|
361
|
+
def extract_image_meta(data: bytes) -> Dict:
|
|
362
|
+
meta: Dict[str, Any] = {
|
|
363
|
+
'width': 0, 'height': 0, 'image_format': '', 'color_mode': '', 'exif_summary': '{}',
|
|
364
|
+
}
|
|
365
|
+
try:
|
|
366
|
+
from PIL import Image
|
|
367
|
+
img = Image.open(io.BytesIO(data))
|
|
368
|
+
meta['width'] = img.width
|
|
369
|
+
meta['height'] = img.height
|
|
370
|
+
meta['image_format'] = img.format or ''
|
|
371
|
+
meta['color_mode'] = img.mode or ''
|
|
372
|
+
|
|
373
|
+
exif = {}
|
|
374
|
+
exif_data = getattr(img, '_getexif', lambda: None)()
|
|
375
|
+
if exif_data:
|
|
376
|
+
for tag_id, value in list(exif_data.items())[:10]:
|
|
377
|
+
try:
|
|
378
|
+
from PIL.ExifTags import TAGS
|
|
379
|
+
tag_name = TAGS.get(tag_id, str(tag_id))
|
|
380
|
+
exif[tag_name] = str(value)[:100]
|
|
381
|
+
except Exception:
|
|
382
|
+
pass
|
|
383
|
+
meta['exif_summary'] = json.dumps(exif, ensure_ascii=False)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
meta['extract_error'] = str(e)
|
|
386
|
+
return meta
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def extract_audio_meta(data: bytes) -> Dict:
|
|
390
|
+
meta: Dict[str, Any] = {
|
|
391
|
+
'duration_sec': 0.0, 'codec': '', 'sample_rate': 0, 'channels': 0,
|
|
392
|
+
'bitrate': 0, 'tags_summary': '{}',
|
|
393
|
+
}
|
|
394
|
+
try:
|
|
395
|
+
import mutagen
|
|
396
|
+
f = mutagen.File(io.BytesIO(data))
|
|
397
|
+
if f:
|
|
398
|
+
info = getattr(f, 'info', None)
|
|
399
|
+
if info:
|
|
400
|
+
meta['duration_sec'] = round(getattr(info, 'length', 0.0), 2)
|
|
401
|
+
meta['sample_rate'] = getattr(info, 'sample_rate', 0)
|
|
402
|
+
meta['channels'] = getattr(info, 'channels', 0)
|
|
403
|
+
meta['bitrate'] = getattr(info, 'bitrate', 0)
|
|
404
|
+
meta['codec'] = type(info).__name__
|
|
405
|
+
|
|
406
|
+
tags = {}
|
|
407
|
+
if f.tags:
|
|
408
|
+
for k in list(f.tags.keys())[:10]:
|
|
409
|
+
try:
|
|
410
|
+
tags[str(k)] = str(f.tags[k])[:100]
|
|
411
|
+
except Exception:
|
|
412
|
+
pass
|
|
413
|
+
meta['tags_summary'] = json.dumps(tags, ensure_ascii=False)
|
|
414
|
+
except Exception as e:
|
|
415
|
+
meta['extract_error'] = str(e)
|
|
416
|
+
return meta
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def extract_video_meta(data: bytes) -> Dict:
|
|
420
|
+
meta: Dict[str, Any] = {
|
|
421
|
+
'width': 0, 'height': 0, 'duration_sec': 0.0, 'container': '',
|
|
422
|
+
}
|
|
423
|
+
try:
|
|
424
|
+
if data[:4] == b'\x1a\x45\xdf\xa3':
|
|
425
|
+
meta['container'] = 'mkv/webm'
|
|
426
|
+
elif len(data) > 8 and data[4:8] == b'ftyp':
|
|
427
|
+
ftyp = data[8:12].decode('ascii', errors='replace').strip()
|
|
428
|
+
meta['container'] = ftyp
|
|
429
|
+
elif data[:4] == b'RIFF':
|
|
430
|
+
meta['container'] = 'avi'
|
|
431
|
+
elif data[:3] == b'FLV':
|
|
432
|
+
meta['container'] = 'flv'
|
|
433
|
+
except Exception as e:
|
|
434
|
+
meta['extract_error'] = str(e)
|
|
435
|
+
return meta
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def extract_pdf_meta(data: bytes) -> Dict:
|
|
439
|
+
meta: Dict[str, Any] = {
|
|
440
|
+
'page_count': 0, 'pdf_title': '', 'pdf_author': '',
|
|
441
|
+
'creation_date': '', 'encrypted': False,
|
|
442
|
+
'page_width_pt': 0.0, 'page_height_pt': 0.0,
|
|
443
|
+
}
|
|
444
|
+
try:
|
|
445
|
+
import fitz
|
|
446
|
+
doc = fitz.open(stream=data, filetype='pdf')
|
|
447
|
+
meta['page_count'] = doc.page_count
|
|
448
|
+
md = doc.metadata or {}
|
|
449
|
+
meta['pdf_title'] = md.get('title', '')
|
|
450
|
+
meta['pdf_author'] = md.get('author', '')
|
|
451
|
+
meta['creation_date'] = md.get('creationDate', '')
|
|
452
|
+
meta['encrypted'] = doc.is_encrypted
|
|
453
|
+
|
|
454
|
+
if doc.page_count > 0:
|
|
455
|
+
page = doc[0]
|
|
456
|
+
rect = page.rect
|
|
457
|
+
meta['page_width_pt'] = round(rect.width, 2)
|
|
458
|
+
meta['page_height_pt'] = round(rect.height, 2)
|
|
459
|
+
doc.close()
|
|
460
|
+
except Exception as e:
|
|
461
|
+
meta['extract_error'] = str(e)
|
|
462
|
+
return meta
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def extract_media_metadata(client, bucket: str, key: str, media_type: str) -> Dict:
|
|
466
|
+
head_size = {
|
|
467
|
+
'image': IMAGE_HEAD_BYTES,
|
|
468
|
+
'audio': AUDIO_HEAD_BYTES,
|
|
469
|
+
'video': VIDEO_HEAD_BYTES,
|
|
470
|
+
'pdf': PDF_HEAD_BYTES,
|
|
471
|
+
}.get(media_type, IMAGE_HEAD_BYTES)
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
data = client.get_object(bucket, key, max_bytes=head_size)
|
|
475
|
+
except Exception as e:
|
|
476
|
+
return {'extract_error': f'download failed: {e}'}
|
|
477
|
+
|
|
478
|
+
if media_type == 'image':
|
|
479
|
+
return extract_image_meta(data)
|
|
480
|
+
elif media_type == 'audio':
|
|
481
|
+
return extract_audio_meta(data)
|
|
482
|
+
elif media_type == 'video':
|
|
483
|
+
return extract_video_meta(data)
|
|
484
|
+
elif media_type == 'pdf':
|
|
485
|
+
return extract_pdf_meta(data)
|
|
486
|
+
return {}
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
# ===================================================================
|
|
490
|
+
# LanceDB Writer
|
|
491
|
+
# ===================================================================
|
|
492
|
+
|
|
493
|
+
def write_to_lancedb(db_path: str, table_name: str, records: List[Dict]):
|
|
494
|
+
if not records:
|
|
495
|
+
return
|
|
496
|
+
lancedb = _import_lancedb()
|
|
497
|
+
pd = _import_pandas()
|
|
498
|
+
|
|
499
|
+
db = lancedb.connect(db_path)
|
|
500
|
+
df = pd.DataFrame(records)
|
|
501
|
+
|
|
502
|
+
table_names = db.table_names()
|
|
503
|
+
if table_name in table_names:
|
|
504
|
+
db.drop_table(table_name)
|
|
505
|
+
|
|
506
|
+
db.create_table(table_name, data=df)
|
|
507
|
+
print(f" [LanceDB] Wrote {len(records)} records to '{table_name}'")
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# ===================================================================
|
|
511
|
+
# Main
|
|
512
|
+
# ===================================================================
|
|
513
|
+
|
|
514
|
+
def main():
|
|
515
|
+
parser = argparse.ArgumentParser(description='S3-compatible data profiler -> LanceDB')
|
|
516
|
+
parser.add_argument('--vendor', required=True,
|
|
517
|
+
choices=['volcengine', 'alibaba', 'tencent', 'aws', 'local'])
|
|
518
|
+
parser.add_argument('--endpoint', default='')
|
|
519
|
+
parser.add_argument('--ak', default='', dest='cred_id',
|
|
520
|
+
help='Access credential ID')
|
|
521
|
+
parser.add_argument('--sk', default='', dest='cred_secret',
|
|
522
|
+
help='Access credential value')
|
|
523
|
+
parser.add_argument('--region', default='')
|
|
524
|
+
parser.add_argument('--bucket', required=True)
|
|
525
|
+
parser.add_argument('--prefix', required=True)
|
|
526
|
+
parser.add_argument('--db-path', default='./catalog_db')
|
|
527
|
+
parser.add_argument('--sample-rows', type=int, default=100)
|
|
528
|
+
args = parser.parse_args()
|
|
529
|
+
|
|
530
|
+
print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
|
|
531
|
+
print(f"[las-data-profiler] db_path={args.db_path}")
|
|
532
|
+
|
|
533
|
+
if args.vendor == 'local':
|
|
534
|
+
client = LocalClient(args.bucket)
|
|
535
|
+
else:
|
|
536
|
+
client = S3Client(
|
|
537
|
+
vendor=args.vendor,
|
|
538
|
+
endpoint=args.endpoint,
|
|
539
|
+
credential_id=args.cred_id,
|
|
540
|
+
credential_secret=args.cred_secret,
|
|
541
|
+
region=args.region,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# ---- Pass 1: File Catalog ----
|
|
545
|
+
print("\n[Pass 1] Scanning files...")
|
|
546
|
+
catalog = build_file_catalog(client, args.bucket, args.prefix, args.vendor)
|
|
547
|
+
print(f" Found {len(catalog)} files")
|
|
548
|
+
|
|
549
|
+
# ---- Pass 2: Structured Schemas ----
|
|
550
|
+
print("\n[Pass 2] Analyzing structured data...")
|
|
551
|
+
structured_files = [f for f in catalog if f['category'] == 'structured']
|
|
552
|
+
all_schemas: List[Dict] = []
|
|
553
|
+
for i, entry in enumerate(structured_files):
|
|
554
|
+
key = entry['file_path']
|
|
555
|
+
ext = entry['extension']
|
|
556
|
+
print(f" [{i+1}/{len(structured_files)}] {key}")
|
|
557
|
+
schemas = analyze_structured_file(client, args.bucket, key, ext, args.sample_rows)
|
|
558
|
+
for s in schemas:
|
|
559
|
+
s['vendor'] = args.vendor
|
|
560
|
+
all_schemas.extend(schemas)
|
|
561
|
+
entry['has_schema'] = True
|
|
562
|
+
|
|
563
|
+
print(f" Analyzed {len(structured_files)} files, {len(all_schemas)} column records")
|
|
564
|
+
|
|
565
|
+
# ---- Pass 3: Media Metadata ----
|
|
566
|
+
print("\n[Pass 3] Extracting media metadata...")
|
|
567
|
+
media_files = [f for f in catalog if f['media_type'] in ('image', 'audio', 'video', 'pdf')]
|
|
568
|
+
all_media_meta: List[Dict] = []
|
|
569
|
+
for i, entry in enumerate(media_files):
|
|
570
|
+
key = entry['file_path']
|
|
571
|
+
media_type = entry['media_type']
|
|
572
|
+
print(f" [{i+1}/{len(media_files)}] {key} ({media_type})")
|
|
573
|
+
meta = extract_media_metadata(client, args.bucket, key, media_type)
|
|
574
|
+
meta['file_path'] = key
|
|
575
|
+
meta['vendor'] = args.vendor
|
|
576
|
+
meta['bucket'] = args.bucket
|
|
577
|
+
meta['media_type'] = media_type
|
|
578
|
+
for col in ['width', 'height', 'image_format', 'color_mode', 'exif_summary',
|
|
579
|
+
'duration_sec', 'codec', 'sample_rate', 'channels', 'bitrate',
|
|
580
|
+
'tags_summary', 'container',
|
|
581
|
+
'page_count', 'pdf_title', 'pdf_author', 'creation_date',
|
|
582
|
+
'encrypted', 'page_width_pt', 'page_height_pt', 'extract_error']:
|
|
583
|
+
meta.setdefault(col, '' if isinstance(meta.get(col), str) else 0)
|
|
584
|
+
all_media_meta.append(meta)
|
|
585
|
+
entry['has_media_meta'] = True
|
|
586
|
+
|
|
587
|
+
print(f" Extracted metadata for {len(media_files)} media files")
|
|
588
|
+
|
|
589
|
+
# ---- Write to LanceDB ----
|
|
590
|
+
print(f"\n[LanceDB] Writing to {args.db_path}")
|
|
591
|
+
write_to_lancedb(args.db_path, 'file_catalog', catalog)
|
|
592
|
+
write_to_lancedb(args.db_path, 'structured_schemas', all_schemas)
|
|
593
|
+
write_to_lancedb(args.db_path, 'media_metadata', all_media_meta)
|
|
594
|
+
|
|
595
|
+
# ---- Summary JSON (stdout, for Node.js to parse) ----
|
|
596
|
+
summary = {
|
|
597
|
+
'summary': {
|
|
598
|
+
'total_files': len(catalog),
|
|
599
|
+
'structured_files': len(structured_files),
|
|
600
|
+
'media_files': len(media_files),
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
print(f"\n{json.dumps(summary)}")
|
|
604
|
+
print("\n[las-data-profiler] Done!")
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
if __name__ == '__main__':
|
|
608
|
+
main()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function getPluginConfig(ctx: any) {
|
|
2
|
+
return ctx.config?.plugins?.entries?.['contextlake-openclaw']?.config || {
|
|
3
|
+
metadata_storage: {
|
|
4
|
+
type: 'local',
|
|
5
|
+
lancedb_uri: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'data'),
|
|
6
|
+
embedding: {
|
|
7
|
+
provider: 'local',
|
|
8
|
+
model_name: 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf'
|
|
9
|
+
}
|
|
10
|
+
},
|
|
11
|
+
file_storage: { type: 'local', local_base_dir: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'files') }
|
|
12
|
+
};
|
|
13
|
+
}
|