@byted-las/contextlake-openclaw 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  name: byted-las-data-profiler
2
2
  description: |
3
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
4
- performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
5
- extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
6
- and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
3
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
4
+ It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
5
+
6
+ IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
7
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
7
8
 
8
9
  ## Trigger Scenarios
9
10
  Be sure to use this Skill when the user mentions the following scenarios:
@@ -13,162 +14,24 @@ Be sure to use this Skill when the user mentions the following scenarios:
13
14
  - Need to understand what a batch of data files contains and what their schema looks like
14
15
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
15
16
  - Need to write the meta-information of object storage or local files into LanceDB
16
- - Mentions TOS, boto3, or object storage data profiling
17
17
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
18
- - Need to batch identify the type and size of remote/local files and build an index
19
- - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
20
- - Need to connect/dock a data source for profiling
21
- - Mentions "connect" data source, docking data source
22
-
23
- ## Overview
24
- This Skill is a Dataset Profiling Tool provided by Volcengine TOS. It connects to the TOS bucket via an S3-compatible protocol (boto3), recursively traverses the target path, and accomplishes three things:
25
- 1. **Cataloging**: Records the meta-information (path, size, type, origin, etc.) of all files.
26
- 2. **Understanding Structured Data**: Samples and parses the schema for JSONL / CSV / TSV / Parquet / JSON to infer the semantic role of each column.
27
- 3. **Extracting Media Meta-information**: Reads only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
28
-
29
- In addition to TOS, this tool is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
30
-
31
- ## Supported Data Sources
32
- | Vendor | Service Name | S3 Endpoint Format | Signature | Addressing Style |
33
- |--------|--------------|--------------------|-----------|------------------|
34
- | Volcengine | Volcengine TOS | https://tos-s3-{region}.volces.com / ivolces.com | V4 | Virtual-hosted (Required) |
35
- | Alibaba | Alibaba Cloud OSS | https://s3.oss-{region}.aliyuncs.com / -internal.aliyuncs.com | V2 (boto3 must use signature_version='s3') | Virtual-hosted |
36
- | Tencent | Tencent Cloud COS | https://cos.{region}.myqcloud.com | V4 | Virtual-hosted |
37
- | AWS | Amazon S3 | https://s3.{region}.amazonaws.com | V4 | Virtual-hosted |
38
- | Local | Local File System | Not required | Not required | os.walk |
39
18
 
40
19
  ## Overall Workflow
41
- 1. Confirm data source parameters.
42
- 2. Install dependencies (boto3 lancedb pyarrow pandas Pillow mutagen pymupdf).
43
- 3. Execute the Python script, making three scanning passes:
44
- - Pass 1: Traverse all files → file_catalog
45
- - Pass 2: Sample structured data → structured_schemas
46
- - Pass 3: Read-only header for media files → media_metadata
47
- 4. Report results.
20
+ When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
21
+ If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
48
22
 
49
- ## Parameter Description
23
+ ## Parameter Description (for `las-data-profiler` tool)
50
24
  | Parameter | Description | Example |
51
25
  |-----------|-------------|---------|
52
- | datasource_name | The name of the data source, used to identify this connection. LanceDB output directory is ~/.openclaw/las-data-profiler/{datasource_name}/, and configuration file is env.sh in the same directory | my_tos_data |
26
+ | datasource_name | The name of the data source | my_tos_data |
53
27
  | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
54
28
  | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
55
- | access_key | AK, can be from parameters or environment variables | - |
56
- | secret_key | SK, can be from parameters or environment variables | - |
29
+ | access_key | AK | - |
30
+ | secret_key | SK | - |
57
31
  | region | Region identifier | cn-beijing |
58
32
  | bucket | Bucket name (root directory path when local) | my-data-bucket |
59
- | prefix | Path prefix to limit the scan scope (required to avoid scanning the entire bucket) | datasets/2024/ |
60
- | sample_rows | [Optional] Number of sampled rows per structured file, default is 100 | 200 |
61
-
62
- ## Authentication Priority
63
- Parameters > Environment Variables (TOS_ACCESS_KEY / S3_ACCESS_KEY / AWS_ACCESS_KEY_ID)
64
-
65
- ## Connection Configuration Differences Across Vendors
66
- - Volcengine TOS: Does not support PathStyle, must be Virtual-hosted (handled by default by the tool).
67
- - Alibaba Cloud OSS: boto3 must use V2 signature (signature_version='s3') because V4 will force chunked encoding (handled by default by the tool).
68
- - Tencent Cloud COS / AWS: Default V4 is sufficient.
69
-
70
- ## File Classification and Processing Strategy
71
- | Classification | Extension | Processing |
72
- |----------------|-----------|------------|
73
- | structured | .json .jsonl .ndjson .csv .tsv .parquet .pq | Sample parsing schema + column semantic inference |
74
- | image | .jpg .png .gif .bmp .webp .tiff .svg ... | Read first 64 KB → width, height, format, color mode, EXIF summary |
75
- | audio | .mp3 .wav .flac .aac .ogg .m4a ... | Read first 512 KB → duration, sample rate, channels, bitrate, codec, tags |
76
- | video | .mp4 .avi .mov .mkv .webm ... | Read first 2 MB → duration, resolution, container format |
77
- | pdf | .pdf | Read first 256 KB → page count, title/author, creation time, encryption status, page size |
78
- | Other | All remaining | Only record meta-information to file_catalog |
79
-
80
- Header reading uses S3's Range requests (Range: bytes=0-N) to download only the required bytes, without transferring the full file.
81
-
82
- ## Understanding Structured Data
83
- Sample the first N rows of each structured file to infer the semantic_hint for each column:
84
-
85
- | semantic_hint | Judgment Logic | Example |
86
- |---------------|----------------|---------|
87
- | id | High uniqueness rate + short string | user_id, sample_001 |
88
- | categorical | Low cardinality (unique count < 50 or uniqueness rate < 20%) | label: cat/dog/bird |
89
- | numeric | int/float type | score: 0.95 |
90
- | boolean | bool or only two distinct values | is_valid: true/false |
91
- | text | Long string (avg > 50 characters) + high uniqueness rate | caption: "A cat sitting..." |
92
- | file_path | > 50% values match path/URI patterns | images/001.jpg, s3://bucket/file |
93
- | timestamp | Matches time format | 2024-01-15T10:30:00Z |
94
- | structured | list / dict type | bbox: [10, 20, 100, 200] |
95
- | constant | Only 1 distinct value | version: "1.0" |
96
-
97
- ## LanceDB Three Tables
98
- ### file_catalog — Full file index
99
- | Column | Description |
100
- |--------|-------------|
101
- | file_path, file_name, extension, mime_type | Path and type information |
102
- | category | structured / non-structured |
103
- | media_type | image / audio / video / pdf / empty |
104
- | size_bytes, last_modified, created_time, etag, storage_class | Storage attributes (created_time is only available for local files, empty for S3 objects) |
105
- | is_multipart | ETag contains multipart flag (-), indicating a large file uploaded in parts |
106
- | depth | Path level (number of /), helps to understand the directory structure |
107
- | parent_dir | Nearest parent directory name, convenient for aggregation statistics by directory |
108
- | vendor, bucket | Data source identifier |
109
- | has_schema | Whether there are in-depth analysis records in structured_schemas |
110
- | has_media_meta | Whether there are header meta-information records in media_metadata |
111
- | scan_timestamp | Scan time |
112
-
113
- ### structured_schemas — Column-level schema of structured data
114
- | Column | Description |
115
- |--------|-------------|
116
- | file_path, vendor, bucket, format | File identifier |
117
- | column_name, column_type, non_null_ratio, unique_count | Basic statistics |
118
- | sample_values | First 3 distinct values (JSON) |
119
- | semantic_hint, semantic_detail | Semantic tags and supplementary explanations |
120
-
121
- ### media_metadata — Header meta-information of media files
122
- | Column | Applicable Type | Description |
123
- |--------|-----------------|-------------|
124
- | width, height | Image / Video | Resolution (pixels) |
125
- | image_format, color_mode | Image | e.g., JPEG, RGB |
126
- | exif_summary | Image | Camera model, shooting time, GPS, etc. (JSON) |
127
- | duration_sec | Audio / Video | Duration (seconds) |
128
- | codec, sample_rate, channels, bitrate | Audio | Encoding parameters |
129
- | container | Video | Container format (mp4, mkv...) |
130
- | tags_summary | Audio | Title/artist/album tags, etc. (JSON) |
131
- | page_count, pdf_title, pdf_author | PDF | Document attributes |
132
- | creation_date, encrypted | PDF | Creation time, encryption status |
133
- | page_width_pt, page_height_pt | PDF | First page size (points) |
134
- | extract_error | All | Error message when extraction fails |
33
+ | prefix | Path prefix to limit the scan scope | datasets/2024/ |
135
34
 
136
35
  ## Output Location
137
- - LanceDB table storage path: `~/.openclaw/las-data-profiler/{datasource_name}/`
138
- - Configuration file: `~/.openclaw/las-data-profiler/{datasource_name}/env.sh`
139
-
140
- ## Usage Examples
141
- ```bash
142
- # Volcengine TOS (Recommended)
143
- python s3_catalog.py \
144
- --vendor volcengine \
145
- --endpoint https://tos-s3-cn-beijing.volces.com \
146
- --ak "$TOS_ACCESS_KEY" --sk "$TOS_SECRET_KEY" \
147
- --region cn-beijing --bucket my-bucket --prefix data/
148
-
149
- # Local Directory
150
- python s3_catalog.py --vendor local --bucket /path/to/data --prefix .
151
-
152
- # Alibaba Cloud OSS
153
- python s3_catalog.py \
154
- --vendor alibaba \
155
- --endpoint https://s3.oss-cn-hangzhou.aliyuncs.com \
156
- --ak "$ALI_AK" --sk "$ALI_SK" \
157
- --region cn-hangzhou --bucket my-bucket --prefix datasets/
158
-
159
- # Tencent Cloud COS
160
- python s3_catalog.py \
161
- --vendor tencent \
162
- --endpoint https://cos.ap-beijing.myqcloud.com \
163
- --ak "$COS_AK" --sk "$COS_SK" \
164
- --region ap-beijing --bucket my-bucket --prefix train/
165
-
166
- # AWS S3
167
- python s3_catalog.py --vendor aws --region us-east-1 --bucket my-bucket --prefix train/
168
- ```
169
-
170
- ## Frequently Asked Questions (FAQ)
171
- - Volcengine TOS: InvalidPathAccess → TOS does not support PathStyle, ensure addressing_style='virtual' (handled by default by the tool).
172
- - Alibaba Cloud OSS: InvalidArgument: aws-chunked encoding not supported → V2 signature Config(signature_version='s3') must be used (handled by default by the tool).
173
- - Media meta-information extraction failed → The file might be corrupted or in a non-standard format. The media_metadata.extract_error column will record the specific error, and other fields will fall back to default values (0 / empty string).
174
- - Video meta-information is incomplete → The position of the moov box for MP4/MOV is not fixed—if moov is at the end of the file (stream-written video), it might not be found in the first 2MB. In this case, duration/resolution will be 0.
36
+ - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
37
+ - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
@@ -48,8 +48,8 @@ export function getPluginConfig(ctx: any): ContextLakeConfig {
48
48
  embedding: {
49
49
  provider: 'las',
50
50
  model_name: 'doubao-embedding-vision-250615',
51
- api_key: process.env.LAS_API_KEY || creds.LAS_API_KEY,
52
- api_base: process.env.LAS_BASE_URL || 'https://operator.las.cn-beijing.volces.com',
51
+ api_key: creds.LAS_API_KEY,
52
+ api_base: 'https://operator.las.cn-beijing.volces.com',
53
53
  dimensions: 2048,
54
54
  encoding_format: 'float'
55
55
  }
@@ -58,8 +58,9 @@ export function getPluginConfig(ctx: any): ContextLakeConfig {
58
58
  type: 'local',
59
59
  local_base_dir: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'files'),
60
60
  tos: {
61
- access_key: process.env.VOLCENGINE_ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY,
62
- secret_key: process.env.VOLCENGINE_SECRET_KEY || creds.VOLCENGINE_SECRET_KEY
61
+ access_key: creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY,
62
+ secret_key: creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY,
63
+ region: creds.REGION || creds.VOLCENGINE_REGION || 'cn-beijing'
63
64
  }
64
65
  }
65
66
  };
@@ -8,8 +8,14 @@ const CONFIG_FILE = path.join(CONFIG_DIR, 'credentials.json');
8
8
 
9
9
  export interface Credentials {
10
10
  LAS_API_KEY?: string;
11
+ ACCESS_KEY?: string;
12
+ SECRET_KEY?: string;
13
+ REGION?: string;
14
+
15
+ // Legacy support
11
16
  VOLCENGINE_ACCESS_KEY?: string;
12
17
  VOLCENGINE_SECRET_KEY?: string;
18
+ VOLCENGINE_REGION?: string;
13
19
  }
14
20
 
15
21
  export function loadCredentials(): Credentials {