@byted-las/contextlake-openclaw 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.listS3Objects = listS3Objects;
37
+ exports.readS3Object = readS3Object;
38
+ const client_s3_1 = require("@aws-sdk/client-s3");
39
+ const fs = __importStar(require("fs"));
40
+ const path = __importStar(require("path"));
41
+ function createS3Client(params) {
42
+ if (params.vendor === 'local')
43
+ return null;
44
+ let endpoint = params.endpoint;
45
+ let region = params.region;
46
+ // Attempt to load credentials if not provided
47
+ let ak = params.access_key;
48
+ let sk = params.secret_key;
49
+ if (!ak || !sk || !region) {
50
+ try {
51
+ const { loadCredentials } = require('../../utils/credentials');
52
+ const creds = loadCredentials();
53
+ ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
54
+ sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
55
+ region = region || creds.REGION || creds.VOLCENGINE_REGION;
56
+ }
57
+ catch (e) {
58
+ // ignore
59
+ }
60
+ }
61
+ ak = ak || '';
62
+ sk = sk || '';
63
+ if (params.vendor === 'volcengine' && !endpoint) {
64
+ endpoint = `https://tos-s3-${region || 'cn-beijing'}.volces.com`;
65
+ }
66
+ else if (params.vendor === 'alibaba' && !endpoint) {
67
+ endpoint = `https://s3.oss-${region || 'cn-hangzhou'}.aliyuncs.com`;
68
+ }
69
+ else if (params.vendor === 'tencent' && !endpoint) {
70
+ endpoint = `https://cos.${region || 'ap-beijing'}.myqcloud.com`;
71
+ }
72
+ return new client_s3_1.S3Client({
73
+ region: region || 'us-east-1',
74
+ endpoint: endpoint,
75
+ credentials: {
76
+ accessKeyId: ak,
77
+ secretAccessKey: sk
78
+ },
79
+ forcePathStyle: false // usually false for virtual hosted style
80
+ });
81
+ }
82
+ async function listS3Objects(params, prefix, maxKeys = 1000, continuationToken) {
83
+ if (params.vendor === 'local') {
84
+ const root = params.bucket;
85
+ const prefixPath = prefix && prefix !== '.' ? path.join(root, prefix) : root;
86
+ const files = [];
87
+ function walkSync(currentDirPath) {
88
+ if (!fs.existsSync(currentDirPath))
89
+ return;
90
+ const dirents = fs.readdirSync(currentDirPath, { withFileTypes: true });
91
+ for (const dirent of dirents) {
92
+ const res = path.resolve(currentDirPath, dirent.name);
93
+ if (dirent.isDirectory()) {
94
+ walkSync(res);
95
+ }
96
+ else {
97
+ const stat = fs.statSync(res);
98
+ files.push({
99
+ Key: path.relative(root, res),
100
+ Size: stat.size,
101
+ LastModified: stat.mtime,
102
+ ETag: '',
103
+ StorageClass: 'LOCAL',
104
+ _created_time: stat.ctime
105
+ });
106
+ }
107
+ }
108
+ }
109
+ walkSync(prefixPath);
110
+ return {
111
+ Contents: files,
112
+ IsTruncated: false,
113
+ NextContinuationToken: undefined
114
+ };
115
+ }
116
+ const client = createS3Client(params);
117
+ if (!client)
118
+ throw new Error('Failed to create S3 client');
119
+ const command = new client_s3_1.ListObjectsV2Command({
120
+ Bucket: params.bucket,
121
+ Prefix: prefix,
122
+ MaxKeys: maxKeys,
123
+ ContinuationToken: continuationToken
124
+ });
125
+ const response = await client.send(command);
126
+ return {
127
+ Contents: response.Contents || [],
128
+ IsTruncated: response.IsTruncated,
129
+ NextContinuationToken: response.NextContinuationToken
130
+ };
131
+ }
132
+ async function readS3Object(params, key, maxBytes) {
133
+ if (params.vendor === 'local') {
134
+ const fullPath = path.join(params.bucket, key);
135
+ if (maxBytes) {
136
+ const fd = fs.openSync(fullPath, 'r');
137
+ const buffer = Buffer.alloc(maxBytes);
138
+ const bytesRead = fs.readSync(fd, buffer, 0, maxBytes, 0);
139
+ fs.closeSync(fd);
140
+ return buffer.subarray(0, bytesRead);
141
+ }
142
+ else {
143
+ return fs.readFileSync(fullPath);
144
+ }
145
+ }
146
+ const client = createS3Client(params);
147
+ if (!client)
148
+ throw new Error('Failed to create S3 client');
149
+ const commandInput = {
150
+ Bucket: params.bucket,
151
+ Key: key
152
+ };
153
+ if (maxBytes) {
154
+ commandInput.Range = `bytes=0-${maxBytes - 1}`;
155
+ }
156
+ const command = new client_s3_1.GetObjectCommand(commandInput);
157
+ const response = await client.send(command);
158
+ if (response.Body) {
159
+ // @ts-ignore
160
+ const chunks = [];
161
+ for await (const chunk of response.Body) {
162
+ chunks.push(chunk);
163
+ }
164
+ return Buffer.concat(chunks);
165
+ }
166
+ return Buffer.alloc(0);
167
+ }
@@ -1,9 +1,10 @@
1
1
  name: byted-las-data-profiler
2
2
  description: |
3
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
4
- performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
5
- extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
6
- and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
3
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
4
+ It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
5
+
6
+ IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
7
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
7
8
 
8
9
  ## Trigger Scenarios
9
10
  Be sure to use this Skill when the user mentions the following scenarios:
@@ -13,162 +14,24 @@ Be sure to use this Skill when the user mentions the following scenarios:
13
14
  - Need to understand what a batch of data files contains and what their schema looks like
14
15
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
15
16
  - Need to write the meta-information of object storage or local files into LanceDB
16
- - Mentions TOS, boto3, or object storage data profiling
17
17
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
18
- - Need to batch identify the type and size of remote/local files and build an index
19
- - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
20
- - Need to connect/dock a data source for profiling
21
- - Mentions "connect" data source, docking data source
22
-
23
- ## Overview
24
- This Skill is a Dataset Profiling Tool provided by Volcengine TOS. It connects to the TOS bucket via an S3-compatible protocol (boto3), recursively traverses the target path, and accomplishes three things:
25
- 1. **Cataloging**: Records the meta-information (path, size, type, origin, etc.) of all files.
26
- 2. **Understanding Structured Data**: Samples and parses the schema for JSONL / CSV / TSV / Parquet / JSON to infer the semantic role of each column.
27
- 3. **Extracting Media Meta-information**: Reads only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
28
-
29
- In addition to TOS, this tool is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
30
-
31
- ## Supported Data Sources
32
- | Vendor | Service Name | S3 Endpoint Format | Signature | Addressing Style |
33
- |--------|--------------|--------------------|-----------|------------------|
34
- | Volcengine | Volcengine TOS | https://tos-s3-{region}.volces.com / ivolces.com | V4 | Virtual-hosted (Required) |
35
- | Alibaba | Alibaba Cloud OSS | https://s3.oss-{region}.aliyuncs.com / -internal.aliyuncs.com | V2 (boto3 must use signature_version='s3') | Virtual-hosted |
36
- | Tencent | Tencent Cloud COS | https://cos.{region}.myqcloud.com | V4 | Virtual-hosted |
37
- | AWS | Amazon S3 | https://s3.{region}.amazonaws.com | V4 | Virtual-hosted |
38
- | Local | Local File System | Not required | Not required | os.walk |
39
18
 
40
19
  ## Overall Workflow
41
- 1. Confirm data source parameters.
42
- 2. Install dependencies (boto3 lancedb pyarrow pandas Pillow mutagen pymupdf).
43
- 3. Execute the Python script, making three scanning passes:
44
- - Pass 1: Traverse all files → file_catalog
45
- - Pass 2: Sample structured data → structured_schemas
46
- - Pass 3: Read-only header for media files → media_metadata
47
- 4. Report results.
20
+ When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
21
+ If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
48
22
 
49
- ## Parameter Description
23
+ ## Parameter Description (for `las-data-profiler` tool)
50
24
  | Parameter | Description | Example |
51
25
  |-----------|-------------|---------|
52
- | datasource_name | The name of the data source, used to identify this connection. LanceDB output directory is ~/.openclaw/las-data-profiler/{datasource_name}/, and configuration file is env.sh in the same directory | my_tos_data |
26
+ | datasource_name | The name of the data source | my_tos_data |
53
27
  | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
54
28
  | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
55
- | access_key | AK, can be from parameters or environment variables | - |
56
- | secret_key | SK, can be from parameters or environment variables | - |
29
+ | access_key | AK | - |
30
+ | secret_key | SK | - |
57
31
  | region | Region identifier | cn-beijing |
58
32
  | bucket | Bucket name (root directory path when local) | my-data-bucket |
59
- | prefix | Path prefix to limit the scan scope (required to avoid scanning the entire bucket) | datasets/2024/ |
60
- | sample_rows | [Optional] Number of sampled rows per structured file, default is 100 | 200 |
61
-
62
- ## Authentication Priority
63
- Parameters > Environment Variables (TOS_ACCESS_KEY / S3_ACCESS_KEY / AWS_ACCESS_KEY_ID)
64
-
65
- ## Connection Configuration Differences Across Vendors
66
- - Volcengine TOS: Does not support PathStyle, must be Virtual-hosted (handled by default by the tool).
67
- - Alibaba Cloud OSS: boto3 must use V2 signature (signature_version='s3') because V4 will force chunked encoding (handled by default by the tool).
68
- - Tencent Cloud COS / AWS: Default V4 is sufficient.
69
-
70
- ## File Classification and Processing Strategy
71
- | Classification | Extension | Processing |
72
- |----------------|-----------|------------|
73
- | structured | .json .jsonl .ndjson .csv .tsv .parquet .pq | Sample parsing schema + column semantic inference |
74
- | image | .jpg .png .gif .bmp .webp .tiff .svg ... | Read first 64 KB → width, height, format, color mode, EXIF summary |
75
- | audio | .mp3 .wav .flac .aac .ogg .m4a ... | Read first 512 KB → duration, sample rate, channels, bitrate, codec, tags |
76
- | video | .mp4 .avi .mov .mkv .webm ... | Read first 2 MB → duration, resolution, container format |
77
- | pdf | .pdf | Read first 256 KB → page count, title/author, creation time, encryption status, page size |
78
- | Other | All remaining | Only record meta-information to file_catalog |
79
-
80
- Header reading uses S3's Range requests (Range: bytes=0-N) to download only the required bytes, without transferring the full file.
81
-
82
- ## Understanding Structured Data
83
- Sample the first N rows of each structured file to infer the semantic_hint for each column:
84
-
85
- | semantic_hint | Judgment Logic | Example |
86
- |---------------|----------------|---------|
87
- | id | High uniqueness rate + short string | user_id, sample_001 |
88
- | categorical | Low cardinality (unique count < 50 or uniqueness rate < 20%) | label: cat/dog/bird |
89
- | numeric | int/float type | score: 0.95 |
90
- | boolean | bool or only two distinct values | is_valid: true/false |
91
- | text | Long string (avg > 50 characters) + high uniqueness rate | caption: "A cat sitting..." |
92
- | file_path | > 50% values match path/URI patterns | images/001.jpg, s3://bucket/file |
93
- | timestamp | Matches time format | 2024-01-15T10:30:00Z |
94
- | structured | list / dict type | bbox: [10, 20, 100, 200] |
95
- | constant | Only 1 distinct value | version: "1.0" |
96
-
97
- ## LanceDB Three Tables
98
- ### file_catalog — Full file index
99
- | Column | Description |
100
- |--------|-------------|
101
- | file_path, file_name, extension, mime_type | Path and type information |
102
- | category | structured / non-structured |
103
- | media_type | image / audio / video / pdf / empty |
104
- | size_bytes, last_modified, created_time, etag, storage_class | Storage attributes (created_time is only available for local files, empty for S3 objects) |
105
- | is_multipart | ETag contains multipart flag (-), indicating a large file uploaded in parts |
106
- | depth | Path level (number of /), helps to understand the directory structure |
107
- | parent_dir | Nearest parent directory name, convenient for aggregation statistics by directory |
108
- | vendor, bucket | Data source identifier |
109
- | has_schema | Whether there are in-depth analysis records in structured_schemas |
110
- | has_media_meta | Whether there are header meta-information records in media_metadata |
111
- | scan_timestamp | Scan time |
112
-
113
- ### structured_schemas — Column-level schema of structured data
114
- | Column | Description |
115
- |--------|-------------|
116
- | file_path, vendor, bucket, format | File identifier |
117
- | column_name, column_type, non_null_ratio, unique_count | Basic statistics |
118
- | sample_values | First 3 distinct values (JSON) |
119
- | semantic_hint, semantic_detail | Semantic tags and supplementary explanations |
120
-
121
- ### media_metadata — Header meta-information of media files
122
- | Column | Applicable Type | Description |
123
- |--------|-----------------|-------------|
124
- | width, height | Image / Video | Resolution (pixels) |
125
- | image_format, color_mode | Image | e.g., JPEG, RGB |
126
- | exif_summary | Image | Camera model, shooting time, GPS, etc. (JSON) |
127
- | duration_sec | Audio / Video | Duration (seconds) |
128
- | codec, sample_rate, channels, bitrate | Audio | Encoding parameters |
129
- | container | Video | Container format (mp4, mkv...) |
130
- | tags_summary | Audio | Title/artist/album tags, etc. (JSON) |
131
- | page_count, pdf_title, pdf_author | PDF | Document attributes |
132
- | creation_date, encrypted | PDF | Creation time, encryption status |
133
- | page_width_pt, page_height_pt | PDF | First page size (points) |
134
- | extract_error | All | Error message when extraction fails |
33
+ | prefix | Path prefix to limit the scan scope | datasets/2024/ |
135
34
 
136
35
  ## Output Location
137
- - LanceDB table storage path: `~/.openclaw/las-data-profiler/{datasource_name}/`
138
- - Configuration file: `~/.openclaw/las-data-profiler/{datasource_name}/env.sh`
139
-
140
- ## Usage Examples
141
- ```bash
142
- # Volcengine TOS (Recommended)
143
- python s3_catalog.py \
144
- --vendor volcengine \
145
- --endpoint https://tos-s3-cn-beijing.volces.com \
146
- --ak "$TOS_ACCESS_KEY" --sk "$TOS_SECRET_KEY" \
147
- --region cn-beijing --bucket my-bucket --prefix data/
148
-
149
- # Local Directory
150
- python s3_catalog.py --vendor local --bucket /path/to/data --prefix .
151
-
152
- # Alibaba Cloud OSS
153
- python s3_catalog.py \
154
- --vendor alibaba \
155
- --endpoint https://s3.oss-cn-hangzhou.aliyuncs.com \
156
- --ak "$ALI_AK" --sk "$ALI_SK" \
157
- --region cn-hangzhou --bucket my-bucket --prefix datasets/
158
-
159
- # Tencent Cloud COS
160
- python s3_catalog.py \
161
- --vendor tencent \
162
- --endpoint https://cos.ap-beijing.myqcloud.com \
163
- --ak "$COS_AK" --sk "$COS_SK" \
164
- --region ap-beijing --bucket my-bucket --prefix train/
165
-
166
- # AWS S3
167
- python s3_catalog.py --vendor aws --region us-east-1 --bucket my-bucket --prefix train/
168
- ```
169
-
170
- ## Frequently Asked Questions (FAQ)
171
- - Volcengine TOS: InvalidPathAccess → TOS does not support PathStyle, ensure addressing_style='virtual' (handled by default by the tool).
172
- - Alibaba Cloud OSS: InvalidArgument: aws-chunked encoding not supported → V2 signature Config(signature_version='s3') must be used (handled by default by the tool).
173
- - Media meta-information extraction failed → The file might be corrupted or in a non-standard format. The media_metadata.extract_error column will record the specific error, and other fields will fall back to default values (0 / empty string).
174
- - Video meta-information is incomplete → The position of the moov box for MP4/MOV is not fixed—if moov is at the end of the file (stream-written video), it might not be found in the first 2MB. In this case, duration/resolution will be 0.
36
+ - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
37
+ - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
@@ -1,9 +1,10 @@
1
1
  name: byted-las-data-profiler
2
2
  description: |
3
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
4
- performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
5
- extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
6
- and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
3
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
4
+ It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
5
+
6
+ IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
7
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
7
8
 
8
9
  ## Trigger Scenarios
9
10
  Be sure to use this Skill when the user mentions the following scenarios:
@@ -13,162 +14,24 @@ Be sure to use this Skill when the user mentions the following scenarios:
13
14
  - Need to understand what a batch of data files contains and what their schema looks like
14
15
  - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
15
16
  - Need to write the meta-information of object storage or local files into LanceDB
16
- - Mentions TOS, boto3, or object storage data profiling
17
17
  - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
18
- - Need to batch identify the type and size of remote/local files and build an index
19
- - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
20
- - Need to connect/dock a data source for profiling
21
- - Mentions "connect" data source, docking data source
22
-
23
- ## Overview
24
- This Skill is a Dataset Profiling Tool provided by Volcengine TOS. It connects to the TOS bucket via an S3-compatible protocol (boto3), recursively traverses the target path, and accomplishes three things:
25
- 1. **Cataloging**: Records the meta-information (path, size, type, origin, etc.) of all files.
26
- 2. **Understanding Structured Data**: Samples and parses the schema for JSONL / CSV / TSV / Parquet / JSON to infer the semantic role of each column.
27
- 3. **Extracting Media Meta-information**: Reads only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
28
-
29
- In addition to TOS, this tool is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
30
-
31
- ## Supported Data Sources
32
- | Vendor | Service Name | S3 Endpoint Format | Signature | Addressing Style |
33
- |--------|--------------|--------------------|-----------|------------------|
34
- | Volcengine | Volcengine TOS | https://tos-s3-{region}.volces.com / ivolces.com | V4 | Virtual-hosted (Required) |
35
- | Alibaba | Alibaba Cloud OSS | https://s3.oss-{region}.aliyuncs.com / -internal.aliyuncs.com | V2 (boto3 must use signature_version='s3') | Virtual-hosted |
36
- | Tencent | Tencent Cloud COS | https://cos.{region}.myqcloud.com | V4 | Virtual-hosted |
37
- | AWS | Amazon S3 | https://s3.{region}.amazonaws.com | V4 | Virtual-hosted |
38
- | Local | Local File System | Not required | Not required | os.walk |
39
18
 
40
19
  ## Overall Workflow
41
- 1. Confirm data source parameters.
42
- 2. Install dependencies (boto3 lancedb pyarrow pandas Pillow mutagen pymupdf).
43
- 3. Execute the Python script, making three scanning passes:
44
- - Pass 1: Traverse all files → file_catalog
45
- - Pass 2: Sample structured data → structured_schemas
46
- - Pass 3: Read-only header for media files → media_metadata
47
- 4. Report results.
20
+ When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
21
+ If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
48
22
 
49
- ## Parameter Description
23
+ ## Parameter Description (for `las-data-profiler` tool)
50
24
  | Parameter | Description | Example |
51
25
  |-----------|-------------|---------|
52
- | datasource_name | The name of the data source, used to identify this connection. LanceDB output directory is ~/.openclaw/las-data-profiler/{datasource_name}/, and configuration file is env.sh in the same directory | my_tos_data |
26
+ | datasource_name | The name of the data source | my_tos_data |
53
27
  | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
54
28
  | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
55
- | access_key | AK, can be from parameters or environment variables | - |
56
- | secret_key | SK, can be from parameters or environment variables | - |
29
+ | access_key | AK | - |
30
+ | secret_key | SK | - |
57
31
  | region | Region identifier | cn-beijing |
58
32
  | bucket | Bucket name (root directory path when local) | my-data-bucket |
59
- | prefix | Path prefix to limit the scan scope (required to avoid scanning the entire bucket) | datasets/2024/ |
60
- | sample_rows | [Optional] Number of sampled rows per structured file, default is 100 | 200 |
61
-
62
- ## Authentication Priority
63
- Parameters > Environment Variables (TOS_ACCESS_KEY / S3_ACCESS_KEY / AWS_ACCESS_KEY_ID)
64
-
65
- ## Connection Configuration Differences Across Vendors
66
- - Volcengine TOS: Does not support PathStyle, must be Virtual-hosted (handled by default by the tool).
67
- - Alibaba Cloud OSS: boto3 must use V2 signature (signature_version='s3') because V4 will force chunked encoding (handled by default by the tool).
68
- - Tencent Cloud COS / AWS: Default V4 is sufficient.
69
-
70
- ## File Classification and Processing Strategy
71
- | Classification | Extension | Processing |
72
- |----------------|-----------|------------|
73
- | structured | .json .jsonl .ndjson .csv .tsv .parquet .pq | Sample parsing schema + column semantic inference |
74
- | image | .jpg .png .gif .bmp .webp .tiff .svg ... | Read first 64 KB → width, height, format, color mode, EXIF summary |
75
- | audio | .mp3 .wav .flac .aac .ogg .m4a ... | Read first 512 KB → duration, sample rate, channels, bitrate, codec, tags |
76
- | video | .mp4 .avi .mov .mkv .webm ... | Read first 2 MB → duration, resolution, container format |
77
- | pdf | .pdf | Read first 256 KB → page count, title/author, creation time, encryption status, page size |
78
- | Other | All remaining | Only record meta-information to file_catalog |
79
-
80
- Header reading uses S3's Range requests (Range: bytes=0-N) to download only the required bytes, without transferring the full file.
81
-
82
- ## Understanding Structured Data
83
- Sample the first N rows of each structured file to infer the semantic_hint for each column:
84
-
85
- | semantic_hint | Judgment Logic | Example |
86
- |---------------|----------------|---------|
87
- | id | High uniqueness rate + short string | user_id, sample_001 |
88
- | categorical | Low cardinality (unique count < 50 or uniqueness rate < 20%) | label: cat/dog/bird |
89
- | numeric | int/float type | score: 0.95 |
90
- | boolean | bool or only two distinct values | is_valid: true/false |
91
- | text | Long string (avg > 50 characters) + high uniqueness rate | caption: "A cat sitting..." |
92
- | file_path | > 50% values match path/URI patterns | images/001.jpg, s3://bucket/file |
93
- | timestamp | Matches time format | 2024-01-15T10:30:00Z |
94
- | structured | list / dict type | bbox: [10, 20, 100, 200] |
95
- | constant | Only 1 distinct value | version: "1.0" |
96
-
97
- ## LanceDB Three Tables
98
- ### file_catalog — Full file index
99
- | Column | Description |
100
- |--------|-------------|
101
- | file_path, file_name, extension, mime_type | Path and type information |
102
- | category | structured / non-structured |
103
- | media_type | image / audio / video / pdf / empty |
104
- | size_bytes, last_modified, created_time, etag, storage_class | Storage attributes (created_time is only available for local files, empty for S3 objects) |
105
- | is_multipart | ETag contains multipart flag (-), indicating a large file uploaded in parts |
106
- | depth | Path level (number of /), helps to understand the directory structure |
107
- | parent_dir | Nearest parent directory name, convenient for aggregation statistics by directory |
108
- | vendor, bucket | Data source identifier |
109
- | has_schema | Whether there are in-depth analysis records in structured_schemas |
110
- | has_media_meta | Whether there are header meta-information records in media_metadata |
111
- | scan_timestamp | Scan time |
112
-
113
- ### structured_schemas — Column-level schema of structured data
114
- | Column | Description |
115
- |--------|-------------|
116
- | file_path, vendor, bucket, format | File identifier |
117
- | column_name, column_type, non_null_ratio, unique_count | Basic statistics |
118
- | sample_values | First 3 distinct values (JSON) |
119
- | semantic_hint, semantic_detail | Semantic tags and supplementary explanations |
120
-
121
- ### media_metadata — Header meta-information of media files
122
- | Column | Applicable Type | Description |
123
- |--------|-----------------|-------------|
124
- | width, height | Image / Video | Resolution (pixels) |
125
- | image_format, color_mode | Image | e.g., JPEG, RGB |
126
- | exif_summary | Image | Camera model, shooting time, GPS, etc. (JSON) |
127
- | duration_sec | Audio / Video | Duration (seconds) |
128
- | codec, sample_rate, channels, bitrate | Audio | Encoding parameters |
129
- | container | Video | Container format (mp4, mkv...) |
130
- | tags_summary | Audio | Title/artist/album tags, etc. (JSON) |
131
- | page_count, pdf_title, pdf_author | PDF | Document attributes |
132
- | creation_date, encrypted | PDF | Creation time, encryption status |
133
- | page_width_pt, page_height_pt | PDF | First page size (points) |
134
- | extract_error | All | Error message when extraction fails |
33
+ | prefix | Path prefix to limit the scan scope | datasets/2024/ |
135
34
 
136
35
  ## Output Location
137
- - LanceDB table storage path: `~/.openclaw/las-data-profiler/{datasource_name}/`
138
- - Configuration file: `~/.openclaw/las-data-profiler/{datasource_name}/env.sh`
139
-
140
- ## Usage Examples
141
- ```bash
142
- # Volcengine TOS (Recommended)
143
- python s3_catalog.py \
144
- --vendor volcengine \
145
- --endpoint https://tos-s3-cn-beijing.volces.com \
146
- --ak "$TOS_ACCESS_KEY" --sk "$TOS_SECRET_KEY" \
147
- --region cn-beijing --bucket my-bucket --prefix data/
148
-
149
- # Local Directory
150
- python s3_catalog.py --vendor local --bucket /path/to/data --prefix .
151
-
152
- # Alibaba Cloud OSS
153
- python s3_catalog.py \
154
- --vendor alibaba \
155
- --endpoint https://s3.oss-cn-hangzhou.aliyuncs.com \
156
- --ak "$ALI_AK" --sk "$ALI_SK" \
157
- --region cn-hangzhou --bucket my-bucket --prefix datasets/
158
-
159
- # Tencent Cloud COS
160
- python s3_catalog.py \
161
- --vendor tencent \
162
- --endpoint https://cos.ap-beijing.myqcloud.com \
163
- --ak "$COS_AK" --sk "$COS_SK" \
164
- --region ap-beijing --bucket my-bucket --prefix train/
165
-
166
- # AWS S3
167
- python s3_catalog.py --vendor aws --region us-east-1 --bucket my-bucket --prefix train/
168
- ```
169
-
170
- ## Frequently Asked Questions (FAQ)
171
- - Volcengine TOS: InvalidPathAccess → TOS does not support PathStyle, ensure addressing_style='virtual' (handled by default by the tool).
172
- - Alibaba Cloud OSS: InvalidArgument: aws-chunked encoding not supported → V2 signature Config(signature_version='s3') must be used (handled by default by the tool).
173
- - Media meta-information extraction failed → The file might be corrupted or in a non-standard format. The media_metadata.extract_error column will record the specific error, and other fields will fall back to default values (0 / empty string).
174
- - Video meta-information is incomplete → The position of the moov box for MP4/MOV is not fixed—if moov is at the end of the file (stream-written video), it might not be found in the first 2MB. In this case, duration/resolution will be 0.
36
+ - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
37
+ - Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
@@ -11,8 +11,8 @@ function getPluginConfig(ctx) {
11
11
  embedding: {
12
12
  provider: 'las',
13
13
  model_name: 'doubao-embedding-vision-250615',
14
- api_key: process.env.LAS_API_KEY || creds.LAS_API_KEY,
15
- api_base: process.env.LAS_BASE_URL || 'https://operator.las.cn-beijing.volces.com',
14
+ api_key: creds.LAS_API_KEY,
15
+ api_base: 'https://operator.las.cn-beijing.volces.com',
16
16
  dimensions: 2048,
17
17
  encoding_format: 'float'
18
18
  }
@@ -21,8 +21,9 @@ function getPluginConfig(ctx) {
21
21
  type: 'local',
22
22
  local_base_dir: require('path').join(require('os').homedir(), '.openclaw', 'contextlake', 'files'),
23
23
  tos: {
24
- access_key: process.env.VOLCENGINE_ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY,
25
- secret_key: process.env.VOLCENGINE_SECRET_KEY || creds.VOLCENGINE_SECRET_KEY
24
+ access_key: creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY,
25
+ secret_key: creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY,
26
+ region: creds.REGION || creds.VOLCENGINE_REGION || 'cn-beijing'
26
27
  }
27
28
  }
28
29
  };
@@ -1,7 +1,11 @@
1
1
  export interface Credentials {
2
2
  LAS_API_KEY?: string;
3
+ ACCESS_KEY?: string;
4
+ SECRET_KEY?: string;
5
+ REGION?: string;
3
6
  VOLCENGINE_ACCESS_KEY?: string;
4
7
  VOLCENGINE_SECRET_KEY?: string;
8
+ VOLCENGINE_REGION?: string;
5
9
  }
6
10
  export declare function loadCredentials(): Credentials;
7
11
  export declare function saveCredentials(creds: Credentials): void;
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "id": "contextlake-openclaw",
3
3
  "name": "ContextLake",
4
- "version": "1.0.3",
4
+ "version": "1.0.4",
5
5
  "description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
6
6
  "skills": ["./src/skills"],
7
7
  "configSchema": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@byted-las/contextlake-openclaw",
3
- "version": "1.0.3",
3
+ "version": "1.0.4",
4
4
  "description": "ContextLake OpenClaw Plugin for managing knowledge base",
5
5
  "main": "index.ts",
6
6
  "files": [
@@ -33,6 +33,7 @@
33
33
  "node": ">=20.17.0"
34
34
  },
35
35
  "dependencies": {
36
+ "@aws-sdk/client-s3": "^3.1014.0",
36
37
  "@lancedb/lancedb": "^0.26.2",
37
38
  "@volcengine/tos-sdk": "^2.9.0",
38
39
  "commander": "^14.0.3",
@@ -147,13 +147,15 @@ export function getCliCommands(pluginConfig: ContextLakeConfig, logger: any) {
147
147
  console.log('Please provide your credentials below. Press enter to keep the current value.');
148
148
 
149
149
  const lasApiKey = await promptForInput('LAS_API_KEY', currentCreds.LAS_API_KEY);
150
- const volcengineAccessKey = await promptForInput('VOLCENGINE_ACCESS_KEY', currentCreds.VOLCENGINE_ACCESS_KEY);
151
- const volcengineSecretKey = await promptForInput('VOLCENGINE_SECRET_KEY', currentCreds.VOLCENGINE_SECRET_KEY);
150
+ const accessKey = await promptForInput('ACCESS_KEY', currentCreds.ACCESS_KEY || currentCreds.VOLCENGINE_ACCESS_KEY);
151
+ const secretKey = await promptForInput('SECRET_KEY', currentCreds.SECRET_KEY || currentCreds.VOLCENGINE_SECRET_KEY);
152
+ const region = await promptForInput('REGION', currentCreds.REGION || currentCreds.VOLCENGINE_REGION || 'cn-beijing');
152
153
 
153
154
  const newCreds = {
154
155
  LAS_API_KEY: lasApiKey,
155
- VOLCENGINE_ACCESS_KEY: volcengineAccessKey,
156
- VOLCENGINE_SECRET_KEY: volcengineSecretKey
156
+ ACCESS_KEY: accessKey,
157
+ SECRET_KEY: secretKey,
158
+ REGION: region
157
159
  };
158
160
 
159
161
  saveCredentials(newCreds);
@@ -26,6 +26,15 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
26
26
 
27
27
  ctx.registerTool(tools.lasDataProfilerTool );
28
28
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
29
+
30
+ ctx.registerTool(tools.listS3ObjectsTool );
31
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
32
+
33
+ ctx.registerTool(tools.readS3ObjectTool );
34
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.readS3ObjectTool.name}`);
35
+
36
+ ctx.registerTool(tools.writeLanceCatalogTool );
37
+ logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.writeLanceCatalogTool.name}`);
29
38
 
30
39
  ctx.registerTool(tools.listDatasourceTool );
31
40
  logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);