@byted-las/contextlake-openclaw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +64 -0
  2. package/bin/contextlake-openclaw.js +5 -0
  3. package/dist/index.d.ts +113 -0
  4. package/dist/index.js +73 -0
  5. package/dist/src/client/lancedb.d.ts +30 -0
  6. package/dist/src/client/lancedb.js +113 -0
  7. package/dist/src/client/tos.d.ts +19 -0
  8. package/dist/src/client/tos.js +81 -0
  9. package/dist/src/commands/cli.d.ts +6 -0
  10. package/dist/src/commands/cli.js +78 -0
  11. package/dist/src/commands/index.d.ts +1 -0
  12. package/dist/src/commands/index.js +139 -0
  13. package/dist/src/commands/slashcmd.d.ts +14 -0
  14. package/dist/src/commands/slashcmd.js +91 -0
  15. package/dist/src/commands/tools.d.ts +219 -0
  16. package/dist/src/commands/tools.js +286 -0
  17. package/dist/src/lib/actions/ingest.d.ts +8 -0
  18. package/dist/src/lib/actions/ingest.js +123 -0
  19. package/dist/src/lib/actions/manage.d.ts +15 -0
  20. package/dist/src/lib/actions/manage.js +91 -0
  21. package/dist/src/lib/actions/retrieve.d.ts +8 -0
  22. package/dist/src/lib/actions/retrieve.js +73 -0
  23. package/dist/src/processor/loader.d.ts +7 -0
  24. package/dist/src/processor/loader.js +83 -0
  25. package/dist/src/service/embedding/factory.d.ts +2 -0
  26. package/dist/src/service/embedding/factory.js +16 -0
  27. package/dist/src/service/embedding/interface.d.ts +18 -0
  28. package/dist/src/service/embedding/interface.js +2 -0
  29. package/dist/src/service/embedding/local.d.ts +14 -0
  30. package/dist/src/service/embedding/local.js +104 -0
  31. package/dist/src/service/embedding/remote.d.ts +9 -0
  32. package/dist/src/service/embedding/remote.js +42 -0
  33. package/dist/src/service/metadata/factory.d.ts +13 -0
  34. package/dist/src/service/metadata/factory.js +48 -0
  35. package/dist/src/service/metadata/interface.d.ts +17 -0
  36. package/dist/src/service/metadata/interface.js +2 -0
  37. package/dist/src/service/metadata/local.d.ts +13 -0
  38. package/dist/src/service/metadata/local.js +49 -0
  39. package/dist/src/service/storage/factory.d.ts +2 -0
  40. package/dist/src/service/storage/factory.js +19 -0
  41. package/dist/src/service/storage/interface.d.ts +32 -0
  42. package/dist/src/service/storage/interface.js +2 -0
  43. package/dist/src/service/storage/local.d.ts +9 -0
  44. package/dist/src/service/storage/local.js +72 -0
  45. package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
  46. package/dist/src/skills/las-data-profiler/index.js +231 -0
  47. package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
  48. package/dist/src/skills/las-data-profiler/register.js +19 -0
  49. package/dist/src/utils/config.d.ts +1 -0
  50. package/dist/src/utils/config.js +16 -0
  51. package/index.ts +78 -0
  52. package/openclaw.plugin.json +57 -0
  53. package/package.json +52 -0
  54. package/src/client/lancedb.ts +102 -0
  55. package/src/client/tos.ts +100 -0
  56. package/src/commands/cli.ts +77 -0
  57. package/src/commands/index.ts +156 -0
  58. package/src/commands/slashcmd.ts +95 -0
  59. package/src/commands/tools.ts +286 -0
  60. package/src/lib/actions/ingest.ts +103 -0
  61. package/src/lib/actions/manage.ts +107 -0
  62. package/src/lib/actions/retrieve.ts +90 -0
  63. package/src/processor/loader.ts +58 -0
  64. package/src/service/embedding/factory.ts +13 -0
  65. package/src/service/embedding/interface.ts +21 -0
  66. package/src/service/embedding/local.ts +118 -0
  67. package/src/service/embedding/remote.ts +45 -0
  68. package/src/service/metadata/factory.ts +52 -0
  69. package/src/service/metadata/interface.ts +19 -0
  70. package/src/service/metadata/local.ts +60 -0
  71. package/src/service/storage/factory.ts +16 -0
  72. package/src/service/storage/interface.ts +36 -0
  73. package/src/service/storage/local.ts +42 -0
  74. package/src/skills/contextlake-delete/SKILL.md +36 -0
  75. package/src/skills/contextlake-ingest/SKILL.md +40 -0
  76. package/src/skills/contextlake-list/SKILL.md +22 -0
  77. package/src/skills/contextlake-retrieve/SKILL.md +37 -0
  78. package/src/skills/las-data-profiler/SKILL.md +174 -0
  79. package/src/skills/las-data-profiler/index.ts +254 -0
  80. package/src/skills/las-data-profiler/register.ts +19 -0
  81. package/src/skills/las-data-profiler/s3_catalog.py +608 -0
  82. package/src/utils/config.ts +13 -0
@@ -0,0 +1,60 @@
1
+ import { MetadataProvider, MetadataConfig } from './interface';
2
+ import { ContextLakeLanceDBClient, DocumentSchema } from '../../client/lancedb';
3
+ import { createEmbeddingProvider } from '../embedding/factory';
4
+ import { EmbeddingProvider } from '../embedding/interface';
5
+
6
+ export class LocalMetadataProvider implements MetadataProvider {
7
+ private client: ContextLakeLanceDBClient;
8
+ private embeddingProvider: EmbeddingProvider;
9
+
10
+ constructor(config: MetadataConfig) {
11
+ if (!config.lancedb_uri) {
12
+ // Fallback to default if somehow not passed
13
+ // Use an absolute path or path relative to home to avoid issues when running in different cwds via OpenClaw daemon
14
+ const os = require('os');
15
+ const path = require('path');
16
+ config.lancedb_uri = path.join(os.homedir(), '.openclaw', 'contextlake', 'data');
17
+ }
18
+
19
+ // Ensure embedding config exists
20
+ if (!config.embedding) {
21
+ config.embedding = {
22
+ provider: 'local',
23
+ model_name: 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf'
24
+ };
25
+ }
26
+
27
+ if (!config.lancedb_uri || !config.embedding) {
28
+ throw new Error(`Missing LanceDB configuration: uri=${config.lancedb_uri}, embedding=${!!config.embedding}`);
29
+ }
30
+ this.embeddingProvider = createEmbeddingProvider(config.embedding);
31
+ this.client = new ContextLakeLanceDBClient(
32
+ { uri: config.lancedb_uri },
33
+ this.embeddingProvider
34
+ );
35
+ }
36
+
37
+ async connect(): Promise<void> {
38
+ await this.client.connect();
39
+ }
40
+
41
+ async addAssets(docs: DocumentSchema[]): Promise<void> {
42
+ await this.client.addAssets(docs);
43
+ }
44
+
45
+ async search(query: string, limit?: number, filter?: string): Promise<DocumentSchema[]> {
46
+ return await this.client.search(query, limit, filter) as unknown as DocumentSchema[];
47
+ }
48
+
49
+ async list(limit?: number, filter?: string): Promise<DocumentSchema[]> {
50
+ return await this.client.list(limit, filter) as unknown as DocumentSchema[];
51
+ }
52
+
53
+ async delete(filter: string): Promise<void> {
54
+ await this.client.delete(filter);
55
+ }
56
+
57
+ async generateEmbedding(text: string): Promise<number[]> {
58
+ return await this.embeddingProvider.generateEmbedding(text);
59
+ }
60
+ }
@@ -0,0 +1,16 @@
1
+ import { StorageProvider, StorageConfig } from './interface';
2
+ import { LocalStorageProvider } from './local';
3
+ import { ContextLakeTosClient } from '../../client/tos';
4
+
5
+ export function createStorageProvider(config: StorageConfig): StorageProvider {
6
+ if (config.type === 'local') {
7
+ return new LocalStorageProvider(config.local_base_dir);
8
+ } else if (config.type === 'tos') {
9
+ if (!config.tos || !config.tos.region || !config.tos.path) {
10
+ throw new Error('Missing TOS configuration: region and path required');
11
+ }
12
+ return new ContextLakeTosClient(config.tos as any);
13
+ } else {
14
+ throw new Error(`Unsupported storage type: ${config.type}`);
15
+ }
16
+ }
@@ -0,0 +1,36 @@
1
+
2
+ export interface StorageProvider {
3
+ /**
4
+ * Upload file content
5
+ * @param fileName - Name of the file
6
+ * @param buffer - File content
7
+ * @returns The storage URL (e.g. file:///... or tos://...)
8
+ */
9
+ uploadFile(fileName: string, buffer: Buffer): Promise<string>;
10
+
11
+ /**
12
+ * Download file content
13
+ * @param url - Storage URL
14
+ * @returns File content buffer
15
+ */
16
+ downloadFile(url: string): Promise<Buffer>;
17
+
18
+ /**
19
+ * Delete file
20
+ * @param url - Storage URL
21
+ */
22
+ deleteFile(url: string): Promise<void>;
23
+ }
24
+
25
+ export interface StorageConfig {
26
+ type: 'local' | 'tos';
27
+ local_base_dir?: string;
28
+ tos?: {
29
+ access_key?: string;
30
+ secret_key?: string;
31
+ region?: string;
32
+ path?: string; // e.g. tos://bucket/base/path/
33
+ endpoint?: string;
34
+ sts_token?: string;
35
+ };
36
+ }
@@ -0,0 +1,42 @@
1
+ import * as fs from 'fs';
2
+ import * as path from 'path';
3
+ import { StorageProvider } from './interface';
4
+
5
+ export class LocalStorageProvider implements StorageProvider {
6
+ private baseDir: string;
7
+
8
+ constructor(baseDir: string = './data/files') {
9
+ this.baseDir = path.resolve(baseDir);
10
+ if (!fs.existsSync(this.baseDir)) {
11
+ fs.mkdirSync(this.baseDir, { recursive: true });
12
+ }
13
+ }
14
+
15
+ async uploadFile(fileName: string, buffer: Buffer): Promise<string> {
16
+ if (!fs.existsSync(this.baseDir)) {
17
+ fs.mkdirSync(this.baseDir, { recursive: true });
18
+ }
19
+ const filePath = path.join(this.baseDir, fileName);
20
+ await fs.promises.writeFile(filePath, buffer);
21
+ return `file://${filePath}`;
22
+ }
23
+
24
+ async downloadFile(url: string): Promise<Buffer> {
25
+ const filePath = this.parseUrl(url);
26
+ return await fs.promises.readFile(filePath);
27
+ }
28
+
29
+ async deleteFile(url: string): Promise<void> {
30
+ const filePath = this.parseUrl(url);
31
+ if (fs.existsSync(filePath)) {
32
+ await fs.promises.unlink(filePath);
33
+ }
34
+ }
35
+
36
+ private parseUrl(url: string): string {
37
+ if (!url.startsWith('file://')) {
38
+ throw new Error(`Invalid local file URL: ${url}`);
39
+ }
40
+ return url.replace('file://', '');
41
+ }
42
+ }
@@ -0,0 +1,36 @@
1
+ name: contextlake-delete
2
+ description: |
3
+ Delete documents and assets from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
4
+ Use this tool when the user wants to "删除某个文档", "清理知识库", "移除文件", "delete knowledge", or "remove documents".
5
+ Supports deleting documents by their specific IDs or by applying a SQL-like filter.
6
+
7
+ Example User Queries:
8
+ - "删除 ID 为 12345 的文档"
9
+ - "Please delete the old architecture document from the knowledge base"
10
+ - "从知识湖中移除 category 是 finance 的所有记录"
11
+
12
+ Example Tool Call (by ID):
13
+ ```json
14
+ {
15
+ "file_ids": ["doc-123", "doc-456"]
16
+ }
17
+ ```
18
+
19
+ Example Tool Call (by Filter):
20
+ ```json
21
+ {
22
+ "filter": "metadata.category = 'finance'"
23
+ }
24
+ ```
25
+
26
+ parameters:
27
+ file_ids:
28
+ type: array
29
+ items:
30
+ type: string
31
+ description: List of specific document IDs to delete.
32
+ required: false
33
+ filter:
34
+ type: string
35
+ description: SQL-like filter string to identify documents to delete (e.g. "metadata.category = 'obsolete'").
36
+ required: false
@@ -0,0 +1,40 @@
1
+ name: contextlake-ingest
2
+ description: |
3
+ Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
4
+ Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
5
+ Supports processing of various file types including PDF, Word, Markdown, and Text.
6
+ Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
7
+
8
+ Example User Queries:
9
+ - "帮我把这个文档注入到知识湖中"
10
+ - "上传这份 PDF 到知识库"
11
+ - "Please ingest these documents into ContextLake"
12
+ - "将 /path/to/doc.txt 添加到知识库"
13
+
14
+ Example Tool Call:
15
+ ```json
16
+ {
17
+ "files": ["/absolute/path/to/document.pdf"],
18
+ "metadata": { "category": "tech" }
19
+ }
20
+ ```
21
+
22
+ parameters:
23
+ files:
24
+ type: array
25
+ items:
26
+ type: string
27
+ description: List of file paths to ingest
28
+ required: true
29
+ metadata:
30
+ type: object
31
+ description: Optional JSON metadata to attach to documents
32
+ required: false
33
+ chunkSize:
34
+ type: integer
35
+ description: Chunk size for text splitting
36
+ required: false
37
+ overlap:
38
+ type: integer
39
+ description: Overlap size for text splitting
40
+ required: false
@@ -0,0 +1,22 @@
1
+ name: contextlake-list
2
+ description: |
3
+ List documents and assets currently in the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
4
+ Use this tool when the user wants to "列出所有知识", "查看知识库文件", "显示文档列表", "list knowledge", or "show documents".
5
+
6
+ Example User Queries:
7
+ - "知识湖里目前有哪些文件?"
8
+ - "列出前10个知识库文档"
9
+ - "Show me all documents in the knowledge base"
10
+
11
+ Example Tool Call:
12
+ ```json
13
+ {
14
+ "limit": 50
15
+ }
16
+ ```
17
+
18
+ parameters:
19
+ limit:
20
+ type: integer
21
+ description: Maximum number of documents to return (default 100).
22
+ required: false
@@ -0,0 +1,37 @@
1
+ name: contextlake-retrieve
2
+ description: |
3
+ Search, query, and retrieve relevant information from the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
4
+ Use this tool when the user wants to "搜索知识", "获取信息", "召回文档", "查询知识库", "search knowledge base", or "retrieve documents".
5
+ Uses vector similarity search to find semantically related document chunks.
6
+
7
+ Example User Queries:
8
+ - "知识库里有关于产品安装的文档吗?"
9
+ - "帮我从知识湖中召回关于财务报表的资料"
10
+ - "Search the knowledge base for deployment guides"
11
+ - "根据知识库内容,回答如何配置网关"
12
+
13
+ Example Tool Call:
14
+ ```json
15
+ {
16
+ "query": "产品安装指南",
17
+ "top_k": 5
18
+ }
19
+ ```
20
+
21
+ parameters:
22
+ query:
23
+ type: string
24
+ description: Search query
25
+ required: true
26
+ top_k:
27
+ type: integer
28
+ description: Number of results to return
29
+ required: false
30
+ filter:
31
+ type: string
32
+ description: Filter string
33
+ required: false
34
+ include_binary:
35
+ type: boolean
36
+ description: Whether to include binary content
37
+ required: false
@@ -0,0 +1,174 @@
1
+ name: byted-las-data-profiler
2
+ description: |
3
+ Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
4
+ performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
5
+ extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
6
+ and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
7
+
8
+ ## Trigger Scenarios
9
+ Be sure to use this Skill when the user mentions the following scenarios:
10
+ - Need to scan the file structure in a TOS bucket or understand the dataset composition
11
+ - Need to connect to object storage (TOS/OSS/COS/S3) using the S3 protocol
12
+ - Need to scan, traverse, or catalog the file structure of a specific bucket or local directory
13
+ - Need to understand what a batch of data files contains and what their schema looks like
14
+ - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
15
+ - Need to write the meta-information of object storage or local files into LanceDB
16
+ - Mentions TOS, boto3, or object storage data profiling
17
+ - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
18
+ - Need to batch identify the type and size of remote/local files and build an index
19
+ - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
20
+ - Need to connect/dock a data source for profiling
21
+ - Mentions "connect" data source, docking data source
22
+
23
+ ## Overview
24
+ This Skill is a Dataset Profiling Tool provided by Volcengine TOS. It connects to the TOS bucket via an S3-compatible protocol (boto3), recursively traverses the target path, and accomplishes three things:
25
+ 1. **Cataloging**: Records the meta-information (path, size, type, origin, etc.) of all files.
26
+ 2. **Understanding Structured Data**: Samples and parses the schema for JSONL / CSV / TSV / Parquet / JSON to infer the semantic role of each column.
27
+ 3. **Extracting Media Meta-information**: Reads only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
28
+
29
+ In addition to TOS, this tool is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
30
+
31
+ ## Supported Data Sources
32
+ | Vendor | Service Name | S3 Endpoint Format | Signature | Addressing Style |
33
+ |--------|--------------|--------------------|-----------|------------------|
34
+ | Volcengine | Volcengine TOS | https://tos-s3-{region}.volces.com / ivolces.com | V4 | Virtual-hosted (Required) |
35
+ | Alibaba | Alibaba Cloud OSS | https://s3.oss-{region}.aliyuncs.com / -internal.aliyuncs.com | V2 (boto3 must use signature_version='s3') | Virtual-hosted |
36
+ | Tencent | Tencent Cloud COS | https://cos.{region}.myqcloud.com | V4 | Virtual-hosted |
37
+ | AWS | Amazon S3 | https://s3.{region}.amazonaws.com | V4 | Virtual-hosted |
38
+ | Local | Local File System | Not required | Not required | os.walk |
39
+
40
+ ## Overall Workflow
41
+ 1. Confirm data source parameters.
42
+ 2. Install dependencies (boto3 lancedb pyarrow pandas Pillow mutagen pymupdf).
43
+ 3. Execute the Python script, making three scanning passes:
44
+ - Pass 1: Traverse all files → file_catalog
45
+ - Pass 2: Sample structured data → structured_schemas
46
+ - Pass 3: Read-only header for media files → media_metadata
47
+ 4. Report results.
48
+
49
+ ## Parameter Description
50
+ | Parameter | Description | Example |
51
+ |-----------|-------------|---------|
52
+ | datasource_name | The name of the data source, used to identify this connection. LanceDB output directory is ~/.openclaw/las-data-profiler/{datasource_name}/, and configuration file is env.sh in the same directory | my_tos_data |
53
+ | vendor | volcengine / alibaba / tencent / aws / local | volcengine |
54
+ | endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
55
+ | access_key | AK, can be from parameters or environment variables | - |
56
+ | secret_key | SK, can be from parameters or environment variables | - |
57
+ | region | Region identifier | cn-beijing |
58
+ | bucket | Bucket name (root directory path when local) | my-data-bucket |
59
+ | prefix | Path prefix to limit the scan scope (required to avoid scanning the entire bucket) | datasets/2024/ |
60
+ | sample_rows | [Optional] Number of sampled rows per structured file, default is 100 | 200 |
61
+
62
+ ## Authentication Priority
63
+ Parameters > Environment Variables (TOS_ACCESS_KEY / S3_ACCESS_KEY / AWS_ACCESS_KEY_ID)
64
+
65
+ ## Connection Configuration Differences Across Vendors
66
+ - Volcengine TOS: Does not support PathStyle, must be Virtual-hosted (handled by default by the tool).
67
+ - Alibaba Cloud OSS: boto3 must use V2 signature (signature_version='s3') because V4 will force chunked encoding (handled by default by the tool).
68
+ - Tencent Cloud COS / AWS: Default V4 is sufficient.
69
+
70
+ ## File Classification and Processing Strategy
71
+ | Classification | Extension | Processing |
72
+ |----------------|-----------|------------|
73
+ | structured | .json .jsonl .ndjson .csv .tsv .parquet .pq | Sample parsing schema + column semantic inference |
74
+ | image | .jpg .png .gif .bmp .webp .tiff .svg ... | Read first 64 KB → width, height, format, color mode, EXIF summary |
75
+ | audio | .mp3 .wav .flac .aac .ogg .m4a ... | Read first 512 KB → duration, sample rate, channels, bitrate, codec, tags |
76
+ | video | .mp4 .avi .mov .mkv .webm ... | Read first 2 MB → duration, resolution, container format |
77
+ | pdf | .pdf | Read first 256 KB → page count, title/author, creation time, encryption status, page size |
78
+ | Other | All remaining | Only record meta-information to file_catalog |
79
+
80
+ Header reading uses S3's Range requests (Range: bytes=0-N) to download only the required bytes, without transferring the full file.
81
+
82
+ ## Understanding Structured Data
83
+ Sample the first N rows of each structured file to infer the semantic_hint for each column:
84
+
85
+ | semantic_hint | Judgment Logic | Example |
86
+ |---------------|----------------|---------|
87
+ | id | High uniqueness rate + short string | user_id, sample_001 |
88
+ | categorical | Low cardinality (unique count < 50 or uniqueness rate < 20%) | label: cat/dog/bird |
89
+ | numeric | int/float type | score: 0.95 |
90
+ | boolean | bool or only two distinct values | is_valid: true/false |
91
+ | text | Long string (avg > 50 characters) + high uniqueness rate | caption: "A cat sitting..." |
92
+ | file_path | > 50% values match path/URI patterns | images/001.jpg, s3://bucket/file |
93
+ | timestamp | Matches time format | 2024-01-15T10:30:00Z |
94
+ | structured | list / dict type | bbox: [10, 20, 100, 200] |
95
+ | constant | Only 1 distinct value | version: "1.0" |
96
+
97
+ ## LanceDB Three Tables
98
+ ### file_catalog — Full file index
99
+ | Column | Description |
100
+ |--------|-------------|
101
+ | file_path, file_name, extension, mime_type | Path and type information |
102
+ | category | structured / non-structured |
103
+ | media_type | image / audio / video / pdf / empty |
104
+ | size_bytes, last_modified, created_time, etag, storage_class | Storage attributes (created_time is only available for local files, empty for S3 objects) |
105
+ | is_multipart | ETag contains multipart flag (-), indicating a large file uploaded in parts |
106
+ | depth | Path level (number of /), helps to understand the directory structure |
107
+ | parent_dir | Nearest parent directory name, convenient for aggregation statistics by directory |
108
+ | vendor, bucket | Data source identifier |
109
+ | has_schema | Whether there are in-depth analysis records in structured_schemas |
110
+ | has_media_meta | Whether there are header meta-information records in media_metadata |
111
+ | scan_timestamp | Scan time |
112
+
113
+ ### structured_schemas — Column-level schema of structured data
114
+ | Column | Description |
115
+ |--------|-------------|
116
+ | file_path, vendor, bucket, format | File identifier |
117
+ | column_name, column_type, non_null_ratio, unique_count | Basic statistics |
118
+ | sample_values | First 3 distinct values (JSON) |
119
+ | semantic_hint, semantic_detail | Semantic tags and supplementary explanations |
120
+
121
+ ### media_metadata — Header meta-information of media files
122
+ | Column | Applicable Type | Description |
123
+ |--------|-----------------|-------------|
124
+ | width, height | Image / Video | Resolution (pixels) |
125
+ | image_format, color_mode | Image | e.g., JPEG, RGB |
126
+ | exif_summary | Image | Camera model, shooting time, GPS, etc. (JSON) |
127
+ | duration_sec | Audio / Video | Duration (seconds) |
128
+ | codec, sample_rate, channels, bitrate | Audio | Encoding parameters |
129
+ | container | Video | Container format (mp4, mkv...) |
130
+ | tags_summary | Audio | Title/artist/album tags, etc. (JSON) |
131
+ | page_count, pdf_title, pdf_author | PDF | Document attributes |
132
+ | creation_date, encrypted | PDF | Creation time, encryption status |
133
+ | page_width_pt, page_height_pt | PDF | First page size (points) |
134
+ | extract_error | All | Error message when extraction fails |
135
+
136
+ ## Output Location
137
+ - LanceDB table storage path: `~/.openclaw/las-data-profiler/{datasource_name}/`
138
+ - Configuration file: `~/.openclaw/las-data-profiler/{datasource_name}/env.sh`
139
+
140
+ ## Usage Examples
141
+ ```bash
142
+ # Volcengine TOS (Recommended)
143
+ python s3_catalog.py \
144
+ --vendor volcengine \
145
+ --endpoint https://tos-s3-cn-beijing.volces.com \
146
+ --ak "$TOS_ACCESS_KEY" --sk "$TOS_SECRET_KEY" \
147
+ --region cn-beijing --bucket my-bucket --prefix data/
148
+
149
+ # Local Directory
150
+ python s3_catalog.py --vendor local --bucket /path/to/data --prefix .
151
+
152
+ # Alibaba Cloud OSS
153
+ python s3_catalog.py \
154
+ --vendor alibaba \
155
+ --endpoint https://s3.oss-cn-hangzhou.aliyuncs.com \
156
+ --ak "$ALI_AK" --sk "$ALI_SK" \
157
+ --region cn-hangzhou --bucket my-bucket --prefix datasets/
158
+
159
+ # Tencent Cloud COS
160
+ python s3_catalog.py \
161
+ --vendor tencent \
162
+ --endpoint https://cos.ap-beijing.myqcloud.com \
163
+ --ak "$COS_AK" --sk "$COS_SK" \
164
+ --region ap-beijing --bucket my-bucket --prefix train/
165
+
166
+ # AWS S3
167
+ python s3_catalog.py --vendor aws --region us-east-1 --bucket my-bucket --prefix train/
168
+ ```
169
+
170
+ ## Frequently Asked Questions (FAQ)
171
+ - Volcengine TOS: InvalidPathAccess → TOS does not support PathStyle, ensure addressing_style='virtual' (handled by default by the tool).
172
+ - Alibaba Cloud OSS: InvalidArgument: aws-chunked encoding not supported → V2 signature Config(signature_version='s3') must be used (handled by default by the tool).
173
+ - Media meta-information extraction failed → The file might be corrupted or in a non-standard format. The media_metadata.extract_error column will record the specific error, and other fields will fall back to default values (0 / empty string).
174
+ - Video meta-information is incomplete → The position of the moov box for MP4/MOV is not fixed—if moov is at the end of the file (stream-written video), it might not be found in the first 2MB. In this case, duration/resolution will be 0.