@byted-las/contextlake-openclaw 1.0.7 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ description: |
7
7
  and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
8
8
 
9
9
  IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
10
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
10
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) via formal tool calls to accomplish the profiling tasks. DO NOT execute them as bash/shell commands.
11
11
  ---
12
12
 
13
13
  ## Trigger Scenarios
@@ -38,8 +38,8 @@ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-object
38
38
  - Table names: `files`, `structured_schemas`, `media_metadata`
39
39
 
40
40
  ## Available Tools for this Skill
41
- - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
- - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
- - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
41
+ - `list-s3-objects`: To traverse and list files in the bucket/directory. (Call this as an Agent Tool, NOT a bash command).
42
+ - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction. (Call this as an Agent Tool, NOT a bash command).
43
+ - `write-lance-catalog`: To write the profiling results to the LanceDB catalog. (Call this as an Agent Tool, NOT a bash command).
44
44
 
45
45
  Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "id": "contextlake-openclaw",
3
3
  "name": "ContextLake",
4
- "version": "1.0.7",
4
+ "version": "1.0.8",
5
5
  "description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
6
6
  "skills": ["./src/skills"],
7
7
  "configSchema": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@byted-las/contextlake-openclaw",
3
- "version": "1.0.7",
3
+ "version": "1.0.8",
4
4
  "description": "ContextLake OpenClaw Plugin for managing knowledge base",
5
5
  "main": "index.ts",
6
6
  "files": [
@@ -16,7 +16,7 @@
16
16
  },
17
17
  "scripts": {
18
18
  "build": "tsc && npm run copy-assets",
19
- "copy-assets": "cp -r src/skills/*/*.md dist/src/skills/ 2>/dev/null || true && for dir in src/skills/*; do if [ -d \"$dir\" ]; then target=\"dist/$dir\"; mkdir -p \"$target\"; cp \"$dir\"/*.md \"$target\"/ 2>/dev/null || true; fi; done && mkdir -p dist/src/lib/scripts && cp src/lib/scripts/*.py dist/src/lib/scripts/ 2>/dev/null || true",
19
+ "copy-assets": "mkdir -p dist/src/skills && cp -r src/skills/* dist/src/skills/ 2>/dev/null || true && mkdir -p dist/src/lib/scripts && cp src/lib/scripts/*.py dist/src/lib/scripts/ 2>/dev/null || true",
20
20
  "test": "vitest --reporter verbose",
21
21
  "test:local": "npx ts-node scripts/local-test.ts",
22
22
  "test:profiler": "npx ts-node scripts/local-profiler-test.ts",
@@ -7,7 +7,7 @@ description: |
7
7
  and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
8
8
 
9
9
  IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
10
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
10
+ You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) via formal tool calls to accomplish the profiling tasks. DO NOT execute them as bash/shell commands.
11
11
  ---
12
12
 
13
13
  ## Trigger Scenarios
@@ -38,8 +38,8 @@ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-object
38
38
  - Table names: `files`, `structured_schemas`, `media_metadata`
39
39
 
40
40
  ## Available Tools for this Skill
41
- - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
- - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
- - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
41
+ - `list-s3-objects`: To traverse and list files in the bucket/directory. (Call this as an Agent Tool, NOT a bash command).
42
+ - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction. (Call this as an Agent Tool, NOT a bash command).
43
+ - `write-lance-catalog`: To write the profiling results to the LanceDB catalog. (Call this as an Agent Tool, NOT a bash command).
44
44
 
45
45
  Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
@@ -1,15 +0,0 @@
1
- import { ContextLakeConfig } from '../../utils/config';
2
- export interface IngestSourceParams {
3
- datasource_name: string;
4
- }
5
- export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
6
- file: any;
7
- status: string;
8
- chunks: number;
9
- message?: undefined;
10
- } | {
11
- file: any;
12
- status: string;
13
- message: any;
14
- chunks?: undefined;
15
- })[]>;
@@ -1,193 +0,0 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
- Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.ingestSource = ingestSource;
37
- const factory_1 = require("../../service/metadata/factory");
38
- const las_api_1 = require("./las-api");
39
- const lancedb = __importStar(require("@lancedb/lancedb"));
40
- const path = __importStar(require("path"));
41
- const fs = __importStar(require("fs"));
42
- const os = __importStar(require("os"));
43
- // @ts-ignore
44
- const uuid_1 = require("uuid");
45
- const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
46
- async function ingestSource(params, config, logger) {
47
- if (logger) {
48
- logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
49
- }
50
- else {
51
- // eslint-disable-next-line no-console
52
- console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
53
- }
54
- const dsDir = path.join(BASE_DIR, params.datasource_name);
55
- const dbPath = path.join(dsDir, 'catalog_db');
56
- if (!fs.existsSync(dbPath)) {
57
- throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
58
- }
59
- const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
60
- const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
61
- await metadataProvider.connect();
62
- const lasClient = new las_api_1.LasApiClient(config, logger);
63
- const results = [];
64
- // Connect to the profiler LanceDB to read the file catalog
65
- const profilerDb = await lancedb.connect(dbPath);
66
- const tableNames = await profilerDb.tableNames();
67
- if (!tableNames.includes('file_catalog')) {
68
- throw new Error(`table 'file_catalog' not found in ${dbPath}`);
69
- }
70
- const catalogTable = await profilerDb.openTable('file_catalog');
71
- const files = await catalogTable.query().toArray();
72
- logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
73
- // Simple chunking for text
74
- const splitText = (text, chunkSize = 500, overlap = 50) => {
75
- const chunks = [];
76
- if (!text)
77
- return chunks;
78
- let i = 0;
79
- while (i < text.length) {
80
- chunks.push(text.slice(i, i + chunkSize));
81
- i += chunkSize - overlap;
82
- }
83
- return chunks;
84
- };
85
- const processText = async (text, fileInfo) => {
86
- const chunks = splitText(text);
87
- const docs = [];
88
- for (const chunk of chunks) {
89
- const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
90
- docs.push({
91
- id: (0, uuid_1.v4)(),
92
- vector,
93
- text: chunk,
94
- source: fileInfo.key,
95
- file_type: fileInfo.category,
96
- storage_type: 'source',
97
- url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
98
- metadata: JSON.stringify({ datasource: params.datasource_name }),
99
- created_at: Date.now(),
100
- binary_data: Buffer.from('')
101
- });
102
- }
103
- return docs;
104
- };
105
- for (const file of files) {
106
- try {
107
- logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
108
- let docs = [];
109
- const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
110
- if (file.media_type === 'pdf') {
111
- // PDF Parse
112
- const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
113
- url: fileUrl
114
- });
115
- const markdown = result.data?.markdown || '';
116
- docs = await processText(markdown, file);
117
- }
118
- else if (file.media_type === 'image') {
119
- // Multimodal Embedding directly
120
- const vector = await metadataProvider.generateMultimodalEmbedding([
121
- { type: 'image_url', image_url: { url: fileUrl } },
122
- { type: 'text', text: 'This is an image from the dataset.' }
123
- ]);
124
- docs.push({
125
- id: (0, uuid_1.v4)(),
126
- vector,
127
- text: 'Image from dataset',
128
- source: file.key,
129
- file_type: 'image',
130
- storage_type: 'source',
131
- url: fileUrl,
132
- metadata: JSON.stringify({ datasource: params.datasource_name }),
133
- created_at: Date.now(),
134
- binary_data: Buffer.from('')
135
- });
136
- }
137
- else if (file.media_type === 'audio') {
138
- // ASR
139
- const result = await lasClient.submitAndPoll('las_asr_pro', {
140
- audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
141
- request: { model_name: 'bigmodel' }
142
- });
143
- const text = result.data?.result?.text || '';
144
- docs = await processText(text, file);
145
- }
146
- else if (file.media_type === 'video') {
147
- // Video understanding -> text -> embedding
148
- const result = await lasClient.submitAndPoll('las_long_video_understand', {
149
- video_url: fileUrl,
150
- query: "详细描述这个视频的内容",
151
- model_name: "doubao-seed-2-0-lite-260215"
152
- });
153
- // Assuming video output is a text description somewhere in the response.
154
- // Note: the exact structure depends on the API return, adjusting to generic text.
155
- const text = JSON.stringify(result.data || '');
156
- // Also need audio extract and ASR for video
157
- // 1. Extract audio
158
- // The output_path_template needs a unique path per video
159
- const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
160
- await lasClient.process('las_audio_extract_and_split', {
161
- input_path: fileUrl,
162
- output_path_template: audioOutputPath,
163
- output_format: 'wav'
164
- });
165
- // 2. ASR on the extracted audio
166
- // Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
167
- const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
168
- audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
169
- request: { model_name: 'bigmodel' }
170
- });
171
- const audioText = asrResult.data?.result?.text || '';
172
- // Combine video text and audio text
173
- const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
174
- docs = await processText(combinedText, file);
175
- }
176
- else if (file.category === 'structured' || file.category === 'non-structured') {
177
- // If we had a direct text content, we could process it here.
178
- // Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
179
- // Fallback just logs
180
- logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
181
- }
182
- if (docs.length > 0) {
183
- await metadataProvider.addAssets(docs);
184
- results.push({ file: file.key, status: 'success', chunks: docs.length });
185
- }
186
- }
187
- catch (error) {
188
- logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
189
- results.push({ file: file.key, status: 'error', message: error.message });
190
- }
191
- }
192
- return results;
193
- }
@@ -1,64 +0,0 @@
1
- import { ContextLakeConfig } from '../../utils/config';
2
- export declare function lasPdfParseDoubao(params: {
3
- url: string;
4
- start_page?: number;
5
- num_pages?: number;
6
- parse_mode?: string;
7
- }, config?: ContextLakeConfig): Promise<any>;
8
- export declare function lasLongVideoUnderstand(params: {
9
- video_url: string;
10
- prompt: string;
11
- system_prompt?: string;
12
- return_chunk_text?: boolean;
13
- max_tokens?: number;
14
- temperature?: number;
15
- top_p?: number;
16
- }, config?: ContextLakeConfig): Promise<any>;
17
- export declare function lasBareImageTextEmbedding(params: {
18
- input: Array<{
19
- type: string;
20
- text?: string;
21
- image_url?: string;
22
- }>;
23
- encoding_format?: string;
24
- }, config?: ContextLakeConfig): Promise<any>;
25
- export declare function lasSeed20(params: {
26
- model: string;
27
- messages: Array<any>;
28
- stream?: boolean;
29
- max_tokens?: number;
30
- temperature?: number;
31
- top_p?: number;
32
- frequency_penalty?: number;
33
- presence_penalty?: number;
34
- tools?: Array<any>;
35
- tool_choice?: any;
36
- user?: string;
37
- logprobs?: boolean;
38
- top_logprobs?: number;
39
- }, config?: ContextLakeConfig): Promise<any>;
40
- export declare function lasAsrPro(params: {
41
- url?: string;
42
- format?: string;
43
- language?: string;
44
- resource?: string;
45
- use_itn?: boolean;
46
- use_sn?: boolean;
47
- enable_alignment?: boolean;
48
- channel_id?: number;
49
- use_word_info?: boolean;
50
- text_format?: number;
51
- enable_semantic_sentence_detection?: boolean;
52
- boost_words?: Array<{
53
- word: string;
54
- weight: number;
55
- }>;
56
- }, config?: ContextLakeConfig): Promise<any>;
57
- export declare function lasAudioExtractAndSplit(params: {
58
- input_path: string;
59
- output_path_template: string;
60
- split_duration?: number;
61
- output_format?: string;
62
- timeout?: number;
63
- extra_params?: string[];
64
- }, config?: ContextLakeConfig): Promise<any>;
@@ -1,72 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.lasPdfParseDoubao = lasPdfParseDoubao;
4
- exports.lasLongVideoUnderstand = lasLongVideoUnderstand;
5
- exports.lasBareImageTextEmbedding = lasBareImageTextEmbedding;
6
- exports.lasSeed20 = lasSeed20;
7
- exports.lasAsrPro = lasAsrPro;
8
- exports.lasAudioExtractAndSplit = lasAudioExtractAndSplit;
9
- function getLASConfig(config) {
10
- // Attempt to get from env vars or config
11
- const endpoint = process.env.LAS_ENDPOINT || (config?.las)?.endpoint;
12
- const apiKey = process.env.LAS_API_KEY || (config?.las)?.api_key;
13
- if (!endpoint || !apiKey) {
14
- throw new Error("LAS_ENDPOINT and LAS_API_KEY must be set in environment variables or config");
15
- }
16
- return { endpoint, apiKey };
17
- }
18
- async function lasFetch(path, payload, config) {
19
- const { endpoint, apiKey } = getLASConfig(config);
20
- const url = `${endpoint.replace(/\/$/, '')}${path}`;
21
- const response = await fetch(url, {
22
- method: 'POST',
23
- headers: {
24
- 'Content-Type': 'application/json',
25
- 'Authorization': `Bearer ${apiKey}`
26
- },
27
- body: JSON.stringify(payload)
28
- });
29
- if (!response.ok) {
30
- let errorText = await response.text().catch(() => '');
31
- throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
32
- }
33
- return await response.json();
34
- }
35
- async function lasPdfParseDoubao(params, config) {
36
- return lasFetch('/api/v1/submit', {
37
- operator_id: 'las_pdf_parse_doubao',
38
- operator_version: 'v1',
39
- data: params
40
- }, config);
41
- }
42
- async function lasLongVideoUnderstand(params, config) {
43
- return lasFetch('/api/v1/submit', {
44
- operator_id: 'las_long_video_understand',
45
- operator_version: 'v1',
46
- data: params
47
- }, config);
48
- }
49
- async function lasBareImageTextEmbedding(params, config) {
50
- return lasFetch('/api/v1/embeddings/multimodal', {
51
- model: 'doubao-embedding-vision',
52
- input: params.input,
53
- encoding_format: params.encoding_format
54
- }, config);
55
- }
56
- async function lasSeed20(params, config) {
57
- return lasFetch('/api/v1/chat/completions', params, config);
58
- }
59
- async function lasAsrPro(params, config) {
60
- return lasFetch('/api/v1/submit', {
61
- operator_id: 'las_asr_pro',
62
- operator_version: 'v1',
63
- data: params
64
- }, config);
65
- }
66
- async function lasAudioExtractAndSplit(params, config) {
67
- return lasFetch('/api/v1/process', {
68
- operator_id: 'las_audio_extract_and_split',
69
- operator_version: 'v1',
70
- data: params
71
- }, config);
72
- }
@@ -1,617 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- s3_catalog.py -- Data source profiling tool for LanceDB
4
-
5
- Three-pass scanning:
6
- Pass 1: Walk all files -> file_catalog
7
- Pass 2: Sample structured data -> structured_schemas
8
- Pass 3: Read media file headers -> media_metadata
9
-
10
- Supported sources: volcengine TOS / alibaba OSS / tencent COS / aws S3 / local
11
- Output: LanceDB tables (file_catalog, structured_schemas, media_metadata)
12
- """
13
-
14
- import argparse
15
- import datetime
16
- import io
17
- import json
18
- import mimetypes
19
- import os
20
- import re
21
- import sys
22
- import time
23
- from pathlib import Path
24
- from typing import Any, Dict, List, Optional, Tuple
25
-
26
- # ---------------------------------------------------------------------------
27
- # Lazy imports
28
- # ---------------------------------------------------------------------------
29
-
30
- def _import_boto3():
31
- try:
32
- import boto3
33
- from botocore.config import Config as BotoConfig
34
- return boto3, BotoConfig
35
- except ImportError:
36
- print("ERROR: boto3 not installed. Run: pip install boto3", file=sys.stderr)
37
- sys.exit(1)
38
-
39
- def _import_lancedb():
40
- try:
41
- import lancedb
42
- return lancedb
43
- except ImportError:
44
- print("ERROR: lancedb not installed. Run: pip install lancedb", file=sys.stderr)
45
- sys.exit(1)
46
-
47
- def _import_pandas():
48
- try:
49
- import pandas as pd
50
- return pd
51
- except ImportError:
52
- print("ERROR: pandas not installed. Run: pip install pandas", file=sys.stderr)
53
- sys.exit(1)
54
-
55
- def _import_pyarrow():
56
- try:
57
- import pyarrow as pa
58
- import pyarrow.parquet as pq
59
- return pa, pq
60
- except ImportError:
61
- print("ERROR: pyarrow not installed. Run: pip install pyarrow", file=sys.stderr)
62
- sys.exit(1)
63
-
64
- # ---------------------------------------------------------------------------
65
- # Constants
66
- # ---------------------------------------------------------------------------
67
-
68
- STRUCTURED_EXTS = {'.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'}
69
- IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'}
70
- AUDIO_EXTS = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
71
- VIDEO_EXTS = {'.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'}
72
- PDF_EXTS = {'.pdf'}
73
-
74
- IMAGE_HEAD_BYTES = 64 * 1024
75
- AUDIO_HEAD_BYTES = 512 * 1024
76
- VIDEO_HEAD_BYTES = 2 * 1024 * 1024
77
- PDF_HEAD_BYTES = 256 * 1024
78
-
79
-
80
- def classify_file(ext: str) -> Tuple[str, Optional[str]]:
81
- ext = ext.lower()
82
- if ext in STRUCTURED_EXTS:
83
- return ('structured', None)
84
- if ext in IMAGE_EXTS:
85
- return ('non-structured', 'image')
86
- if ext in AUDIO_EXTS:
87
- return ('non-structured', 'audio')
88
- if ext in VIDEO_EXTS:
89
- return ('non-structured', 'video')
90
- if ext in PDF_EXTS:
91
- return ('non-structured', 'pdf')
92
- return ('non-structured', None)
93
-
94
-
95
- # ===================================================================
96
- # S3 Client Abstraction
97
- # ===================================================================
98
-
99
- class S3Client:
100
- """Thin wrapper around boto3 S3 with vendor-specific configuration."""
101
-
102
- def __init__(self, vendor: str, endpoint: str, credential_id: str, credential_secret: str, region: str):
103
- boto3, BotoConfig = _import_boto3()
104
- self.vendor = vendor
105
-
106
- config_kwargs: Dict[str, Any] = {}
107
- if vendor == 'volcengine':
108
- config_kwargs['s3'] = {'addressing_style': 'virtual'}
109
- elif vendor == 'alibaba':
110
- config_kwargs['signature_version'] = 's3'
111
- config_kwargs['s3'] = {'addressing_style': 'virtual'}
112
- elif vendor == 'tencent':
113
- config_kwargs['s3'] = {'addressing_style': 'virtual'}
114
-
115
- self.client = boto3.client(
116
- 's3',
117
- endpoint_url=endpoint or None,
118
- aws_access_key_id=credential_id,
119
- aws_secret_access_key=credential_secret,
120
- region_name=region,
121
- config=BotoConfig(**config_kwargs) if config_kwargs else None,
122
- )
123
-
124
- def list_objects(self, bucket: str, prefix: str):
125
- paginator = self.client.get_paginator('list_objects_v2')
126
- for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
127
- for obj in page.get('Contents', []):
128
- yield obj
129
-
130
- def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
131
- resp = self.client.get_object(Bucket=bucket, Key=key, Range=f'bytes={start}-{end}')
132
- return resp['Body'].read()
133
-
134
- def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
135
- kwargs: Dict[str, Any] = {'Bucket': bucket, 'Key': key}
136
- if max_bytes:
137
- kwargs['Range'] = f'bytes=0-{max_bytes - 1}'
138
- resp = self.client.get_object(**kwargs)
139
- return resp['Body'].read()
140
-
141
-
142
- # ===================================================================
143
- # Local FS Abstraction
144
- # ===================================================================
145
-
146
- class LocalClient:
147
- def __init__(self, root: str):
148
- self.root = root
149
-
150
- def list_objects(self, bucket: str, prefix: str):
151
- base = Path(bucket)
152
- prefix_path = base / prefix if prefix and prefix != '.' else base
153
- for dirpath, _dirs, files in os.walk(prefix_path):
154
- for fname in files:
155
- full = Path(dirpath) / fname
156
- stat = full.stat()
157
- key = str(full.relative_to(base))
158
- yield {
159
- 'Key': key,
160
- 'Size': stat.st_size,
161
- 'LastModified': datetime.datetime.fromtimestamp(stat.st_mtime),
162
- 'ETag': '',
163
- 'StorageClass': 'LOCAL',
164
- '_created_time': datetime.datetime.fromtimestamp(stat.st_ctime),
165
- }
166
-
167
- def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
168
- fpath = Path(bucket) / key
169
- with open(fpath, 'rb') as f:
170
- f.seek(start)
171
- return f.read(end - start + 1)
172
-
173
- def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
174
- fpath = Path(bucket) / key
175
- with open(fpath, 'rb') as f:
176
- if max_bytes:
177
- return f.read(max_bytes)
178
- return f.read()
179
-
180
-
181
- # ===================================================================
182
- # Pass 1: File Catalog
183
- # ===================================================================
184
-
185
- def build_file_catalog(client, bucket: str, prefix: str, vendor: str) -> List[Dict]:
186
- catalog: List[Dict] = []
187
- scan_ts = datetime.datetime.utcnow().isoformat() + 'Z'
188
-
189
- for obj in client.list_objects(bucket, prefix):
190
- key = obj['Key']
191
- if key.endswith('/'):
192
- continue
193
-
194
- name = os.path.basename(key)
195
- ext = os.path.splitext(name)[1].lower()
196
- mime, _ = mimetypes.guess_type(name)
197
- category, media_type = classify_file(ext)
198
- etag = obj.get('ETag', '').strip('"')
199
- depth = key.count('/')
200
- parent_dir = os.path.basename(os.path.dirname(key)) if '/' in key else ''
201
-
202
- catalog.append({
203
- 'file_path': key,
204
- 'file_name': name,
205
- 'extension': ext,
206
- 'mime_type': mime or '',
207
- 'category': category,
208
- 'media_type': media_type or '',
209
- 'size_bytes': obj.get('Size', 0),
210
- 'last_modified': str(obj.get('LastModified', '')),
211
- 'created_time': str(obj.get('_created_time', '')),
212
- 'etag': etag,
213
- 'storage_class': obj.get('StorageClass', ''),
214
- 'is_multipart': '-' in etag,
215
- 'depth': depth,
216
- 'parent_dir': parent_dir,
217
- 'vendor': vendor,
218
- 'bucket': bucket,
219
- 'has_schema': False,
220
- 'has_media_meta': False,
221
- 'scan_timestamp': scan_ts,
222
- })
223
-
224
- return catalog
225
-
226
-
227
- # ===================================================================
228
- # Pass 2: Structured Schema Analysis
229
- # ===================================================================
230
-
231
- def infer_semantic_hint(series) -> Tuple[str, str]:
232
- pd = _import_pandas()
233
- non_null = series.dropna()
234
- n = len(non_null)
235
- if n == 0:
236
- return ('constant', 'all null')
237
-
238
- unique_count = non_null.nunique()
239
- unique_ratio = unique_count / n if n > 0 else 0
240
-
241
- if set(non_null.unique()).issubset({True, False, 0, 1, 'true', 'false', 'True', 'False'}):
242
- return ('boolean', f'{unique_count} distinct values')
243
-
244
- if unique_count == 1:
245
- return ('constant', f'value: {non_null.iloc[0]}')
246
-
247
- dtype_str = str(series.dtype)
248
-
249
- if 'int' in dtype_str or 'float' in dtype_str:
250
- return ('numeric', dtype_str)
251
-
252
- sample_val = non_null.iloc[0]
253
- if isinstance(sample_val, (list, dict)):
254
- return ('structured', type(sample_val).__name__)
255
-
256
- try:
257
- str_vals = non_null.astype(str)
258
- avg_len = str_vals.str.len().mean()
259
-
260
- path_pattern = re.compile(r'[\\\\/]|^s3://|^tos://|^gs://|^https?://')
261
- path_ratio = str_vals.apply(lambda x: bool(path_pattern.search(x))).mean()
262
- if path_ratio > 0.5:
263
- return ('file_path', f'{path_ratio:.0%} match path/URI pattern')
264
-
265
- ts_pattern = re.compile(r'\d{4}[-/]\d{2}[-/]\d{2}')
266
- ts_ratio = str_vals.apply(lambda x: bool(ts_pattern.search(x))).mean()
267
- if ts_ratio > 0.5:
268
- return ('timestamp', f'{ts_ratio:.0%} match timestamp pattern')
269
-
270
- if unique_ratio > 0.9 and avg_len < 50:
271
- return ('id', f'unique_ratio={unique_ratio:.2f}, avg_len={avg_len:.1f}')
272
-
273
- if unique_count < 50 or unique_ratio < 0.2:
274
- return ('categorical', f'{unique_count} categories')
275
-
276
- if avg_len > 50 and unique_ratio > 0.5:
277
- return ('text', f'avg_len={avg_len:.1f}')
278
-
279
- except Exception:
280
- pass
281
-
282
- return ('text', '')
283
-
284
-
285
- def analyze_structured_file(client, bucket: str, key: str, ext: str, sample_rows: int) -> List[Dict]:
286
- pd = _import_pandas()
287
- pa, pq = _import_pyarrow()
288
-
289
- schemas: List[Dict] = []
290
- max_download = 2 * 1024 * 1024
291
-
292
- try:
293
- raw = client.get_object(bucket, key, max_bytes=max_download)
294
- except Exception as e:
295
- return [{'file_path': key, 'error': str(e)}]
296
-
297
- df = None
298
- fmt = ext.lstrip('.')
299
-
300
- try:
301
- if ext in ('.parquet', '.pq'):
302
- buf = io.BytesIO(raw)
303
- table = pq.read_table(buf)
304
- df = table.to_pandas().head(sample_rows)
305
- fmt = 'parquet'
306
- elif ext == '.csv':
307
- df = pd.read_csv(io.BytesIO(raw), nrows=sample_rows, on_bad_lines='skip')
308
- elif ext == '.tsv':
309
- df = pd.read_csv(io.BytesIO(raw), sep='\t', nrows=sample_rows, on_bad_lines='skip')
310
- elif ext in ('.jsonl', '.ndjson'):
311
- lines = raw.decode('utf-8', errors='replace').strip().split('\n')[:sample_rows]
312
- records = [json.loads(line) for line in lines if line.strip()]
313
- df = pd.json_normalize(records)
314
- elif ext == '.json':
315
- data = json.loads(raw.decode('utf-8', errors='replace'))
316
- if isinstance(data, list):
317
- df = pd.json_normalize(data[:sample_rows])
318
- elif isinstance(data, dict):
319
- df = pd.json_normalize([data])
320
- except Exception as e:
321
- return [{'file_path': key, 'error': f'parse error: {e}'}]
322
-
323
- if df is None or df.empty:
324
- return []
325
-
326
- for col in df.columns:
327
- series = df[col]
328
- non_null = series.dropna()
329
- unique_count = int(non_null.nunique()) if len(non_null) > 0 else 0
330
- non_null_ratio = len(non_null) / len(series) if len(series) > 0 else 0.0
331
-
332
- sample_values = []
333
- try:
334
- sample_values = [str(v) for v in non_null.unique()[:3]]
335
- except Exception:
336
- pass
337
-
338
- hint, detail = infer_semantic_hint(series)
339
-
340
- schemas.append({
341
- 'file_path': key,
342
- 'vendor': '',
343
- 'bucket': bucket,
344
- 'format': fmt,
345
- 'column_name': str(col),
346
- 'column_type': str(series.dtype),
347
- 'non_null_ratio': round(non_null_ratio, 4),
348
- 'unique_count': unique_count,
349
- 'sample_values': json.dumps(sample_values, ensure_ascii=False),
350
- 'semantic_hint': hint,
351
- 'semantic_detail': detail,
352
- })
353
-
354
- return schemas
355
-
356
-
357
- # ===================================================================
358
- # Pass 3: Media Metadata Extraction
359
- # ===================================================================
360
-
361
- def extract_image_meta(data: bytes) -> Dict:
362
- meta: Dict[str, Any] = {
363
- 'width': 0, 'height': 0, 'image_format': '', 'color_mode': '', 'exif_summary': '{}',
364
- }
365
- try:
366
- from PIL import Image
367
- img = Image.open(io.BytesIO(data))
368
- meta['width'] = img.width
369
- meta['height'] = img.height
370
- meta['image_format'] = img.format or ''
371
- meta['color_mode'] = img.mode or ''
372
-
373
- exif = {}
374
- exif_data = getattr(img, '_getexif', lambda: None)()
375
- if exif_data:
376
- for tag_id, value in list(exif_data.items())[:10]:
377
- try:
378
- from PIL.ExifTags import TAGS
379
- tag_name = TAGS.get(tag_id, str(tag_id))
380
- exif[tag_name] = str(value)[:100]
381
- except Exception:
382
- pass
383
- meta['exif_summary'] = json.dumps(exif, ensure_ascii=False)
384
- except Exception as e:
385
- meta['extract_error'] = str(e)
386
- return meta
387
-
388
-
389
- def extract_audio_meta(data: bytes) -> Dict:
390
- meta: Dict[str, Any] = {
391
- 'duration_sec': 0.0, 'codec': '', 'sample_rate': 0, 'channels': 0,
392
- 'bitrate': 0, 'tags_summary': '{}',
393
- }
394
- try:
395
- import mutagen
396
- f = mutagen.File(io.BytesIO(data))
397
- if f:
398
- info = getattr(f, 'info', None)
399
- if info:
400
- meta['duration_sec'] = round(getattr(info, 'length', 0.0), 2)
401
- meta['sample_rate'] = getattr(info, 'sample_rate', 0)
402
- meta['channels'] = getattr(info, 'channels', 0)
403
- meta['bitrate'] = getattr(info, 'bitrate', 0)
404
- meta['codec'] = type(info).__name__
405
-
406
- tags = {}
407
- if f.tags:
408
- for k in list(f.tags.keys())[:10]:
409
- try:
410
- tags[str(k)] = str(f.tags[k])[:100]
411
- except Exception:
412
- pass
413
- meta['tags_summary'] = json.dumps(tags, ensure_ascii=False)
414
- except Exception as e:
415
- meta['extract_error'] = str(e)
416
- return meta
417
-
418
-
419
- def extract_video_meta(data: bytes) -> Dict:
420
- meta: Dict[str, Any] = {
421
- 'width': 0, 'height': 0, 'duration_sec': 0.0, 'container': '',
422
- }
423
- try:
424
- if data[:4] == b'\x1a\x45\xdf\xa3':
425
- meta['container'] = 'mkv/webm'
426
- elif len(data) > 8 and data[4:8] == b'ftyp':
427
- ftyp = data[8:12].decode('ascii', errors='replace').strip()
428
- meta['container'] = ftyp
429
- elif data[:4] == b'RIFF':
430
- meta['container'] = 'avi'
431
- elif data[:3] == b'FLV':
432
- meta['container'] = 'flv'
433
- except Exception as e:
434
- meta['extract_error'] = str(e)
435
- return meta
436
-
437
-
438
- def extract_pdf_meta(data: bytes) -> Dict:
439
- meta: Dict[str, Any] = {
440
- 'page_count': 0, 'pdf_title': '', 'pdf_author': '',
441
- 'creation_date': '', 'encrypted': False,
442
- 'page_width_pt': 0.0, 'page_height_pt': 0.0,
443
- }
444
- try:
445
- import fitz
446
- doc = fitz.open(stream=data, filetype='pdf')
447
- meta['page_count'] = doc.page_count
448
- md = doc.metadata or {}
449
- meta['pdf_title'] = md.get('title', '')
450
- meta['pdf_author'] = md.get('author', '')
451
- meta['creation_date'] = md.get('creationDate', '')
452
- meta['encrypted'] = doc.is_encrypted
453
-
454
- if doc.page_count > 0:
455
- page = doc[0]
456
- rect = page.rect
457
- meta['page_width_pt'] = round(rect.width, 2)
458
- meta['page_height_pt'] = round(rect.height, 2)
459
- doc.close()
460
- except Exception as e:
461
- meta['extract_error'] = str(e)
462
- return meta
463
-
464
-
465
- def extract_media_metadata(client, bucket: str, key: str, media_type: str) -> Dict:
466
- head_size = {
467
- 'image': IMAGE_HEAD_BYTES,
468
- 'audio': AUDIO_HEAD_BYTES,
469
- 'video': VIDEO_HEAD_BYTES,
470
- 'pdf': PDF_HEAD_BYTES,
471
- }.get(media_type, IMAGE_HEAD_BYTES)
472
-
473
- try:
474
- data = client.get_object(bucket, key, max_bytes=head_size)
475
- except Exception as e:
476
- return {'extract_error': f'download failed: {e}'}
477
-
478
- if media_type == 'image':
479
- return extract_image_meta(data)
480
- elif media_type == 'audio':
481
- return extract_audio_meta(data)
482
- elif media_type == 'video':
483
- return extract_video_meta(data)
484
- elif media_type == 'pdf':
485
- return extract_pdf_meta(data)
486
- return {}
487
-
488
-
489
- # ===================================================================
490
- # LanceDB Writer
491
- # ===================================================================
492
-
493
- def write_to_lancedb(db_path: str, table_name: str, records: List[Dict]):
494
- if not records:
495
- return
496
- lancedb = _import_lancedb()
497
- pd = _import_pandas()
498
-
499
- db = lancedb.connect(db_path)
500
- df = pd.DataFrame(records)
501
-
502
- table_names = db.table_names()
503
- if table_name in table_names:
504
- db.drop_table(table_name)
505
-
506
- db.create_table(table_name, data=df)
507
- print(f" [LanceDB] Wrote {len(records)} records to '{table_name}'")
508
-
509
-
510
- # ===================================================================
511
- # Main
512
- # ===================================================================
513
-
514
- def main():
515
- parser = argparse.ArgumentParser(description='S3-compatible data profiler -> LanceDB')
516
- parser.add_argument('--vendor', required=True,
517
- choices=['volcengine', 'alibaba', 'tencent', 'aws', 'local'])
518
- parser.add_argument('--endpoint', default='')
519
- parser.add_argument('--ak', default='', dest='cred_id',
520
- help='Access credential ID')
521
- parser.add_argument('--sk', default='', dest='cred_secret',
522
- help='Access credential value')
523
- parser.add_argument('--region', default='')
524
- parser.add_argument('--bucket', required=True)
525
- parser.add_argument('--prefix', required=True)
526
- parser.add_argument('--db-path', default=None, help='Path to LanceDB database. Defaults to ~/.openclaw/las-data-profiler/{datasource_name}/catalog_db if datasource_name is provided.')
527
- parser.add_argument('--datasource-name', default='', help='Name of the datasource. Used to determine default db-path if not explicitly provided.')
528
- parser.add_argument('--sample-rows', type=int, default=100)
529
- args = parser.parse_args()
530
-
531
- if not args.db_path:
532
- if args.datasource_name:
533
- import os
534
- home_dir = os.path.expanduser('~')
535
- args.db_path = os.path.join(home_dir, '.openclaw', 'las-data-profiler', args.datasource_name, 'catalog_db')
536
- else:
537
- args.db_path = './catalog_db'
538
-
539
- print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
540
- print(f"[las-data-profiler] db_path={args.db_path}")
541
-
542
- if args.vendor == 'local':
543
- client = LocalClient(args.bucket)
544
- else:
545
- client = S3Client(
546
- vendor=args.vendor,
547
- endpoint=args.endpoint,
548
- credential_id=args.cred_id,
549
- credential_secret=args.cred_secret,
550
- region=args.region,
551
- )
552
-
553
- # ---- Pass 1: File Catalog ----
554
- print("\n[Pass 1] Scanning files...")
555
- catalog = build_file_catalog(client, args.bucket, args.prefix, args.vendor)
556
- print(f" Found {len(catalog)} files")
557
-
558
- # ---- Pass 2: Structured Schemas ----
559
- print("\n[Pass 2] Analyzing structured data...")
560
- structured_files = [f for f in catalog if f['category'] == 'structured']
561
- all_schemas: List[Dict] = []
562
- for i, entry in enumerate(structured_files):
563
- key = entry['file_path']
564
- ext = entry['extension']
565
- print(f" [{i+1}/{len(structured_files)}] {key}")
566
- schemas = analyze_structured_file(client, args.bucket, key, ext, args.sample_rows)
567
- for s in schemas:
568
- s['vendor'] = args.vendor
569
- all_schemas.extend(schemas)
570
- entry['has_schema'] = True
571
-
572
- print(f" Analyzed {len(structured_files)} files, {len(all_schemas)} column records")
573
-
574
- # ---- Pass 3: Media Metadata ----
575
- print("\n[Pass 3] Extracting media metadata...")
576
- media_files = [f for f in catalog if f['media_type'] in ('image', 'audio', 'video', 'pdf')]
577
- all_media_meta: List[Dict] = []
578
- for i, entry in enumerate(media_files):
579
- key = entry['file_path']
580
- media_type = entry['media_type']
581
- print(f" [{i+1}/{len(media_files)}] {key} ({media_type})")
582
- meta = extract_media_metadata(client, args.bucket, key, media_type)
583
- meta['file_path'] = key
584
- meta['vendor'] = args.vendor
585
- meta['bucket'] = args.bucket
586
- meta['media_type'] = media_type
587
- for col in ['width', 'height', 'image_format', 'color_mode', 'exif_summary',
588
- 'duration_sec', 'codec', 'sample_rate', 'channels', 'bitrate',
589
- 'tags_summary', 'container',
590
- 'page_count', 'pdf_title', 'pdf_author', 'creation_date',
591
- 'encrypted', 'page_width_pt', 'page_height_pt', 'extract_error']:
592
- meta.setdefault(col, '' if isinstance(meta.get(col), str) else 0)
593
- all_media_meta.append(meta)
594
- entry['has_media_meta'] = True
595
-
596
- print(f" Extracted metadata for {len(media_files)} media files")
597
-
598
- # ---- Write to LanceDB ----
599
- print(f"\n[LanceDB] Writing to {args.db_path}")
600
- write_to_lancedb(args.db_path, 'file_catalog', catalog)
601
- write_to_lancedb(args.db_path, 'structured_schemas', all_schemas)
602
- write_to_lancedb(args.db_path, 'media_metadata', all_media_meta)
603
-
604
- # ---- Summary JSON (stdout, for Node.js to parse) ----
605
- summary = {
606
- 'summary': {
607
- 'total_files': len(catalog),
608
- 'structured_files': len(structured_files),
609
- 'media_files': len(media_files),
610
- }
611
- }
612
- print(f"\n{json.dumps(summary)}")
613
- print("\n[las-data-profiler] Done!")
614
-
615
-
616
- if __name__ == '__main__':
617
- main()
@@ -1,14 +0,0 @@
1
- import { EmbeddingProvider, EmbeddingConfig } from './interface';
2
- export declare const setNodeLlamaCppImporter: (importer: () => Promise<any>) => void;
3
- export declare class LocalEmbeddingProvider implements EmbeddingProvider {
4
- private llama;
5
- private model;
6
- private context;
7
- private initPromise;
8
- private modelPath;
9
- constructor(config: EmbeddingConfig);
10
- private ensureInitialized;
11
- private doInitialize;
12
- generateEmbedding(text: string): Promise<number[]>;
13
- generateEmbeddings(texts: string[]): Promise<number[][]>;
14
- }
@@ -1,107 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.LocalEmbeddingProvider = exports.setNodeLlamaCppImporter = void 0;
4
- // import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
5
- const DEFAULT_LOCAL_MODEL = 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf';
6
- let nodeLlamaImportPromise = null;
7
- const setNodeLlamaCppImporter = (importer) => {
8
- nodeLlamaImportPromise = importer();
9
- };
10
- exports.setNodeLlamaCppImporter = setNodeLlamaCppImporter;
11
- const importNodeLlamaCpp = async () => {
12
- if (!nodeLlamaImportPromise) {
13
- nodeLlamaImportPromise = import('node-llama-cpp');
14
- }
15
- return nodeLlamaImportPromise;
16
- };
17
- class LocalEmbeddingProvider {
18
- llama = null;
19
- model = null;
20
- context = null;
21
- initPromise = null;
22
- modelPath;
23
- constructor(config) {
24
- // Override transformers.js default with node-llama-cpp default
25
- this.modelPath = config.model_name === 'Xenova/all-MiniLM-L6-v2'
26
- ? DEFAULT_LOCAL_MODEL
27
- : (config.model_name || DEFAULT_LOCAL_MODEL);
28
- }
29
- async ensureInitialized() {
30
- if (this.context) {
31
- return;
32
- }
33
- if (this.initPromise) {
34
- return this.initPromise;
35
- }
36
- this.initPromise = this.doInitialize().catch((err) => {
37
- this.initPromise = null;
38
- this.context = null;
39
- this.model = null;
40
- this.llama = null;
41
- throw err;
42
- });
43
- return this.initPromise;
44
- }
45
- async doInitialize() {
46
- try {
47
- const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
48
- if (!this.llama) {
49
- this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
50
- }
51
- if (!this.model) {
52
- const resolved = await resolveModelFile(this.modelPath);
53
- this.model = await this.llama.loadModel({ modelPath: resolved });
54
- }
55
- if (!this.context) {
56
- this.context = await this.model.createEmbeddingContext();
57
- }
58
- }
59
- catch (err) {
60
- const detail = err instanceof Error ? err.message : String(err);
61
- throw new Error(`Local embeddings unavailable. Reason: ${detail}`, {
62
- cause: err,
63
- });
64
- }
65
- }
66
- async generateEmbedding(text) {
67
- if (!text || !text.trim()) {
68
- throw new Error('Embedding input text must be a non-empty string');
69
- }
70
- await this.ensureInitialized();
71
- const embedding = await this.context.getEmbeddingFor(text);
72
- const vector = embedding.vector; // TypedArray
73
- // Optimized normalization loop
74
- let sumSq = 0;
75
- const len = vector.length;
76
- for (let i = 0; i < len; i++) {
77
- const val = vector[i];
78
- if (Number.isFinite(val)) {
79
- sumSq += val * val;
80
- }
81
- else {
82
- vector[i] = 0;
83
- }
84
- }
85
- const magnitude = Math.sqrt(sumSq);
86
- const result = new Array(len);
87
- if (magnitude > 0) {
88
- const scale = 1.0 / magnitude;
89
- for (let i = 0; i < len; i++) {
90
- result[i] = vector[i] * scale;
91
- }
92
- }
93
- else {
94
- for (let i = 0; i < len; i++) {
95
- result[i] = vector[i];
96
- }
97
- }
98
- return result;
99
- }
100
- async generateEmbeddings(texts) {
101
- if (!Array.isArray(texts)) {
102
- throw new Error('Embedding input must be an array of strings');
103
- }
104
- return Promise.all(texts.map(text => this.generateEmbedding(text)));
105
- }
106
- }
107
- exports.LocalEmbeddingProvider = LocalEmbeddingProvider;
@@ -1,45 +0,0 @@
1
- ---
2
- name: byted-las-data-profiler
3
- description: |
4
- Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
5
- performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
6
- extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
7
- and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
8
-
9
- IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
10
- You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
11
- ---
12
-
13
- ## Trigger Scenarios
14
- Be sure to use this Skill when the user mentions the following scenarios:
15
- - Need to scan the file structure in a TOS bucket or understand the dataset composition
16
- - Need to connect to object storage (TOS/OSS/COS/S3) using the S3 protocol
17
- - Need to scan, traverse, or catalog the file structure of a specific bucket or local directory
18
- - Need to understand what a batch of data files contains and what their schema looks like
19
- - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
20
- - Need to write the meta-information of object storage or local files into LanceDB
21
- - Mentions TOS, boto3, or object storage data profiling
22
- - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
23
- - Need to batch identify the type and size of remote/local files and build an index
24
- - Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
25
- - Need to connect/dock a data source for profiling
26
- - Mentions "connect" data source, docking data source
27
-
28
- ## Overview
29
- This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
30
-
31
- 1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
32
- 2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
33
- 3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
34
- 4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
35
-
36
- ## Output Location
37
- - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
38
- - Table names: `files`, `structured_schemas`, `media_metadata`
39
-
40
- ## Available Tools for this Skill
41
- - `list-s3-objects`: To traverse and list files in the bucket/directory.
42
- - `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
43
- - `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
44
-
45
- Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.