@byted-las/contextlake-openclaw 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1 -1
- package/dist/src/skills/las-data-profiler/SKILL.md +4 -4
- package/index.ts +1 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
- package/src/skills/las-data-profiler/SKILL.md +4 -4
- package/dist/src/lib/actions/ingest-source.d.ts +0 -15
- package/dist/src/lib/actions/ingest-source.js +0 -193
- package/dist/src/lib/actions/las.d.ts +0 -64
- package/dist/src/lib/actions/las.js +0 -72
- package/dist/src/lib/scripts/s3_catalog.py +0 -617
- package/dist/src/service/embedding/local.d.ts +0 -14
- package/dist/src/service/embedding/local.js +0 -107
- package/dist/src/skills/SKILL.md +0 -45
package/dist/index.js
CHANGED
|
@@ -4,7 +4,7 @@ const commands_1 = require("./src/commands");
|
|
|
4
4
|
const plugin = {
|
|
5
5
|
id: 'contextlake-openclaw',
|
|
6
6
|
name: 'ContextLake',
|
|
7
|
-
version: '1.0.
|
|
7
|
+
version: '1.0.9',
|
|
8
8
|
description: 'A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support',
|
|
9
9
|
configSchema: {
|
|
10
10
|
type: 'object',
|
|
@@ -7,7 +7,7 @@ description: |
|
|
|
7
7
|
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
8
8
|
|
|
9
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
10
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) via formal tool calls to accomplish the profiling tasks. DO NOT execute them as bash/shell commands.
|
|
11
11
|
---
|
|
12
12
|
|
|
13
13
|
## Trigger Scenarios
|
|
@@ -38,8 +38,8 @@ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-object
|
|
|
38
38
|
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
39
|
|
|
40
40
|
## Available Tools for this Skill
|
|
41
|
-
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
-
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
-
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory. (Call this as an Agent Tool, NOT a bash command).
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction. (Call this as an Agent Tool, NOT a bash command).
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog. (Call this as an Agent Tool, NOT a bash command).
|
|
44
44
|
|
|
45
45
|
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|
package/index.ts
CHANGED
|
@@ -5,7 +5,7 @@ import { registerAll } from './src/commands';
|
|
|
5
5
|
const plugin = {
|
|
6
6
|
id: 'contextlake-openclaw',
|
|
7
7
|
name: 'ContextLake',
|
|
8
|
-
version: '1.0.
|
|
8
|
+
version: '1.0.9',
|
|
9
9
|
description: 'A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support',
|
|
10
10
|
configSchema: {
|
|
11
11
|
type: 'object',
|
package/openclaw.plugin.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "contextlake-openclaw",
|
|
3
3
|
"name": "ContextLake",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.9",
|
|
5
5
|
"description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
|
|
6
6
|
"skills": ["./src/skills"],
|
|
7
7
|
"configSchema": {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@byted-las/contextlake-openclaw",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.9",
|
|
4
4
|
"description": "ContextLake OpenClaw Plugin for managing knowledge base",
|
|
5
5
|
"main": "index.ts",
|
|
6
6
|
"files": [
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
},
|
|
17
17
|
"scripts": {
|
|
18
18
|
"build": "tsc && npm run copy-assets",
|
|
19
|
-
"copy-assets": "
|
|
19
|
+
"copy-assets": "mkdir -p dist/src/skills && cp -r src/skills/* dist/src/skills/ 2>/dev/null || true && mkdir -p dist/src/lib/scripts && cp src/lib/scripts/*.py dist/src/lib/scripts/ 2>/dev/null || true",
|
|
20
20
|
"test": "vitest --reporter verbose",
|
|
21
21
|
"test:local": "npx ts-node scripts/local-test.ts",
|
|
22
22
|
"test:profiler": "npx ts-node scripts/local-profiler-test.ts",
|
|
@@ -7,7 +7,7 @@ description: |
|
|
|
7
7
|
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
8
8
|
|
|
9
9
|
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
10
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
10
|
+
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) via formal tool calls to accomplish the profiling tasks. DO NOT execute them as bash/shell commands.
|
|
11
11
|
---
|
|
12
12
|
|
|
13
13
|
## Trigger Scenarios
|
|
@@ -38,8 +38,8 @@ This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-object
|
|
|
38
38
|
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
39
|
|
|
40
40
|
## Available Tools for this Skill
|
|
41
|
-
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
-
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
-
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
41
|
+
- `list-s3-objects`: To traverse and list files in the bucket/directory. (Call this as an Agent Tool, NOT a bash command).
|
|
42
|
+
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction. (Call this as an Agent Tool, NOT a bash command).
|
|
43
|
+
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog. (Call this as an Agent Tool, NOT a bash command).
|
|
44
44
|
|
|
45
45
|
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
-
export interface IngestSourceParams {
|
|
3
|
-
datasource_name: string;
|
|
4
|
-
}
|
|
5
|
-
export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
|
|
6
|
-
file: any;
|
|
7
|
-
status: string;
|
|
8
|
-
chunks: number;
|
|
9
|
-
message?: undefined;
|
|
10
|
-
} | {
|
|
11
|
-
file: any;
|
|
12
|
-
status: string;
|
|
13
|
-
message: any;
|
|
14
|
-
chunks?: undefined;
|
|
15
|
-
})[]>;
|
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.ingestSource = ingestSource;
|
|
37
|
-
const factory_1 = require("../../service/metadata/factory");
|
|
38
|
-
const las_api_1 = require("./las-api");
|
|
39
|
-
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
40
|
-
const path = __importStar(require("path"));
|
|
41
|
-
const fs = __importStar(require("fs"));
|
|
42
|
-
const os = __importStar(require("os"));
|
|
43
|
-
// @ts-ignore
|
|
44
|
-
const uuid_1 = require("uuid");
|
|
45
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
-
async function ingestSource(params, config, logger) {
|
|
47
|
-
if (logger) {
|
|
48
|
-
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
49
|
-
}
|
|
50
|
-
else {
|
|
51
|
-
// eslint-disable-next-line no-console
|
|
52
|
-
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
53
|
-
}
|
|
54
|
-
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
55
|
-
const dbPath = path.join(dsDir, 'catalog_db');
|
|
56
|
-
if (!fs.existsSync(dbPath)) {
|
|
57
|
-
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
58
|
-
}
|
|
59
|
-
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
60
|
-
const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
|
|
61
|
-
await metadataProvider.connect();
|
|
62
|
-
const lasClient = new las_api_1.LasApiClient(config, logger);
|
|
63
|
-
const results = [];
|
|
64
|
-
// Connect to the profiler LanceDB to read the file catalog
|
|
65
|
-
const profilerDb = await lancedb.connect(dbPath);
|
|
66
|
-
const tableNames = await profilerDb.tableNames();
|
|
67
|
-
if (!tableNames.includes('file_catalog')) {
|
|
68
|
-
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
69
|
-
}
|
|
70
|
-
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
71
|
-
const files = await catalogTable.query().toArray();
|
|
72
|
-
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
73
|
-
// Simple chunking for text
|
|
74
|
-
const splitText = (text, chunkSize = 500, overlap = 50) => {
|
|
75
|
-
const chunks = [];
|
|
76
|
-
if (!text)
|
|
77
|
-
return chunks;
|
|
78
|
-
let i = 0;
|
|
79
|
-
while (i < text.length) {
|
|
80
|
-
chunks.push(text.slice(i, i + chunkSize));
|
|
81
|
-
i += chunkSize - overlap;
|
|
82
|
-
}
|
|
83
|
-
return chunks;
|
|
84
|
-
};
|
|
85
|
-
const processText = async (text, fileInfo) => {
|
|
86
|
-
const chunks = splitText(text);
|
|
87
|
-
const docs = [];
|
|
88
|
-
for (const chunk of chunks) {
|
|
89
|
-
const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
|
|
90
|
-
docs.push({
|
|
91
|
-
id: (0, uuid_1.v4)(),
|
|
92
|
-
vector,
|
|
93
|
-
text: chunk,
|
|
94
|
-
source: fileInfo.key,
|
|
95
|
-
file_type: fileInfo.category,
|
|
96
|
-
storage_type: 'source',
|
|
97
|
-
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
98
|
-
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
99
|
-
created_at: Date.now(),
|
|
100
|
-
binary_data: Buffer.from('')
|
|
101
|
-
});
|
|
102
|
-
}
|
|
103
|
-
return docs;
|
|
104
|
-
};
|
|
105
|
-
for (const file of files) {
|
|
106
|
-
try {
|
|
107
|
-
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
108
|
-
let docs = [];
|
|
109
|
-
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
110
|
-
if (file.media_type === 'pdf') {
|
|
111
|
-
// PDF Parse
|
|
112
|
-
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
113
|
-
url: fileUrl
|
|
114
|
-
});
|
|
115
|
-
const markdown = result.data?.markdown || '';
|
|
116
|
-
docs = await processText(markdown, file);
|
|
117
|
-
}
|
|
118
|
-
else if (file.media_type === 'image') {
|
|
119
|
-
// Multimodal Embedding directly
|
|
120
|
-
const vector = await metadataProvider.generateMultimodalEmbedding([
|
|
121
|
-
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
122
|
-
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
123
|
-
]);
|
|
124
|
-
docs.push({
|
|
125
|
-
id: (0, uuid_1.v4)(),
|
|
126
|
-
vector,
|
|
127
|
-
text: 'Image from dataset',
|
|
128
|
-
source: file.key,
|
|
129
|
-
file_type: 'image',
|
|
130
|
-
storage_type: 'source',
|
|
131
|
-
url: fileUrl,
|
|
132
|
-
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
133
|
-
created_at: Date.now(),
|
|
134
|
-
binary_data: Buffer.from('')
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
else if (file.media_type === 'audio') {
|
|
138
|
-
// ASR
|
|
139
|
-
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
140
|
-
audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
|
|
141
|
-
request: { model_name: 'bigmodel' }
|
|
142
|
-
});
|
|
143
|
-
const text = result.data?.result?.text || '';
|
|
144
|
-
docs = await processText(text, file);
|
|
145
|
-
}
|
|
146
|
-
else if (file.media_type === 'video') {
|
|
147
|
-
// Video understanding -> text -> embedding
|
|
148
|
-
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
149
|
-
video_url: fileUrl,
|
|
150
|
-
query: "详细描述这个视频的内容",
|
|
151
|
-
model_name: "doubao-seed-2-0-lite-260215"
|
|
152
|
-
});
|
|
153
|
-
// Assuming video output is a text description somewhere in the response.
|
|
154
|
-
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
155
|
-
const text = JSON.stringify(result.data || '');
|
|
156
|
-
// Also need audio extract and ASR for video
|
|
157
|
-
// 1. Extract audio
|
|
158
|
-
// The output_path_template needs a unique path per video
|
|
159
|
-
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
|
|
160
|
-
await lasClient.process('las_audio_extract_and_split', {
|
|
161
|
-
input_path: fileUrl,
|
|
162
|
-
output_path_template: audioOutputPath,
|
|
163
|
-
output_format: 'wav'
|
|
164
|
-
});
|
|
165
|
-
// 2. ASR on the extracted audio
|
|
166
|
-
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
167
|
-
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
168
|
-
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
169
|
-
request: { model_name: 'bigmodel' }
|
|
170
|
-
});
|
|
171
|
-
const audioText = asrResult.data?.result?.text || '';
|
|
172
|
-
// Combine video text and audio text
|
|
173
|
-
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
174
|
-
docs = await processText(combinedText, file);
|
|
175
|
-
}
|
|
176
|
-
else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
177
|
-
// If we had a direct text content, we could process it here.
|
|
178
|
-
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
179
|
-
// Fallback just logs
|
|
180
|
-
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
181
|
-
}
|
|
182
|
-
if (docs.length > 0) {
|
|
183
|
-
await metadataProvider.addAssets(docs);
|
|
184
|
-
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
catch (error) {
|
|
188
|
-
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
189
|
-
results.push({ file: file.key, status: 'error', message: error.message });
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
return results;
|
|
193
|
-
}
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
-
export declare function lasPdfParseDoubao(params: {
|
|
3
|
-
url: string;
|
|
4
|
-
start_page?: number;
|
|
5
|
-
num_pages?: number;
|
|
6
|
-
parse_mode?: string;
|
|
7
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
8
|
-
export declare function lasLongVideoUnderstand(params: {
|
|
9
|
-
video_url: string;
|
|
10
|
-
prompt: string;
|
|
11
|
-
system_prompt?: string;
|
|
12
|
-
return_chunk_text?: boolean;
|
|
13
|
-
max_tokens?: number;
|
|
14
|
-
temperature?: number;
|
|
15
|
-
top_p?: number;
|
|
16
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
17
|
-
export declare function lasBareImageTextEmbedding(params: {
|
|
18
|
-
input: Array<{
|
|
19
|
-
type: string;
|
|
20
|
-
text?: string;
|
|
21
|
-
image_url?: string;
|
|
22
|
-
}>;
|
|
23
|
-
encoding_format?: string;
|
|
24
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
25
|
-
export declare function lasSeed20(params: {
|
|
26
|
-
model: string;
|
|
27
|
-
messages: Array<any>;
|
|
28
|
-
stream?: boolean;
|
|
29
|
-
max_tokens?: number;
|
|
30
|
-
temperature?: number;
|
|
31
|
-
top_p?: number;
|
|
32
|
-
frequency_penalty?: number;
|
|
33
|
-
presence_penalty?: number;
|
|
34
|
-
tools?: Array<any>;
|
|
35
|
-
tool_choice?: any;
|
|
36
|
-
user?: string;
|
|
37
|
-
logprobs?: boolean;
|
|
38
|
-
top_logprobs?: number;
|
|
39
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
40
|
-
export declare function lasAsrPro(params: {
|
|
41
|
-
url?: string;
|
|
42
|
-
format?: string;
|
|
43
|
-
language?: string;
|
|
44
|
-
resource?: string;
|
|
45
|
-
use_itn?: boolean;
|
|
46
|
-
use_sn?: boolean;
|
|
47
|
-
enable_alignment?: boolean;
|
|
48
|
-
channel_id?: number;
|
|
49
|
-
use_word_info?: boolean;
|
|
50
|
-
text_format?: number;
|
|
51
|
-
enable_semantic_sentence_detection?: boolean;
|
|
52
|
-
boost_words?: Array<{
|
|
53
|
-
word: string;
|
|
54
|
-
weight: number;
|
|
55
|
-
}>;
|
|
56
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
57
|
-
export declare function lasAudioExtractAndSplit(params: {
|
|
58
|
-
input_path: string;
|
|
59
|
-
output_path_template: string;
|
|
60
|
-
split_duration?: number;
|
|
61
|
-
output_format?: string;
|
|
62
|
-
timeout?: number;
|
|
63
|
-
extra_params?: string[];
|
|
64
|
-
}, config?: ContextLakeConfig): Promise<any>;
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.lasPdfParseDoubao = lasPdfParseDoubao;
|
|
4
|
-
exports.lasLongVideoUnderstand = lasLongVideoUnderstand;
|
|
5
|
-
exports.lasBareImageTextEmbedding = lasBareImageTextEmbedding;
|
|
6
|
-
exports.lasSeed20 = lasSeed20;
|
|
7
|
-
exports.lasAsrPro = lasAsrPro;
|
|
8
|
-
exports.lasAudioExtractAndSplit = lasAudioExtractAndSplit;
|
|
9
|
-
function getLASConfig(config) {
|
|
10
|
-
// Attempt to get from env vars or config
|
|
11
|
-
const endpoint = process.env.LAS_ENDPOINT || (config?.las)?.endpoint;
|
|
12
|
-
const apiKey = process.env.LAS_API_KEY || (config?.las)?.api_key;
|
|
13
|
-
if (!endpoint || !apiKey) {
|
|
14
|
-
throw new Error("LAS_ENDPOINT and LAS_API_KEY must be set in environment variables or config");
|
|
15
|
-
}
|
|
16
|
-
return { endpoint, apiKey };
|
|
17
|
-
}
|
|
18
|
-
async function lasFetch(path, payload, config) {
|
|
19
|
-
const { endpoint, apiKey } = getLASConfig(config);
|
|
20
|
-
const url = `${endpoint.replace(/\/$/, '')}${path}`;
|
|
21
|
-
const response = await fetch(url, {
|
|
22
|
-
method: 'POST',
|
|
23
|
-
headers: {
|
|
24
|
-
'Content-Type': 'application/json',
|
|
25
|
-
'Authorization': `Bearer ${apiKey}`
|
|
26
|
-
},
|
|
27
|
-
body: JSON.stringify(payload)
|
|
28
|
-
});
|
|
29
|
-
if (!response.ok) {
|
|
30
|
-
let errorText = await response.text().catch(() => '');
|
|
31
|
-
throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
32
|
-
}
|
|
33
|
-
return await response.json();
|
|
34
|
-
}
|
|
35
|
-
async function lasPdfParseDoubao(params, config) {
|
|
36
|
-
return lasFetch('/api/v1/submit', {
|
|
37
|
-
operator_id: 'las_pdf_parse_doubao',
|
|
38
|
-
operator_version: 'v1',
|
|
39
|
-
data: params
|
|
40
|
-
}, config);
|
|
41
|
-
}
|
|
42
|
-
async function lasLongVideoUnderstand(params, config) {
|
|
43
|
-
return lasFetch('/api/v1/submit', {
|
|
44
|
-
operator_id: 'las_long_video_understand',
|
|
45
|
-
operator_version: 'v1',
|
|
46
|
-
data: params
|
|
47
|
-
}, config);
|
|
48
|
-
}
|
|
49
|
-
async function lasBareImageTextEmbedding(params, config) {
|
|
50
|
-
return lasFetch('/api/v1/embeddings/multimodal', {
|
|
51
|
-
model: 'doubao-embedding-vision',
|
|
52
|
-
input: params.input,
|
|
53
|
-
encoding_format: params.encoding_format
|
|
54
|
-
}, config);
|
|
55
|
-
}
|
|
56
|
-
async function lasSeed20(params, config) {
|
|
57
|
-
return lasFetch('/api/v1/chat/completions', params, config);
|
|
58
|
-
}
|
|
59
|
-
async function lasAsrPro(params, config) {
|
|
60
|
-
return lasFetch('/api/v1/submit', {
|
|
61
|
-
operator_id: 'las_asr_pro',
|
|
62
|
-
operator_version: 'v1',
|
|
63
|
-
data: params
|
|
64
|
-
}, config);
|
|
65
|
-
}
|
|
66
|
-
async function lasAudioExtractAndSplit(params, config) {
|
|
67
|
-
return lasFetch('/api/v1/process', {
|
|
68
|
-
operator_id: 'las_audio_extract_and_split',
|
|
69
|
-
operator_version: 'v1',
|
|
70
|
-
data: params
|
|
71
|
-
}, config);
|
|
72
|
-
}
|
|
@@ -1,617 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
s3_catalog.py -- Data source profiling tool for LanceDB
|
|
4
|
-
|
|
5
|
-
Three-pass scanning:
|
|
6
|
-
Pass 1: Walk all files -> file_catalog
|
|
7
|
-
Pass 2: Sample structured data -> structured_schemas
|
|
8
|
-
Pass 3: Read media file headers -> media_metadata
|
|
9
|
-
|
|
10
|
-
Supported sources: volcengine TOS / alibaba OSS / tencent COS / aws S3 / local
|
|
11
|
-
Output: LanceDB tables (file_catalog, structured_schemas, media_metadata)
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import argparse
|
|
15
|
-
import datetime
|
|
16
|
-
import io
|
|
17
|
-
import json
|
|
18
|
-
import mimetypes
|
|
19
|
-
import os
|
|
20
|
-
import re
|
|
21
|
-
import sys
|
|
22
|
-
import time
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
25
|
-
|
|
26
|
-
# ---------------------------------------------------------------------------
|
|
27
|
-
# Lazy imports
|
|
28
|
-
# ---------------------------------------------------------------------------
|
|
29
|
-
|
|
30
|
-
def _import_boto3():
|
|
31
|
-
try:
|
|
32
|
-
import boto3
|
|
33
|
-
from botocore.config import Config as BotoConfig
|
|
34
|
-
return boto3, BotoConfig
|
|
35
|
-
except ImportError:
|
|
36
|
-
print("ERROR: boto3 not installed. Run: pip install boto3", file=sys.stderr)
|
|
37
|
-
sys.exit(1)
|
|
38
|
-
|
|
39
|
-
def _import_lancedb():
|
|
40
|
-
try:
|
|
41
|
-
import lancedb
|
|
42
|
-
return lancedb
|
|
43
|
-
except ImportError:
|
|
44
|
-
print("ERROR: lancedb not installed. Run: pip install lancedb", file=sys.stderr)
|
|
45
|
-
sys.exit(1)
|
|
46
|
-
|
|
47
|
-
def _import_pandas():
|
|
48
|
-
try:
|
|
49
|
-
import pandas as pd
|
|
50
|
-
return pd
|
|
51
|
-
except ImportError:
|
|
52
|
-
print("ERROR: pandas not installed. Run: pip install pandas", file=sys.stderr)
|
|
53
|
-
sys.exit(1)
|
|
54
|
-
|
|
55
|
-
def _import_pyarrow():
|
|
56
|
-
try:
|
|
57
|
-
import pyarrow as pa
|
|
58
|
-
import pyarrow.parquet as pq
|
|
59
|
-
return pa, pq
|
|
60
|
-
except ImportError:
|
|
61
|
-
print("ERROR: pyarrow not installed. Run: pip install pyarrow", file=sys.stderr)
|
|
62
|
-
sys.exit(1)
|
|
63
|
-
|
|
64
|
-
# ---------------------------------------------------------------------------
|
|
65
|
-
# Constants
|
|
66
|
-
# ---------------------------------------------------------------------------
|
|
67
|
-
|
|
68
|
-
STRUCTURED_EXTS = {'.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'}
|
|
69
|
-
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'}
|
|
70
|
-
AUDIO_EXTS = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
|
|
71
|
-
VIDEO_EXTS = {'.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'}
|
|
72
|
-
PDF_EXTS = {'.pdf'}
|
|
73
|
-
|
|
74
|
-
IMAGE_HEAD_BYTES = 64 * 1024
|
|
75
|
-
AUDIO_HEAD_BYTES = 512 * 1024
|
|
76
|
-
VIDEO_HEAD_BYTES = 2 * 1024 * 1024
|
|
77
|
-
PDF_HEAD_BYTES = 256 * 1024
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def classify_file(ext: str) -> Tuple[str, Optional[str]]:
|
|
81
|
-
ext = ext.lower()
|
|
82
|
-
if ext in STRUCTURED_EXTS:
|
|
83
|
-
return ('structured', None)
|
|
84
|
-
if ext in IMAGE_EXTS:
|
|
85
|
-
return ('non-structured', 'image')
|
|
86
|
-
if ext in AUDIO_EXTS:
|
|
87
|
-
return ('non-structured', 'audio')
|
|
88
|
-
if ext in VIDEO_EXTS:
|
|
89
|
-
return ('non-structured', 'video')
|
|
90
|
-
if ext in PDF_EXTS:
|
|
91
|
-
return ('non-structured', 'pdf')
|
|
92
|
-
return ('non-structured', None)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
# ===================================================================
|
|
96
|
-
# S3 Client Abstraction
|
|
97
|
-
# ===================================================================
|
|
98
|
-
|
|
99
|
-
class S3Client:
|
|
100
|
-
"""Thin wrapper around boto3 S3 with vendor-specific configuration."""
|
|
101
|
-
|
|
102
|
-
def __init__(self, vendor: str, endpoint: str, credential_id: str, credential_secret: str, region: str):
|
|
103
|
-
boto3, BotoConfig = _import_boto3()
|
|
104
|
-
self.vendor = vendor
|
|
105
|
-
|
|
106
|
-
config_kwargs: Dict[str, Any] = {}
|
|
107
|
-
if vendor == 'volcengine':
|
|
108
|
-
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
109
|
-
elif vendor == 'alibaba':
|
|
110
|
-
config_kwargs['signature_version'] = 's3'
|
|
111
|
-
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
112
|
-
elif vendor == 'tencent':
|
|
113
|
-
config_kwargs['s3'] = {'addressing_style': 'virtual'}
|
|
114
|
-
|
|
115
|
-
self.client = boto3.client(
|
|
116
|
-
's3',
|
|
117
|
-
endpoint_url=endpoint or None,
|
|
118
|
-
aws_access_key_id=credential_id,
|
|
119
|
-
aws_secret_access_key=credential_secret,
|
|
120
|
-
region_name=region,
|
|
121
|
-
config=BotoConfig(**config_kwargs) if config_kwargs else None,
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
def list_objects(self, bucket: str, prefix: str):
|
|
125
|
-
paginator = self.client.get_paginator('list_objects_v2')
|
|
126
|
-
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
127
|
-
for obj in page.get('Contents', []):
|
|
128
|
-
yield obj
|
|
129
|
-
|
|
130
|
-
def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
|
|
131
|
-
resp = self.client.get_object(Bucket=bucket, Key=key, Range=f'bytes={start}-{end}')
|
|
132
|
-
return resp['Body'].read()
|
|
133
|
-
|
|
134
|
-
def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
|
|
135
|
-
kwargs: Dict[str, Any] = {'Bucket': bucket, 'Key': key}
|
|
136
|
-
if max_bytes:
|
|
137
|
-
kwargs['Range'] = f'bytes=0-{max_bytes - 1}'
|
|
138
|
-
resp = self.client.get_object(**kwargs)
|
|
139
|
-
return resp['Body'].read()
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
# ===================================================================
|
|
143
|
-
# Local FS Abstraction
|
|
144
|
-
# ===================================================================
|
|
145
|
-
|
|
146
|
-
class LocalClient:
|
|
147
|
-
def __init__(self, root: str):
|
|
148
|
-
self.root = root
|
|
149
|
-
|
|
150
|
-
def list_objects(self, bucket: str, prefix: str):
|
|
151
|
-
base = Path(bucket)
|
|
152
|
-
prefix_path = base / prefix if prefix and prefix != '.' else base
|
|
153
|
-
for dirpath, _dirs, files in os.walk(prefix_path):
|
|
154
|
-
for fname in files:
|
|
155
|
-
full = Path(dirpath) / fname
|
|
156
|
-
stat = full.stat()
|
|
157
|
-
key = str(full.relative_to(base))
|
|
158
|
-
yield {
|
|
159
|
-
'Key': key,
|
|
160
|
-
'Size': stat.st_size,
|
|
161
|
-
'LastModified': datetime.datetime.fromtimestamp(stat.st_mtime),
|
|
162
|
-
'ETag': '',
|
|
163
|
-
'StorageClass': 'LOCAL',
|
|
164
|
-
'_created_time': datetime.datetime.fromtimestamp(stat.st_ctime),
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
def get_range(self, bucket: str, key: str, start: int, end: int) -> bytes:
|
|
168
|
-
fpath = Path(bucket) / key
|
|
169
|
-
with open(fpath, 'rb') as f:
|
|
170
|
-
f.seek(start)
|
|
171
|
-
return f.read(end - start + 1)
|
|
172
|
-
|
|
173
|
-
def get_object(self, bucket: str, key: str, max_bytes: Optional[int] = None) -> bytes:
|
|
174
|
-
fpath = Path(bucket) / key
|
|
175
|
-
with open(fpath, 'rb') as f:
|
|
176
|
-
if max_bytes:
|
|
177
|
-
return f.read(max_bytes)
|
|
178
|
-
return f.read()
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
# ===================================================================
|
|
182
|
-
# Pass 1: File Catalog
|
|
183
|
-
# ===================================================================
|
|
184
|
-
|
|
185
|
-
def build_file_catalog(client, bucket: str, prefix: str, vendor: str) -> List[Dict]:
|
|
186
|
-
catalog: List[Dict] = []
|
|
187
|
-
scan_ts = datetime.datetime.utcnow().isoformat() + 'Z'
|
|
188
|
-
|
|
189
|
-
for obj in client.list_objects(bucket, prefix):
|
|
190
|
-
key = obj['Key']
|
|
191
|
-
if key.endswith('/'):
|
|
192
|
-
continue
|
|
193
|
-
|
|
194
|
-
name = os.path.basename(key)
|
|
195
|
-
ext = os.path.splitext(name)[1].lower()
|
|
196
|
-
mime, _ = mimetypes.guess_type(name)
|
|
197
|
-
category, media_type = classify_file(ext)
|
|
198
|
-
etag = obj.get('ETag', '').strip('"')
|
|
199
|
-
depth = key.count('/')
|
|
200
|
-
parent_dir = os.path.basename(os.path.dirname(key)) if '/' in key else ''
|
|
201
|
-
|
|
202
|
-
catalog.append({
|
|
203
|
-
'file_path': key,
|
|
204
|
-
'file_name': name,
|
|
205
|
-
'extension': ext,
|
|
206
|
-
'mime_type': mime or '',
|
|
207
|
-
'category': category,
|
|
208
|
-
'media_type': media_type or '',
|
|
209
|
-
'size_bytes': obj.get('Size', 0),
|
|
210
|
-
'last_modified': str(obj.get('LastModified', '')),
|
|
211
|
-
'created_time': str(obj.get('_created_time', '')),
|
|
212
|
-
'etag': etag,
|
|
213
|
-
'storage_class': obj.get('StorageClass', ''),
|
|
214
|
-
'is_multipart': '-' in etag,
|
|
215
|
-
'depth': depth,
|
|
216
|
-
'parent_dir': parent_dir,
|
|
217
|
-
'vendor': vendor,
|
|
218
|
-
'bucket': bucket,
|
|
219
|
-
'has_schema': False,
|
|
220
|
-
'has_media_meta': False,
|
|
221
|
-
'scan_timestamp': scan_ts,
|
|
222
|
-
})
|
|
223
|
-
|
|
224
|
-
return catalog
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
# ===================================================================
|
|
228
|
-
# Pass 2: Structured Schema Analysis
|
|
229
|
-
# ===================================================================
|
|
230
|
-
|
|
231
|
-
def infer_semantic_hint(series) -> Tuple[str, str]:
|
|
232
|
-
pd = _import_pandas()
|
|
233
|
-
non_null = series.dropna()
|
|
234
|
-
n = len(non_null)
|
|
235
|
-
if n == 0:
|
|
236
|
-
return ('constant', 'all null')
|
|
237
|
-
|
|
238
|
-
unique_count = non_null.nunique()
|
|
239
|
-
unique_ratio = unique_count / n if n > 0 else 0
|
|
240
|
-
|
|
241
|
-
if set(non_null.unique()).issubset({True, False, 0, 1, 'true', 'false', 'True', 'False'}):
|
|
242
|
-
return ('boolean', f'{unique_count} distinct values')
|
|
243
|
-
|
|
244
|
-
if unique_count == 1:
|
|
245
|
-
return ('constant', f'value: {non_null.iloc[0]}')
|
|
246
|
-
|
|
247
|
-
dtype_str = str(series.dtype)
|
|
248
|
-
|
|
249
|
-
if 'int' in dtype_str or 'float' in dtype_str:
|
|
250
|
-
return ('numeric', dtype_str)
|
|
251
|
-
|
|
252
|
-
sample_val = non_null.iloc[0]
|
|
253
|
-
if isinstance(sample_val, (list, dict)):
|
|
254
|
-
return ('structured', type(sample_val).__name__)
|
|
255
|
-
|
|
256
|
-
try:
|
|
257
|
-
str_vals = non_null.astype(str)
|
|
258
|
-
avg_len = str_vals.str.len().mean()
|
|
259
|
-
|
|
260
|
-
path_pattern = re.compile(r'[\\\\/]|^s3://|^tos://|^gs://|^https?://')
|
|
261
|
-
path_ratio = str_vals.apply(lambda x: bool(path_pattern.search(x))).mean()
|
|
262
|
-
if path_ratio > 0.5:
|
|
263
|
-
return ('file_path', f'{path_ratio:.0%} match path/URI pattern')
|
|
264
|
-
|
|
265
|
-
ts_pattern = re.compile(r'\d{4}[-/]\d{2}[-/]\d{2}')
|
|
266
|
-
ts_ratio = str_vals.apply(lambda x: bool(ts_pattern.search(x))).mean()
|
|
267
|
-
if ts_ratio > 0.5:
|
|
268
|
-
return ('timestamp', f'{ts_ratio:.0%} match timestamp pattern')
|
|
269
|
-
|
|
270
|
-
if unique_ratio > 0.9 and avg_len < 50:
|
|
271
|
-
return ('id', f'unique_ratio={unique_ratio:.2f}, avg_len={avg_len:.1f}')
|
|
272
|
-
|
|
273
|
-
if unique_count < 50 or unique_ratio < 0.2:
|
|
274
|
-
return ('categorical', f'{unique_count} categories')
|
|
275
|
-
|
|
276
|
-
if avg_len > 50 and unique_ratio > 0.5:
|
|
277
|
-
return ('text', f'avg_len={avg_len:.1f}')
|
|
278
|
-
|
|
279
|
-
except Exception:
|
|
280
|
-
pass
|
|
281
|
-
|
|
282
|
-
return ('text', '')
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def analyze_structured_file(client, bucket: str, key: str, ext: str, sample_rows: int) -> List[Dict]:
|
|
286
|
-
pd = _import_pandas()
|
|
287
|
-
pa, pq = _import_pyarrow()
|
|
288
|
-
|
|
289
|
-
schemas: List[Dict] = []
|
|
290
|
-
max_download = 2 * 1024 * 1024
|
|
291
|
-
|
|
292
|
-
try:
|
|
293
|
-
raw = client.get_object(bucket, key, max_bytes=max_download)
|
|
294
|
-
except Exception as e:
|
|
295
|
-
return [{'file_path': key, 'error': str(e)}]
|
|
296
|
-
|
|
297
|
-
df = None
|
|
298
|
-
fmt = ext.lstrip('.')
|
|
299
|
-
|
|
300
|
-
try:
|
|
301
|
-
if ext in ('.parquet', '.pq'):
|
|
302
|
-
buf = io.BytesIO(raw)
|
|
303
|
-
table = pq.read_table(buf)
|
|
304
|
-
df = table.to_pandas().head(sample_rows)
|
|
305
|
-
fmt = 'parquet'
|
|
306
|
-
elif ext == '.csv':
|
|
307
|
-
df = pd.read_csv(io.BytesIO(raw), nrows=sample_rows, on_bad_lines='skip')
|
|
308
|
-
elif ext == '.tsv':
|
|
309
|
-
df = pd.read_csv(io.BytesIO(raw), sep='\t', nrows=sample_rows, on_bad_lines='skip')
|
|
310
|
-
elif ext in ('.jsonl', '.ndjson'):
|
|
311
|
-
lines = raw.decode('utf-8', errors='replace').strip().split('\n')[:sample_rows]
|
|
312
|
-
records = [json.loads(line) for line in lines if line.strip()]
|
|
313
|
-
df = pd.json_normalize(records)
|
|
314
|
-
elif ext == '.json':
|
|
315
|
-
data = json.loads(raw.decode('utf-8', errors='replace'))
|
|
316
|
-
if isinstance(data, list):
|
|
317
|
-
df = pd.json_normalize(data[:sample_rows])
|
|
318
|
-
elif isinstance(data, dict):
|
|
319
|
-
df = pd.json_normalize([data])
|
|
320
|
-
except Exception as e:
|
|
321
|
-
return [{'file_path': key, 'error': f'parse error: {e}'}]
|
|
322
|
-
|
|
323
|
-
if df is None or df.empty:
|
|
324
|
-
return []
|
|
325
|
-
|
|
326
|
-
for col in df.columns:
|
|
327
|
-
series = df[col]
|
|
328
|
-
non_null = series.dropna()
|
|
329
|
-
unique_count = int(non_null.nunique()) if len(non_null) > 0 else 0
|
|
330
|
-
non_null_ratio = len(non_null) / len(series) if len(series) > 0 else 0.0
|
|
331
|
-
|
|
332
|
-
sample_values = []
|
|
333
|
-
try:
|
|
334
|
-
sample_values = [str(v) for v in non_null.unique()[:3]]
|
|
335
|
-
except Exception:
|
|
336
|
-
pass
|
|
337
|
-
|
|
338
|
-
hint, detail = infer_semantic_hint(series)
|
|
339
|
-
|
|
340
|
-
schemas.append({
|
|
341
|
-
'file_path': key,
|
|
342
|
-
'vendor': '',
|
|
343
|
-
'bucket': bucket,
|
|
344
|
-
'format': fmt,
|
|
345
|
-
'column_name': str(col),
|
|
346
|
-
'column_type': str(series.dtype),
|
|
347
|
-
'non_null_ratio': round(non_null_ratio, 4),
|
|
348
|
-
'unique_count': unique_count,
|
|
349
|
-
'sample_values': json.dumps(sample_values, ensure_ascii=False),
|
|
350
|
-
'semantic_hint': hint,
|
|
351
|
-
'semantic_detail': detail,
|
|
352
|
-
})
|
|
353
|
-
|
|
354
|
-
return schemas
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# ===================================================================
|
|
358
|
-
# Pass 3: Media Metadata Extraction
|
|
359
|
-
# ===================================================================
|
|
360
|
-
|
|
361
|
-
def extract_image_meta(data: bytes) -> Dict:
|
|
362
|
-
meta: Dict[str, Any] = {
|
|
363
|
-
'width': 0, 'height': 0, 'image_format': '', 'color_mode': '', 'exif_summary': '{}',
|
|
364
|
-
}
|
|
365
|
-
try:
|
|
366
|
-
from PIL import Image
|
|
367
|
-
img = Image.open(io.BytesIO(data))
|
|
368
|
-
meta['width'] = img.width
|
|
369
|
-
meta['height'] = img.height
|
|
370
|
-
meta['image_format'] = img.format or ''
|
|
371
|
-
meta['color_mode'] = img.mode or ''
|
|
372
|
-
|
|
373
|
-
exif = {}
|
|
374
|
-
exif_data = getattr(img, '_getexif', lambda: None)()
|
|
375
|
-
if exif_data:
|
|
376
|
-
for tag_id, value in list(exif_data.items())[:10]:
|
|
377
|
-
try:
|
|
378
|
-
from PIL.ExifTags import TAGS
|
|
379
|
-
tag_name = TAGS.get(tag_id, str(tag_id))
|
|
380
|
-
exif[tag_name] = str(value)[:100]
|
|
381
|
-
except Exception:
|
|
382
|
-
pass
|
|
383
|
-
meta['exif_summary'] = json.dumps(exif, ensure_ascii=False)
|
|
384
|
-
except Exception as e:
|
|
385
|
-
meta['extract_error'] = str(e)
|
|
386
|
-
return meta
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
def extract_audio_meta(data: bytes) -> Dict:
|
|
390
|
-
meta: Dict[str, Any] = {
|
|
391
|
-
'duration_sec': 0.0, 'codec': '', 'sample_rate': 0, 'channels': 0,
|
|
392
|
-
'bitrate': 0, 'tags_summary': '{}',
|
|
393
|
-
}
|
|
394
|
-
try:
|
|
395
|
-
import mutagen
|
|
396
|
-
f = mutagen.File(io.BytesIO(data))
|
|
397
|
-
if f:
|
|
398
|
-
info = getattr(f, 'info', None)
|
|
399
|
-
if info:
|
|
400
|
-
meta['duration_sec'] = round(getattr(info, 'length', 0.0), 2)
|
|
401
|
-
meta['sample_rate'] = getattr(info, 'sample_rate', 0)
|
|
402
|
-
meta['channels'] = getattr(info, 'channels', 0)
|
|
403
|
-
meta['bitrate'] = getattr(info, 'bitrate', 0)
|
|
404
|
-
meta['codec'] = type(info).__name__
|
|
405
|
-
|
|
406
|
-
tags = {}
|
|
407
|
-
if f.tags:
|
|
408
|
-
for k in list(f.tags.keys())[:10]:
|
|
409
|
-
try:
|
|
410
|
-
tags[str(k)] = str(f.tags[k])[:100]
|
|
411
|
-
except Exception:
|
|
412
|
-
pass
|
|
413
|
-
meta['tags_summary'] = json.dumps(tags, ensure_ascii=False)
|
|
414
|
-
except Exception as e:
|
|
415
|
-
meta['extract_error'] = str(e)
|
|
416
|
-
return meta
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
def extract_video_meta(data: bytes) -> Dict:
|
|
420
|
-
meta: Dict[str, Any] = {
|
|
421
|
-
'width': 0, 'height': 0, 'duration_sec': 0.0, 'container': '',
|
|
422
|
-
}
|
|
423
|
-
try:
|
|
424
|
-
if data[:4] == b'\x1a\x45\xdf\xa3':
|
|
425
|
-
meta['container'] = 'mkv/webm'
|
|
426
|
-
elif len(data) > 8 and data[4:8] == b'ftyp':
|
|
427
|
-
ftyp = data[8:12].decode('ascii', errors='replace').strip()
|
|
428
|
-
meta['container'] = ftyp
|
|
429
|
-
elif data[:4] == b'RIFF':
|
|
430
|
-
meta['container'] = 'avi'
|
|
431
|
-
elif data[:3] == b'FLV':
|
|
432
|
-
meta['container'] = 'flv'
|
|
433
|
-
except Exception as e:
|
|
434
|
-
meta['extract_error'] = str(e)
|
|
435
|
-
return meta
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
def extract_pdf_meta(data: bytes) -> Dict:
|
|
439
|
-
meta: Dict[str, Any] = {
|
|
440
|
-
'page_count': 0, 'pdf_title': '', 'pdf_author': '',
|
|
441
|
-
'creation_date': '', 'encrypted': False,
|
|
442
|
-
'page_width_pt': 0.0, 'page_height_pt': 0.0,
|
|
443
|
-
}
|
|
444
|
-
try:
|
|
445
|
-
import fitz
|
|
446
|
-
doc = fitz.open(stream=data, filetype='pdf')
|
|
447
|
-
meta['page_count'] = doc.page_count
|
|
448
|
-
md = doc.metadata or {}
|
|
449
|
-
meta['pdf_title'] = md.get('title', '')
|
|
450
|
-
meta['pdf_author'] = md.get('author', '')
|
|
451
|
-
meta['creation_date'] = md.get('creationDate', '')
|
|
452
|
-
meta['encrypted'] = doc.is_encrypted
|
|
453
|
-
|
|
454
|
-
if doc.page_count > 0:
|
|
455
|
-
page = doc[0]
|
|
456
|
-
rect = page.rect
|
|
457
|
-
meta['page_width_pt'] = round(rect.width, 2)
|
|
458
|
-
meta['page_height_pt'] = round(rect.height, 2)
|
|
459
|
-
doc.close()
|
|
460
|
-
except Exception as e:
|
|
461
|
-
meta['extract_error'] = str(e)
|
|
462
|
-
return meta
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
def extract_media_metadata(client, bucket: str, key: str, media_type: str) -> Dict:
|
|
466
|
-
head_size = {
|
|
467
|
-
'image': IMAGE_HEAD_BYTES,
|
|
468
|
-
'audio': AUDIO_HEAD_BYTES,
|
|
469
|
-
'video': VIDEO_HEAD_BYTES,
|
|
470
|
-
'pdf': PDF_HEAD_BYTES,
|
|
471
|
-
}.get(media_type, IMAGE_HEAD_BYTES)
|
|
472
|
-
|
|
473
|
-
try:
|
|
474
|
-
data = client.get_object(bucket, key, max_bytes=head_size)
|
|
475
|
-
except Exception as e:
|
|
476
|
-
return {'extract_error': f'download failed: {e}'}
|
|
477
|
-
|
|
478
|
-
if media_type == 'image':
|
|
479
|
-
return extract_image_meta(data)
|
|
480
|
-
elif media_type == 'audio':
|
|
481
|
-
return extract_audio_meta(data)
|
|
482
|
-
elif media_type == 'video':
|
|
483
|
-
return extract_video_meta(data)
|
|
484
|
-
elif media_type == 'pdf':
|
|
485
|
-
return extract_pdf_meta(data)
|
|
486
|
-
return {}
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
# ===================================================================
|
|
490
|
-
# LanceDB Writer
|
|
491
|
-
# ===================================================================
|
|
492
|
-
|
|
493
|
-
def write_to_lancedb(db_path: str, table_name: str, records: List[Dict]):
|
|
494
|
-
if not records:
|
|
495
|
-
return
|
|
496
|
-
lancedb = _import_lancedb()
|
|
497
|
-
pd = _import_pandas()
|
|
498
|
-
|
|
499
|
-
db = lancedb.connect(db_path)
|
|
500
|
-
df = pd.DataFrame(records)
|
|
501
|
-
|
|
502
|
-
table_names = db.table_names()
|
|
503
|
-
if table_name in table_names:
|
|
504
|
-
db.drop_table(table_name)
|
|
505
|
-
|
|
506
|
-
db.create_table(table_name, data=df)
|
|
507
|
-
print(f" [LanceDB] Wrote {len(records)} records to '{table_name}'")
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
# ===================================================================
|
|
511
|
-
# Main
|
|
512
|
-
# ===================================================================
|
|
513
|
-
|
|
514
|
-
def main():
|
|
515
|
-
parser = argparse.ArgumentParser(description='S3-compatible data profiler -> LanceDB')
|
|
516
|
-
parser.add_argument('--vendor', required=True,
|
|
517
|
-
choices=['volcengine', 'alibaba', 'tencent', 'aws', 'local'])
|
|
518
|
-
parser.add_argument('--endpoint', default='')
|
|
519
|
-
parser.add_argument('--ak', default='', dest='cred_id',
|
|
520
|
-
help='Access credential ID')
|
|
521
|
-
parser.add_argument('--sk', default='', dest='cred_secret',
|
|
522
|
-
help='Access credential value')
|
|
523
|
-
parser.add_argument('--region', default='')
|
|
524
|
-
parser.add_argument('--bucket', required=True)
|
|
525
|
-
parser.add_argument('--prefix', required=True)
|
|
526
|
-
parser.add_argument('--db-path', default=None, help='Path to LanceDB database. Defaults to ~/.openclaw/las-data-profiler/{datasource_name}/catalog_db if datasource_name is provided.')
|
|
527
|
-
parser.add_argument('--datasource-name', default='', help='Name of the datasource. Used to determine default db-path if not explicitly provided.')
|
|
528
|
-
parser.add_argument('--sample-rows', type=int, default=100)
|
|
529
|
-
args = parser.parse_args()
|
|
530
|
-
|
|
531
|
-
if not args.db_path:
|
|
532
|
-
if args.datasource_name:
|
|
533
|
-
import os
|
|
534
|
-
home_dir = os.path.expanduser('~')
|
|
535
|
-
args.db_path = os.path.join(home_dir, '.openclaw', 'las-data-profiler', args.datasource_name, 'catalog_db')
|
|
536
|
-
else:
|
|
537
|
-
args.db_path = './catalog_db'
|
|
538
|
-
|
|
539
|
-
print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
|
|
540
|
-
print(f"[las-data-profiler] db_path={args.db_path}")
|
|
541
|
-
|
|
542
|
-
if args.vendor == 'local':
|
|
543
|
-
client = LocalClient(args.bucket)
|
|
544
|
-
else:
|
|
545
|
-
client = S3Client(
|
|
546
|
-
vendor=args.vendor,
|
|
547
|
-
endpoint=args.endpoint,
|
|
548
|
-
credential_id=args.cred_id,
|
|
549
|
-
credential_secret=args.cred_secret,
|
|
550
|
-
region=args.region,
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
# ---- Pass 1: File Catalog ----
|
|
554
|
-
print("\n[Pass 1] Scanning files...")
|
|
555
|
-
catalog = build_file_catalog(client, args.bucket, args.prefix, args.vendor)
|
|
556
|
-
print(f" Found {len(catalog)} files")
|
|
557
|
-
|
|
558
|
-
# ---- Pass 2: Structured Schemas ----
|
|
559
|
-
print("\n[Pass 2] Analyzing structured data...")
|
|
560
|
-
structured_files = [f for f in catalog if f['category'] == 'structured']
|
|
561
|
-
all_schemas: List[Dict] = []
|
|
562
|
-
for i, entry in enumerate(structured_files):
|
|
563
|
-
key = entry['file_path']
|
|
564
|
-
ext = entry['extension']
|
|
565
|
-
print(f" [{i+1}/{len(structured_files)}] {key}")
|
|
566
|
-
schemas = analyze_structured_file(client, args.bucket, key, ext, args.sample_rows)
|
|
567
|
-
for s in schemas:
|
|
568
|
-
s['vendor'] = args.vendor
|
|
569
|
-
all_schemas.extend(schemas)
|
|
570
|
-
entry['has_schema'] = True
|
|
571
|
-
|
|
572
|
-
print(f" Analyzed {len(structured_files)} files, {len(all_schemas)} column records")
|
|
573
|
-
|
|
574
|
-
# ---- Pass 3: Media Metadata ----
|
|
575
|
-
print("\n[Pass 3] Extracting media metadata...")
|
|
576
|
-
media_files = [f for f in catalog if f['media_type'] in ('image', 'audio', 'video', 'pdf')]
|
|
577
|
-
all_media_meta: List[Dict] = []
|
|
578
|
-
for i, entry in enumerate(media_files):
|
|
579
|
-
key = entry['file_path']
|
|
580
|
-
media_type = entry['media_type']
|
|
581
|
-
print(f" [{i+1}/{len(media_files)}] {key} ({media_type})")
|
|
582
|
-
meta = extract_media_metadata(client, args.bucket, key, media_type)
|
|
583
|
-
meta['file_path'] = key
|
|
584
|
-
meta['vendor'] = args.vendor
|
|
585
|
-
meta['bucket'] = args.bucket
|
|
586
|
-
meta['media_type'] = media_type
|
|
587
|
-
for col in ['width', 'height', 'image_format', 'color_mode', 'exif_summary',
|
|
588
|
-
'duration_sec', 'codec', 'sample_rate', 'channels', 'bitrate',
|
|
589
|
-
'tags_summary', 'container',
|
|
590
|
-
'page_count', 'pdf_title', 'pdf_author', 'creation_date',
|
|
591
|
-
'encrypted', 'page_width_pt', 'page_height_pt', 'extract_error']:
|
|
592
|
-
meta.setdefault(col, '' if isinstance(meta.get(col), str) else 0)
|
|
593
|
-
all_media_meta.append(meta)
|
|
594
|
-
entry['has_media_meta'] = True
|
|
595
|
-
|
|
596
|
-
print(f" Extracted metadata for {len(media_files)} media files")
|
|
597
|
-
|
|
598
|
-
# ---- Write to LanceDB ----
|
|
599
|
-
print(f"\n[LanceDB] Writing to {args.db_path}")
|
|
600
|
-
write_to_lancedb(args.db_path, 'file_catalog', catalog)
|
|
601
|
-
write_to_lancedb(args.db_path, 'structured_schemas', all_schemas)
|
|
602
|
-
write_to_lancedb(args.db_path, 'media_metadata', all_media_meta)
|
|
603
|
-
|
|
604
|
-
# ---- Summary JSON (stdout, for Node.js to parse) ----
|
|
605
|
-
summary = {
|
|
606
|
-
'summary': {
|
|
607
|
-
'total_files': len(catalog),
|
|
608
|
-
'structured_files': len(structured_files),
|
|
609
|
-
'media_files': len(media_files),
|
|
610
|
-
}
|
|
611
|
-
}
|
|
612
|
-
print(f"\n{json.dumps(summary)}")
|
|
613
|
-
print("\n[las-data-profiler] Done!")
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
if __name__ == '__main__':
|
|
617
|
-
main()
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import { EmbeddingProvider, EmbeddingConfig } from './interface';
|
|
2
|
-
export declare const setNodeLlamaCppImporter: (importer: () => Promise<any>) => void;
|
|
3
|
-
export declare class LocalEmbeddingProvider implements EmbeddingProvider {
|
|
4
|
-
private llama;
|
|
5
|
-
private model;
|
|
6
|
-
private context;
|
|
7
|
-
private initPromise;
|
|
8
|
-
private modelPath;
|
|
9
|
-
constructor(config: EmbeddingConfig);
|
|
10
|
-
private ensureInitialized;
|
|
11
|
-
private doInitialize;
|
|
12
|
-
generateEmbedding(text: string): Promise<number[]>;
|
|
13
|
-
generateEmbeddings(texts: string[]): Promise<number[][]>;
|
|
14
|
-
}
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.LocalEmbeddingProvider = exports.setNodeLlamaCppImporter = void 0;
|
|
4
|
-
// import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
|
|
5
|
-
const DEFAULT_LOCAL_MODEL = 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf';
|
|
6
|
-
let nodeLlamaImportPromise = null;
|
|
7
|
-
const setNodeLlamaCppImporter = (importer) => {
|
|
8
|
-
nodeLlamaImportPromise = importer();
|
|
9
|
-
};
|
|
10
|
-
exports.setNodeLlamaCppImporter = setNodeLlamaCppImporter;
|
|
11
|
-
const importNodeLlamaCpp = async () => {
|
|
12
|
-
if (!nodeLlamaImportPromise) {
|
|
13
|
-
nodeLlamaImportPromise = import('node-llama-cpp');
|
|
14
|
-
}
|
|
15
|
-
return nodeLlamaImportPromise;
|
|
16
|
-
};
|
|
17
|
-
class LocalEmbeddingProvider {
|
|
18
|
-
llama = null;
|
|
19
|
-
model = null;
|
|
20
|
-
context = null;
|
|
21
|
-
initPromise = null;
|
|
22
|
-
modelPath;
|
|
23
|
-
constructor(config) {
|
|
24
|
-
// Override transformers.js default with node-llama-cpp default
|
|
25
|
-
this.modelPath = config.model_name === 'Xenova/all-MiniLM-L6-v2'
|
|
26
|
-
? DEFAULT_LOCAL_MODEL
|
|
27
|
-
: (config.model_name || DEFAULT_LOCAL_MODEL);
|
|
28
|
-
}
|
|
29
|
-
async ensureInitialized() {
|
|
30
|
-
if (this.context) {
|
|
31
|
-
return;
|
|
32
|
-
}
|
|
33
|
-
if (this.initPromise) {
|
|
34
|
-
return this.initPromise;
|
|
35
|
-
}
|
|
36
|
-
this.initPromise = this.doInitialize().catch((err) => {
|
|
37
|
-
this.initPromise = null;
|
|
38
|
-
this.context = null;
|
|
39
|
-
this.model = null;
|
|
40
|
-
this.llama = null;
|
|
41
|
-
throw err;
|
|
42
|
-
});
|
|
43
|
-
return this.initPromise;
|
|
44
|
-
}
|
|
45
|
-
async doInitialize() {
|
|
46
|
-
try {
|
|
47
|
-
const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
|
|
48
|
-
if (!this.llama) {
|
|
49
|
-
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
|
|
50
|
-
}
|
|
51
|
-
if (!this.model) {
|
|
52
|
-
const resolved = await resolveModelFile(this.modelPath);
|
|
53
|
-
this.model = await this.llama.loadModel({ modelPath: resolved });
|
|
54
|
-
}
|
|
55
|
-
if (!this.context) {
|
|
56
|
-
this.context = await this.model.createEmbeddingContext();
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
catch (err) {
|
|
60
|
-
const detail = err instanceof Error ? err.message : String(err);
|
|
61
|
-
throw new Error(`Local embeddings unavailable. Reason: ${detail}`, {
|
|
62
|
-
cause: err,
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
async generateEmbedding(text) {
|
|
67
|
-
if (!text || !text.trim()) {
|
|
68
|
-
throw new Error('Embedding input text must be a non-empty string');
|
|
69
|
-
}
|
|
70
|
-
await this.ensureInitialized();
|
|
71
|
-
const embedding = await this.context.getEmbeddingFor(text);
|
|
72
|
-
const vector = embedding.vector; // TypedArray
|
|
73
|
-
// Optimized normalization loop
|
|
74
|
-
let sumSq = 0;
|
|
75
|
-
const len = vector.length;
|
|
76
|
-
for (let i = 0; i < len; i++) {
|
|
77
|
-
const val = vector[i];
|
|
78
|
-
if (Number.isFinite(val)) {
|
|
79
|
-
sumSq += val * val;
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
vector[i] = 0;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
const magnitude = Math.sqrt(sumSq);
|
|
86
|
-
const result = new Array(len);
|
|
87
|
-
if (magnitude > 0) {
|
|
88
|
-
const scale = 1.0 / magnitude;
|
|
89
|
-
for (let i = 0; i < len; i++) {
|
|
90
|
-
result[i] = vector[i] * scale;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
else {
|
|
94
|
-
for (let i = 0; i < len; i++) {
|
|
95
|
-
result[i] = vector[i];
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
return result;
|
|
99
|
-
}
|
|
100
|
-
async generateEmbeddings(texts) {
|
|
101
|
-
if (!Array.isArray(texts)) {
|
|
102
|
-
throw new Error('Embedding input must be an array of strings');
|
|
103
|
-
}
|
|
104
|
-
return Promise.all(texts.map(text => this.generateEmbedding(text)));
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
exports.LocalEmbeddingProvider = LocalEmbeddingProvider;
|
package/dist/src/skills/SKILL.md
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: byted-las-data-profiler
|
|
3
|
-
description: |
|
|
4
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
|
|
5
|
-
performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
|
|
6
|
-
extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
|
|
7
|
-
and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
8
|
-
|
|
9
|
-
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
10
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
|
|
11
|
-
---
|
|
12
|
-
|
|
13
|
-
## Trigger Scenarios
|
|
14
|
-
Be sure to use this Skill when the user mentions the following scenarios:
|
|
15
|
-
- Need to scan the file structure in a TOS bucket or understand the dataset composition
|
|
16
|
-
- Need to connect to object storage (TOS/OSS/COS/S3) using the S3 protocol
|
|
17
|
-
- Need to scan, traverse, or catalog the file structure of a specific bucket or local directory
|
|
18
|
-
- Need to understand what a batch of data files contains and what their schema looks like
|
|
19
|
-
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
20
|
-
- Need to write the meta-information of object storage or local files into LanceDB
|
|
21
|
-
- Mentions TOS, boto3, or object storage data profiling
|
|
22
|
-
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
23
|
-
- Need to batch identify the type and size of remote/local files and build an index
|
|
24
|
-
- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
|
|
25
|
-
- Need to connect/dock a data source for profiling
|
|
26
|
-
- Mentions "connect" data source, docking data source
|
|
27
|
-
|
|
28
|
-
## Overview
|
|
29
|
-
This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
|
|
30
|
-
|
|
31
|
-
1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
|
|
32
|
-
2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
|
|
33
|
-
3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
|
|
34
|
-
4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
|
|
35
|
-
|
|
36
|
-
## Output Location
|
|
37
|
-
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
38
|
-
- Table names: `files`, `structured_schemas`, `media_metadata`
|
|
39
|
-
|
|
40
|
-
## Available Tools for this Skill
|
|
41
|
-
- `list-s3-objects`: To traverse and list files in the bucket/directory.
|
|
42
|
-
- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
|
|
43
|
-
- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
|
|
44
|
-
|
|
45
|
-
Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.
|