@byted-las/contextlake-openclaw 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -1
- package/dist/index.js +5 -5
- package/dist/src/client/lancedb.js +13 -4
- package/dist/src/commands/cli.d.ts +5 -2
- package/dist/src/commands/cli.js +94 -10
- package/dist/src/commands/index.d.ts +2 -1
- package/dist/src/commands/index.js +31 -35
- package/dist/src/commands/slashcmd.d.ts +8 -1
- package/dist/src/commands/slashcmd.js +90 -6
- package/dist/src/commands/tools.d.ts +10 -218
- package/dist/src/commands/tools.js +109 -104
- package/dist/src/lib/actions/ingest-source.d.ts +15 -0
- package/dist/src/lib/actions/ingest-source.js +193 -0
- package/dist/src/lib/actions/ingest.d.ts +14 -7
- package/dist/src/lib/actions/ingest.js +133 -63
- package/dist/src/lib/actions/las-api.d.ts +13 -0
- package/dist/src/lib/actions/las-api.js +105 -0
- package/dist/src/lib/actions/las-tools.d.ts +3 -0
- package/dist/src/lib/actions/las-tools.js +194 -0
- package/dist/src/lib/actions/las.d.ts +64 -0
- package/dist/src/lib/actions/las.js +72 -0
- package/dist/src/lib/actions/manage.d.ts +3 -2
- package/dist/src/{skills/las-data-profiler/index.d.ts → lib/actions/profiler.d.ts} +4 -2
- package/dist/src/{skills/las-data-profiler/index.js → lib/actions/profiler.js} +19 -3
- package/dist/src/lib/actions/retrieve.d.ts +2 -1
- package/dist/src/lib/actions/retrieve.js +2 -18
- package/{src/skills/las-data-profiler → dist/src/lib/scripts}/s3_catalog.py +10 -1
- package/dist/src/processor/loader.js +9 -2
- package/dist/src/service/embedding/factory.js +1 -10
- package/dist/src/service/embedding/interface.d.ts +8 -1
- package/dist/src/service/embedding/local.js +16 -13
- package/dist/src/service/embedding/remote.d.ts +7 -0
- package/dist/src/service/embedding/remote.js +108 -7
- package/dist/src/service/metadata/interface.d.ts +1 -0
- package/dist/src/service/metadata/local.d.ts +1 -0
- package/dist/src/service/metadata/local.js +6 -0
- package/dist/src/skills/SKILL.md +174 -0
- package/dist/src/skills/contextlake-delete/SKILL.md +36 -0
- package/dist/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/dist/src/skills/contextlake-list/SKILL.md +22 -0
- package/dist/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/dist/src/skills/las-data-profiler/SKILL.md +174 -0
- package/dist/src/utils/config.d.ts +34 -1
- package/dist/src/utils/config.js +16 -3
- package/dist/src/utils/credentials.d.ts +8 -0
- package/dist/src/utils/credentials.js +77 -0
- package/index.ts +8 -8
- package/openclaw.plugin.json +1 -1
- package/package.json +8 -7
- package/src/client/lancedb.ts +32 -21
- package/src/commands/cli.ts +105 -13
- package/src/commands/index.ts +45 -42
- package/src/commands/slashcmd.ts +69 -10
- package/src/commands/tools.ts +142 -117
- package/src/lib/actions/ingest.ts +151 -75
- package/src/lib/actions/las-api.ts +119 -0
- package/src/lib/actions/las-tools.ts +196 -0
- package/src/lib/actions/manage.ts +6 -5
- package/src/{skills/las-data-profiler/index.ts → lib/actions/profiler.ts} +21 -4
- package/src/lib/actions/retrieve.ts +16 -34
- package/src/lib/scripts/s3_catalog.py +617 -0
- package/src/processor/loader.ts +12 -4
- package/src/service/embedding/factory.ts +1 -8
- package/src/service/embedding/interface.ts +9 -1
- package/src/service/embedding/remote.ts +133 -13
- package/src/service/metadata/interface.ts +1 -0
- package/src/service/metadata/local.ts +7 -0
- package/src/service/storage/factory.ts +2 -2
- package/src/utils/config.ts +61 -8
- package/src/utils/credentials.ts +50 -0
- package/bin/contextlake-openclaw.js +0 -5
- package/dist/src/skills/las-data-profiler/register.d.ts +0 -1
- package/dist/src/skills/las-data-profiler/register.js +0 -19
- package/src/service/embedding/local.ts +0 -118
- package/src/skills/las-data-profiler/register.ts +0 -19
|
@@ -1,103 +1,179 @@
|
|
|
1
1
|
import { createStorageProvider } from '../../service/storage/factory';
|
|
2
2
|
import { createMetadataProvider } from '../../service/metadata/factory';
|
|
3
|
-
import {
|
|
3
|
+
import { LasApiClient } from './las-api';
|
|
4
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
5
|
+
import { getLasTools } from './las-tools';
|
|
6
|
+
import * as lancedb from '@lancedb/lancedb';
|
|
4
7
|
import * as path from 'path';
|
|
8
|
+
import * as fs from 'fs';
|
|
9
|
+
import * as os from 'os';
|
|
5
10
|
// @ts-ignore
|
|
6
11
|
import { v4 as uuidv4 } from 'uuid';
|
|
7
12
|
|
|
8
|
-
interface
|
|
9
|
-
|
|
10
|
-
metadata?: Record<string, any>;
|
|
11
|
-
chunkSize?: number;
|
|
12
|
-
overlap?: number;
|
|
13
|
+
export interface IngestSourceParams {
|
|
14
|
+
datasource_name: string;
|
|
13
15
|
}
|
|
14
16
|
|
|
15
|
-
|
|
17
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
18
|
+
|
|
19
|
+
export async function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any) {
|
|
16
20
|
if (logger) {
|
|
17
|
-
|
|
21
|
+
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
18
22
|
} else {
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
// eslint-disable-next-line no-console
|
|
24
|
+
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
28
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
29
|
+
|
|
30
|
+
if (!fs.existsSync(dbPath)) {
|
|
31
|
+
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
21
32
|
}
|
|
22
33
|
|
|
23
|
-
const storageConfig = config.file_storage || { type: 'local', local_base_dir: './data/files' };
|
|
24
34
|
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
25
|
-
|
|
26
|
-
const storageProvider = createStorageProvider(storageConfig);
|
|
27
|
-
const metadataProvider = createMetadataProvider(metaConfig);
|
|
28
|
-
|
|
35
|
+
const metadataProvider = createMetadataProvider(metaConfig as any);
|
|
29
36
|
await metadataProvider.connect();
|
|
30
|
-
const maxInlineSize = (config.storage_policy?.max_inline_size_kb || 1024) * 1024;
|
|
31
37
|
|
|
38
|
+
const lasClient = new LasApiClient(config, logger);
|
|
32
39
|
const results = [];
|
|
33
40
|
|
|
34
|
-
|
|
41
|
+
// Connect to the profiler LanceDB to read the file catalog
|
|
42
|
+
const profilerDb = await lancedb.connect(dbPath);
|
|
43
|
+
const tableNames = await profilerDb.tableNames();
|
|
44
|
+
|
|
45
|
+
if (!tableNames.includes('file_catalog')) {
|
|
46
|
+
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
50
|
+
const files = await catalogTable.query().toArray();
|
|
51
|
+
|
|
52
|
+
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
53
|
+
|
|
54
|
+
// Simple chunking for text
|
|
55
|
+
const splitText = (text: string, chunkSize: number = 500, overlap: number = 50) => {
|
|
56
|
+
const chunks: string[] = [];
|
|
57
|
+
if (!text) return chunks;
|
|
58
|
+
let i = 0;
|
|
59
|
+
while (i < text.length) {
|
|
60
|
+
chunks.push(text.slice(i, i + chunkSize));
|
|
61
|
+
i += chunkSize - overlap;
|
|
62
|
+
}
|
|
63
|
+
return chunks;
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
const processText = async (text: string, fileInfo: any) => {
|
|
67
|
+
const chunks = splitText(text);
|
|
68
|
+
const docs = [];
|
|
69
|
+
for (const chunk of chunks) {
|
|
70
|
+
const vector = await metadataProvider.generateMultimodalEmbedding!([{ type: 'text', text: chunk }]);
|
|
71
|
+
docs.push({
|
|
72
|
+
id: uuidv4(),
|
|
73
|
+
vector,
|
|
74
|
+
text: chunk,
|
|
75
|
+
source: fileInfo.key,
|
|
76
|
+
file_type: fileInfo.category,
|
|
77
|
+
storage_type: 'source',
|
|
78
|
+
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
79
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
80
|
+
created_at: Date.now(),
|
|
81
|
+
binary_data: Buffer.from('')
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
return docs;
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
for (const file of files) {
|
|
35
88
|
try {
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
const
|
|
39
|
-
|
|
40
|
-
let fileUrl = '';
|
|
41
|
-
let storageType = '';
|
|
42
|
-
let binaryData: Buffer = Buffer.alloc(0);
|
|
89
|
+
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
90
|
+
let docs: any[] = [];
|
|
91
|
+
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
43
92
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
93
|
+
if (file.media_type === 'pdf') {
|
|
94
|
+
// PDF Parse
|
|
95
|
+
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
96
|
+
url: fileUrl
|
|
97
|
+
});
|
|
98
|
+
const markdown = result.data?.markdown || '';
|
|
99
|
+
docs = await processText(markdown, file);
|
|
100
|
+
} else if (file.media_type === 'image') {
|
|
101
|
+
// Multimodal Embedding directly
|
|
102
|
+
const vector = await metadataProvider.generateMultimodalEmbedding!([
|
|
103
|
+
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
104
|
+
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
105
|
+
]);
|
|
106
|
+
docs.push({
|
|
107
|
+
id: uuidv4(),
|
|
108
|
+
vector,
|
|
109
|
+
text: 'Image from dataset',
|
|
110
|
+
source: file.key,
|
|
111
|
+
file_type: 'image',
|
|
112
|
+
storage_type: 'source',
|
|
113
|
+
url: fileUrl,
|
|
114
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
115
|
+
created_at: Date.now(),
|
|
116
|
+
binary_data: Buffer.from('')
|
|
117
|
+
});
|
|
118
|
+
} else if (file.media_type === 'audio') {
|
|
119
|
+
// ASR
|
|
120
|
+
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
121
|
+
audio: { url: fileUrl, format: (file.key as string).split('.').pop() || 'wav' },
|
|
122
|
+
request: { model_name: 'bigmodel' }
|
|
123
|
+
});
|
|
124
|
+
const text = result.data?.result?.text || '';
|
|
125
|
+
docs = await processText(text, file);
|
|
126
|
+
} else if (file.media_type === 'video') {
|
|
127
|
+
// Video understanding -> text -> embedding
|
|
128
|
+
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
129
|
+
video_url: fileUrl,
|
|
130
|
+
query: "详细描述这个视频的内容",
|
|
131
|
+
model_name: "doubao-seed-2-0-lite-260215"
|
|
132
|
+
});
|
|
133
|
+
// Assuming video output is a text description somewhere in the response.
|
|
134
|
+
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
135
|
+
const text = JSON.stringify(result.data || '');
|
|
136
|
+
|
|
137
|
+
// Also need audio extract and ASR for video
|
|
138
|
+
// 1. Extract audio
|
|
139
|
+
// The output_path_template needs a unique path per video
|
|
140
|
+
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${uuidv4()}.wav`;
|
|
141
|
+
await lasClient.process('las_audio_extract_and_split', {
|
|
142
|
+
input_path: fileUrl,
|
|
143
|
+
output_path_template: audioOutputPath,
|
|
144
|
+
output_format: 'wav'
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
// 2. ASR on the extracted audio
|
|
148
|
+
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
149
|
+
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
150
|
+
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
151
|
+
request: { model_name: 'bigmodel' }
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
const audioText = asrResult.data?.result?.text || '';
|
|
155
|
+
|
|
156
|
+
// Combine video text and audio text
|
|
157
|
+
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
158
|
+
docs = await processText(combinedText, file);
|
|
159
|
+
|
|
160
|
+
} else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
161
|
+
// If we had a direct text content, we could process it here.
|
|
162
|
+
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
163
|
+
// Fallback just logs
|
|
164
|
+
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
52
165
|
}
|
|
53
166
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
// If no text extracted (e.g. image), store one entry with empty text but with metadata/binary
|
|
58
|
-
if (chunks.length === 0) {
|
|
59
|
-
const vector = await metadataProvider.generateEmbedding(fileName); // Embed filename as fallback
|
|
60
|
-
docs.push({
|
|
61
|
-
id: uuidv4(),
|
|
62
|
-
vector,
|
|
63
|
-
text: '',
|
|
64
|
-
source: fileName,
|
|
65
|
-
file_type: type,
|
|
66
|
-
storage_type: storageType,
|
|
67
|
-
url: fileUrl,
|
|
68
|
-
metadata: JSON.stringify(params.metadata || {}),
|
|
69
|
-
created_at: createdAt,
|
|
70
|
-
binary_data: binaryData
|
|
71
|
-
});
|
|
72
|
-
} else {
|
|
73
|
-
for (const chunk of chunks) {
|
|
74
|
-
const vector = await metadataProvider.generateEmbedding(chunk);
|
|
75
|
-
docs.push({
|
|
76
|
-
id: uuidv4(),
|
|
77
|
-
vector,
|
|
78
|
-
text: chunk,
|
|
79
|
-
source: fileName,
|
|
80
|
-
file_type: type,
|
|
81
|
-
storage_type: storageType,
|
|
82
|
-
url: fileUrl,
|
|
83
|
-
metadata: JSON.stringify(params.metadata || {}),
|
|
84
|
-
created_at: createdAt,
|
|
85
|
-
binary_data: binaryData // Only attach to first chunk
|
|
86
|
-
});
|
|
87
|
-
// Clear binary data for subsequent chunks of the same file to avoid duplication
|
|
88
|
-
binaryData = Buffer.alloc(0);
|
|
89
|
-
}
|
|
167
|
+
if (docs.length > 0) {
|
|
168
|
+
await metadataProvider.addAssets(docs);
|
|
169
|
+
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
90
170
|
}
|
|
91
171
|
|
|
92
|
-
await metadataProvider.addAssets(docs);
|
|
93
|
-
results.push({ file: fileName, status: 'success', chunks: docs.length });
|
|
94
|
-
|
|
95
172
|
} catch (error: any) {
|
|
96
|
-
|
|
97
|
-
results.push({ file:
|
|
173
|
+
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
174
|
+
results.push({ file: file.key, status: 'error', message: error.message });
|
|
98
175
|
}
|
|
99
176
|
}
|
|
100
177
|
|
|
101
|
-
|
|
102
|
-
return JSON.parse(JSON.stringify(results));
|
|
178
|
+
return results;
|
|
103
179
|
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
+
|
|
3
|
+
export class LasApiClient {
|
|
4
|
+
private endpoint: string;
|
|
5
|
+
private apiKey: string;
|
|
6
|
+
private logger: any;
|
|
7
|
+
|
|
8
|
+
constructor(config: ContextLakeConfig, logger: any) {
|
|
9
|
+
this.logger = logger;
|
|
10
|
+
this.apiKey = config.metadata_storage?.embedding?.api_key || process.env.LAS_API_KEY || '';
|
|
11
|
+
this.endpoint = config.metadata_storage?.embedding?.api_base || process.env.LAS_BASE_URL || 'https://operator.las.cn-beijing.volces.com';
|
|
12
|
+
|
|
13
|
+
// Remove trailing slash
|
|
14
|
+
if (this.endpoint.endsWith('/')) {
|
|
15
|
+
this.endpoint = this.endpoint.slice(0, -1);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
private async request(path: string, body: any) {
|
|
20
|
+
if (!this.apiKey) {
|
|
21
|
+
throw new Error('LAS_API_KEY is not configured. Please set it in config or environment variables.');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const url = `${this.endpoint}${path}`;
|
|
25
|
+
this.logger.debug(`[LasApiClient] Requesting ${url}`, { body: JSON.stringify(body) });
|
|
26
|
+
|
|
27
|
+
const response = await fetch(url, {
|
|
28
|
+
method: 'POST',
|
|
29
|
+
headers: {
|
|
30
|
+
'Content-Type': 'application/json',
|
|
31
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
32
|
+
},
|
|
33
|
+
body: JSON.stringify(body)
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
if (!response.ok) {
|
|
37
|
+
let errorText = '';
|
|
38
|
+
try {
|
|
39
|
+
errorText = await response.text();
|
|
40
|
+
} catch (e) { }
|
|
41
|
+
throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const result = await response.json();
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async process(operatorId: string, data: any, version: string = 'v1') {
|
|
49
|
+
const result = await this.request('/api/v1/process', {
|
|
50
|
+
operator_id: operatorId,
|
|
51
|
+
operator_version: version,
|
|
52
|
+
data
|
|
53
|
+
});
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async submit(operatorId: string, data: any, version: string = 'v1') {
|
|
58
|
+
const result = await this.request('/api/v1/submit', {
|
|
59
|
+
operator_id: operatorId,
|
|
60
|
+
operator_version: version,
|
|
61
|
+
data
|
|
62
|
+
});
|
|
63
|
+
return result;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async poll(operatorId: string, taskId: string, version: string = 'v1') {
|
|
67
|
+
const result = await this.request('/api/v1/poll', {
|
|
68
|
+
operator_id: operatorId,
|
|
69
|
+
operator_version: version,
|
|
70
|
+
task_id: taskId
|
|
71
|
+
});
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async submitAndPoll(operatorId: string, data: any, version: string = 'v1', pollIntervalMs: number = 3000, maxRetries: number = 200) {
|
|
76
|
+
const submitResult = await this.submit(operatorId, data, version);
|
|
77
|
+
|
|
78
|
+
if (!submitResult?.metadata?.task_id) {
|
|
79
|
+
throw new Error(`Failed to submit task for ${operatorId}. Response: ${JSON.stringify(submitResult)}`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const taskId = submitResult.metadata.task_id;
|
|
83
|
+
this.logger.info(`[LasApiClient] Task submitted: ${taskId} for ${operatorId}`);
|
|
84
|
+
|
|
85
|
+
let retries = 0;
|
|
86
|
+
while (retries < maxRetries) {
|
|
87
|
+
await new Promise(resolve => setTimeout(resolve, pollIntervalMs));
|
|
88
|
+
|
|
89
|
+
const pollResult = await this.poll(operatorId, taskId, version);
|
|
90
|
+
const status = pollResult?.metadata?.task_status;
|
|
91
|
+
|
|
92
|
+
this.logger.debug(`[LasApiClient] Task ${taskId} status: ${status}`);
|
|
93
|
+
|
|
94
|
+
if (status === 'COMPLETED') {
|
|
95
|
+
return pollResult;
|
|
96
|
+
} else if (status === 'FAILED' || status === 'TIMEOUT') {
|
|
97
|
+
const errorMsg = pollResult?.metadata?.error_msg || 'Unknown error';
|
|
98
|
+
throw new Error(`Task ${taskId} failed with status: ${status}. Message: ${errorMsg}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
retries++;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
throw new Error(`Task ${taskId} timed out after ${maxRetries} polling attempts.`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async multimodalEmbedding(model: string, input: any[], encodingFormat: string = 'float', dimensions?: number, instructions?: string, sparseEmbedding?: any) {
|
|
108
|
+
const body: any = {
|
|
109
|
+
model,
|
|
110
|
+
input,
|
|
111
|
+
encoding_format: encodingFormat
|
|
112
|
+
};
|
|
113
|
+
if (dimensions) body.dimensions = dimensions;
|
|
114
|
+
if (instructions) body.instructions = instructions;
|
|
115
|
+
if (sparseEmbedding) body.sparse_embedding = sparseEmbedding;
|
|
116
|
+
|
|
117
|
+
return await this.request('/api/v1/embeddings/multimodal', body);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import { LasApiClient } from './las-api';
|
|
2
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
3
|
+
// @ts-ignore
|
|
4
|
+
import type { AnyAgentTool } from 'openclaw/plugin-sdk';
|
|
5
|
+
|
|
6
|
+
export function getLasTools(pluginConfig: ContextLakeConfig, logger: any): AnyAgentTool[] {
|
|
7
|
+
const apiClient = new LasApiClient(pluginConfig, logger);
|
|
8
|
+
|
|
9
|
+
const callApi = async (method: string, args: any[]) => {
|
|
10
|
+
try {
|
|
11
|
+
// @ts-ignore
|
|
12
|
+
return await apiClient[method](...args);
|
|
13
|
+
} catch (error: any) {
|
|
14
|
+
logger.error(`[LasTools] API ${method} failed`, { error: error.message });
|
|
15
|
+
return { error: error.message };
|
|
16
|
+
}
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
return [
|
|
20
|
+
{
|
|
21
|
+
name: 'las_image_resample',
|
|
22
|
+
label: 'LAS Image Resample',
|
|
23
|
+
description: `Resample/Resize an image and save it to TOS.
|
|
24
|
+
Parameters in data:
|
|
25
|
+
- image_src_type (string, default: "image_url"): "image_url" or "image_tos"
|
|
26
|
+
- image (string, required): URL or tos:// path
|
|
27
|
+
- tos_dir (string, required): tos:// output directory
|
|
28
|
+
- image_suffix (string): ".jpg" or ".png"
|
|
29
|
+
- target_size (array of integers): e.g. [1024, 1024]
|
|
30
|
+
- target_dpi (array of integers): e.g. [72, 72]
|
|
31
|
+
- method (string): "nearest", "bilinear", "bicubic", "lanczos"`,
|
|
32
|
+
parameters: {
|
|
33
|
+
type: 'object',
|
|
34
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
35
|
+
required: ['data']
|
|
36
|
+
},
|
|
37
|
+
async execute(toolCallId: string, params: any) {
|
|
38
|
+
return await callApi('process', ['las_image_resample', params.data]);
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
name: 'las_audio_extract_and_split',
|
|
43
|
+
label: 'LAS Audio Extract and Split',
|
|
44
|
+
description: `Extract audio from video and split it into chunks.
|
|
45
|
+
Parameters in data:
|
|
46
|
+
- input_path (string, required): tos:// video path
|
|
47
|
+
- output_path_template (string, required): e.g. tos://bucket/{index}.{output_file_ext}
|
|
48
|
+
- split_duration (number): duration in seconds, default 30.0
|
|
49
|
+
- output_format (string): "wav", "mp3", "flac"
|
|
50
|
+
- timeout (integer)
|
|
51
|
+
- extra_params (array of string): ffmpeg params`,
|
|
52
|
+
parameters: {
|
|
53
|
+
type: 'object',
|
|
54
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
55
|
+
required: ['data']
|
|
56
|
+
},
|
|
57
|
+
async execute(toolCallId: string, params: any) {
|
|
58
|
+
return await callApi('process', ['las_audio_extract_and_split', params.data]);
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
name: 'las_audio_convert',
|
|
63
|
+
label: 'LAS Audio Convert',
|
|
64
|
+
description: `Convert audio format.
|
|
65
|
+
Parameters in data:
|
|
66
|
+
- input_path (string, required): tos:// audio path
|
|
67
|
+
- output_path (string, required): tos:// output path
|
|
68
|
+
- output_format (string): "wav", "mp3", "flac"
|
|
69
|
+
- extra_params (array of string): ffmpeg params`,
|
|
70
|
+
parameters: {
|
|
71
|
+
type: 'object',
|
|
72
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
73
|
+
required: ['data']
|
|
74
|
+
},
|
|
75
|
+
async execute(toolCallId: string, params: any) {
|
|
76
|
+
return await callApi('process', ['las_audio_convert', params.data]);
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
name: 'las_asr_pro',
|
|
81
|
+
label: 'LAS ASR Pro (Speech Recognition)',
|
|
82
|
+
description: `Perform automatic speech recognition (ASR).
|
|
83
|
+
Parameters in data:
|
|
84
|
+
- resource: "bigasr" or "seedasr"
|
|
85
|
+
- audio (object, required): { url: string, language: string, format: string }
|
|
86
|
+
- request (object, required): { model_name: "bigmodel", ... }
|
|
87
|
+
- user (object): { uid: string }`,
|
|
88
|
+
parameters: {
|
|
89
|
+
type: 'object',
|
|
90
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
91
|
+
required: ['data']
|
|
92
|
+
},
|
|
93
|
+
async execute(toolCallId: string, params: any) {
|
|
94
|
+
return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
name: 'las_seed_2_0',
|
|
99
|
+
label: 'LAS Seed 2.0 (Audio)',
|
|
100
|
+
description: `ASR with Seed 2.0.
|
|
101
|
+
Parameters in data:
|
|
102
|
+
- audio (object, required): { url, format, language }
|
|
103
|
+
- request (object, required): { model_name: "seedasr", ... }`,
|
|
104
|
+
parameters: {
|
|
105
|
+
type: 'object',
|
|
106
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
107
|
+
required: ['data']
|
|
108
|
+
},
|
|
109
|
+
async execute(toolCallId: string, params: any) {
|
|
110
|
+
return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
|
|
111
|
+
}
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
name: 'las_bare_image_text_embedding',
|
|
115
|
+
label: 'LAS Multimodal Embedding',
|
|
116
|
+
description: `Multimodal Embedding (image and text).
|
|
117
|
+
Parameters:
|
|
118
|
+
- model (string, required): "doubao-embedding-vision-250615"
|
|
119
|
+
- input (array of objects, required): [ { type: "image_url", image_url: { url: "..." } }, { type: "text", text: "..." } ]
|
|
120
|
+
- encoding_format (string): "float", "base64"
|
|
121
|
+
- dimensions (integer): 1024 or 2048`,
|
|
122
|
+
parameters: {
|
|
123
|
+
type: 'object',
|
|
124
|
+
properties: {
|
|
125
|
+
model: { type: 'string', default: 'doubao-embedding-vision-250615' },
|
|
126
|
+
input: { type: 'array', items: { type: 'object' } },
|
|
127
|
+
encoding_format: { type: 'string', default: 'float' },
|
|
128
|
+
dimensions: { type: 'integer' }
|
|
129
|
+
},
|
|
130
|
+
required: ['model', 'input']
|
|
131
|
+
},
|
|
132
|
+
async execute(toolCallId: string, params: any) {
|
|
133
|
+
return await callApi('multimodalEmbedding', [
|
|
134
|
+
params.model,
|
|
135
|
+
params.input,
|
|
136
|
+
params.encoding_format,
|
|
137
|
+
params.dimensions
|
|
138
|
+
]);
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
name: 'las_long_video_understand',
|
|
143
|
+
label: 'LAS Long Video Understand',
|
|
144
|
+
description: `Long Video Understanding.
|
|
145
|
+
Parameters in data:
|
|
146
|
+
- video_url (string, required)
|
|
147
|
+
- query (string, required)
|
|
148
|
+
- model_name (string): default "doubao-seed-2-0-lite-260215"
|
|
149
|
+
- ...other params like fps, target_tokens_per_clip`,
|
|
150
|
+
parameters: {
|
|
151
|
+
type: 'object',
|
|
152
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
153
|
+
required: ['data']
|
|
154
|
+
},
|
|
155
|
+
async execute(toolCallId: string, params: any) {
|
|
156
|
+
return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
name: 'las_pdf_parse_doubao',
|
|
161
|
+
label: 'LAS PDF Parse Doubao',
|
|
162
|
+
description: `Parse PDF documents to Markdown.
|
|
163
|
+
Parameters in data:
|
|
164
|
+
- url (string, required): PDF URL or tos://
|
|
165
|
+
- start_page (integer): default 1
|
|
166
|
+
- num_pages (integer): default to end
|
|
167
|
+
- parse_mode (string): "normal" or "detail"`,
|
|
168
|
+
parameters: {
|
|
169
|
+
type: 'object',
|
|
170
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
171
|
+
required: ['data']
|
|
172
|
+
},
|
|
173
|
+
async execute(toolCallId: string, params: any) {
|
|
174
|
+
return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
name: 'las_video_resize',
|
|
179
|
+
label: 'LAS Video Resize',
|
|
180
|
+
description: `Resize video.
|
|
181
|
+
Parameters in data:
|
|
182
|
+
- video_url (string, required): URL or tos://
|
|
183
|
+
- target_width (integer)
|
|
184
|
+
- target_height (integer)
|
|
185
|
+
- output_dir (string, required): tos://`,
|
|
186
|
+
parameters: {
|
|
187
|
+
type: 'object',
|
|
188
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
189
|
+
required: ['data']
|
|
190
|
+
},
|
|
191
|
+
async execute(toolCallId: string, params: any) {
|
|
192
|
+
return await callApi('submitAndPoll', ['las_video_resize', params.data]);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
];
|
|
196
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { createMetadataProvider } from '../../service/metadata/factory';
|
|
2
2
|
import { createStorageProvider } from '../../service/storage/factory';
|
|
3
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
3
4
|
|
|
4
5
|
interface ListParams {
|
|
5
6
|
limit?: number;
|
|
@@ -10,7 +11,7 @@ interface DeleteParams {
|
|
|
10
11
|
filter?: string;
|
|
11
12
|
}
|
|
12
13
|
|
|
13
|
-
export async function listAssets(params: ListParams, config:
|
|
14
|
+
export async function listAssets(params: ListParams, config: ContextLakeConfig, logger?: any) {
|
|
14
15
|
if (logger) {
|
|
15
16
|
logger.info(`[ContextLake-Action] Calling listAssets with params: ${JSON.stringify(params)}`);
|
|
16
17
|
} else {
|
|
@@ -20,7 +21,7 @@ export async function listAssets(params: ListParams, config: any, logger?: any)
|
|
|
20
21
|
|
|
21
22
|
// Ensure config has default if not provided
|
|
22
23
|
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
23
|
-
const metadataProvider = createMetadataProvider(metaConfig);
|
|
24
|
+
const metadataProvider = createMetadataProvider(metaConfig as any);
|
|
24
25
|
|
|
25
26
|
await metadataProvider.connect();
|
|
26
27
|
const docs = await metadataProvider.list(params.limit || 100);
|
|
@@ -47,7 +48,7 @@ export async function listAssets(params: ListParams, config: any, logger?: any)
|
|
|
47
48
|
return JSON.parse(JSON.stringify(Array.from(fileMap.values())));
|
|
48
49
|
}
|
|
49
50
|
|
|
50
|
-
export async function deleteAssets(params: DeleteParams, config:
|
|
51
|
+
export async function deleteAssets(params: DeleteParams, config: ContextLakeConfig, logger?: any) {
|
|
51
52
|
if (logger) {
|
|
52
53
|
logger.info(`[ContextLake-Action] Calling deleteAssets with params: ${JSON.stringify(params)}`);
|
|
53
54
|
} else {
|
|
@@ -56,13 +57,13 @@ export async function deleteAssets(params: DeleteParams, config: any, logger?: a
|
|
|
56
57
|
}
|
|
57
58
|
|
|
58
59
|
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
59
|
-
const metadataProvider = createMetadataProvider(metaConfig);
|
|
60
|
+
const metadataProvider = createMetadataProvider(metaConfig as any);
|
|
60
61
|
|
|
61
62
|
// file_storage config is optional for deletion (we might not need to delete from storage if inline)
|
|
62
63
|
// Check if file_storage config exists before creating provider
|
|
63
64
|
let storageProvider;
|
|
64
65
|
if (config.file_storage && config.file_storage.type) {
|
|
65
|
-
storageProvider = createStorageProvider(config.file_storage);
|
|
66
|
+
storageProvider = createStorageProvider(config.file_storage as any);
|
|
66
67
|
}
|
|
67
68
|
|
|
68
69
|
await metadataProvider.connect();
|
|
@@ -19,7 +19,7 @@ export interface ConnectParams {
|
|
|
19
19
|
sample_rows?: number;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
interface ConnectResult {
|
|
22
|
+
export interface ConnectResult {
|
|
23
23
|
status: 'success' | 'error';
|
|
24
24
|
datasource_name: string;
|
|
25
25
|
db_path: string;
|
|
@@ -37,7 +37,7 @@ interface ConnectResult {
|
|
|
37
37
|
// Constants
|
|
38
38
|
// ---------------------------------------------------------------------------
|
|
39
39
|
|
|
40
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', '
|
|
40
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
41
41
|
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
42
42
|
|
|
43
43
|
// ---------------------------------------------------------------------------
|
|
@@ -113,8 +113,8 @@ function ensurePythonDeps(): void {
|
|
|
113
113
|
* Get the path to the bundled Python script.
|
|
114
114
|
*/
|
|
115
115
|
function getScriptPath(): string {
|
|
116
|
-
// The Python script is
|
|
117
|
-
return path.join(__dirname, 's3_catalog.py');
|
|
116
|
+
// The Python script is located in the scripts directory
|
|
117
|
+
return path.join(__dirname, '../scripts', 's3_catalog.py');
|
|
118
118
|
}
|
|
119
119
|
|
|
120
120
|
// ---------------------------------------------------------------------------
|
|
@@ -252,3 +252,20 @@ export async function connectDataSource(
|
|
|
252
252
|
});
|
|
253
253
|
});
|
|
254
254
|
}
|
|
255
|
+
|
|
256
|
+
export async function listDataSources(
|
|
257
|
+
_ctx?: any
|
|
258
|
+
): Promise<{ datasources: string[] }> {
|
|
259
|
+
try {
|
|
260
|
+
if (!fs.existsSync(BASE_DIR)) {
|
|
261
|
+
return { datasources: [] };
|
|
262
|
+
}
|
|
263
|
+
const entries = fs.readdirSync(BASE_DIR, { withFileTypes: true });
|
|
264
|
+
const datasources = entries
|
|
265
|
+
.filter(entry => entry.isDirectory())
|
|
266
|
+
.map(entry => entry.name);
|
|
267
|
+
return { datasources };
|
|
268
|
+
} catch (error: any) {
|
|
269
|
+
throw new Error(`Failed to list data sources: ${error.message}`);
|
|
270
|
+
}
|
|
271
|
+
}
|