@byted-las/contextlake-openclaw 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -1
- package/dist/index.js +5 -5
- package/dist/src/client/lancedb.js +13 -4
- package/dist/src/commands/cli.d.ts +5 -2
- package/dist/src/commands/cli.js +94 -10
- package/dist/src/commands/index.d.ts +2 -1
- package/dist/src/commands/index.js +31 -35
- package/dist/src/commands/slashcmd.d.ts +8 -1
- package/dist/src/commands/slashcmd.js +90 -6
- package/dist/src/commands/tools.d.ts +10 -218
- package/dist/src/commands/tools.js +109 -104
- package/dist/src/lib/actions/ingest-source.d.ts +15 -0
- package/dist/src/lib/actions/ingest-source.js +193 -0
- package/dist/src/lib/actions/ingest.d.ts +14 -7
- package/dist/src/lib/actions/ingest.js +133 -63
- package/dist/src/lib/actions/las-api.d.ts +13 -0
- package/dist/src/lib/actions/las-api.js +105 -0
- package/dist/src/lib/actions/las-tools.d.ts +3 -0
- package/dist/src/lib/actions/las-tools.js +194 -0
- package/dist/src/lib/actions/las.d.ts +64 -0
- package/dist/src/lib/actions/las.js +72 -0
- package/dist/src/lib/actions/manage.d.ts +3 -2
- package/dist/src/{skills/las-data-profiler/index.d.ts → lib/actions/profiler.d.ts} +4 -2
- package/dist/src/{skills/las-data-profiler/index.js → lib/actions/profiler.js} +19 -3
- package/dist/src/lib/actions/retrieve.d.ts +2 -1
- package/dist/src/lib/actions/retrieve.js +2 -18
- package/{src/skills/las-data-profiler → dist/src/lib/scripts}/s3_catalog.py +10 -1
- package/dist/src/processor/loader.js +9 -2
- package/dist/src/service/embedding/factory.js +1 -10
- package/dist/src/service/embedding/interface.d.ts +8 -1
- package/dist/src/service/embedding/local.js +16 -13
- package/dist/src/service/embedding/remote.d.ts +7 -0
- package/dist/src/service/embedding/remote.js +108 -7
- package/dist/src/service/metadata/interface.d.ts +1 -0
- package/dist/src/service/metadata/local.d.ts +1 -0
- package/dist/src/service/metadata/local.js +6 -0
- package/dist/src/skills/SKILL.md +174 -0
- package/dist/src/skills/contextlake-delete/SKILL.md +36 -0
- package/dist/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/dist/src/skills/contextlake-list/SKILL.md +22 -0
- package/dist/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/dist/src/skills/las-data-profiler/SKILL.md +174 -0
- package/dist/src/utils/config.d.ts +34 -1
- package/dist/src/utils/config.js +16 -3
- package/dist/src/utils/credentials.d.ts +8 -0
- package/dist/src/utils/credentials.js +77 -0
- package/index.ts +8 -8
- package/openclaw.plugin.json +1 -1
- package/package.json +8 -7
- package/src/client/lancedb.ts +32 -21
- package/src/commands/cli.ts +105 -13
- package/src/commands/index.ts +45 -42
- package/src/commands/slashcmd.ts +69 -10
- package/src/commands/tools.ts +142 -117
- package/src/lib/actions/ingest.ts +151 -75
- package/src/lib/actions/las-api.ts +119 -0
- package/src/lib/actions/las-tools.ts +196 -0
- package/src/lib/actions/manage.ts +6 -5
- package/src/{skills/las-data-profiler/index.ts → lib/actions/profiler.ts} +21 -4
- package/src/lib/actions/retrieve.ts +16 -34
- package/src/lib/scripts/s3_catalog.py +617 -0
- package/src/processor/loader.ts +12 -4
- package/src/service/embedding/factory.ts +1 -8
- package/src/service/embedding/interface.ts +9 -1
- package/src/service/embedding/remote.ts +133 -13
- package/src/service/metadata/interface.ts +1 -0
- package/src/service/metadata/local.ts +7 -0
- package/src/service/storage/factory.ts +2 -2
- package/src/utils/config.ts +61 -8
- package/src/utils/credentials.ts +50 -0
- package/bin/contextlake-openclaw.js +0 -5
- package/dist/src/skills/las-data-profiler/register.d.ts +0 -1
- package/dist/src/skills/las-data-profiler/register.js +0 -19
- package/src/service/embedding/local.ts +0 -118
- package/src/skills/las-data-profiler/register.ts +0 -19
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.ingestSource = ingestSource;
|
|
37
|
+
const factory_1 = require("../../service/metadata/factory");
|
|
38
|
+
const las_api_1 = require("./las-api");
|
|
39
|
+
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
40
|
+
const path = __importStar(require("path"));
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const os = __importStar(require("os"));
|
|
43
|
+
// @ts-ignore
|
|
44
|
+
const uuid_1 = require("uuid");
|
|
45
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
+
async function ingestSource(params, config, logger) {
|
|
47
|
+
if (logger) {
|
|
48
|
+
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
// eslint-disable-next-line no-console
|
|
52
|
+
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
53
|
+
}
|
|
54
|
+
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
55
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
56
|
+
if (!fs.existsSync(dbPath)) {
|
|
57
|
+
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
58
|
+
}
|
|
59
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
60
|
+
const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
|
|
61
|
+
await metadataProvider.connect();
|
|
62
|
+
const lasClient = new las_api_1.LasApiClient(config, logger);
|
|
63
|
+
const results = [];
|
|
64
|
+
// Connect to the profiler LanceDB to read the file catalog
|
|
65
|
+
const profilerDb = await lancedb.connect(dbPath);
|
|
66
|
+
const tableNames = await profilerDb.tableNames();
|
|
67
|
+
if (!tableNames.includes('file_catalog')) {
|
|
68
|
+
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
69
|
+
}
|
|
70
|
+
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
71
|
+
const files = await catalogTable.query().toArray();
|
|
72
|
+
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
73
|
+
// Simple chunking for text
|
|
74
|
+
const splitText = (text, chunkSize = 500, overlap = 50) => {
|
|
75
|
+
const chunks = [];
|
|
76
|
+
if (!text)
|
|
77
|
+
return chunks;
|
|
78
|
+
let i = 0;
|
|
79
|
+
while (i < text.length) {
|
|
80
|
+
chunks.push(text.slice(i, i + chunkSize));
|
|
81
|
+
i += chunkSize - overlap;
|
|
82
|
+
}
|
|
83
|
+
return chunks;
|
|
84
|
+
};
|
|
85
|
+
const processText = async (text, fileInfo) => {
|
|
86
|
+
const chunks = splitText(text);
|
|
87
|
+
const docs = [];
|
|
88
|
+
for (const chunk of chunks) {
|
|
89
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
|
|
90
|
+
docs.push({
|
|
91
|
+
id: (0, uuid_1.v4)(),
|
|
92
|
+
vector,
|
|
93
|
+
text: chunk,
|
|
94
|
+
source: fileInfo.key,
|
|
95
|
+
file_type: fileInfo.category,
|
|
96
|
+
storage_type: 'source',
|
|
97
|
+
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
98
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
99
|
+
created_at: Date.now(),
|
|
100
|
+
binary_data: Buffer.from('')
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
return docs;
|
|
104
|
+
};
|
|
105
|
+
for (const file of files) {
|
|
106
|
+
try {
|
|
107
|
+
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
108
|
+
let docs = [];
|
|
109
|
+
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
110
|
+
if (file.media_type === 'pdf') {
|
|
111
|
+
// PDF Parse
|
|
112
|
+
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
113
|
+
url: fileUrl
|
|
114
|
+
});
|
|
115
|
+
const markdown = result.data?.markdown || '';
|
|
116
|
+
docs = await processText(markdown, file);
|
|
117
|
+
}
|
|
118
|
+
else if (file.media_type === 'image') {
|
|
119
|
+
// Multimodal Embedding directly
|
|
120
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([
|
|
121
|
+
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
122
|
+
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
123
|
+
]);
|
|
124
|
+
docs.push({
|
|
125
|
+
id: (0, uuid_1.v4)(),
|
|
126
|
+
vector,
|
|
127
|
+
text: 'Image from dataset',
|
|
128
|
+
source: file.key,
|
|
129
|
+
file_type: 'image',
|
|
130
|
+
storage_type: 'source',
|
|
131
|
+
url: fileUrl,
|
|
132
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
133
|
+
created_at: Date.now(),
|
|
134
|
+
binary_data: Buffer.from('')
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
else if (file.media_type === 'audio') {
|
|
138
|
+
// ASR
|
|
139
|
+
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
140
|
+
audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
|
|
141
|
+
request: { model_name: 'bigmodel' }
|
|
142
|
+
});
|
|
143
|
+
const text = result.data?.result?.text || '';
|
|
144
|
+
docs = await processText(text, file);
|
|
145
|
+
}
|
|
146
|
+
else if (file.media_type === 'video') {
|
|
147
|
+
// Video understanding -> text -> embedding
|
|
148
|
+
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
149
|
+
video_url: fileUrl,
|
|
150
|
+
query: "详细描述这个视频的内容",
|
|
151
|
+
model_name: "doubao-seed-2-0-lite-260215"
|
|
152
|
+
});
|
|
153
|
+
// Assuming video output is a text description somewhere in the response.
|
|
154
|
+
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
155
|
+
const text = JSON.stringify(result.data || '');
|
|
156
|
+
// Also need audio extract and ASR for video
|
|
157
|
+
// 1. Extract audio
|
|
158
|
+
// The output_path_template needs a unique path per video
|
|
159
|
+
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
|
|
160
|
+
await lasClient.process('las_audio_extract_and_split', {
|
|
161
|
+
input_path: fileUrl,
|
|
162
|
+
output_path_template: audioOutputPath,
|
|
163
|
+
output_format: 'wav'
|
|
164
|
+
});
|
|
165
|
+
// 2. ASR on the extracted audio
|
|
166
|
+
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
167
|
+
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
168
|
+
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
169
|
+
request: { model_name: 'bigmodel' }
|
|
170
|
+
});
|
|
171
|
+
const audioText = asrResult.data?.result?.text || '';
|
|
172
|
+
// Combine video text and audio text
|
|
173
|
+
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
174
|
+
docs = await processText(combinedText, file);
|
|
175
|
+
}
|
|
176
|
+
else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
177
|
+
// If we had a direct text content, we could process it here.
|
|
178
|
+
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
179
|
+
// Fallback just logs
|
|
180
|
+
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
181
|
+
}
|
|
182
|
+
if (docs.length > 0) {
|
|
183
|
+
await metadataProvider.addAssets(docs);
|
|
184
|
+
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
189
|
+
results.push({ file: file.key, status: 'error', message: error.message });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return results;
|
|
193
|
+
}
|
|
@@ -1,8 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
chunkSize?: number;
|
|
5
|
-
overlap?: number;
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
+
export interface IngestSourceParams {
|
|
3
|
+
datasource_name: string;
|
|
6
4
|
}
|
|
7
|
-
export declare function
|
|
8
|
-
|
|
5
|
+
export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
|
|
6
|
+
file: any;
|
|
7
|
+
status: string;
|
|
8
|
+
chunks: number;
|
|
9
|
+
message?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
file: any;
|
|
12
|
+
status: string;
|
|
13
|
+
message: any;
|
|
14
|
+
chunks?: undefined;
|
|
15
|
+
})[]>;
|
|
@@ -33,91 +33,161 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
33
33
|
};
|
|
34
34
|
})();
|
|
35
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.
|
|
37
|
-
const factory_1 = require("../../service/
|
|
38
|
-
const
|
|
39
|
-
const
|
|
36
|
+
exports.ingestSource = ingestSource;
|
|
37
|
+
const factory_1 = require("../../service/metadata/factory");
|
|
38
|
+
const las_api_1 = require("./las-api");
|
|
39
|
+
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
40
40
|
const path = __importStar(require("path"));
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const os = __importStar(require("os"));
|
|
41
43
|
// @ts-ignore
|
|
42
44
|
const uuid_1 = require("uuid");
|
|
43
|
-
|
|
45
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
+
async function ingestSource(params, config, logger) {
|
|
44
47
|
if (logger) {
|
|
45
|
-
logger.info(`[ContextLake-Action] Calling
|
|
48
|
+
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
46
49
|
}
|
|
47
50
|
else {
|
|
48
51
|
// eslint-disable-next-line no-console
|
|
49
|
-
console.log(`[ContextLake-Action] Calling
|
|
52
|
+
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
53
|
+
}
|
|
54
|
+
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
55
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
56
|
+
if (!fs.existsSync(dbPath)) {
|
|
57
|
+
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
50
58
|
}
|
|
51
|
-
const storageConfig = config.file_storage || { type: 'local', local_base_dir: './data/files' };
|
|
52
59
|
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
53
|
-
const
|
|
54
|
-
const metadataProvider = (0, factory_2.createMetadataProvider)(metaConfig);
|
|
60
|
+
const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
|
|
55
61
|
await metadataProvider.connect();
|
|
56
|
-
const
|
|
62
|
+
const lasClient = new las_api_1.LasApiClient(config, logger);
|
|
57
63
|
const results = [];
|
|
58
|
-
|
|
64
|
+
// Connect to the profiler LanceDB to read the file catalog
|
|
65
|
+
const profilerDb = await lancedb.connect(dbPath);
|
|
66
|
+
const tableNames = await profilerDb.tableNames();
|
|
67
|
+
if (!tableNames.includes('file_catalog')) {
|
|
68
|
+
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
69
|
+
}
|
|
70
|
+
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
71
|
+
const files = await catalogTable.query().toArray();
|
|
72
|
+
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
73
|
+
// Simple chunking for text
|
|
74
|
+
const splitText = (text, chunkSize = 500, overlap = 50) => {
|
|
75
|
+
const chunks = [];
|
|
76
|
+
if (!text)
|
|
77
|
+
return chunks;
|
|
78
|
+
let i = 0;
|
|
79
|
+
while (i < text.length) {
|
|
80
|
+
chunks.push(text.slice(i, i + chunkSize));
|
|
81
|
+
i += chunkSize - overlap;
|
|
82
|
+
}
|
|
83
|
+
return chunks;
|
|
84
|
+
};
|
|
85
|
+
const processText = async (text, fileInfo) => {
|
|
86
|
+
const chunks = splitText(text);
|
|
87
|
+
const docs = [];
|
|
88
|
+
for (const chunk of chunks) {
|
|
89
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
|
|
90
|
+
docs.push({
|
|
91
|
+
id: (0, uuid_1.v4)(),
|
|
92
|
+
vector,
|
|
93
|
+
text: chunk,
|
|
94
|
+
source: fileInfo.key,
|
|
95
|
+
file_type: fileInfo.category,
|
|
96
|
+
storage_type: 'source',
|
|
97
|
+
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
98
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
99
|
+
created_at: Date.now(),
|
|
100
|
+
binary_data: Buffer.from('')
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
return docs;
|
|
104
|
+
};
|
|
105
|
+
for (const file of files) {
|
|
59
106
|
try {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
storageType = 'inline';
|
|
71
|
-
}
|
|
72
|
-
else {
|
|
73
|
-
fileUrl = await storageProvider.uploadFile(fileName, buffer);
|
|
74
|
-
storageType = storageConfig.type;
|
|
107
|
+
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
108
|
+
let docs = [];
|
|
109
|
+
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
110
|
+
if (file.media_type === 'pdf') {
|
|
111
|
+
// PDF Parse
|
|
112
|
+
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
113
|
+
url: fileUrl
|
|
114
|
+
});
|
|
115
|
+
const markdown = result.data?.markdown || '';
|
|
116
|
+
docs = await processText(markdown, file);
|
|
75
117
|
}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
118
|
+
else if (file.media_type === 'image') {
|
|
119
|
+
// Multimodal Embedding directly
|
|
120
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([
|
|
121
|
+
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
122
|
+
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
123
|
+
]);
|
|
81
124
|
docs.push({
|
|
82
125
|
id: (0, uuid_1.v4)(),
|
|
83
126
|
vector,
|
|
84
|
-
text: '',
|
|
85
|
-
source:
|
|
86
|
-
file_type:
|
|
87
|
-
storage_type:
|
|
127
|
+
text: 'Image from dataset',
|
|
128
|
+
source: file.key,
|
|
129
|
+
file_type: 'image',
|
|
130
|
+
storage_type: 'source',
|
|
88
131
|
url: fileUrl,
|
|
89
|
-
metadata: JSON.stringify(params.
|
|
90
|
-
created_at:
|
|
91
|
-
binary_data:
|
|
132
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
133
|
+
created_at: Date.now(),
|
|
134
|
+
binary_data: Buffer.from('')
|
|
92
135
|
});
|
|
93
136
|
}
|
|
94
|
-
else {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
137
|
+
else if (file.media_type === 'audio') {
|
|
138
|
+
// ASR
|
|
139
|
+
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
140
|
+
audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
|
|
141
|
+
request: { model_name: 'bigmodel' }
|
|
142
|
+
});
|
|
143
|
+
const text = result.data?.result?.text || '';
|
|
144
|
+
docs = await processText(text, file);
|
|
145
|
+
}
|
|
146
|
+
else if (file.media_type === 'video') {
|
|
147
|
+
// Video understanding -> text -> embedding
|
|
148
|
+
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
149
|
+
video_url: fileUrl,
|
|
150
|
+
query: "详细描述这个视频的内容",
|
|
151
|
+
model_name: "doubao-seed-2-0-lite-260215"
|
|
152
|
+
});
|
|
153
|
+
// Assuming video output is a text description somewhere in the response.
|
|
154
|
+
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
155
|
+
const text = JSON.stringify(result.data || '');
|
|
156
|
+
// Also need audio extract and ASR for video
|
|
157
|
+
// 1. Extract audio
|
|
158
|
+
// The output_path_template needs a unique path per video
|
|
159
|
+
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
|
|
160
|
+
await lasClient.process('las_audio_extract_and_split', {
|
|
161
|
+
input_path: fileUrl,
|
|
162
|
+
output_path_template: audioOutputPath,
|
|
163
|
+
output_format: 'wav'
|
|
164
|
+
});
|
|
165
|
+
// 2. ASR on the extracted audio
|
|
166
|
+
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
167
|
+
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
168
|
+
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
169
|
+
request: { model_name: 'bigmodel' }
|
|
170
|
+
});
|
|
171
|
+
const audioText = asrResult.data?.result?.text || '';
|
|
172
|
+
// Combine video text and audio text
|
|
173
|
+
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
174
|
+
docs = await processText(combinedText, file);
|
|
175
|
+
}
|
|
176
|
+
else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
177
|
+
// If we had a direct text content, we could process it here.
|
|
178
|
+
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
179
|
+
// Fallback just logs
|
|
180
|
+
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
181
|
+
}
|
|
182
|
+
if (docs.length > 0) {
|
|
183
|
+
await metadataProvider.addAssets(docs);
|
|
184
|
+
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
112
185
|
}
|
|
113
|
-
await metadataProvider.addAssets(docs);
|
|
114
|
-
results.push({ file: fileName, status: 'success', chunks: docs.length });
|
|
115
186
|
}
|
|
116
187
|
catch (error) {
|
|
117
|
-
|
|
118
|
-
results.push({ file:
|
|
188
|
+
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
189
|
+
results.push({ file: file.key, status: 'error', message: error.message });
|
|
119
190
|
}
|
|
120
191
|
}
|
|
121
|
-
|
|
122
|
-
return JSON.parse(JSON.stringify(results));
|
|
192
|
+
return results;
|
|
123
193
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
+
export declare class LasApiClient {
|
|
3
|
+
private endpoint;
|
|
4
|
+
private apiKey;
|
|
5
|
+
private logger;
|
|
6
|
+
constructor(config: ContextLakeConfig, logger: any);
|
|
7
|
+
private request;
|
|
8
|
+
process(operatorId: string, data: any, version?: string): Promise<any>;
|
|
9
|
+
submit(operatorId: string, data: any, version?: string): Promise<any>;
|
|
10
|
+
poll(operatorId: string, taskId: string, version?: string): Promise<any>;
|
|
11
|
+
submitAndPoll(operatorId: string, data: any, version?: string, pollIntervalMs?: number, maxRetries?: number): Promise<any>;
|
|
12
|
+
multimodalEmbedding(model: string, input: any[], encodingFormat?: string, dimensions?: number, instructions?: string, sparseEmbedding?: any): Promise<any>;
|
|
13
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.LasApiClient = void 0;
|
|
4
|
+
class LasApiClient {
|
|
5
|
+
endpoint;
|
|
6
|
+
apiKey;
|
|
7
|
+
logger;
|
|
8
|
+
constructor(config, logger) {
|
|
9
|
+
this.logger = logger;
|
|
10
|
+
this.apiKey = config.metadata_storage?.embedding?.api_key || process.env.LAS_API_KEY || '';
|
|
11
|
+
this.endpoint = config.metadata_storage?.embedding?.api_base || process.env.LAS_BASE_URL || 'https://operator.las.cn-beijing.volces.com';
|
|
12
|
+
// Remove trailing slash
|
|
13
|
+
if (this.endpoint.endsWith('/')) {
|
|
14
|
+
this.endpoint = this.endpoint.slice(0, -1);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
async request(path, body) {
|
|
18
|
+
if (!this.apiKey) {
|
|
19
|
+
throw new Error('LAS_API_KEY is not configured. Please set it in config or environment variables.');
|
|
20
|
+
}
|
|
21
|
+
const url = `${this.endpoint}${path}`;
|
|
22
|
+
this.logger.debug(`[LasApiClient] Requesting ${url}`, { body: JSON.stringify(body) });
|
|
23
|
+
const response = await fetch(url, {
|
|
24
|
+
method: 'POST',
|
|
25
|
+
headers: {
|
|
26
|
+
'Content-Type': 'application/json',
|
|
27
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
28
|
+
},
|
|
29
|
+
body: JSON.stringify(body)
|
|
30
|
+
});
|
|
31
|
+
if (!response.ok) {
|
|
32
|
+
let errorText = '';
|
|
33
|
+
try {
|
|
34
|
+
errorText = await response.text();
|
|
35
|
+
}
|
|
36
|
+
catch (e) { }
|
|
37
|
+
throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
38
|
+
}
|
|
39
|
+
const result = await response.json();
|
|
40
|
+
return result;
|
|
41
|
+
}
|
|
42
|
+
async process(operatorId, data, version = 'v1') {
|
|
43
|
+
const result = await this.request('/api/v1/process', {
|
|
44
|
+
operator_id: operatorId,
|
|
45
|
+
operator_version: version,
|
|
46
|
+
data
|
|
47
|
+
});
|
|
48
|
+
return result;
|
|
49
|
+
}
|
|
50
|
+
async submit(operatorId, data, version = 'v1') {
|
|
51
|
+
const result = await this.request('/api/v1/submit', {
|
|
52
|
+
operator_id: operatorId,
|
|
53
|
+
operator_version: version,
|
|
54
|
+
data
|
|
55
|
+
});
|
|
56
|
+
return result;
|
|
57
|
+
}
|
|
58
|
+
async poll(operatorId, taskId, version = 'v1') {
|
|
59
|
+
const result = await this.request('/api/v1/poll', {
|
|
60
|
+
operator_id: operatorId,
|
|
61
|
+
operator_version: version,
|
|
62
|
+
task_id: taskId
|
|
63
|
+
});
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
66
|
+
async submitAndPoll(operatorId, data, version = 'v1', pollIntervalMs = 3000, maxRetries = 200) {
|
|
67
|
+
const submitResult = await this.submit(operatorId, data, version);
|
|
68
|
+
if (!submitResult?.metadata?.task_id) {
|
|
69
|
+
throw new Error(`Failed to submit task for ${operatorId}. Response: ${JSON.stringify(submitResult)}`);
|
|
70
|
+
}
|
|
71
|
+
const taskId = submitResult.metadata.task_id;
|
|
72
|
+
this.logger.info(`[LasApiClient] Task submitted: ${taskId} for ${operatorId}`);
|
|
73
|
+
let retries = 0;
|
|
74
|
+
while (retries < maxRetries) {
|
|
75
|
+
await new Promise(resolve => setTimeout(resolve, pollIntervalMs));
|
|
76
|
+
const pollResult = await this.poll(operatorId, taskId, version);
|
|
77
|
+
const status = pollResult?.metadata?.task_status;
|
|
78
|
+
this.logger.debug(`[LasApiClient] Task ${taskId} status: ${status}`);
|
|
79
|
+
if (status === 'COMPLETED') {
|
|
80
|
+
return pollResult;
|
|
81
|
+
}
|
|
82
|
+
else if (status === 'FAILED' || status === 'TIMEOUT') {
|
|
83
|
+
const errorMsg = pollResult?.metadata?.error_msg || 'Unknown error';
|
|
84
|
+
throw new Error(`Task ${taskId} failed with status: ${status}. Message: ${errorMsg}`);
|
|
85
|
+
}
|
|
86
|
+
retries++;
|
|
87
|
+
}
|
|
88
|
+
throw new Error(`Task ${taskId} timed out after ${maxRetries} polling attempts.`);
|
|
89
|
+
}
|
|
90
|
+
async multimodalEmbedding(model, input, encodingFormat = 'float', dimensions, instructions, sparseEmbedding) {
|
|
91
|
+
const body = {
|
|
92
|
+
model,
|
|
93
|
+
input,
|
|
94
|
+
encoding_format: encodingFormat
|
|
95
|
+
};
|
|
96
|
+
if (dimensions)
|
|
97
|
+
body.dimensions = dimensions;
|
|
98
|
+
if (instructions)
|
|
99
|
+
body.instructions = instructions;
|
|
100
|
+
if (sparseEmbedding)
|
|
101
|
+
body.sparse_embedding = sparseEmbedding;
|
|
102
|
+
return await this.request('/api/v1/embeddings/multimodal', body);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
exports.LasApiClient = LasApiClient;
|