@byted-las/contextlake-openclaw 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -1
- package/dist/index.js +5 -5
- package/dist/src/client/lancedb.js +13 -4
- package/dist/src/commands/cli.d.ts +5 -2
- package/dist/src/commands/cli.js +94 -10
- package/dist/src/commands/index.d.ts +2 -1
- package/dist/src/commands/index.js +31 -35
- package/dist/src/commands/slashcmd.d.ts +8 -1
- package/dist/src/commands/slashcmd.js +90 -6
- package/dist/src/commands/tools.d.ts +10 -218
- package/dist/src/commands/tools.js +109 -104
- package/dist/src/lib/actions/ingest-source.d.ts +15 -0
- package/dist/src/lib/actions/ingest-source.js +193 -0
- package/dist/src/lib/actions/ingest.d.ts +14 -7
- package/dist/src/lib/actions/ingest.js +133 -63
- package/dist/src/lib/actions/las-api.d.ts +13 -0
- package/dist/src/lib/actions/las-api.js +105 -0
- package/dist/src/lib/actions/las-tools.d.ts +3 -0
- package/dist/src/lib/actions/las-tools.js +194 -0
- package/dist/src/lib/actions/las.d.ts +64 -0
- package/dist/src/lib/actions/las.js +72 -0
- package/dist/src/lib/actions/manage.d.ts +3 -2
- package/dist/src/{skills/las-data-profiler/index.d.ts → lib/actions/profiler.d.ts} +4 -2
- package/dist/src/{skills/las-data-profiler/index.js → lib/actions/profiler.js} +19 -3
- package/dist/src/lib/actions/retrieve.d.ts +2 -1
- package/dist/src/lib/actions/retrieve.js +2 -18
- package/{src/skills/las-data-profiler → dist/src/lib/scripts}/s3_catalog.py +10 -1
- package/dist/src/processor/loader.js +9 -2
- package/dist/src/service/embedding/factory.js +1 -10
- package/dist/src/service/embedding/interface.d.ts +8 -1
- package/dist/src/service/embedding/local.js +16 -13
- package/dist/src/service/embedding/remote.d.ts +7 -0
- package/dist/src/service/embedding/remote.js +108 -7
- package/dist/src/service/metadata/interface.d.ts +1 -0
- package/dist/src/service/metadata/local.d.ts +1 -0
- package/dist/src/service/metadata/local.js +6 -0
- package/dist/src/skills/SKILL.md +174 -0
- package/dist/src/skills/contextlake-delete/SKILL.md +36 -0
- package/dist/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/dist/src/skills/contextlake-list/SKILL.md +22 -0
- package/dist/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/dist/src/skills/las-data-profiler/SKILL.md +174 -0
- package/dist/src/utils/config.d.ts +34 -1
- package/dist/src/utils/config.js +16 -3
- package/dist/src/utils/credentials.d.ts +8 -0
- package/dist/src/utils/credentials.js +77 -0
- package/index.ts +8 -8
- package/openclaw.plugin.json +1 -1
- package/package.json +8 -7
- package/src/client/lancedb.ts +32 -21
- package/src/commands/cli.ts +105 -13
- package/src/commands/index.ts +45 -42
- package/src/commands/slashcmd.ts +69 -10
- package/src/commands/tools.ts +142 -117
- package/src/lib/actions/ingest.ts +151 -75
- package/src/lib/actions/las-api.ts +119 -0
- package/src/lib/actions/las-tools.ts +196 -0
- package/src/lib/actions/manage.ts +6 -5
- package/src/{skills/las-data-profiler/index.ts → lib/actions/profiler.ts} +21 -4
- package/src/lib/actions/retrieve.ts +16 -34
- package/src/lib/scripts/s3_catalog.py +617 -0
- package/src/processor/loader.ts +12 -4
- package/src/service/embedding/factory.ts +1 -8
- package/src/service/embedding/interface.ts +9 -1
- package/src/service/embedding/remote.ts +133 -13
- package/src/service/metadata/interface.ts +1 -0
- package/src/service/metadata/local.ts +7 -0
- package/src/service/storage/factory.ts +2 -2
- package/src/utils/config.ts +61 -8
- package/src/utils/credentials.ts +50 -0
- package/bin/contextlake-openclaw.js +0 -5
- package/dist/src/skills/las-data-profiler/register.d.ts +0 -1
- package/dist/src/skills/las-data-profiler/register.js +0 -19
- package/src/service/embedding/local.ts +0 -118
- package/src/skills/las-data-profiler/register.ts +0 -19
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.getLasTools = getLasTools;
|
|
4
|
+
const las_api_1 = require("./las-api");
|
|
5
|
+
function getLasTools(pluginConfig, logger) {
|
|
6
|
+
const apiClient = new las_api_1.LasApiClient(pluginConfig, logger);
|
|
7
|
+
const callApi = async (method, args) => {
|
|
8
|
+
try {
|
|
9
|
+
// @ts-ignore
|
|
10
|
+
return await apiClient[method](...args);
|
|
11
|
+
}
|
|
12
|
+
catch (error) {
|
|
13
|
+
logger.error(`[LasTools] API ${method} failed`, { error: error.message });
|
|
14
|
+
return { error: error.message };
|
|
15
|
+
}
|
|
16
|
+
};
|
|
17
|
+
return [
|
|
18
|
+
{
|
|
19
|
+
name: 'las_image_resample',
|
|
20
|
+
label: 'LAS Image Resample',
|
|
21
|
+
description: `Resample/Resize an image and save it to TOS.
|
|
22
|
+
Parameters in data:
|
|
23
|
+
- image_src_type (string, default: "image_url"): "image_url" or "image_tos"
|
|
24
|
+
- image (string, required): URL or tos:// path
|
|
25
|
+
- tos_dir (string, required): tos:// output directory
|
|
26
|
+
- image_suffix (string): ".jpg" or ".png"
|
|
27
|
+
- target_size (array of integers): e.g. [1024, 1024]
|
|
28
|
+
- target_dpi (array of integers): e.g. [72, 72]
|
|
29
|
+
- method (string): "nearest", "bilinear", "bicubic", "lanczos"`,
|
|
30
|
+
parameters: {
|
|
31
|
+
type: 'object',
|
|
32
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
33
|
+
required: ['data']
|
|
34
|
+
},
|
|
35
|
+
async execute(toolCallId, params) {
|
|
36
|
+
return await callApi('process', ['las_image_resample', params.data]);
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
name: 'las_audio_extract_and_split',
|
|
41
|
+
label: 'LAS Audio Extract and Split',
|
|
42
|
+
description: `Extract audio from video and split it into chunks.
|
|
43
|
+
Parameters in data:
|
|
44
|
+
- input_path (string, required): tos:// video path
|
|
45
|
+
- output_path_template (string, required): e.g. tos://bucket/{index}.{output_file_ext}
|
|
46
|
+
- split_duration (number): duration in seconds, default 30.0
|
|
47
|
+
- output_format (string): "wav", "mp3", "flac"
|
|
48
|
+
- timeout (integer)
|
|
49
|
+
- extra_params (array of string): ffmpeg params`,
|
|
50
|
+
parameters: {
|
|
51
|
+
type: 'object',
|
|
52
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
53
|
+
required: ['data']
|
|
54
|
+
},
|
|
55
|
+
async execute(toolCallId, params) {
|
|
56
|
+
return await callApi('process', ['las_audio_extract_and_split', params.data]);
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
name: 'las_audio_convert',
|
|
61
|
+
label: 'LAS Audio Convert',
|
|
62
|
+
description: `Convert audio format.
|
|
63
|
+
Parameters in data:
|
|
64
|
+
- input_path (string, required): tos:// audio path
|
|
65
|
+
- output_path (string, required): tos:// output path
|
|
66
|
+
- output_format (string): "wav", "mp3", "flac"
|
|
67
|
+
- extra_params (array of string): ffmpeg params`,
|
|
68
|
+
parameters: {
|
|
69
|
+
type: 'object',
|
|
70
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
71
|
+
required: ['data']
|
|
72
|
+
},
|
|
73
|
+
async execute(toolCallId, params) {
|
|
74
|
+
return await callApi('process', ['las_audio_convert', params.data]);
|
|
75
|
+
}
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
name: 'las_asr_pro',
|
|
79
|
+
label: 'LAS ASR Pro (Speech Recognition)',
|
|
80
|
+
description: `Perform automatic speech recognition (ASR).
|
|
81
|
+
Parameters in data:
|
|
82
|
+
- resource: "bigasr" or "seedasr"
|
|
83
|
+
- audio (object, required): { url: string, language: string, format: string }
|
|
84
|
+
- request (object, required): { model_name: "bigmodel", ... }
|
|
85
|
+
- user (object): { uid: string }`,
|
|
86
|
+
parameters: {
|
|
87
|
+
type: 'object',
|
|
88
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
89
|
+
required: ['data']
|
|
90
|
+
},
|
|
91
|
+
async execute(toolCallId, params) {
|
|
92
|
+
return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
name: 'las_seed_2_0',
|
|
97
|
+
label: 'LAS Seed 2.0 (Audio)',
|
|
98
|
+
description: `ASR with Seed 2.0.
|
|
99
|
+
Parameters in data:
|
|
100
|
+
- audio (object, required): { url, format, language }
|
|
101
|
+
- request (object, required): { model_name: "seedasr", ... }`,
|
|
102
|
+
parameters: {
|
|
103
|
+
type: 'object',
|
|
104
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
105
|
+
required: ['data']
|
|
106
|
+
},
|
|
107
|
+
async execute(toolCallId, params) {
|
|
108
|
+
return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
|
|
109
|
+
}
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
name: 'las_bare_image_text_embedding',
|
|
113
|
+
label: 'LAS Multimodal Embedding',
|
|
114
|
+
description: `Multimodal Embedding (image and text).
|
|
115
|
+
Parameters:
|
|
116
|
+
- model (string, required): "doubao-embedding-vision-250615"
|
|
117
|
+
- input (array of objects, required): [ { type: "image_url", image_url: { url: "..." } }, { type: "text", text: "..." } ]
|
|
118
|
+
- encoding_format (string): "float", "base64"
|
|
119
|
+
- dimensions (integer): 1024 or 2048`,
|
|
120
|
+
parameters: {
|
|
121
|
+
type: 'object',
|
|
122
|
+
properties: {
|
|
123
|
+
model: { type: 'string', default: 'doubao-embedding-vision-250615' },
|
|
124
|
+
input: { type: 'array', items: { type: 'object' } },
|
|
125
|
+
encoding_format: { type: 'string', default: 'float' },
|
|
126
|
+
dimensions: { type: 'integer' }
|
|
127
|
+
},
|
|
128
|
+
required: ['model', 'input']
|
|
129
|
+
},
|
|
130
|
+
async execute(toolCallId, params) {
|
|
131
|
+
return await callApi('multimodalEmbedding', [
|
|
132
|
+
params.model,
|
|
133
|
+
params.input,
|
|
134
|
+
params.encoding_format,
|
|
135
|
+
params.dimensions
|
|
136
|
+
]);
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
name: 'las_long_video_understand',
|
|
141
|
+
label: 'LAS Long Video Understand',
|
|
142
|
+
description: `Long Video Understanding.
|
|
143
|
+
Parameters in data:
|
|
144
|
+
- video_url (string, required)
|
|
145
|
+
- query (string, required)
|
|
146
|
+
- model_name (string): default "doubao-seed-2-0-lite-260215"
|
|
147
|
+
- ...other params like fps, target_tokens_per_clip`,
|
|
148
|
+
parameters: {
|
|
149
|
+
type: 'object',
|
|
150
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
151
|
+
required: ['data']
|
|
152
|
+
},
|
|
153
|
+
async execute(toolCallId, params) {
|
|
154
|
+
return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
|
|
155
|
+
}
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
name: 'las_pdf_parse_doubao',
|
|
159
|
+
label: 'LAS PDF Parse Doubao',
|
|
160
|
+
description: `Parse PDF documents to Markdown.
|
|
161
|
+
Parameters in data:
|
|
162
|
+
- url (string, required): PDF URL or tos://
|
|
163
|
+
- start_page (integer): default 1
|
|
164
|
+
- num_pages (integer): default to end
|
|
165
|
+
- parse_mode (string): "normal" or "detail"`,
|
|
166
|
+
parameters: {
|
|
167
|
+
type: 'object',
|
|
168
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
169
|
+
required: ['data']
|
|
170
|
+
},
|
|
171
|
+
async execute(toolCallId, params) {
|
|
172
|
+
return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
|
|
173
|
+
}
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
name: 'las_video_resize',
|
|
177
|
+
label: 'LAS Video Resize',
|
|
178
|
+
description: `Resize video.
|
|
179
|
+
Parameters in data:
|
|
180
|
+
- video_url (string, required): URL or tos://
|
|
181
|
+
- target_width (integer)
|
|
182
|
+
- target_height (integer)
|
|
183
|
+
- output_dir (string, required): tos://`,
|
|
184
|
+
parameters: {
|
|
185
|
+
type: 'object',
|
|
186
|
+
properties: { data: { type: 'object', additionalProperties: true } },
|
|
187
|
+
required: ['data']
|
|
188
|
+
},
|
|
189
|
+
async execute(toolCallId, params) {
|
|
190
|
+
return await callApi('submitAndPoll', ['las_video_resize', params.data]);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
];
|
|
194
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
+
export declare function lasPdfParseDoubao(params: {
|
|
3
|
+
url: string;
|
|
4
|
+
start_page?: number;
|
|
5
|
+
num_pages?: number;
|
|
6
|
+
parse_mode?: string;
|
|
7
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
8
|
+
export declare function lasLongVideoUnderstand(params: {
|
|
9
|
+
video_url: string;
|
|
10
|
+
prompt: string;
|
|
11
|
+
system_prompt?: string;
|
|
12
|
+
return_chunk_text?: boolean;
|
|
13
|
+
max_tokens?: number;
|
|
14
|
+
temperature?: number;
|
|
15
|
+
top_p?: number;
|
|
16
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
17
|
+
export declare function lasBareImageTextEmbedding(params: {
|
|
18
|
+
input: Array<{
|
|
19
|
+
type: string;
|
|
20
|
+
text?: string;
|
|
21
|
+
image_url?: string;
|
|
22
|
+
}>;
|
|
23
|
+
encoding_format?: string;
|
|
24
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
25
|
+
export declare function lasSeed20(params: {
|
|
26
|
+
model: string;
|
|
27
|
+
messages: Array<any>;
|
|
28
|
+
stream?: boolean;
|
|
29
|
+
max_tokens?: number;
|
|
30
|
+
temperature?: number;
|
|
31
|
+
top_p?: number;
|
|
32
|
+
frequency_penalty?: number;
|
|
33
|
+
presence_penalty?: number;
|
|
34
|
+
tools?: Array<any>;
|
|
35
|
+
tool_choice?: any;
|
|
36
|
+
user?: string;
|
|
37
|
+
logprobs?: boolean;
|
|
38
|
+
top_logprobs?: number;
|
|
39
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
40
|
+
export declare function lasAsrPro(params: {
|
|
41
|
+
url?: string;
|
|
42
|
+
format?: string;
|
|
43
|
+
language?: string;
|
|
44
|
+
resource?: string;
|
|
45
|
+
use_itn?: boolean;
|
|
46
|
+
use_sn?: boolean;
|
|
47
|
+
enable_alignment?: boolean;
|
|
48
|
+
channel_id?: number;
|
|
49
|
+
use_word_info?: boolean;
|
|
50
|
+
text_format?: number;
|
|
51
|
+
enable_semantic_sentence_detection?: boolean;
|
|
52
|
+
boost_words?: Array<{
|
|
53
|
+
word: string;
|
|
54
|
+
weight: number;
|
|
55
|
+
}>;
|
|
56
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
57
|
+
export declare function lasAudioExtractAndSplit(params: {
|
|
58
|
+
input_path: string;
|
|
59
|
+
output_path_template: string;
|
|
60
|
+
split_duration?: number;
|
|
61
|
+
output_format?: string;
|
|
62
|
+
timeout?: number;
|
|
63
|
+
extra_params?: string[];
|
|
64
|
+
}, config?: ContextLakeConfig): Promise<any>;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.lasPdfParseDoubao = lasPdfParseDoubao;
|
|
4
|
+
exports.lasLongVideoUnderstand = lasLongVideoUnderstand;
|
|
5
|
+
exports.lasBareImageTextEmbedding = lasBareImageTextEmbedding;
|
|
6
|
+
exports.lasSeed20 = lasSeed20;
|
|
7
|
+
exports.lasAsrPro = lasAsrPro;
|
|
8
|
+
exports.lasAudioExtractAndSplit = lasAudioExtractAndSplit;
|
|
9
|
+
function getLASConfig(config) {
|
|
10
|
+
// Attempt to get from env vars or config
|
|
11
|
+
const endpoint = process.env.LAS_ENDPOINT || (config?.las)?.endpoint;
|
|
12
|
+
const apiKey = process.env.LAS_API_KEY || (config?.las)?.api_key;
|
|
13
|
+
if (!endpoint || !apiKey) {
|
|
14
|
+
throw new Error("LAS_ENDPOINT and LAS_API_KEY must be set in environment variables or config");
|
|
15
|
+
}
|
|
16
|
+
return { endpoint, apiKey };
|
|
17
|
+
}
|
|
18
|
+
async function lasFetch(path, payload, config) {
|
|
19
|
+
const { endpoint, apiKey } = getLASConfig(config);
|
|
20
|
+
const url = `${endpoint.replace(/\/$/, '')}${path}`;
|
|
21
|
+
const response = await fetch(url, {
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: {
|
|
24
|
+
'Content-Type': 'application/json',
|
|
25
|
+
'Authorization': `Bearer ${apiKey}`
|
|
26
|
+
},
|
|
27
|
+
body: JSON.stringify(payload)
|
|
28
|
+
});
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
let errorText = await response.text().catch(() => '');
|
|
31
|
+
throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
32
|
+
}
|
|
33
|
+
return await response.json();
|
|
34
|
+
}
|
|
35
|
+
async function lasPdfParseDoubao(params, config) {
|
|
36
|
+
return lasFetch('/api/v1/submit', {
|
|
37
|
+
operator_id: 'las_pdf_parse_doubao',
|
|
38
|
+
operator_version: 'v1',
|
|
39
|
+
data: params
|
|
40
|
+
}, config);
|
|
41
|
+
}
|
|
42
|
+
async function lasLongVideoUnderstand(params, config) {
|
|
43
|
+
return lasFetch('/api/v1/submit', {
|
|
44
|
+
operator_id: 'las_long_video_understand',
|
|
45
|
+
operator_version: 'v1',
|
|
46
|
+
data: params
|
|
47
|
+
}, config);
|
|
48
|
+
}
|
|
49
|
+
async function lasBareImageTextEmbedding(params, config) {
|
|
50
|
+
return lasFetch('/api/v1/embeddings/multimodal', {
|
|
51
|
+
model: 'doubao-embedding-vision',
|
|
52
|
+
input: params.input,
|
|
53
|
+
encoding_format: params.encoding_format
|
|
54
|
+
}, config);
|
|
55
|
+
}
|
|
56
|
+
async function lasSeed20(params, config) {
|
|
57
|
+
return lasFetch('/api/v1/chat/completions', params, config);
|
|
58
|
+
}
|
|
59
|
+
async function lasAsrPro(params, config) {
|
|
60
|
+
return lasFetch('/api/v1/submit', {
|
|
61
|
+
operator_id: 'las_asr_pro',
|
|
62
|
+
operator_version: 'v1',
|
|
63
|
+
data: params
|
|
64
|
+
}, config);
|
|
65
|
+
}
|
|
66
|
+
async function lasAudioExtractAndSplit(params, config) {
|
|
67
|
+
return lasFetch('/api/v1/process', {
|
|
68
|
+
operator_id: 'las_audio_extract_and_split',
|
|
69
|
+
operator_version: 'v1',
|
|
70
|
+
data: params
|
|
71
|
+
}, config);
|
|
72
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
1
2
|
interface ListParams {
|
|
2
3
|
limit?: number;
|
|
3
4
|
}
|
|
@@ -5,8 +6,8 @@ interface DeleteParams {
|
|
|
5
6
|
file_ids?: string[];
|
|
6
7
|
filter?: string;
|
|
7
8
|
}
|
|
8
|
-
export declare function listAssets(params: ListParams, config:
|
|
9
|
-
export declare function deleteAssets(params: DeleteParams, config:
|
|
9
|
+
export declare function listAssets(params: ListParams, config: ContextLakeConfig, logger?: any): Promise<any>;
|
|
10
|
+
export declare function deleteAssets(params: DeleteParams, config: ContextLakeConfig, logger?: any): Promise<{
|
|
10
11
|
status: string;
|
|
11
12
|
message: string;
|
|
12
13
|
deleted_count: number;
|
|
@@ -9,7 +9,7 @@ export interface ConnectParams {
|
|
|
9
9
|
prefix: string;
|
|
10
10
|
sample_rows?: number;
|
|
11
11
|
}
|
|
12
|
-
interface ConnectResult {
|
|
12
|
+
export interface ConnectResult {
|
|
13
13
|
status: 'success' | 'error';
|
|
14
14
|
datasource_name: string;
|
|
15
15
|
db_path: string;
|
|
@@ -23,4 +23,6 @@ interface ConnectResult {
|
|
|
23
23
|
error?: string;
|
|
24
24
|
}
|
|
25
25
|
export declare function connectDataSource(params: ConnectParams, _ctx?: any): Promise<ConnectResult>;
|
|
26
|
-
export {
|
|
26
|
+
export declare function listDataSources(_ctx?: any): Promise<{
|
|
27
|
+
datasources: string[];
|
|
28
|
+
}>;
|
|
@@ -34,6 +34,7 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
34
34
|
})();
|
|
35
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
36
|
exports.connectDataSource = connectDataSource;
|
|
37
|
+
exports.listDataSources = listDataSources;
|
|
37
38
|
const path = __importStar(require("path"));
|
|
38
39
|
const fs = __importStar(require("fs"));
|
|
39
40
|
const os = __importStar(require("os"));
|
|
@@ -41,7 +42,7 @@ const child_process_1 = require("child_process");
|
|
|
41
42
|
// ---------------------------------------------------------------------------
|
|
42
43
|
// Constants
|
|
43
44
|
// ---------------------------------------------------------------------------
|
|
44
|
-
const BASE_DIR = path.join(os.homedir(), '.openclaw', '
|
|
45
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
45
46
|
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
46
47
|
// ---------------------------------------------------------------------------
|
|
47
48
|
// Helpers
|
|
@@ -109,8 +110,8 @@ function ensurePythonDeps() {
|
|
|
109
110
|
* Get the path to the bundled Python script.
|
|
110
111
|
*/
|
|
111
112
|
function getScriptPath() {
|
|
112
|
-
// The Python script is
|
|
113
|
-
return path.join(__dirname, 's3_catalog.py');
|
|
113
|
+
// The Python script is located in the scripts directory
|
|
114
|
+
return path.join(__dirname, '../scripts', 's3_catalog.py');
|
|
114
115
|
}
|
|
115
116
|
// ---------------------------------------------------------------------------
|
|
116
117
|
// Main Entry
|
|
@@ -229,3 +230,18 @@ async function connectDataSource(params, _ctx) {
|
|
|
229
230
|
});
|
|
230
231
|
});
|
|
231
232
|
}
|
|
233
|
+
async function listDataSources(_ctx) {
|
|
234
|
+
try {
|
|
235
|
+
if (!fs.existsSync(BASE_DIR)) {
|
|
236
|
+
return { datasources: [] };
|
|
237
|
+
}
|
|
238
|
+
const entries = fs.readdirSync(BASE_DIR, { withFileTypes: true });
|
|
239
|
+
const datasources = entries
|
|
240
|
+
.filter(entry => entry.isDirectory())
|
|
241
|
+
.map(entry => entry.name);
|
|
242
|
+
return { datasources };
|
|
243
|
+
}
|
|
244
|
+
catch (error) {
|
|
245
|
+
throw new Error(`Failed to list data sources: ${error.message}`);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
1
2
|
interface RetrieveParams {
|
|
2
3
|
query: string;
|
|
3
4
|
top_k?: number;
|
|
4
5
|
filter?: string;
|
|
5
6
|
include_binary?: boolean;
|
|
6
7
|
}
|
|
7
|
-
export declare function retrieveAssets(params: RetrieveParams, config:
|
|
8
|
+
export declare function retrieveAssets(params: RetrieveParams, config: ContextLakeConfig, logger?: any): Promise<any[]>;
|
|
8
9
|
export {};
|
|
@@ -19,17 +19,11 @@ async function retrieveAssets(params, config, logger) {
|
|
|
19
19
|
}
|
|
20
20
|
await metadataProvider.connect();
|
|
21
21
|
const results = await metadataProvider.search(params.query, params.top_k || 5, params.filter);
|
|
22
|
-
// Post-process to handle binary data retrieval if requested
|
|
23
22
|
if (params.include_binary) {
|
|
24
23
|
const enrichedResults = await Promise.all(results.map(async (doc) => {
|
|
25
|
-
|
|
26
|
-
const { binary_data, ...rest } = doc;
|
|
27
|
-
if (rest.vector) {
|
|
28
|
-
rest.vector = Array.from(rest.vector);
|
|
29
|
-
}
|
|
24
|
+
const { binary_data, vector, ...rest } = doc;
|
|
30
25
|
try {
|
|
31
26
|
if (rest.storage_type === 'inline') {
|
|
32
|
-
// Return inline binary data (Base64)
|
|
33
27
|
return {
|
|
34
28
|
...rest,
|
|
35
29
|
binary_content: binary_data ? binary_data.toString('base64') : null
|
|
@@ -37,7 +31,6 @@ async function retrieveAssets(params, config, logger) {
|
|
|
37
31
|
}
|
|
38
32
|
else if (rest.storage_type === 'local' || rest.storage_type === 'tos') {
|
|
39
33
|
if (storageProvider) {
|
|
40
|
-
// Download from storage provider
|
|
41
34
|
const buffer = await storageProvider.downloadFile(rest.url);
|
|
42
35
|
return {
|
|
43
36
|
...rest,
|
|
@@ -56,17 +49,8 @@ async function retrieveAssets(params, config, logger) {
|
|
|
56
49
|
}));
|
|
57
50
|
return enrichedResults;
|
|
58
51
|
}
|
|
59
|
-
// Remove binary_data from default response to keep it light and serializable
|
|
60
52
|
return results.map((doc) => {
|
|
61
|
-
const { binary_data, ...rest } = doc;
|
|
62
|
-
// Ensure all fields are standard serializable types (remove complex objects/buffers)
|
|
63
|
-
if (rest.vector) {
|
|
64
|
-
// Keep vector as standard array of numbers if it exists, but some clients might choke on large arrays.
|
|
65
|
-
// Usually vector is fine, but LanceDB might return custom objects like Float32Array which fail structuredClone in postMessage
|
|
66
|
-
rest.vector = Array.from(rest.vector);
|
|
67
|
-
}
|
|
68
|
-
// LanceDB records might be Proxy objects or contain non-clonable getters.
|
|
69
|
-
// We stringify and parse to get a clean, plain Javascript Object.
|
|
53
|
+
const { binary_data, vector, ...rest } = doc;
|
|
70
54
|
const safeDoc = JSON.parse(JSON.stringify(rest));
|
|
71
55
|
return safeDoc;
|
|
72
56
|
});
|
|
@@ -523,10 +523,19 @@ def main():
|
|
|
523
523
|
parser.add_argument('--region', default='')
|
|
524
524
|
parser.add_argument('--bucket', required=True)
|
|
525
525
|
parser.add_argument('--prefix', required=True)
|
|
526
|
-
parser.add_argument('--db-path', default='
|
|
526
|
+
parser.add_argument('--db-path', default=None, help='Path to LanceDB database. Defaults to ~/.openclaw/las-data-profiler/{datasource_name}/catalog_db if datasource_name is provided.')
|
|
527
|
+
parser.add_argument('--datasource-name', default='', help='Name of the datasource. Used to determine default db-path if not explicitly provided.')
|
|
527
528
|
parser.add_argument('--sample-rows', type=int, default=100)
|
|
528
529
|
args = parser.parse_args()
|
|
529
530
|
|
|
531
|
+
if not args.db_path:
|
|
532
|
+
if args.datasource_name:
|
|
533
|
+
import os
|
|
534
|
+
home_dir = os.path.expanduser('~')
|
|
535
|
+
args.db_path = os.path.join(home_dir, '.openclaw', 'las-data-profiler', args.datasource_name, 'catalog_db')
|
|
536
|
+
else:
|
|
537
|
+
args.db_path = './catalog_db'
|
|
538
|
+
|
|
530
539
|
print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
|
|
531
540
|
print(f"[las-data-profiler] db_path={args.db_path}")
|
|
532
541
|
|
|
@@ -72,12 +72,19 @@ function splitText(text, chunkSize = 500, overlap = 50) {
|
|
|
72
72
|
const chunks = [];
|
|
73
73
|
if (!text)
|
|
74
74
|
return chunks;
|
|
75
|
+
const safeChunkSize = Number.isFinite(chunkSize) ? Math.max(1, Math.floor(chunkSize)) : 500;
|
|
76
|
+
const safeOverlapRaw = Number.isFinite(overlap) ? Math.max(0, Math.floor(overlap)) : 50;
|
|
77
|
+
const safeOverlap = Math.min(safeOverlapRaw, safeChunkSize - 1);
|
|
78
|
+
const step = Math.max(1, safeChunkSize - safeOverlap);
|
|
75
79
|
let start = 0;
|
|
76
80
|
while (start < text.length) {
|
|
77
|
-
const end = Math.min(start +
|
|
81
|
+
const end = Math.min(start + safeChunkSize, text.length);
|
|
78
82
|
const chunk = text.slice(start, end);
|
|
83
|
+
if (!chunk) {
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
79
86
|
chunks.push(chunk);
|
|
80
|
-
start +=
|
|
87
|
+
start += step;
|
|
81
88
|
}
|
|
82
89
|
return chunks;
|
|
83
90
|
}
|
|
@@ -1,16 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.createEmbeddingProvider = createEmbeddingProvider;
|
|
4
|
-
const local_1 = require("./local");
|
|
5
4
|
const remote_1 = require("./remote");
|
|
6
5
|
function createEmbeddingProvider(config) {
|
|
7
|
-
|
|
8
|
-
return new local_1.LocalEmbeddingProvider(config);
|
|
9
|
-
}
|
|
10
|
-
else if (config.provider === 'openai' || config.provider === 'remote') {
|
|
11
|
-
return new remote_1.RemoteEmbeddingProvider(config);
|
|
12
|
-
}
|
|
13
|
-
else {
|
|
14
|
-
throw new Error(`Unsupported embedding provider: ${config.provider}`);
|
|
15
|
-
}
|
|
6
|
+
return new remote_1.RemoteEmbeddingProvider(config);
|
|
16
7
|
}
|
|
@@ -9,10 +9,17 @@ export interface EmbeddingProvider {
|
|
|
9
9
|
* @param texts - Array of input texts
|
|
10
10
|
*/
|
|
11
11
|
generateEmbeddings(texts: string[]): Promise<number[][]>;
|
|
12
|
+
/**
|
|
13
|
+
* Generate embedding for multimodal input (LAS specific)
|
|
14
|
+
* @param input - Multimodal input array
|
|
15
|
+
*/
|
|
16
|
+
generateMultimodalEmbedding?(input: any[]): Promise<number[]>;
|
|
12
17
|
}
|
|
13
18
|
export interface EmbeddingConfig {
|
|
14
|
-
provider: 'local' | 'remote' | 'openai';
|
|
19
|
+
provider: 'local' | 'remote' | 'openai' | 'las';
|
|
15
20
|
model_name: string;
|
|
16
21
|
api_key?: string;
|
|
17
22
|
api_base?: string;
|
|
23
|
+
dimensions?: number;
|
|
24
|
+
encoding_format?: 'float' | 'base64';
|
|
18
25
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.LocalEmbeddingProvider = exports.setNodeLlamaCppImporter = void 0;
|
|
4
4
|
// import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
|
|
5
|
-
const DEFAULT_LOCAL_MODEL =
|
|
5
|
+
const DEFAULT_LOCAL_MODEL = 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf';
|
|
6
6
|
let nodeLlamaImportPromise = null;
|
|
7
7
|
const setNodeLlamaCppImporter = (importer) => {
|
|
8
8
|
nodeLlamaImportPromise = importer();
|
|
@@ -10,7 +10,7 @@ const setNodeLlamaCppImporter = (importer) => {
|
|
|
10
10
|
exports.setNodeLlamaCppImporter = setNodeLlamaCppImporter;
|
|
11
11
|
const importNodeLlamaCpp = async () => {
|
|
12
12
|
if (!nodeLlamaImportPromise) {
|
|
13
|
-
nodeLlamaImportPromise = import(
|
|
13
|
+
nodeLlamaImportPromise = import('node-llama-cpp');
|
|
14
14
|
}
|
|
15
15
|
return nodeLlamaImportPromise;
|
|
16
16
|
};
|
|
@@ -33,7 +33,13 @@ class LocalEmbeddingProvider {
|
|
|
33
33
|
if (this.initPromise) {
|
|
34
34
|
return this.initPromise;
|
|
35
35
|
}
|
|
36
|
-
this.initPromise = this.doInitialize()
|
|
36
|
+
this.initPromise = this.doInitialize().catch((err) => {
|
|
37
|
+
this.initPromise = null;
|
|
38
|
+
this.context = null;
|
|
39
|
+
this.model = null;
|
|
40
|
+
this.llama = null;
|
|
41
|
+
throw err;
|
|
42
|
+
});
|
|
37
43
|
return this.initPromise;
|
|
38
44
|
}
|
|
39
45
|
async doInitialize() {
|
|
@@ -58,15 +64,15 @@ class LocalEmbeddingProvider {
|
|
|
58
64
|
}
|
|
59
65
|
}
|
|
60
66
|
async generateEmbedding(text) {
|
|
67
|
+
if (!text || !text.trim()) {
|
|
68
|
+
throw new Error('Embedding input text must be a non-empty string');
|
|
69
|
+
}
|
|
61
70
|
await this.ensureInitialized();
|
|
62
71
|
const embedding = await this.context.getEmbeddingFor(text);
|
|
63
72
|
const vector = embedding.vector; // TypedArray
|
|
64
73
|
// Optimized normalization loop
|
|
65
74
|
let sumSq = 0;
|
|
66
75
|
const len = vector.length;
|
|
67
|
-
// First pass: Calculate magnitude and sanitize (implicitly handled by JS numbers usually, but keeping finite check if needed)
|
|
68
|
-
// For performance, we assume node-llama-cpp returns valid floats.
|
|
69
|
-
// If strict sanitization is needed, it can be combined.
|
|
70
76
|
for (let i = 0; i < len; i++) {
|
|
71
77
|
const val = vector[i];
|
|
72
78
|
if (Number.isFinite(val)) {
|
|
@@ -85,19 +91,16 @@ class LocalEmbeddingProvider {
|
|
|
85
91
|
}
|
|
86
92
|
}
|
|
87
93
|
else {
|
|
88
|
-
// Zero vector case
|
|
89
94
|
for (let i = 0; i < len; i++) {
|
|
90
|
-
result[i] = vector[i];
|
|
95
|
+
result[i] = vector[i];
|
|
91
96
|
}
|
|
92
97
|
}
|
|
93
98
|
return result;
|
|
94
99
|
}
|
|
95
|
-
// Optimized batch processing for local embedding
|
|
96
100
|
async generateEmbeddings(texts) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
// If newer version supports batch, we should use it. For now, we parallelize with limit.
|
|
101
|
+
if (!Array.isArray(texts)) {
|
|
102
|
+
throw new Error('Embedding input must be an array of strings');
|
|
103
|
+
}
|
|
101
104
|
return Promise.all(texts.map(text => this.generateEmbedding(text)));
|
|
102
105
|
}
|
|
103
106
|
}
|
|
@@ -3,7 +3,14 @@ export declare class RemoteEmbeddingProvider implements EmbeddingProvider {
|
|
|
3
3
|
private apiKey;
|
|
4
4
|
private modelName;
|
|
5
5
|
private apiBase;
|
|
6
|
+
private dimensions?;
|
|
7
|
+
private encodingFormat;
|
|
8
|
+
private mode;
|
|
6
9
|
constructor(config: EmbeddingConfig);
|
|
10
|
+
private detectMode;
|
|
7
11
|
generateEmbedding(text: string): Promise<number[]>;
|
|
8
12
|
generateEmbeddings(texts: string[]): Promise<number[][]>;
|
|
13
|
+
private generateOpenAICompatibleEmbeddings;
|
|
14
|
+
private generateLasEmbeddings;
|
|
15
|
+
generateMultimodalEmbedding(input: any[]): Promise<number[]>;
|
|
9
16
|
}
|