@byted-las/contextlake-openclaw 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/dist/index.d.ts +2 -1
  2. package/dist/index.js +5 -5
  3. package/dist/src/client/lancedb.js +13 -4
  4. package/dist/src/commands/cli.d.ts +5 -2
  5. package/dist/src/commands/cli.js +94 -10
  6. package/dist/src/commands/index.d.ts +2 -1
  7. package/dist/src/commands/index.js +31 -35
  8. package/dist/src/commands/slashcmd.d.ts +8 -1
  9. package/dist/src/commands/slashcmd.js +90 -6
  10. package/dist/src/commands/tools.d.ts +10 -218
  11. package/dist/src/commands/tools.js +109 -104
  12. package/dist/src/lib/actions/ingest-source.d.ts +15 -0
  13. package/dist/src/lib/actions/ingest-source.js +193 -0
  14. package/dist/src/lib/actions/ingest.d.ts +14 -7
  15. package/dist/src/lib/actions/ingest.js +133 -63
  16. package/dist/src/lib/actions/las-api.d.ts +13 -0
  17. package/dist/src/lib/actions/las-api.js +105 -0
  18. package/dist/src/lib/actions/las-tools.d.ts +3 -0
  19. package/dist/src/lib/actions/las-tools.js +194 -0
  20. package/dist/src/lib/actions/las.d.ts +64 -0
  21. package/dist/src/lib/actions/las.js +72 -0
  22. package/dist/src/lib/actions/manage.d.ts +3 -2
  23. package/dist/src/{skills/las-data-profiler/index.d.ts → lib/actions/profiler.d.ts} +4 -2
  24. package/dist/src/{skills/las-data-profiler/index.js → lib/actions/profiler.js} +19 -3
  25. package/dist/src/lib/actions/retrieve.d.ts +2 -1
  26. package/dist/src/lib/actions/retrieve.js +2 -18
  27. package/{src/skills/las-data-profiler → dist/src/lib/scripts}/s3_catalog.py +10 -1
  28. package/dist/src/processor/loader.js +9 -2
  29. package/dist/src/service/embedding/factory.js +1 -10
  30. package/dist/src/service/embedding/interface.d.ts +8 -1
  31. package/dist/src/service/embedding/local.js +16 -13
  32. package/dist/src/service/embedding/remote.d.ts +7 -0
  33. package/dist/src/service/embedding/remote.js +108 -7
  34. package/dist/src/service/metadata/interface.d.ts +1 -0
  35. package/dist/src/service/metadata/local.d.ts +1 -0
  36. package/dist/src/service/metadata/local.js +6 -0
  37. package/dist/src/skills/SKILL.md +174 -0
  38. package/dist/src/skills/contextlake-delete/SKILL.md +36 -0
  39. package/dist/src/skills/contextlake-ingest/SKILL.md +40 -0
  40. package/dist/src/skills/contextlake-list/SKILL.md +22 -0
  41. package/dist/src/skills/contextlake-retrieve/SKILL.md +37 -0
  42. package/dist/src/skills/las-data-profiler/SKILL.md +174 -0
  43. package/dist/src/utils/config.d.ts +34 -1
  44. package/dist/src/utils/config.js +16 -3
  45. package/dist/src/utils/credentials.d.ts +8 -0
  46. package/dist/src/utils/credentials.js +77 -0
  47. package/index.ts +8 -8
  48. package/openclaw.plugin.json +1 -1
  49. package/package.json +8 -7
  50. package/src/client/lancedb.ts +32 -21
  51. package/src/commands/cli.ts +105 -13
  52. package/src/commands/index.ts +45 -42
  53. package/src/commands/slashcmd.ts +69 -10
  54. package/src/commands/tools.ts +142 -117
  55. package/src/lib/actions/ingest.ts +151 -75
  56. package/src/lib/actions/las-api.ts +119 -0
  57. package/src/lib/actions/las-tools.ts +196 -0
  58. package/src/lib/actions/manage.ts +6 -5
  59. package/src/{skills/las-data-profiler/index.ts → lib/actions/profiler.ts} +21 -4
  60. package/src/lib/actions/retrieve.ts +16 -34
  61. package/src/lib/scripts/s3_catalog.py +617 -0
  62. package/src/processor/loader.ts +12 -4
  63. package/src/service/embedding/factory.ts +1 -8
  64. package/src/service/embedding/interface.ts +9 -1
  65. package/src/service/embedding/remote.ts +133 -13
  66. package/src/service/metadata/interface.ts +1 -0
  67. package/src/service/metadata/local.ts +7 -0
  68. package/src/service/storage/factory.ts +2 -2
  69. package/src/utils/config.ts +61 -8
  70. package/src/utils/credentials.ts +50 -0
  71. package/bin/contextlake-openclaw.js +0 -5
  72. package/dist/src/skills/las-data-profiler/register.d.ts +0 -1
  73. package/dist/src/skills/las-data-profiler/register.js +0 -19
  74. package/src/service/embedding/local.ts +0 -118
  75. package/src/skills/las-data-profiler/register.ts +0 -19
@@ -0,0 +1,194 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.getLasTools = getLasTools;
4
+ const las_api_1 = require("./las-api");
5
+ function getLasTools(pluginConfig, logger) {
6
+ const apiClient = new las_api_1.LasApiClient(pluginConfig, logger);
7
+ const callApi = async (method, args) => {
8
+ try {
9
+ // @ts-ignore
10
+ return await apiClient[method](...args);
11
+ }
12
+ catch (error) {
13
+ logger.error(`[LasTools] API ${method} failed`, { error: error.message });
14
+ return { error: error.message };
15
+ }
16
+ };
17
+ return [
18
+ {
19
+ name: 'las_image_resample',
20
+ label: 'LAS Image Resample',
21
+ description: `Resample/Resize an image and save it to TOS.
22
+ Parameters in data:
23
+ - image_src_type (string, default: "image_url"): "image_url" or "image_tos"
24
+ - image (string, required): URL or tos:// path
25
+ - tos_dir (string, required): tos:// output directory
26
+ - image_suffix (string): ".jpg" or ".png"
27
+ - target_size (array of integers): e.g. [1024, 1024]
28
+ - target_dpi (array of integers): e.g. [72, 72]
29
+ - method (string): "nearest", "bilinear", "bicubic", "lanczos"`,
30
+ parameters: {
31
+ type: 'object',
32
+ properties: { data: { type: 'object', additionalProperties: true } },
33
+ required: ['data']
34
+ },
35
+ async execute(toolCallId, params) {
36
+ return await callApi('process', ['las_image_resample', params.data]);
37
+ }
38
+ },
39
+ {
40
+ name: 'las_audio_extract_and_split',
41
+ label: 'LAS Audio Extract and Split',
42
+ description: `Extract audio from video and split it into chunks.
43
+ Parameters in data:
44
+ - input_path (string, required): tos:// video path
45
+ - output_path_template (string, required): e.g. tos://bucket/{index}.{output_file_ext}
46
+ - split_duration (number): duration in seconds, default 30.0
47
+ - output_format (string): "wav", "mp3", "flac"
48
+ - timeout (integer)
49
+ - extra_params (array of string): ffmpeg params`,
50
+ parameters: {
51
+ type: 'object',
52
+ properties: { data: { type: 'object', additionalProperties: true } },
53
+ required: ['data']
54
+ },
55
+ async execute(toolCallId, params) {
56
+ return await callApi('process', ['las_audio_extract_and_split', params.data]);
57
+ }
58
+ },
59
+ {
60
+ name: 'las_audio_convert',
61
+ label: 'LAS Audio Convert',
62
+ description: `Convert audio format.
63
+ Parameters in data:
64
+ - input_path (string, required): tos:// audio path
65
+ - output_path (string, required): tos:// output path
66
+ - output_format (string): "wav", "mp3", "flac"
67
+ - extra_params (array of string): ffmpeg params`,
68
+ parameters: {
69
+ type: 'object',
70
+ properties: { data: { type: 'object', additionalProperties: true } },
71
+ required: ['data']
72
+ },
73
+ async execute(toolCallId, params) {
74
+ return await callApi('process', ['las_audio_convert', params.data]);
75
+ }
76
+ },
77
+ {
78
+ name: 'las_asr_pro',
79
+ label: 'LAS ASR Pro (Speech Recognition)',
80
+ description: `Perform automatic speech recognition (ASR).
81
+ Parameters in data:
82
+ - resource: "bigasr" or "seedasr"
83
+ - audio (object, required): { url: string, language: string, format: string }
84
+ - request (object, required): { model_name: "bigmodel", ... }
85
+ - user (object): { uid: string }`,
86
+ parameters: {
87
+ type: 'object',
88
+ properties: { data: { type: 'object', additionalProperties: true } },
89
+ required: ['data']
90
+ },
91
+ async execute(toolCallId, params) {
92
+ return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
93
+ }
94
+ },
95
+ {
96
+ name: 'las_seed_2_0',
97
+ label: 'LAS Seed 2.0 (Audio)',
98
+ description: `ASR with Seed 2.0.
99
+ Parameters in data:
100
+ - audio (object, required): { url, format, language }
101
+ - request (object, required): { model_name: "seedasr", ... }`,
102
+ parameters: {
103
+ type: 'object',
104
+ properties: { data: { type: 'object', additionalProperties: true } },
105
+ required: ['data']
106
+ },
107
+ async execute(toolCallId, params) {
108
+ return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
109
+ }
110
+ },
111
+ {
112
+ name: 'las_bare_image_text_embedding',
113
+ label: 'LAS Multimodal Embedding',
114
+ description: `Multimodal Embedding (image and text).
115
+ Parameters:
116
+ - model (string, required): "doubao-embedding-vision-250615"
117
+ - input (array of objects, required): [ { type: "image_url", image_url: { url: "..." } }, { type: "text", text: "..." } ]
118
+ - encoding_format (string): "float", "base64"
119
+ - dimensions (integer): 1024 or 2048`,
120
+ parameters: {
121
+ type: 'object',
122
+ properties: {
123
+ model: { type: 'string', default: 'doubao-embedding-vision-250615' },
124
+ input: { type: 'array', items: { type: 'object' } },
125
+ encoding_format: { type: 'string', default: 'float' },
126
+ dimensions: { type: 'integer' }
127
+ },
128
+ required: ['model', 'input']
129
+ },
130
+ async execute(toolCallId, params) {
131
+ return await callApi('multimodalEmbedding', [
132
+ params.model,
133
+ params.input,
134
+ params.encoding_format,
135
+ params.dimensions
136
+ ]);
137
+ }
138
+ },
139
+ {
140
+ name: 'las_long_video_understand',
141
+ label: 'LAS Long Video Understand',
142
+ description: `Long Video Understanding.
143
+ Parameters in data:
144
+ - video_url (string, required)
145
+ - query (string, required)
146
+ - model_name (string): default "doubao-seed-2-0-lite-260215"
147
+ - ...other params like fps, target_tokens_per_clip`,
148
+ parameters: {
149
+ type: 'object',
150
+ properties: { data: { type: 'object', additionalProperties: true } },
151
+ required: ['data']
152
+ },
153
+ async execute(toolCallId, params) {
154
+ return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
155
+ }
156
+ },
157
+ {
158
+ name: 'las_pdf_parse_doubao',
159
+ label: 'LAS PDF Parse Doubao',
160
+ description: `Parse PDF documents to Markdown.
161
+ Parameters in data:
162
+ - url (string, required): PDF URL or tos://
163
+ - start_page (integer): default 1
164
+ - num_pages (integer): default to end
165
+ - parse_mode (string): "normal" or "detail"`,
166
+ parameters: {
167
+ type: 'object',
168
+ properties: { data: { type: 'object', additionalProperties: true } },
169
+ required: ['data']
170
+ },
171
+ async execute(toolCallId, params) {
172
+ return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
173
+ }
174
+ },
175
+ {
176
+ name: 'las_video_resize',
177
+ label: 'LAS Video Resize',
178
+ description: `Resize video.
179
+ Parameters in data:
180
+ - video_url (string, required): URL or tos://
181
+ - target_width (integer)
182
+ - target_height (integer)
183
+ - output_dir (string, required): tos://`,
184
+ parameters: {
185
+ type: 'object',
186
+ properties: { data: { type: 'object', additionalProperties: true } },
187
+ required: ['data']
188
+ },
189
+ async execute(toolCallId, params) {
190
+ return await callApi('submitAndPoll', ['las_video_resize', params.data]);
191
+ }
192
+ }
193
+ ];
194
+ }
@@ -0,0 +1,64 @@
1
+ import { ContextLakeConfig } from '../../utils/config';
2
+ export declare function lasPdfParseDoubao(params: {
3
+ url: string;
4
+ start_page?: number;
5
+ num_pages?: number;
6
+ parse_mode?: string;
7
+ }, config?: ContextLakeConfig): Promise<any>;
8
+ export declare function lasLongVideoUnderstand(params: {
9
+ video_url: string;
10
+ prompt: string;
11
+ system_prompt?: string;
12
+ return_chunk_text?: boolean;
13
+ max_tokens?: number;
14
+ temperature?: number;
15
+ top_p?: number;
16
+ }, config?: ContextLakeConfig): Promise<any>;
17
+ export declare function lasBareImageTextEmbedding(params: {
18
+ input: Array<{
19
+ type: string;
20
+ text?: string;
21
+ image_url?: string;
22
+ }>;
23
+ encoding_format?: string;
24
+ }, config?: ContextLakeConfig): Promise<any>;
25
+ export declare function lasSeed20(params: {
26
+ model: string;
27
+ messages: Array<any>;
28
+ stream?: boolean;
29
+ max_tokens?: number;
30
+ temperature?: number;
31
+ top_p?: number;
32
+ frequency_penalty?: number;
33
+ presence_penalty?: number;
34
+ tools?: Array<any>;
35
+ tool_choice?: any;
36
+ user?: string;
37
+ logprobs?: boolean;
38
+ top_logprobs?: number;
39
+ }, config?: ContextLakeConfig): Promise<any>;
40
+ export declare function lasAsrPro(params: {
41
+ url?: string;
42
+ format?: string;
43
+ language?: string;
44
+ resource?: string;
45
+ use_itn?: boolean;
46
+ use_sn?: boolean;
47
+ enable_alignment?: boolean;
48
+ channel_id?: number;
49
+ use_word_info?: boolean;
50
+ text_format?: number;
51
+ enable_semantic_sentence_detection?: boolean;
52
+ boost_words?: Array<{
53
+ word: string;
54
+ weight: number;
55
+ }>;
56
+ }, config?: ContextLakeConfig): Promise<any>;
57
+ export declare function lasAudioExtractAndSplit(params: {
58
+ input_path: string;
59
+ output_path_template: string;
60
+ split_duration?: number;
61
+ output_format?: string;
62
+ timeout?: number;
63
+ extra_params?: string[];
64
+ }, config?: ContextLakeConfig): Promise<any>;
@@ -0,0 +1,72 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.lasPdfParseDoubao = lasPdfParseDoubao;
4
+ exports.lasLongVideoUnderstand = lasLongVideoUnderstand;
5
+ exports.lasBareImageTextEmbedding = lasBareImageTextEmbedding;
6
+ exports.lasSeed20 = lasSeed20;
7
+ exports.lasAsrPro = lasAsrPro;
8
+ exports.lasAudioExtractAndSplit = lasAudioExtractAndSplit;
9
+ function getLASConfig(config) {
10
+ // Attempt to get from env vars or config
11
+ const endpoint = process.env.LAS_ENDPOINT || (config?.las)?.endpoint;
12
+ const apiKey = process.env.LAS_API_KEY || (config?.las)?.api_key;
13
+ if (!endpoint || !apiKey) {
14
+ throw new Error("LAS_ENDPOINT and LAS_API_KEY must be set in environment variables or config");
15
+ }
16
+ return { endpoint, apiKey };
17
+ }
18
+ async function lasFetch(path, payload, config) {
19
+ const { endpoint, apiKey } = getLASConfig(config);
20
+ const url = `${endpoint.replace(/\/$/, '')}${path}`;
21
+ const response = await fetch(url, {
22
+ method: 'POST',
23
+ headers: {
24
+ 'Content-Type': 'application/json',
25
+ 'Authorization': `Bearer ${apiKey}`
26
+ },
27
+ body: JSON.stringify(payload)
28
+ });
29
+ if (!response.ok) {
30
+ let errorText = await response.text().catch(() => '');
31
+ throw new Error(`LAS API Error: ${response.status} ${response.statusText} - ${errorText}`);
32
+ }
33
+ return await response.json();
34
+ }
35
+ async function lasPdfParseDoubao(params, config) {
36
+ return lasFetch('/api/v1/submit', {
37
+ operator_id: 'las_pdf_parse_doubao',
38
+ operator_version: 'v1',
39
+ data: params
40
+ }, config);
41
+ }
42
+ async function lasLongVideoUnderstand(params, config) {
43
+ return lasFetch('/api/v1/submit', {
44
+ operator_id: 'las_long_video_understand',
45
+ operator_version: 'v1',
46
+ data: params
47
+ }, config);
48
+ }
49
+ async function lasBareImageTextEmbedding(params, config) {
50
+ return lasFetch('/api/v1/embeddings/multimodal', {
51
+ model: 'doubao-embedding-vision',
52
+ input: params.input,
53
+ encoding_format: params.encoding_format
54
+ }, config);
55
+ }
56
+ async function lasSeed20(params, config) {
57
+ return lasFetch('/api/v1/chat/completions', params, config);
58
+ }
59
+ async function lasAsrPro(params, config) {
60
+ return lasFetch('/api/v1/submit', {
61
+ operator_id: 'las_asr_pro',
62
+ operator_version: 'v1',
63
+ data: params
64
+ }, config);
65
+ }
66
+ async function lasAudioExtractAndSplit(params, config) {
67
+ return lasFetch('/api/v1/process', {
68
+ operator_id: 'las_audio_extract_and_split',
69
+ operator_version: 'v1',
70
+ data: params
71
+ }, config);
72
+ }
@@ -1,3 +1,4 @@
1
+ import { ContextLakeConfig } from '../../utils/config';
1
2
  interface ListParams {
2
3
  limit?: number;
3
4
  }
@@ -5,8 +6,8 @@ interface DeleteParams {
5
6
  file_ids?: string[];
6
7
  filter?: string;
7
8
  }
8
- export declare function listAssets(params: ListParams, config: any, logger?: any): Promise<any>;
9
- export declare function deleteAssets(params: DeleteParams, config: any, logger?: any): Promise<{
9
+ export declare function listAssets(params: ListParams, config: ContextLakeConfig, logger?: any): Promise<any>;
10
+ export declare function deleteAssets(params: DeleteParams, config: ContextLakeConfig, logger?: any): Promise<{
10
11
  status: string;
11
12
  message: string;
12
13
  deleted_count: number;
@@ -9,7 +9,7 @@ export interface ConnectParams {
9
9
  prefix: string;
10
10
  sample_rows?: number;
11
11
  }
12
- interface ConnectResult {
12
+ export interface ConnectResult {
13
13
  status: 'success' | 'error';
14
14
  datasource_name: string;
15
15
  db_path: string;
@@ -23,4 +23,6 @@ interface ConnectResult {
23
23
  error?: string;
24
24
  }
25
25
  export declare function connectDataSource(params: ConnectParams, _ctx?: any): Promise<ConnectResult>;
26
- export {};
26
+ export declare function listDataSources(_ctx?: any): Promise<{
27
+ datasources: string[];
28
+ }>;
@@ -34,6 +34,7 @@ var __importStar = (this && this.__importStar) || (function () {
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
36
  exports.connectDataSource = connectDataSource;
37
+ exports.listDataSources = listDataSources;
37
38
  const path = __importStar(require("path"));
38
39
  const fs = __importStar(require("fs"));
39
40
  const os = __importStar(require("os"));
@@ -41,7 +42,7 @@ const child_process_1 = require("child_process");
41
42
  // ---------------------------------------------------------------------------
42
43
  // Constants
43
44
  // ---------------------------------------------------------------------------
44
- const BASE_DIR = path.join(os.homedir(), '.openclaw', 'las-data-profiler');
45
+ const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
45
46
  const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
46
47
  // ---------------------------------------------------------------------------
47
48
  // Helpers
@@ -109,8 +110,8 @@ function ensurePythonDeps() {
109
110
  * Get the path to the bundled Python script.
110
111
  */
111
112
  function getScriptPath() {
112
- // The Python script is co-located with this module
113
- return path.join(__dirname, 's3_catalog.py');
113
+ // The Python script is located in the scripts directory
114
+ return path.join(__dirname, '../scripts', 's3_catalog.py');
114
115
  }
115
116
  // ---------------------------------------------------------------------------
116
117
  // Main Entry
@@ -229,3 +230,18 @@ async function connectDataSource(params, _ctx) {
229
230
  });
230
231
  });
231
232
  }
233
+ async function listDataSources(_ctx) {
234
+ try {
235
+ if (!fs.existsSync(BASE_DIR)) {
236
+ return { datasources: [] };
237
+ }
238
+ const entries = fs.readdirSync(BASE_DIR, { withFileTypes: true });
239
+ const datasources = entries
240
+ .filter(entry => entry.isDirectory())
241
+ .map(entry => entry.name);
242
+ return { datasources };
243
+ }
244
+ catch (error) {
245
+ throw new Error(`Failed to list data sources: ${error.message}`);
246
+ }
247
+ }
@@ -1,8 +1,9 @@
1
+ import { ContextLakeConfig } from '../../utils/config';
1
2
  interface RetrieveParams {
2
3
  query: string;
3
4
  top_k?: number;
4
5
  filter?: string;
5
6
  include_binary?: boolean;
6
7
  }
7
- export declare function retrieveAssets(params: RetrieveParams, config: any, logger?: any): Promise<any[]>;
8
+ export declare function retrieveAssets(params: RetrieveParams, config: ContextLakeConfig, logger?: any): Promise<any[]>;
8
9
  export {};
@@ -19,17 +19,11 @@ async function retrieveAssets(params, config, logger) {
19
19
  }
20
20
  await metadataProvider.connect();
21
21
  const results = await metadataProvider.search(params.query, params.top_k || 5, params.filter);
22
- // Post-process to handle binary data retrieval if requested
23
22
  if (params.include_binary) {
24
23
  const enrichedResults = await Promise.all(results.map(async (doc) => {
25
- // Remove raw binary_data buffer to avoid clone errors in postMessage
26
- const { binary_data, ...rest } = doc;
27
- if (rest.vector) {
28
- rest.vector = Array.from(rest.vector);
29
- }
24
+ const { binary_data, vector, ...rest } = doc;
30
25
  try {
31
26
  if (rest.storage_type === 'inline') {
32
- // Return inline binary data (Base64)
33
27
  return {
34
28
  ...rest,
35
29
  binary_content: binary_data ? binary_data.toString('base64') : null
@@ -37,7 +31,6 @@ async function retrieveAssets(params, config, logger) {
37
31
  }
38
32
  else if (rest.storage_type === 'local' || rest.storage_type === 'tos') {
39
33
  if (storageProvider) {
40
- // Download from storage provider
41
34
  const buffer = await storageProvider.downloadFile(rest.url);
42
35
  return {
43
36
  ...rest,
@@ -56,17 +49,8 @@ async function retrieveAssets(params, config, logger) {
56
49
  }));
57
50
  return enrichedResults;
58
51
  }
59
- // Remove binary_data from default response to keep it light and serializable
60
52
  return results.map((doc) => {
61
- const { binary_data, ...rest } = doc;
62
- // Ensure all fields are standard serializable types (remove complex objects/buffers)
63
- if (rest.vector) {
64
- // Keep vector as standard array of numbers if it exists, but some clients might choke on large arrays.
65
- // Usually vector is fine, but LanceDB might return custom objects like Float32Array which fail structuredClone in postMessage
66
- rest.vector = Array.from(rest.vector);
67
- }
68
- // LanceDB records might be Proxy objects or contain non-clonable getters.
69
- // We stringify and parse to get a clean, plain Javascript Object.
53
+ const { binary_data, vector, ...rest } = doc;
70
54
  const safeDoc = JSON.parse(JSON.stringify(rest));
71
55
  return safeDoc;
72
56
  });
@@ -523,10 +523,19 @@ def main():
523
523
  parser.add_argument('--region', default='')
524
524
  parser.add_argument('--bucket', required=True)
525
525
  parser.add_argument('--prefix', required=True)
526
- parser.add_argument('--db-path', default='./catalog_db')
526
+ parser.add_argument('--db-path', default=None, help='Path to LanceDB database. Defaults to ~/.openclaw/las-data-profiler/{datasource_name}/catalog_db if datasource_name is provided.')
527
+ parser.add_argument('--datasource-name', default='', help='Name of the datasource. Used to determine default db-path if not explicitly provided.')
527
528
  parser.add_argument('--sample-rows', type=int, default=100)
528
529
  args = parser.parse_args()
529
530
 
531
+ if not args.db_path:
532
+ if args.datasource_name:
533
+ import os
534
+ home_dir = os.path.expanduser('~')
535
+ args.db_path = os.path.join(home_dir, '.openclaw', 'las-data-profiler', args.datasource_name, 'catalog_db')
536
+ else:
537
+ args.db_path = './catalog_db'
538
+
530
539
  print(f"[las-data-profiler] vendor={args.vendor}, bucket={args.bucket}, prefix={args.prefix}")
531
540
  print(f"[las-data-profiler] db_path={args.db_path}")
532
541
 
@@ -72,12 +72,19 @@ function splitText(text, chunkSize = 500, overlap = 50) {
72
72
  const chunks = [];
73
73
  if (!text)
74
74
  return chunks;
75
+ const safeChunkSize = Number.isFinite(chunkSize) ? Math.max(1, Math.floor(chunkSize)) : 500;
76
+ const safeOverlapRaw = Number.isFinite(overlap) ? Math.max(0, Math.floor(overlap)) : 50;
77
+ const safeOverlap = Math.min(safeOverlapRaw, safeChunkSize - 1);
78
+ const step = Math.max(1, safeChunkSize - safeOverlap);
75
79
  let start = 0;
76
80
  while (start < text.length) {
77
- const end = Math.min(start + chunkSize, text.length);
81
+ const end = Math.min(start + safeChunkSize, text.length);
78
82
  const chunk = text.slice(start, end);
83
+ if (!chunk) {
84
+ break;
85
+ }
79
86
  chunks.push(chunk);
80
- start += chunkSize - overlap;
87
+ start += step;
81
88
  }
82
89
  return chunks;
83
90
  }
@@ -1,16 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.createEmbeddingProvider = createEmbeddingProvider;
4
- const local_1 = require("./local");
5
4
  const remote_1 = require("./remote");
6
5
  function createEmbeddingProvider(config) {
7
- if (config.provider === 'local') {
8
- return new local_1.LocalEmbeddingProvider(config);
9
- }
10
- else if (config.provider === 'openai' || config.provider === 'remote') {
11
- return new remote_1.RemoteEmbeddingProvider(config);
12
- }
13
- else {
14
- throw new Error(`Unsupported embedding provider: ${config.provider}`);
15
- }
6
+ return new remote_1.RemoteEmbeddingProvider(config);
16
7
  }
@@ -9,10 +9,17 @@ export interface EmbeddingProvider {
9
9
  * @param texts - Array of input texts
10
10
  */
11
11
  generateEmbeddings(texts: string[]): Promise<number[][]>;
12
+ /**
13
+ * Generate embedding for multimodal input (LAS specific)
14
+ * @param input - Multimodal input array
15
+ */
16
+ generateMultimodalEmbedding?(input: any[]): Promise<number[]>;
12
17
  }
13
18
  export interface EmbeddingConfig {
14
- provider: 'local' | 'remote' | 'openai';
19
+ provider: 'local' | 'remote' | 'openai' | 'las';
15
20
  model_name: string;
16
21
  api_key?: string;
17
22
  api_base?: string;
23
+ dimensions?: number;
24
+ encoding_format?: 'float' | 'base64';
18
25
  }
@@ -2,7 +2,7 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.LocalEmbeddingProvider = exports.setNodeLlamaCppImporter = void 0;
4
4
  // import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
5
- const DEFAULT_LOCAL_MODEL = "hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf";
5
+ const DEFAULT_LOCAL_MODEL = 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf';
6
6
  let nodeLlamaImportPromise = null;
7
7
  const setNodeLlamaCppImporter = (importer) => {
8
8
  nodeLlamaImportPromise = importer();
@@ -10,7 +10,7 @@ const setNodeLlamaCppImporter = (importer) => {
10
10
  exports.setNodeLlamaCppImporter = setNodeLlamaCppImporter;
11
11
  const importNodeLlamaCpp = async () => {
12
12
  if (!nodeLlamaImportPromise) {
13
- nodeLlamaImportPromise = import("node-llama-cpp");
13
+ nodeLlamaImportPromise = import('node-llama-cpp');
14
14
  }
15
15
  return nodeLlamaImportPromise;
16
16
  };
@@ -33,7 +33,13 @@ class LocalEmbeddingProvider {
33
33
  if (this.initPromise) {
34
34
  return this.initPromise;
35
35
  }
36
- this.initPromise = this.doInitialize();
36
+ this.initPromise = this.doInitialize().catch((err) => {
37
+ this.initPromise = null;
38
+ this.context = null;
39
+ this.model = null;
40
+ this.llama = null;
41
+ throw err;
42
+ });
37
43
  return this.initPromise;
38
44
  }
39
45
  async doInitialize() {
@@ -58,15 +64,15 @@ class LocalEmbeddingProvider {
58
64
  }
59
65
  }
60
66
  async generateEmbedding(text) {
67
+ if (!text || !text.trim()) {
68
+ throw new Error('Embedding input text must be a non-empty string');
69
+ }
61
70
  await this.ensureInitialized();
62
71
  const embedding = await this.context.getEmbeddingFor(text);
63
72
  const vector = embedding.vector; // TypedArray
64
73
  // Optimized normalization loop
65
74
  let sumSq = 0;
66
75
  const len = vector.length;
67
- // First pass: Calculate magnitude and sanitize (implicitly handled by JS numbers usually, but keeping finite check if needed)
68
- // For performance, we assume node-llama-cpp returns valid floats.
69
- // If strict sanitization is needed, it can be combined.
70
76
  for (let i = 0; i < len; i++) {
71
77
  const val = vector[i];
72
78
  if (Number.isFinite(val)) {
@@ -85,19 +91,16 @@ class LocalEmbeddingProvider {
85
91
  }
86
92
  }
87
93
  else {
88
- // Zero vector case
89
94
  for (let i = 0; i < len; i++) {
90
- result[i] = vector[i]; // or 0
95
+ result[i] = vector[i];
91
96
  }
92
97
  }
93
98
  return result;
94
99
  }
95
- // Optimized batch processing for local embedding
96
100
  async generateEmbeddings(texts) {
97
- await this.ensureInitialized();
98
- // node-llama-cpp's createEmbeddingContext might not support batch directly yet depending on version,
99
- // but we can at least optimize the loop.
100
- // If newer version supports batch, we should use it. For now, we parallelize with limit.
101
+ if (!Array.isArray(texts)) {
102
+ throw new Error('Embedding input must be an array of strings');
103
+ }
101
104
  return Promise.all(texts.map(text => this.generateEmbedding(text)));
102
105
  }
103
106
  }
@@ -3,7 +3,14 @@ export declare class RemoteEmbeddingProvider implements EmbeddingProvider {
3
3
  private apiKey;
4
4
  private modelName;
5
5
  private apiBase;
6
+ private dimensions?;
7
+ private encodingFormat;
8
+ private mode;
6
9
  constructor(config: EmbeddingConfig);
10
+ private detectMode;
7
11
  generateEmbedding(text: string): Promise<number[]>;
8
12
  generateEmbeddings(texts: string[]): Promise<number[][]>;
13
+ private generateOpenAICompatibleEmbeddings;
14
+ private generateLasEmbeddings;
15
+ generateMultimodalEmbedding(input: any[]): Promise<number[]>;
9
16
  }