@byted-las/contextlake-openclaw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/bin/contextlake-openclaw.js +5 -0
- package/dist/index.d.ts +113 -0
- package/dist/index.js +73 -0
- package/dist/src/client/lancedb.d.ts +30 -0
- package/dist/src/client/lancedb.js +113 -0
- package/dist/src/client/tos.d.ts +19 -0
- package/dist/src/client/tos.js +81 -0
- package/dist/src/commands/cli.d.ts +6 -0
- package/dist/src/commands/cli.js +78 -0
- package/dist/src/commands/index.d.ts +1 -0
- package/dist/src/commands/index.js +139 -0
- package/dist/src/commands/slashcmd.d.ts +14 -0
- package/dist/src/commands/slashcmd.js +91 -0
- package/dist/src/commands/tools.d.ts +219 -0
- package/dist/src/commands/tools.js +286 -0
- package/dist/src/lib/actions/ingest.d.ts +8 -0
- package/dist/src/lib/actions/ingest.js +123 -0
- package/dist/src/lib/actions/manage.d.ts +15 -0
- package/dist/src/lib/actions/manage.js +91 -0
- package/dist/src/lib/actions/retrieve.d.ts +8 -0
- package/dist/src/lib/actions/retrieve.js +73 -0
- package/dist/src/processor/loader.d.ts +7 -0
- package/dist/src/processor/loader.js +83 -0
- package/dist/src/service/embedding/factory.d.ts +2 -0
- package/dist/src/service/embedding/factory.js +16 -0
- package/dist/src/service/embedding/interface.d.ts +18 -0
- package/dist/src/service/embedding/interface.js +2 -0
- package/dist/src/service/embedding/local.d.ts +14 -0
- package/dist/src/service/embedding/local.js +104 -0
- package/dist/src/service/embedding/remote.d.ts +9 -0
- package/dist/src/service/embedding/remote.js +42 -0
- package/dist/src/service/metadata/factory.d.ts +13 -0
- package/dist/src/service/metadata/factory.js +48 -0
- package/dist/src/service/metadata/interface.d.ts +17 -0
- package/dist/src/service/metadata/interface.js +2 -0
- package/dist/src/service/metadata/local.d.ts +13 -0
- package/dist/src/service/metadata/local.js +49 -0
- package/dist/src/service/storage/factory.d.ts +2 -0
- package/dist/src/service/storage/factory.js +19 -0
- package/dist/src/service/storage/interface.d.ts +32 -0
- package/dist/src/service/storage/interface.js +2 -0
- package/dist/src/service/storage/local.d.ts +9 -0
- package/dist/src/service/storage/local.js +72 -0
- package/dist/src/skills/las-data-profiler/index.d.ts +26 -0
- package/dist/src/skills/las-data-profiler/index.js +231 -0
- package/dist/src/skills/las-data-profiler/register.d.ts +1 -0
- package/dist/src/skills/las-data-profiler/register.js +19 -0
- package/dist/src/utils/config.d.ts +1 -0
- package/dist/src/utils/config.js +16 -0
- package/index.ts +78 -0
- package/openclaw.plugin.json +57 -0
- package/package.json +52 -0
- package/src/client/lancedb.ts +102 -0
- package/src/client/tos.ts +100 -0
- package/src/commands/cli.ts +77 -0
- package/src/commands/index.ts +156 -0
- package/src/commands/slashcmd.ts +95 -0
- package/src/commands/tools.ts +286 -0
- package/src/lib/actions/ingest.ts +103 -0
- package/src/lib/actions/manage.ts +107 -0
- package/src/lib/actions/retrieve.ts +90 -0
- package/src/processor/loader.ts +58 -0
- package/src/service/embedding/factory.ts +13 -0
- package/src/service/embedding/interface.ts +21 -0
- package/src/service/embedding/local.ts +118 -0
- package/src/service/embedding/remote.ts +45 -0
- package/src/service/metadata/factory.ts +52 -0
- package/src/service/metadata/interface.ts +19 -0
- package/src/service/metadata/local.ts +60 -0
- package/src/service/storage/factory.ts +16 -0
- package/src/service/storage/interface.ts +36 -0
- package/src/service/storage/local.ts +42 -0
- package/src/skills/contextlake-delete/SKILL.md +36 -0
- package/src/skills/contextlake-ingest/SKILL.md +40 -0
- package/src/skills/contextlake-list/SKILL.md +22 -0
- package/src/skills/contextlake-retrieve/SKILL.md +37 -0
- package/src/skills/las-data-profiler/SKILL.md +174 -0
- package/src/skills/las-data-profiler/index.ts +254 -0
- package/src/skills/las-data-profiler/register.ts +19 -0
- package/src/skills/las-data-profiler/s3_catalog.py +608 -0
- package/src/utils/config.ts +13 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import * as path from 'path';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as os from 'os';
|
|
4
|
+
import { execSync, spawn } from 'child_process';
|
|
5
|
+
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
// Types
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
export interface ConnectParams {
|
|
11
|
+
datasource_name: string;
|
|
12
|
+
vendor: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
|
|
13
|
+
endpoint?: string;
|
|
14
|
+
access_key?: string;
|
|
15
|
+
secret_key?: string;
|
|
16
|
+
region?: string;
|
|
17
|
+
bucket: string;
|
|
18
|
+
prefix: string;
|
|
19
|
+
sample_rows?: number;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface ConnectResult {
|
|
23
|
+
status: 'success' | 'error';
|
|
24
|
+
datasource_name: string;
|
|
25
|
+
db_path: string;
|
|
26
|
+
env_path: string;
|
|
27
|
+
tables: string[];
|
|
28
|
+
summary?: {
|
|
29
|
+
total_files: number;
|
|
30
|
+
structured_files: number;
|
|
31
|
+
media_files: number;
|
|
32
|
+
};
|
|
33
|
+
error?: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Constants
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'las-data-profiler');
|
|
41
|
+
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Helpers
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
function getDataSourceDir(name: string): string {
|
|
48
|
+
return path.join(BASE_DIR, name);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function ensureDir(dir: string): void {
|
|
52
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Generate env.sh with all connection parameters for this datasource.
|
|
57
|
+
* This file can be sourced to re-run the profiler or for debugging.
|
|
58
|
+
*/
|
|
59
|
+
function writeEnvFile(dir: string, params: ConnectParams): string {
|
|
60
|
+
const envPath = path.join(dir, 'env.sh');
|
|
61
|
+
const lines: string[] = [
|
|
62
|
+
'#!/usr/bin/env bash',
|
|
63
|
+
'# Auto-generated by las-data-profiler connect',
|
|
64
|
+
`# Datasource: ${params.datasource_name}`,
|
|
65
|
+
`# Created: ${new Date().toISOString()}`,
|
|
66
|
+
'',
|
|
67
|
+
`export LAS_VENDOR="${params.vendor}"`,
|
|
68
|
+
`export LAS_BUCKET="${params.bucket}"`,
|
|
69
|
+
`export LAS_PREFIX="${params.prefix}"`,
|
|
70
|
+
];
|
|
71
|
+
|
|
72
|
+
if (params.endpoint) {
|
|
73
|
+
lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
|
|
74
|
+
}
|
|
75
|
+
if (params.access_key) {
|
|
76
|
+
lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
|
|
77
|
+
}
|
|
78
|
+
if (params.secret_key) {
|
|
79
|
+
lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
|
|
80
|
+
}
|
|
81
|
+
if (params.region) {
|
|
82
|
+
lines.push(`export LAS_REGION="${params.region}"`);
|
|
83
|
+
}
|
|
84
|
+
if (params.sample_rows) {
|
|
85
|
+
lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
|
|
89
|
+
lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
|
|
90
|
+
lines.push('');
|
|
91
|
+
|
|
92
|
+
fs.writeFileSync(envPath, lines.join('\n'), { mode: 0o600 });
|
|
93
|
+
return envPath;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Install Python dependencies if not already available.
|
|
98
|
+
*/
|
|
99
|
+
function ensurePythonDeps(): void {
|
|
100
|
+
try {
|
|
101
|
+
execSync(`python3 -c "import boto3, lancedb, pyarrow, pandas, PIL, mutagen, fitz"`, {
|
|
102
|
+
stdio: 'pipe',
|
|
103
|
+
});
|
|
104
|
+
} catch {
|
|
105
|
+
console.log('[las-data-profiler] Installing Python dependencies...');
|
|
106
|
+
execSync(`pip3 install --user ${PYTHON_DEPS.join(' ')}`, {
|
|
107
|
+
stdio: 'inherit',
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Get the path to the bundled Python script.
|
|
114
|
+
*/
|
|
115
|
+
function getScriptPath(): string {
|
|
116
|
+
// The Python script is co-located with this module
|
|
117
|
+
return path.join(__dirname, 's3_catalog.py');
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
// Main Entry
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
export async function connectDataSource(
|
|
125
|
+
params: ConnectParams,
|
|
126
|
+
_ctx?: any
|
|
127
|
+
): Promise<ConnectResult> {
|
|
128
|
+
// Validate required params
|
|
129
|
+
if (!params.datasource_name) {
|
|
130
|
+
throw new Error('datasource_name is required');
|
|
131
|
+
}
|
|
132
|
+
if (!params.vendor) {
|
|
133
|
+
throw new Error('vendor is required');
|
|
134
|
+
}
|
|
135
|
+
if (!params.bucket) {
|
|
136
|
+
throw new Error('bucket is required');
|
|
137
|
+
}
|
|
138
|
+
if (params.prefix === undefined || params.prefix === null) {
|
|
139
|
+
throw new Error('prefix is required');
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// For non-local vendors, validate credentials
|
|
143
|
+
if (params.vendor !== 'local') {
|
|
144
|
+
if (!params.endpoint && params.vendor !== 'aws') {
|
|
145
|
+
throw new Error(`endpoint is required for vendor "${params.vendor}"`);
|
|
146
|
+
}
|
|
147
|
+
const ak = params.access_key || process.env.TOS_ACCESS_KEY || process.env.S3_ACCESS_KEY || process.env.AWS_ACCESS_KEY_ID;
|
|
148
|
+
const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
|
|
149
|
+
if (!ak || !sk) {
|
|
150
|
+
throw new Error(
|
|
151
|
+
'access_key and secret_key are required (via params or env vars TOS_ACCESS_KEY/TOS_SECRET_KEY, S3_ACCESS_KEY/S3_SECRET_KEY, AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)'
|
|
152
|
+
);
|
|
153
|
+
}
|
|
154
|
+
// Normalise into params so env.sh picks them up
|
|
155
|
+
params.access_key = ak;
|
|
156
|
+
params.secret_key = sk;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const dsDir = getDataSourceDir(params.datasource_name);
|
|
160
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
161
|
+
|
|
162
|
+
ensureDir(dsDir);
|
|
163
|
+
|
|
164
|
+
// 1. Write env.sh
|
|
165
|
+
const envPath = writeEnvFile(dsDir, params);
|
|
166
|
+
|
|
167
|
+
// 2. Ensure Python dependencies
|
|
168
|
+
ensurePythonDeps();
|
|
169
|
+
|
|
170
|
+
// 3. Build CLI args for the Python script
|
|
171
|
+
const scriptPath = getScriptPath();
|
|
172
|
+
const args: string[] = [
|
|
173
|
+
scriptPath,
|
|
174
|
+
'--vendor', params.vendor,
|
|
175
|
+
'--bucket', params.bucket,
|
|
176
|
+
'--prefix', params.prefix,
|
|
177
|
+
'--db-path', dbPath,
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
if (params.endpoint) {
|
|
181
|
+
args.push('--endpoint', params.endpoint);
|
|
182
|
+
}
|
|
183
|
+
if (params.access_key) {
|
|
184
|
+
args.push('--ak', params.access_key);
|
|
185
|
+
}
|
|
186
|
+
if (params.secret_key) {
|
|
187
|
+
args.push('--sk', params.secret_key);
|
|
188
|
+
}
|
|
189
|
+
if (params.region) {
|
|
190
|
+
args.push('--region', params.region);
|
|
191
|
+
}
|
|
192
|
+
if (params.sample_rows) {
|
|
193
|
+
args.push('--sample-rows', String(params.sample_rows));
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// 4. Execute the profiling script
|
|
197
|
+
return new Promise<ConnectResult>((resolve) => {
|
|
198
|
+
let stdout = '';
|
|
199
|
+
let stderr = '';
|
|
200
|
+
|
|
201
|
+
const proc = spawn('python3', args, {
|
|
202
|
+
cwd: dsDir,
|
|
203
|
+
env: { ...process.env },
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
proc.stdout.on('data', (data: Buffer) => {
|
|
207
|
+
stdout += data.toString();
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
proc.stderr.on('data', (data: Buffer) => {
|
|
211
|
+
stderr += data.toString();
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
proc.on('close', (code: number | null) => {
|
|
215
|
+
if (code !== 0) {
|
|
216
|
+
resolve({
|
|
217
|
+
status: 'error',
|
|
218
|
+
datasource_name: params.datasource_name,
|
|
219
|
+
db_path: dbPath,
|
|
220
|
+
env_path: envPath,
|
|
221
|
+
tables: [],
|
|
222
|
+
error: stderr || `Python script exited with code ${code}`,
|
|
223
|
+
});
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Try to parse structured output from the script
|
|
228
|
+
try {
|
|
229
|
+
const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
|
|
230
|
+
const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
|
|
231
|
+
resolve({
|
|
232
|
+
status: 'success',
|
|
233
|
+
datasource_name: params.datasource_name,
|
|
234
|
+
db_path: dbPath,
|
|
235
|
+
env_path: envPath,
|
|
236
|
+
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
237
|
+
summary: result.summary || {
|
|
238
|
+
total_files: 0,
|
|
239
|
+
structured_files: 0,
|
|
240
|
+
media_files: 0,
|
|
241
|
+
},
|
|
242
|
+
});
|
|
243
|
+
} catch {
|
|
244
|
+
resolve({
|
|
245
|
+
status: 'success',
|
|
246
|
+
datasource_name: params.datasource_name,
|
|
247
|
+
db_path: dbPath,
|
|
248
|
+
env_path: envPath,
|
|
249
|
+
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
});
|
|
254
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
// @ts-ignore
|
|
2
|
+
import { PluginContext } from 'openclaw/plugin-sdk';
|
|
3
|
+
import { connectDataSource } from './index';
|
|
4
|
+
|
|
5
|
+
export function registerLasDataProfilerSkill(ctx: any) {
|
|
6
|
+
const definition = {
|
|
7
|
+
name: 'las-data-profiler',
|
|
8
|
+
description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
|
|
9
|
+
async execute(params: any) {
|
|
10
|
+
return await connectDataSource(params, ctx);
|
|
11
|
+
}
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
if (typeof ctx.registerTool === 'function') {
|
|
15
|
+
ctx.registerTool(definition);
|
|
16
|
+
} else if (typeof ctx.registerSkill === 'function') {
|
|
17
|
+
ctx.registerSkill(definition);
|
|
18
|
+
}
|
|
19
|
+
}
|