@byted-las/contextlake-openclaw 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -23
- package/dist/src/commands/cli.d.ts +1 -1
- package/dist/src/commands/cli.js +10 -14
- package/dist/src/commands/index.js +11 -4
- package/dist/src/commands/slashcmd.js +4 -9
- package/dist/src/commands/tools.d.ts +5 -0
- package/dist/src/commands/tools.js +180 -10
- package/dist/src/lib/actions/lance-tools.d.ts +13 -0
- package/dist/src/lib/actions/lance-tools.js +73 -0
- package/dist/src/lib/actions/las-tools.js +58 -0
- package/dist/src/lib/actions/profiler.d.ts +4 -3
- package/dist/src/lib/actions/profiler.js +156 -141
- package/dist/src/lib/actions/s3-tools.d.ts +21 -0
- package/dist/src/lib/actions/s3-tools.js +221 -0
- package/dist/src/skills/SKILL.md +14 -151
- package/dist/src/skills/las-data-profiler/SKILL.md +14 -151
- package/dist/src/utils/config.js +5 -4
- package/dist/src/utils/credentials.d.ts +4 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +3 -1
- package/src/commands/cli.ts +10 -14
- package/src/commands/index.ts +16 -4
- package/src/commands/slashcmd.ts +4 -10
- package/src/commands/tools.ts +177 -12
- package/src/lib/actions/lance-tools.ts +58 -0
- package/src/lib/actions/las-tools.ts +56 -0
- package/src/lib/actions/profiler.ts +148 -157
- package/src/lib/actions/s3-tools.ts +203 -0
- package/src/skills/las-data-profiler/SKILL.md +14 -151
- package/src/utils/config.ts +5 -4
- package/src/utils/credentials.ts +6 -0
- package/src/lib/scripts/s3_catalog.py +0 -617
|
@@ -2,8 +2,35 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.getLasTools = getLasTools;
|
|
4
4
|
const las_api_1 = require("./las-api");
|
|
5
|
+
const s3_tools_1 = require("./s3-tools");
|
|
5
6
|
function getLasTools(pluginConfig, logger) {
|
|
6
7
|
const apiClient = new las_api_1.LasApiClient(pluginConfig, logger);
|
|
8
|
+
const processUrl = async (url) => {
|
|
9
|
+
if (!url)
|
|
10
|
+
return url;
|
|
11
|
+
if (url.startsWith('tos://')) {
|
|
12
|
+
// LAS operators prefer native tos:// paths when supported, leave as is
|
|
13
|
+
return url;
|
|
14
|
+
}
|
|
15
|
+
else if (url.startsWith('oss://') || url.startsWith('s3://') || url.startsWith('cos://') || url.startsWith('file://')) {
|
|
16
|
+
// Need presigned HTTP url for other vendors
|
|
17
|
+
logger.info(`[LasTools] Presigning URL for vendor: ${url}`);
|
|
18
|
+
try {
|
|
19
|
+
// If it's a file:// we also presign it to file:// which might not be supported by remote LAS,
|
|
20
|
+
// but local files typically need to be uploaded to TOS first. We'll leave file:// to fail or be handled elsewhere.
|
|
21
|
+
if (url.startsWith('file://'))
|
|
22
|
+
return url;
|
|
23
|
+
const urlParts = new URL(url);
|
|
24
|
+
const key = urlParts.pathname.replace(/^\//, '');
|
|
25
|
+
return await (0, s3_tools_1.getPresignedUrl)({ url }, key, 3600);
|
|
26
|
+
}
|
|
27
|
+
catch (e) {
|
|
28
|
+
logger.warn(`[LasTools] Failed to presign URL: ${url}`, { error: e.message });
|
|
29
|
+
return url; // fallback to original
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return url;
|
|
33
|
+
};
|
|
7
34
|
const callApi = async (method, args) => {
|
|
8
35
|
try {
|
|
9
36
|
// @ts-ignore
|
|
@@ -33,6 +60,9 @@ Parameters in data:
|
|
|
33
60
|
required: ['data']
|
|
34
61
|
},
|
|
35
62
|
async execute(toolCallId, params) {
|
|
63
|
+
if (params.data?.image) {
|
|
64
|
+
params.data.image = await processUrl(params.data.image);
|
|
65
|
+
}
|
|
36
66
|
return await callApi('process', ['las_image_resample', params.data]);
|
|
37
67
|
}
|
|
38
68
|
},
|
|
@@ -53,6 +83,9 @@ Parameters in data:
|
|
|
53
83
|
required: ['data']
|
|
54
84
|
},
|
|
55
85
|
async execute(toolCallId, params) {
|
|
86
|
+
if (params.data?.input_path) {
|
|
87
|
+
params.data.input_path = await processUrl(params.data.input_path);
|
|
88
|
+
}
|
|
56
89
|
return await callApi('process', ['las_audio_extract_and_split', params.data]);
|
|
57
90
|
}
|
|
58
91
|
},
|
|
@@ -71,6 +104,9 @@ Parameters in data:
|
|
|
71
104
|
required: ['data']
|
|
72
105
|
},
|
|
73
106
|
async execute(toolCallId, params) {
|
|
107
|
+
if (params.data?.input_path) {
|
|
108
|
+
params.data.input_path = await processUrl(params.data.input_path);
|
|
109
|
+
}
|
|
74
110
|
return await callApi('process', ['las_audio_convert', params.data]);
|
|
75
111
|
}
|
|
76
112
|
},
|
|
@@ -89,6 +125,9 @@ Parameters in data:
|
|
|
89
125
|
required: ['data']
|
|
90
126
|
},
|
|
91
127
|
async execute(toolCallId, params) {
|
|
128
|
+
if (params.data?.audio?.url) {
|
|
129
|
+
params.data.audio.url = await processUrl(params.data.audio.url);
|
|
130
|
+
}
|
|
92
131
|
return await callApi('submitAndPoll', ['las_asr_pro', params.data]);
|
|
93
132
|
}
|
|
94
133
|
},
|
|
@@ -105,6 +144,9 @@ Parameters in data:
|
|
|
105
144
|
required: ['data']
|
|
106
145
|
},
|
|
107
146
|
async execute(toolCallId, params) {
|
|
147
|
+
if (params.data?.audio?.url) {
|
|
148
|
+
params.data.audio.url = await processUrl(params.data.audio.url);
|
|
149
|
+
}
|
|
108
150
|
return await callApi('submitAndPoll', ['las_seed_2_0', params.data]);
|
|
109
151
|
}
|
|
110
152
|
},
|
|
@@ -128,6 +170,13 @@ Parameters:
|
|
|
128
170
|
required: ['model', 'input']
|
|
129
171
|
},
|
|
130
172
|
async execute(toolCallId, params) {
|
|
173
|
+
if (params.input && Array.isArray(params.input)) {
|
|
174
|
+
for (const item of params.input) {
|
|
175
|
+
if (item.type === 'image_url' && item.image_url?.url) {
|
|
176
|
+
item.image_url.url = await processUrl(item.image_url.url);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
131
180
|
return await callApi('multimodalEmbedding', [
|
|
132
181
|
params.model,
|
|
133
182
|
params.input,
|
|
@@ -151,6 +200,9 @@ Parameters in data:
|
|
|
151
200
|
required: ['data']
|
|
152
201
|
},
|
|
153
202
|
async execute(toolCallId, params) {
|
|
203
|
+
if (params.data?.video_url) {
|
|
204
|
+
params.data.video_url = await processUrl(params.data.video_url);
|
|
205
|
+
}
|
|
154
206
|
return await callApi('submitAndPoll', ['las_long_video_understand', params.data]);
|
|
155
207
|
}
|
|
156
208
|
},
|
|
@@ -169,6 +221,9 @@ Parameters in data:
|
|
|
169
221
|
required: ['data']
|
|
170
222
|
},
|
|
171
223
|
async execute(toolCallId, params) {
|
|
224
|
+
if (params.data?.url) {
|
|
225
|
+
params.data.url = await processUrl(params.data.url);
|
|
226
|
+
}
|
|
172
227
|
return await callApi('submitAndPoll', ['las_pdf_parse_doubao', params.data]);
|
|
173
228
|
}
|
|
174
229
|
},
|
|
@@ -187,6 +242,9 @@ Parameters in data:
|
|
|
187
242
|
required: ['data']
|
|
188
243
|
},
|
|
189
244
|
async execute(toolCallId, params) {
|
|
245
|
+
if (params.data?.video_url) {
|
|
246
|
+
params.data.video_url = await processUrl(params.data.video_url);
|
|
247
|
+
}
|
|
190
248
|
return await callApi('submitAndPoll', ['las_video_resize', params.data]);
|
|
191
249
|
}
|
|
192
250
|
}
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
export interface ConnectParams {
|
|
2
2
|
datasource_name: string;
|
|
3
|
-
|
|
3
|
+
url: string;
|
|
4
|
+
vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
|
|
4
5
|
endpoint?: string;
|
|
5
6
|
access_key?: string;
|
|
6
7
|
secret_key?: string;
|
|
7
8
|
region?: string;
|
|
8
|
-
bucket
|
|
9
|
-
prefix
|
|
9
|
+
bucket?: string;
|
|
10
|
+
prefix?: string;
|
|
10
11
|
sample_rows?: number;
|
|
11
12
|
}
|
|
12
13
|
export interface ConnectResult {
|
|
@@ -38,25 +38,16 @@ exports.listDataSources = listDataSources;
|
|
|
38
38
|
const path = __importStar(require("path"));
|
|
39
39
|
const fs = __importStar(require("fs"));
|
|
40
40
|
const os = __importStar(require("os"));
|
|
41
|
-
const
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
// ---------------------------------------------------------------------------
|
|
41
|
+
const s3_tools_1 = require("./s3-tools");
|
|
42
|
+
const lance_tools_1 = require("./lance-tools");
|
|
43
|
+
const mime = __importStar(require("mime-types"));
|
|
45
44
|
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
-
const PYTHON_DEPS = ['boto3', 'lancedb', 'pyarrow', 'pandas', 'Pillow', 'mutagen', 'pymupdf'];
|
|
47
|
-
// ---------------------------------------------------------------------------
|
|
48
|
-
// Helpers
|
|
49
|
-
// ---------------------------------------------------------------------------
|
|
50
45
|
function getDataSourceDir(name) {
|
|
51
46
|
return path.join(BASE_DIR, name);
|
|
52
47
|
}
|
|
53
48
|
function ensureDir(dir) {
|
|
54
49
|
fs.mkdirSync(dir, { recursive: true });
|
|
55
50
|
}
|
|
56
|
-
/**
|
|
57
|
-
* Generate env.sh with all connection parameters for this datasource.
|
|
58
|
-
* This file can be sourced to re-run the profiler or for debugging.
|
|
59
|
-
*/
|
|
60
51
|
function writeEnvFile(dir, params) {
|
|
61
52
|
const envPath = path.join(dir, 'env.sh');
|
|
62
53
|
const lines = [
|
|
@@ -69,166 +60,190 @@ function writeEnvFile(dir, params) {
|
|
|
69
60
|
`export LAS_BUCKET="${params.bucket}"`,
|
|
70
61
|
`export LAS_PREFIX="${params.prefix}"`,
|
|
71
62
|
];
|
|
72
|
-
if (params.endpoint)
|
|
63
|
+
if (params.endpoint)
|
|
73
64
|
lines.push(`export LAS_ENDPOINT="${params.endpoint}"`);
|
|
74
|
-
|
|
75
|
-
if (params.access_key) {
|
|
65
|
+
if (params.access_key)
|
|
76
66
|
lines.push(`export LAS_ACCESS_KEY="${params.access_key}"`);
|
|
77
|
-
|
|
78
|
-
if (params.secret_key) {
|
|
67
|
+
if (params.secret_key)
|
|
79
68
|
lines.push(`export LAS_SECRET_KEY="${params.secret_key}"`);
|
|
80
|
-
|
|
81
|
-
if (params.region) {
|
|
69
|
+
if (params.region)
|
|
82
70
|
lines.push(`export LAS_REGION="${params.region}"`);
|
|
83
|
-
|
|
84
|
-
if (params.sample_rows) {
|
|
71
|
+
if (params.sample_rows)
|
|
85
72
|
lines.push(`export LAS_SAMPLE_ROWS="${params.sample_rows}"`);
|
|
86
|
-
}
|
|
87
73
|
lines.push(`export LAS_DB_PATH="${path.join(dir, 'catalog_db')}"`);
|
|
88
74
|
lines.push(`export LAS_DATASOURCE_NAME="${params.datasource_name}"`);
|
|
89
75
|
lines.push('');
|
|
90
76
|
fs.writeFileSync(envPath, lines.join('\n'), { mode: 0o600 });
|
|
91
77
|
return envPath;
|
|
92
78
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
*/
|
|
112
|
-
function getScriptPath() {
|
|
113
|
-
// The Python script is located in the scripts directory
|
|
114
|
-
return path.join(__dirname, '../scripts', 's3_catalog.py');
|
|
79
|
+
function classifyFile(ext) {
|
|
80
|
+
ext = ext.toLowerCase();
|
|
81
|
+
const STRUCTURED_EXTS = ['.json', '.jsonl', '.ndjson', '.csv', '.tsv', '.parquet', '.pq'];
|
|
82
|
+
const IMAGE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff', '.tif', '.svg', '.ico', '.heic', '.heif'];
|
|
83
|
+
const AUDIO_EXTS = ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a', '.wma', '.opus'];
|
|
84
|
+
const VIDEO_EXTS = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv', '.m4v', '.3gp'];
|
|
85
|
+
const PDF_EXTS = ['.pdf'];
|
|
86
|
+
if (STRUCTURED_EXTS.includes(ext))
|
|
87
|
+
return { category: 'structured', mediaType: '' };
|
|
88
|
+
if (IMAGE_EXTS.includes(ext))
|
|
89
|
+
return { category: 'non-structured', mediaType: 'image' };
|
|
90
|
+
if (AUDIO_EXTS.includes(ext))
|
|
91
|
+
return { category: 'non-structured', mediaType: 'audio' };
|
|
92
|
+
if (VIDEO_EXTS.includes(ext))
|
|
93
|
+
return { category: 'non-structured', mediaType: 'video' };
|
|
94
|
+
if (PDF_EXTS.includes(ext))
|
|
95
|
+
return { category: 'non-structured', mediaType: 'pdf' };
|
|
96
|
+
return { category: 'non-structured', mediaType: '' };
|
|
115
97
|
}
|
|
116
|
-
// ---------------------------------------------------------------------------
|
|
117
|
-
// Main Entry
|
|
118
|
-
// ---------------------------------------------------------------------------
|
|
119
98
|
async function connectDataSource(params, _ctx) {
|
|
120
|
-
|
|
121
|
-
if (!params.datasource_name) {
|
|
99
|
+
if (!params.datasource_name)
|
|
122
100
|
throw new Error('datasource_name is required');
|
|
101
|
+
if (!params.url)
|
|
102
|
+
throw new Error('url is required (e.g. tos://bucket/prefix)');
|
|
103
|
+
// Parse URL: tos://bucket/prefix
|
|
104
|
+
try {
|
|
105
|
+
if (params.url.startsWith('file://') || params.url.startsWith('/')) {
|
|
106
|
+
params.vendor = 'local';
|
|
107
|
+
const localPath = params.url.startsWith('file://') ? params.url.slice(7) : params.url;
|
|
108
|
+
params.bucket = localPath;
|
|
109
|
+
params.prefix = '.';
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
const parsedUrl = new URL(params.url);
|
|
113
|
+
const protocol = parsedUrl.protocol.replace(':', '');
|
|
114
|
+
if (['tos', 'oss', 'cos', 's3'].includes(protocol)) {
|
|
115
|
+
if (protocol === 'tos')
|
|
116
|
+
params.vendor = 'volcengine';
|
|
117
|
+
else if (protocol === 'oss')
|
|
118
|
+
params.vendor = 'alibaba';
|
|
119
|
+
else if (protocol === 'cos')
|
|
120
|
+
params.vendor = 'tencent';
|
|
121
|
+
else if (protocol === 's3')
|
|
122
|
+
params.vendor = 'aws';
|
|
123
|
+
params.bucket = parsedUrl.hostname;
|
|
124
|
+
params.prefix = parsedUrl.pathname.replace(/^\//, ''); // Remove leading slash
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
throw new Error(`Unsupported protocol: ${protocol}`);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
123
130
|
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
throw new Error('bucket is required');
|
|
129
|
-
}
|
|
130
|
-
if (params.prefix === undefined || params.prefix === null) {
|
|
131
|
-
throw new Error('prefix is required');
|
|
131
|
+
catch (e) {
|
|
132
|
+
if (!params.vendor || !params.bucket || params.prefix === undefined) {
|
|
133
|
+
throw new Error(`Invalid url format: ${e.message}`);
|
|
134
|
+
}
|
|
132
135
|
}
|
|
133
|
-
// For non-local vendors, validate credentials
|
|
134
136
|
if (params.vendor !== 'local') {
|
|
135
|
-
if (!params.endpoint && params.vendor !== 'aws')
|
|
137
|
+
if (!params.endpoint && params.vendor !== 'aws')
|
|
136
138
|
throw new Error(`endpoint is required for vendor "${params.vendor}"`);
|
|
139
|
+
let ak = params.access_key;
|
|
140
|
+
let sk = params.secret_key;
|
|
141
|
+
if (!ak || !sk) {
|
|
142
|
+
try {
|
|
143
|
+
const { loadCredentials } = require('../../utils/credentials');
|
|
144
|
+
const creds = loadCredentials();
|
|
145
|
+
ak = ak || creds.ACCESS_KEY || creds.VOLCENGINE_ACCESS_KEY;
|
|
146
|
+
sk = sk || creds.SECRET_KEY || creds.VOLCENGINE_SECRET_KEY;
|
|
147
|
+
}
|
|
148
|
+
catch (e) {
|
|
149
|
+
// ignore
|
|
150
|
+
}
|
|
137
151
|
}
|
|
138
|
-
const ak = params.access_key || process.env.TOS_ACCESS_KEY || process.env.S3_ACCESS_KEY || process.env.AWS_ACCESS_KEY_ID;
|
|
139
|
-
const sk = params.secret_key || process.env.TOS_SECRET_KEY || process.env.S3_SECRET_KEY || process.env.AWS_SECRET_ACCESS_KEY;
|
|
140
152
|
if (!ak || !sk) {
|
|
141
|
-
throw new Error('access_key and secret_key are required
|
|
153
|
+
throw new Error('access_key and secret_key are required');
|
|
142
154
|
}
|
|
143
|
-
// Normalise into params so env.sh picks them up
|
|
144
155
|
params.access_key = ak;
|
|
145
156
|
params.secret_key = sk;
|
|
146
157
|
}
|
|
147
158
|
const dsDir = getDataSourceDir(params.datasource_name);
|
|
148
159
|
const dbPath = path.join(dsDir, 'catalog_db');
|
|
149
160
|
ensureDir(dsDir);
|
|
150
|
-
// 1. Write env.sh
|
|
151
161
|
const envPath = writeEnvFile(dsDir, params);
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
'
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
// Try to parse structured output from the script
|
|
205
|
-
try {
|
|
206
|
-
const jsonMatch = stdout.match(/\{[\s\S]*"summary"[\s\S]*\}/);
|
|
207
|
-
const result = jsonMatch ? JSON.parse(jsonMatch[0]) : {};
|
|
208
|
-
resolve({
|
|
209
|
-
status: 'success',
|
|
210
|
-
datasource_name: params.datasource_name,
|
|
211
|
-
db_path: dbPath,
|
|
212
|
-
env_path: envPath,
|
|
213
|
-
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
214
|
-
summary: result.summary || {
|
|
215
|
-
total_files: 0,
|
|
216
|
-
structured_files: 0,
|
|
217
|
-
media_files: 0,
|
|
218
|
-
},
|
|
219
|
-
});
|
|
220
|
-
}
|
|
221
|
-
catch {
|
|
222
|
-
resolve({
|
|
223
|
-
status: 'success',
|
|
224
|
-
datasource_name: params.datasource_name,
|
|
225
|
-
db_path: dbPath,
|
|
226
|
-
env_path: envPath,
|
|
227
|
-
tables: ['file_catalog', 'structured_schemas', 'media_metadata'],
|
|
162
|
+
try {
|
|
163
|
+
let isTruncated = true;
|
|
164
|
+
let continuationToken = undefined;
|
|
165
|
+
let total_files = 0;
|
|
166
|
+
let structured_files = 0;
|
|
167
|
+
let media_files = 0;
|
|
168
|
+
const allRecords = [];
|
|
169
|
+
const scan_ts = new Date().toISOString() + 'Z';
|
|
170
|
+
while (isTruncated) {
|
|
171
|
+
const response = await (0, s3_tools_1.listS3Objects)({
|
|
172
|
+
vendor: params.vendor,
|
|
173
|
+
bucket: params.bucket,
|
|
174
|
+
endpoint: params.endpoint,
|
|
175
|
+
access_key: params.access_key,
|
|
176
|
+
secret_key: params.secret_key,
|
|
177
|
+
region: params.region
|
|
178
|
+
}, params.prefix || '', 1000, continuationToken);
|
|
179
|
+
for (const obj of response.Contents) {
|
|
180
|
+
const key = obj.Key || '';
|
|
181
|
+
if (key.endsWith('/'))
|
|
182
|
+
continue;
|
|
183
|
+
const name = path.basename(key);
|
|
184
|
+
const ext = path.extname(name).toLowerCase();
|
|
185
|
+
const mimeType = mime.lookup(name) || '';
|
|
186
|
+
const { category, mediaType } = classifyFile(ext);
|
|
187
|
+
const depth = (key.match(/\//g) || []).length;
|
|
188
|
+
const parentDir = key.includes('/') ? path.basename(path.dirname(key)) : '';
|
|
189
|
+
total_files++;
|
|
190
|
+
if (category === 'structured')
|
|
191
|
+
structured_files++;
|
|
192
|
+
if (mediaType)
|
|
193
|
+
media_files++;
|
|
194
|
+
allRecords.push({
|
|
195
|
+
file_path: key,
|
|
196
|
+
file_name: name,
|
|
197
|
+
extension: ext,
|
|
198
|
+
mime_type: mimeType,
|
|
199
|
+
category: category,
|
|
200
|
+
media_type: mediaType,
|
|
201
|
+
size_bytes: obj.Size || 0,
|
|
202
|
+
last_modified: obj.LastModified ? String(obj.LastModified) : '',
|
|
203
|
+
created_time: obj._created_time ? String(obj._created_time) : '',
|
|
204
|
+
etag: (obj.ETag || '').replace(/"/g, ''),
|
|
205
|
+
storage_class: obj.StorageClass || '',
|
|
206
|
+
is_multipart: (obj.ETag || '').includes('-'),
|
|
207
|
+
depth: depth,
|
|
208
|
+
parent_dir: parentDir,
|
|
209
|
+
vendor: params.vendor,
|
|
210
|
+
bucket: params.bucket,
|
|
211
|
+
has_schema: false,
|
|
212
|
+
has_media_meta: false,
|
|
213
|
+
scan_timestamp: scan_ts
|
|
228
214
|
});
|
|
229
215
|
}
|
|
216
|
+
isTruncated = response.IsTruncated || false;
|
|
217
|
+
continuationToken = response.NextContinuationToken;
|
|
218
|
+
}
|
|
219
|
+
await (0, lance_tools_1.writeLanceCatalog)({
|
|
220
|
+
db_path: dbPath,
|
|
221
|
+
table_name: 'file_catalog',
|
|
222
|
+
records: allRecords
|
|
230
223
|
});
|
|
231
|
-
|
|
224
|
+
return {
|
|
225
|
+
status: 'success',
|
|
226
|
+
datasource_name: params.datasource_name,
|
|
227
|
+
db_path: dbPath,
|
|
228
|
+
env_path: envPath,
|
|
229
|
+
tables: ['file_catalog'],
|
|
230
|
+
summary: {
|
|
231
|
+
total_files,
|
|
232
|
+
structured_files,
|
|
233
|
+
media_files
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
catch (error) {
|
|
238
|
+
return {
|
|
239
|
+
status: 'error',
|
|
240
|
+
datasource_name: params.datasource_name,
|
|
241
|
+
db_path: dbPath,
|
|
242
|
+
env_path: envPath,
|
|
243
|
+
tables: [],
|
|
244
|
+
error: error.message
|
|
245
|
+
};
|
|
246
|
+
}
|
|
232
247
|
}
|
|
233
248
|
async function listDataSources(_ctx) {
|
|
234
249
|
try {
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export interface S3Params {
|
|
2
|
+
url?: string;
|
|
3
|
+
vendor?: 'volcengine' | 'alibaba' | 'tencent' | 'aws' | 'local';
|
|
4
|
+
endpoint?: string;
|
|
5
|
+
access_key?: string;
|
|
6
|
+
secret_key?: string;
|
|
7
|
+
region?: string;
|
|
8
|
+
bucket?: string;
|
|
9
|
+
}
|
|
10
|
+
export declare function parseS3Url(params: S3Params): S3Params;
|
|
11
|
+
export declare function listS3Objects(params: S3Params, prefix: string, maxKeys?: number, continuationToken?: string): Promise<{
|
|
12
|
+
Contents: any[];
|
|
13
|
+
IsTruncated: boolean;
|
|
14
|
+
NextContinuationToken: undefined;
|
|
15
|
+
} | {
|
|
16
|
+
Contents: import("@aws-sdk/client-s3")._Object[];
|
|
17
|
+
IsTruncated: boolean | undefined;
|
|
18
|
+
NextContinuationToken: string | undefined;
|
|
19
|
+
}>;
|
|
20
|
+
export declare function readS3Object(params: S3Params, key: string, maxBytes?: number): Promise<Buffer>;
|
|
21
|
+
export declare function getPresignedUrl(params: S3Params, key: string, expiresIn?: number): Promise<string>;
|