@byted-las/contextlake-openclaw 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/client/lancedb.js +1 -1
- package/dist/src/commands/cli.d.ts +2 -1
- package/dist/src/commands/cli.js +31 -8
- package/dist/src/commands/index.js +25 -6
- package/dist/src/commands/slashcmd.d.ts +6 -0
- package/dist/src/commands/slashcmd.js +90 -6
- package/dist/src/commands/tools.d.ts +2 -0
- package/dist/src/commands/tools.js +44 -37
- package/dist/src/lib/actions/ingest-source.d.ts +15 -0
- package/dist/src/lib/actions/ingest-source.js +193 -0
- package/dist/src/lib/actions/ingest.d.ts +13 -7
- package/dist/src/lib/actions/ingest.js +133 -58
- package/dist/src/lib/actions/las-api.d.ts +13 -0
- package/dist/src/lib/actions/las-api.js +105 -0
- package/dist/src/lib/actions/las-tools.d.ts +3 -0
- package/dist/src/lib/actions/las-tools.js +194 -0
- package/dist/src/lib/actions/las.d.ts +64 -0
- package/dist/src/lib/actions/las.js +72 -0
- package/dist/src/lib/actions/profiler.d.ts +3 -0
- package/dist/src/lib/actions/profiler.js +17 -1
- package/dist/src/lib/actions/retrieve.js +2 -8
- package/dist/src/lib/scripts/s3_catalog.py +10 -1
- package/dist/src/service/embedding/factory.js +1 -10
- package/dist/src/service/embedding/interface.d.ts +5 -0
- package/dist/src/service/embedding/remote.d.ts +1 -0
- package/dist/src/service/embedding/remote.js +31 -0
- package/dist/src/service/metadata/interface.d.ts +1 -0
- package/dist/src/service/metadata/local.d.ts +1 -0
- package/dist/src/service/metadata/local.js +6 -0
- package/dist/src/utils/config.js +11 -2
- package/dist/src/utils/credentials.d.ts +8 -0
- package/dist/src/utils/credentials.js +77 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -5
- package/src/client/lancedb.ts +1 -1
- package/src/commands/cli.ts +35 -9
- package/src/commands/index.ts +30 -6
- package/src/commands/slashcmd.ts +67 -7
- package/src/commands/tools.ts +49 -41
- package/src/lib/actions/ingest.ts +151 -71
- package/src/lib/actions/las-api.ts +119 -0
- package/src/lib/actions/las-tools.ts +196 -0
- package/src/lib/actions/profiler.ts +18 -1
- package/src/lib/actions/retrieve.ts +2 -10
- package/src/lib/scripts/s3_catalog.py +10 -1
- package/src/service/embedding/factory.ts +1 -8
- package/src/service/embedding/interface.ts +6 -0
- package/src/service/embedding/remote.ts +36 -0
- package/src/service/metadata/interface.ts +1 -0
- package/src/service/metadata/local.ts +7 -0
- package/src/utils/config.ts +13 -2
- package/src/utils/credentials.ts +50 -0
- package/bin/contextlake-openclaw.js +0 -5
- package/src/service/embedding/local.ts +0 -121
|
@@ -97,7 +97,7 @@ class ContextLakeLanceDBClient {
|
|
|
97
97
|
}
|
|
98
98
|
return await fallbackQuery.toArray();
|
|
99
99
|
}
|
|
100
|
-
const vector = await this.embeddingProvider.
|
|
100
|
+
const vector = await this.embeddingProvider.generateMultimodalEmbedding([{ type: 'text', text: query }]);
|
|
101
101
|
// @ts-ignore
|
|
102
102
|
let search = table.vectorSearch(vector).limit(normalizedLimit);
|
|
103
103
|
if (filter) {
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../utils/config';
|
|
2
2
|
export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
|
|
3
3
|
connectAction: (datasource_name: string, options: any) => Promise<void>;
|
|
4
|
-
ingestAction: (
|
|
4
|
+
ingestAction: (datasource_name: string) => Promise<void>;
|
|
5
5
|
searchAction: (query: any, options: any) => Promise<void>;
|
|
6
6
|
listAction: (options: any) => Promise<void>;
|
|
7
7
|
deleteAction: (options: any) => Promise<void>;
|
|
8
|
+
onboardAction: () => Promise<void>;
|
|
8
9
|
};
|
package/dist/src/commands/cli.js
CHANGED
|
@@ -6,6 +6,7 @@ const ingest_1 = require("../lib/actions/ingest");
|
|
|
6
6
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
7
7
|
const manage_1 = require("../lib/actions/manage");
|
|
8
8
|
const profiler_1 = require("../lib/actions/profiler");
|
|
9
|
+
const credentials_1 = require("../utils/credentials");
|
|
9
10
|
function parseOptionalInt(value, fallback) {
|
|
10
11
|
const parsed = Number.parseInt(String(value), 10);
|
|
11
12
|
return Number.isFinite(parsed) ? parsed : fallback;
|
|
@@ -67,15 +68,11 @@ function getCliCommands(pluginConfig, logger) {
|
|
|
67
68
|
process.exitCode = 1;
|
|
68
69
|
}
|
|
69
70
|
},
|
|
70
|
-
ingestAction: async (
|
|
71
|
-
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, {
|
|
71
|
+
ingestAction: async (datasource_name) => {
|
|
72
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
|
|
72
73
|
try {
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
-
files,
|
|
76
|
-
metadata,
|
|
77
|
-
chunkSize: parseOptionalInt(options.chunkSize, 500),
|
|
78
|
-
overlap: parseOptionalInt(options.overlap, 50)
|
|
74
|
+
const result = await (0, ingest_1.ingestSource)({
|
|
75
|
+
datasource_name
|
|
79
76
|
}, pluginConfig, logger);
|
|
80
77
|
// eslint-disable-next-line no-console
|
|
81
78
|
console.log(JSON.stringify(result, null, 2));
|
|
@@ -134,6 +131,32 @@ function getCliCommands(pluginConfig, logger) {
|
|
|
134
131
|
console.error('Error:', e.message);
|
|
135
132
|
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI delete failed`, { error: e.message, stack: e.stack });
|
|
136
133
|
}
|
|
134
|
+
},
|
|
135
|
+
onboardAction: async () => {
|
|
136
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard started`);
|
|
137
|
+
try {
|
|
138
|
+
const currentCreds = (0, credentials_1.loadCredentials)();
|
|
139
|
+
// eslint-disable-next-line no-console
|
|
140
|
+
console.log('Welcome to ContextLake Onboarding!');
|
|
141
|
+
// eslint-disable-next-line no-console
|
|
142
|
+
console.log('Please provide your credentials below. Press enter to keep the current value.');
|
|
143
|
+
const lasApiKey = await (0, credentials_1.promptForInput)('LAS_API_KEY', currentCreds.LAS_API_KEY);
|
|
144
|
+
const volcengineAccessKey = await (0, credentials_1.promptForInput)('VOLCENGINE_ACCESS_KEY', currentCreds.VOLCENGINE_ACCESS_KEY);
|
|
145
|
+
const volcengineSecretKey = await (0, credentials_1.promptForInput)('VOLCENGINE_SECRET_KEY', currentCreds.VOLCENGINE_SECRET_KEY);
|
|
146
|
+
const newCreds = {
|
|
147
|
+
LAS_API_KEY: lasApiKey,
|
|
148
|
+
VOLCENGINE_ACCESS_KEY: volcengineAccessKey,
|
|
149
|
+
VOLCENGINE_SECRET_KEY: volcengineSecretKey
|
|
150
|
+
};
|
|
151
|
+
(0, credentials_1.saveCredentials)(newCreds);
|
|
152
|
+
// eslint-disable-next-line no-console
|
|
153
|
+
console.log('Credentials saved successfully!');
|
|
154
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI onboard success`);
|
|
155
|
+
}
|
|
156
|
+
catch (e) {
|
|
157
|
+
console.error('Error during onboarding:', e.message);
|
|
158
|
+
logger.error(`[${new Date().toISOString()}] [ContextLake] CLI onboard failed`, { error: e.message, stack: e.stack });
|
|
159
|
+
}
|
|
137
160
|
}
|
|
138
161
|
};
|
|
139
162
|
}
|
|
@@ -20,6 +20,12 @@ function registerAll(ctx, logger) {
|
|
|
20
20
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
|
|
21
21
|
ctx.registerTool(tools.lasDataProfilerTool);
|
|
22
22
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
|
|
23
|
+
ctx.registerTool(tools.listDatasourceTool);
|
|
24
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listDatasourceTool.name}`);
|
|
25
|
+
for (const lasTool of tools.lasTools) {
|
|
26
|
+
ctx.registerTool(lasTool);
|
|
27
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${lasTool.name}`);
|
|
28
|
+
}
|
|
23
29
|
}
|
|
24
30
|
catch (error) {
|
|
25
31
|
logger.error(`[${new Date().toISOString()}] [ContextLake] Error registering agent tools: ${error.message}${error.stack ? '\\n' + error.stack : ''}`);
|
|
@@ -46,11 +52,8 @@ function registerAll(ctx, logger) {
|
|
|
46
52
|
.option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
|
|
47
53
|
.action(commands.connectAction);
|
|
48
54
|
// Ingest
|
|
49
|
-
contextlake.command('ingest <
|
|
50
|
-
.description('
|
|
51
|
-
.option('-c, --chunk-size <number>', 'Chunk size for text splitting', '500')
|
|
52
|
-
.option('-o, --overlap <number>', 'Chunk overlap size', '50')
|
|
53
|
-
.option('-m, --metadata <json>', 'JSON metadata to attach to the documents')
|
|
55
|
+
contextlake.command('ingest <datasource_name>')
|
|
56
|
+
.description('Process and ingest all files from a connected data source into the knowledge base')
|
|
54
57
|
.action(commands.ingestAction);
|
|
55
58
|
// Search
|
|
56
59
|
contextlake.command('search <query>')
|
|
@@ -70,6 +73,10 @@ function registerAll(ctx, logger) {
|
|
|
70
73
|
.option('--ids <ids...>', 'List of specific file IDs to delete')
|
|
71
74
|
.option('-f, --filter <string>', 'Filter string to match documents for deletion')
|
|
72
75
|
.action(commands.deleteAction);
|
|
76
|
+
// Onboard
|
|
77
|
+
contextlake.command('onboard')
|
|
78
|
+
.description('Configure credentials for ContextLake')
|
|
79
|
+
.action(commands.onboardAction);
|
|
73
80
|
}, { commands: ['contextlake'] });
|
|
74
81
|
logger.info(`[${new Date().toISOString()}] [ContextLake] CLI commands registered`);
|
|
75
82
|
}
|
|
@@ -86,7 +93,7 @@ function registerAll(ctx, logger) {
|
|
|
86
93
|
const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
|
|
87
94
|
ctx.registerCommand({
|
|
88
95
|
name: 'contextlake-ingest',
|
|
89
|
-
description: '
|
|
96
|
+
description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
|
|
90
97
|
acceptsArgs: true,
|
|
91
98
|
handler: slashCommands.ingestHandler
|
|
92
99
|
});
|
|
@@ -108,6 +115,18 @@ function registerAll(ctx, logger) {
|
|
|
108
115
|
acceptsArgs: true,
|
|
109
116
|
handler: slashCommands.deleteHandler
|
|
110
117
|
});
|
|
118
|
+
ctx.registerCommand({
|
|
119
|
+
name: 'contextlake-profiler',
|
|
120
|
+
description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
|
|
121
|
+
acceptsArgs: true,
|
|
122
|
+
handler: slashCommands.profilerHandler
|
|
123
|
+
});
|
|
124
|
+
ctx.registerCommand({
|
|
125
|
+
name: 'contextlake-list-datasource',
|
|
126
|
+
description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',
|
|
127
|
+
acceptsArgs: false,
|
|
128
|
+
handler: slashCommands.listDatasourceHandler
|
|
129
|
+
});
|
|
111
130
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash commands registered`);
|
|
112
131
|
}
|
|
113
132
|
catch (error) {
|
|
@@ -12,4 +12,10 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
|
|
|
12
12
|
deleteHandler: (commandCtx: any) => Promise<{
|
|
13
13
|
text: string;
|
|
14
14
|
}>;
|
|
15
|
+
profilerHandler: (commandCtx: any) => Promise<{
|
|
16
|
+
text: string;
|
|
17
|
+
}>;
|
|
18
|
+
listDatasourceHandler: (commandCtx: any) => Promise<{
|
|
19
|
+
text: string;
|
|
20
|
+
}>;
|
|
15
21
|
};
|
|
@@ -1,9 +1,46 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
2
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
36
|
exports.getSlashCommands = getSlashCommands;
|
|
4
37
|
const ingest_1 = require("../lib/actions/ingest");
|
|
5
38
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
6
39
|
const manage_1 = require("../lib/actions/manage");
|
|
40
|
+
const profiler_1 = require("../lib/actions/profiler");
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const path = __importStar(require("path"));
|
|
43
|
+
const os = __importStar(require("os"));
|
|
7
44
|
function getSlashCommands(pluginConfig, logger) {
|
|
8
45
|
return {
|
|
9
46
|
ingestHandler: async (commandCtx) => {
|
|
@@ -12,12 +49,16 @@ function getSlashCommands(pluginConfig, logger) {
|
|
|
12
49
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
|
|
13
50
|
try {
|
|
14
51
|
if (args.length === 0) {
|
|
15
|
-
return { text: `**Error:** Missing
|
|
52
|
+
return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
|
|
16
53
|
}
|
|
17
|
-
const
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
54
|
+
const datasource_name = args[0];
|
|
55
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
56
|
+
const dsDir = path.join(BASE_DIR, datasource_name);
|
|
57
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
58
|
+
if (!fs.existsSync(dbPath)) {
|
|
59
|
+
return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
|
|
60
|
+
}
|
|
61
|
+
const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
|
|
21
62
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
|
|
22
63
|
return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
23
64
|
}
|
|
@@ -86,6 +127,49 @@ function getSlashCommands(pluginConfig, logger) {
|
|
|
86
127
|
logger.error(`[ContextLake] Slash delete failed`, { error: e.message });
|
|
87
128
|
return { text: `**Error executing delete:** ${e.message}` };
|
|
88
129
|
}
|
|
89
|
-
}
|
|
130
|
+
},
|
|
131
|
+
profilerHandler: async (commandCtx) => {
|
|
132
|
+
const rawArgs = commandCtx.args || "";
|
|
133
|
+
const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
|
|
134
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
|
|
135
|
+
try {
|
|
136
|
+
if (args.length < 4) {
|
|
137
|
+
return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]` };
|
|
138
|
+
}
|
|
139
|
+
const [datasource_name, vendor, bucket, prefix, endpoint, access_key, secret_key, region] = args;
|
|
140
|
+
if (!['volcengine', 'alibaba', 'tencent', 'aws', 'local'].includes(vendor)) {
|
|
141
|
+
return { text: `**Error:** Invalid vendor. Must be one of: volcengine, alibaba, tencent, aws, local` };
|
|
142
|
+
}
|
|
143
|
+
const params = {
|
|
144
|
+
datasource_name,
|
|
145
|
+
vendor: vendor,
|
|
146
|
+
bucket,
|
|
147
|
+
prefix,
|
|
148
|
+
endpoint,
|
|
149
|
+
access_key,
|
|
150
|
+
secret_key,
|
|
151
|
+
region,
|
|
152
|
+
};
|
|
153
|
+
const result = await (0, profiler_1.connectDataSource)(params);
|
|
154
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
|
|
155
|
+
return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
156
|
+
}
|
|
157
|
+
catch (e) {
|
|
158
|
+
logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
|
|
159
|
+
return { text: `**Error executing profiler:** ${e.message}` };
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
listDatasourceHandler: async (commandCtx) => {
|
|
163
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
|
|
164
|
+
try {
|
|
165
|
+
const result = await (0, profiler_1.listDataSources)();
|
|
166
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource completed`, { result });
|
|
167
|
+
return { text: `**Data Sources:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
|
|
168
|
+
}
|
|
169
|
+
catch (e) {
|
|
170
|
+
logger.error(`[ContextLake] Slash list-datasource failed`, { error: e.message });
|
|
171
|
+
return { text: `**Error executing list-datasource:** ${e.message}` };
|
|
172
|
+
}
|
|
173
|
+
},
|
|
90
174
|
};
|
|
91
175
|
}
|
|
@@ -5,44 +5,52 @@ const ingest_1 = require("../lib/actions/ingest");
|
|
|
5
5
|
const retrieve_1 = require("../lib/actions/retrieve");
|
|
6
6
|
const manage_1 = require("../lib/actions/manage");
|
|
7
7
|
const profiler_1 = require("../lib/actions/profiler");
|
|
8
|
+
const las_tools_1 = require("../lib/actions/las-tools");
|
|
8
9
|
function getAgentTools(pluginConfig, logger) {
|
|
10
|
+
const lasTools = (0, las_tools_1.getLasTools)(pluginConfig, logger);
|
|
9
11
|
return {
|
|
12
|
+
lasTools,
|
|
13
|
+
listDatasourceTool: {
|
|
14
|
+
name: 'contextlake-list-datasource',
|
|
15
|
+
label: 'ContextLake List Datasources',
|
|
16
|
+
description: `List all connected and profiled data sources.`,
|
|
17
|
+
parameters: {
|
|
18
|
+
type: 'object',
|
|
19
|
+
properties: {},
|
|
20
|
+
required: [],
|
|
21
|
+
additionalProperties: false
|
|
22
|
+
},
|
|
23
|
+
async execute(toolCallId, params) {
|
|
24
|
+
logger.info(`[${new Date().toISOString()}] [ContextLake] Executing list-datasource skill, toolCallId: ${toolCallId}`);
|
|
25
|
+
try {
|
|
26
|
+
const result = await (0, profiler_1.listDataSources)();
|
|
27
|
+
return {
|
|
28
|
+
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
29
|
+
details: result
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
catch (error) {
|
|
33
|
+
logger.error(`[${new Date().toISOString()}] [ContextLake] list-datasource skill failed`, { error: error.message });
|
|
34
|
+
return {
|
|
35
|
+
content: [{ type: "text", text: String(error.message) }],
|
|
36
|
+
details: { error: error.message }
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
10
41
|
ingestTool: {
|
|
11
42
|
name: 'contextlake-ingest',
|
|
12
43
|
label: 'ContextLake Ingest',
|
|
13
|
-
description: `
|
|
44
|
+
description: `Process and ingest all files from a connected data source into the knowledge base.
|
|
14
45
|
Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
|
|
15
|
-
Supports
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
Example User Queries:
|
|
19
|
-
- "帮我把这个文档注入到知识湖中"
|
|
20
|
-
- "上传这份 PDF 到知识库"
|
|
21
|
-
- "Please ingest these documents into ContextLake"
|
|
22
|
-
- "将 /path/to/doc.txt 添加到知识库"`,
|
|
46
|
+
Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
|
|
47
|
+
Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
|
|
23
48
|
parameters: {
|
|
24
49
|
type: 'object',
|
|
25
50
|
properties: {
|
|
26
|
-
|
|
27
|
-
type: 'array',
|
|
28
|
-
items: { type: 'string' },
|
|
29
|
-
description: 'List of file paths to ingest'
|
|
30
|
-
},
|
|
31
|
-
metadata: {
|
|
32
|
-
type: 'object',
|
|
33
|
-
description: 'Optional JSON metadata to attach to documents',
|
|
34
|
-
additionalProperties: true
|
|
35
|
-
},
|
|
36
|
-
chunkSize: {
|
|
37
|
-
type: 'integer',
|
|
38
|
-
description: 'Chunk size for text splitting'
|
|
39
|
-
},
|
|
40
|
-
overlap: {
|
|
41
|
-
type: 'integer',
|
|
42
|
-
description: 'Overlap size for text splitting'
|
|
43
|
-
}
|
|
51
|
+
datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
|
|
44
52
|
},
|
|
45
|
-
required: ['
|
|
53
|
+
required: ['datasource_name'],
|
|
46
54
|
additionalProperties: false
|
|
47
55
|
},
|
|
48
56
|
async execute(toolCallId, params) {
|
|
@@ -56,21 +64,21 @@ Example User Queries:
|
|
|
56
64
|
catch (e) {
|
|
57
65
|
logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
|
|
58
66
|
return {
|
|
59
|
-
content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with '
|
|
60
|
-
details: { error: `Invalid params format: received string "${params}", expected object with '
|
|
67
|
+
content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
|
|
68
|
+
details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
|
|
61
69
|
};
|
|
62
70
|
}
|
|
63
71
|
}
|
|
64
|
-
if (!actualParams.
|
|
72
|
+
if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
|
|
65
73
|
actualParams = actualParams.params;
|
|
66
74
|
}
|
|
67
|
-
if (!actualParams.
|
|
75
|
+
if (!actualParams.datasource_name) {
|
|
68
76
|
return {
|
|
69
|
-
content: [{ type: "text", text: `Invalid params: '
|
|
70
|
-
details: { error: `Invalid params: '
|
|
77
|
+
content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
|
|
78
|
+
details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
|
|
71
79
|
};
|
|
72
80
|
}
|
|
73
|
-
const result = await (0, ingest_1.
|
|
81
|
+
const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
|
|
74
82
|
logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
|
|
75
83
|
return {
|
|
76
84
|
content: [{ type: "text", text: JSON.stringify(result) }],
|
|
@@ -81,8 +89,7 @@ Example User Queries:
|
|
|
81
89
|
logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
|
|
82
90
|
return {
|
|
83
91
|
content: [{ type: "text", text: String(error.message) }],
|
|
84
|
-
details: { error: error.message
|
|
85
|
-
}
|
|
92
|
+
details: { error: error.message }
|
|
86
93
|
};
|
|
87
94
|
}
|
|
88
95
|
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
+
export interface IngestSourceParams {
|
|
3
|
+
datasource_name: string;
|
|
4
|
+
}
|
|
5
|
+
export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
|
|
6
|
+
file: any;
|
|
7
|
+
status: string;
|
|
8
|
+
chunks: number;
|
|
9
|
+
message?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
file: any;
|
|
12
|
+
status: string;
|
|
13
|
+
message: any;
|
|
14
|
+
chunks?: undefined;
|
|
15
|
+
})[]>;
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.ingestSource = ingestSource;
|
|
37
|
+
const factory_1 = require("../../service/metadata/factory");
|
|
38
|
+
const las_api_1 = require("./las-api");
|
|
39
|
+
const lancedb = __importStar(require("@lancedb/lancedb"));
|
|
40
|
+
const path = __importStar(require("path"));
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const os = __importStar(require("os"));
|
|
43
|
+
// @ts-ignore
|
|
44
|
+
const uuid_1 = require("uuid");
|
|
45
|
+
const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
|
|
46
|
+
async function ingestSource(params, config, logger) {
|
|
47
|
+
if (logger) {
|
|
48
|
+
logger.info(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
// eslint-disable-next-line no-console
|
|
52
|
+
console.log(`[ContextLake-Action] Calling ingestSource with params: ${JSON.stringify(params)}`);
|
|
53
|
+
}
|
|
54
|
+
const dsDir = path.join(BASE_DIR, params.datasource_name);
|
|
55
|
+
const dbPath = path.join(dsDir, 'catalog_db');
|
|
56
|
+
if (!fs.existsSync(dbPath)) {
|
|
57
|
+
throw new Error(`Data source database not found at ${dbPath}. Please run profiler connect first.`);
|
|
58
|
+
}
|
|
59
|
+
const metaConfig = config.metadata_storage || { type: 'local', lancedb_uri: './data/contextlake' };
|
|
60
|
+
const metadataProvider = (0, factory_1.createMetadataProvider)(metaConfig);
|
|
61
|
+
await metadataProvider.connect();
|
|
62
|
+
const lasClient = new las_api_1.LasApiClient(config, logger);
|
|
63
|
+
const results = [];
|
|
64
|
+
// Connect to the profiler LanceDB to read the file catalog
|
|
65
|
+
const profilerDb = await lancedb.connect(dbPath);
|
|
66
|
+
const tableNames = await profilerDb.tableNames();
|
|
67
|
+
if (!tableNames.includes('file_catalog')) {
|
|
68
|
+
throw new Error(`table 'file_catalog' not found in ${dbPath}`);
|
|
69
|
+
}
|
|
70
|
+
const catalogTable = await profilerDb.openTable('file_catalog');
|
|
71
|
+
const files = await catalogTable.query().toArray();
|
|
72
|
+
logger?.info(`[ContextLake-Action] Found ${files.length} files in catalog`);
|
|
73
|
+
// Simple chunking for text
|
|
74
|
+
const splitText = (text, chunkSize = 500, overlap = 50) => {
|
|
75
|
+
const chunks = [];
|
|
76
|
+
if (!text)
|
|
77
|
+
return chunks;
|
|
78
|
+
let i = 0;
|
|
79
|
+
while (i < text.length) {
|
|
80
|
+
chunks.push(text.slice(i, i + chunkSize));
|
|
81
|
+
i += chunkSize - overlap;
|
|
82
|
+
}
|
|
83
|
+
return chunks;
|
|
84
|
+
};
|
|
85
|
+
const processText = async (text, fileInfo) => {
|
|
86
|
+
const chunks = splitText(text);
|
|
87
|
+
const docs = [];
|
|
88
|
+
for (const chunk of chunks) {
|
|
89
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([{ type: 'text', text: chunk }]);
|
|
90
|
+
docs.push({
|
|
91
|
+
id: (0, uuid_1.v4)(),
|
|
92
|
+
vector,
|
|
93
|
+
text: chunk,
|
|
94
|
+
source: fileInfo.key,
|
|
95
|
+
file_type: fileInfo.category,
|
|
96
|
+
storage_type: 'source',
|
|
97
|
+
url: fileInfo.url || `tos://${fileInfo.bucket}/${fileInfo.key}`,
|
|
98
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
99
|
+
created_at: Date.now(),
|
|
100
|
+
binary_data: Buffer.from('')
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
return docs;
|
|
104
|
+
};
|
|
105
|
+
for (const file of files) {
|
|
106
|
+
try {
|
|
107
|
+
logger?.info(`[ContextLake-Action] Processing file: ${file.key}, type: ${file.media_type}`);
|
|
108
|
+
let docs = [];
|
|
109
|
+
const fileUrl = file.url || `tos://${file.bucket}/${file.key}`;
|
|
110
|
+
if (file.media_type === 'pdf') {
|
|
111
|
+
// PDF Parse
|
|
112
|
+
const result = await lasClient.submitAndPoll('las_pdf_parse_doubao', {
|
|
113
|
+
url: fileUrl
|
|
114
|
+
});
|
|
115
|
+
const markdown = result.data?.markdown || '';
|
|
116
|
+
docs = await processText(markdown, file);
|
|
117
|
+
}
|
|
118
|
+
else if (file.media_type === 'image') {
|
|
119
|
+
// Multimodal Embedding directly
|
|
120
|
+
const vector = await metadataProvider.generateMultimodalEmbedding([
|
|
121
|
+
{ type: 'image_url', image_url: { url: fileUrl } },
|
|
122
|
+
{ type: 'text', text: 'This is an image from the dataset.' }
|
|
123
|
+
]);
|
|
124
|
+
docs.push({
|
|
125
|
+
id: (0, uuid_1.v4)(),
|
|
126
|
+
vector,
|
|
127
|
+
text: 'Image from dataset',
|
|
128
|
+
source: file.key,
|
|
129
|
+
file_type: 'image',
|
|
130
|
+
storage_type: 'source',
|
|
131
|
+
url: fileUrl,
|
|
132
|
+
metadata: JSON.stringify({ datasource: params.datasource_name }),
|
|
133
|
+
created_at: Date.now(),
|
|
134
|
+
binary_data: Buffer.from('')
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
else if (file.media_type === 'audio') {
|
|
138
|
+
// ASR
|
|
139
|
+
const result = await lasClient.submitAndPoll('las_asr_pro', {
|
|
140
|
+
audio: { url: fileUrl, format: file.key.split('.').pop() || 'wav' },
|
|
141
|
+
request: { model_name: 'bigmodel' }
|
|
142
|
+
});
|
|
143
|
+
const text = result.data?.result?.text || '';
|
|
144
|
+
docs = await processText(text, file);
|
|
145
|
+
}
|
|
146
|
+
else if (file.media_type === 'video') {
|
|
147
|
+
// Video understanding -> text -> embedding
|
|
148
|
+
const result = await lasClient.submitAndPoll('las_long_video_understand', {
|
|
149
|
+
video_url: fileUrl,
|
|
150
|
+
query: "详细描述这个视频的内容",
|
|
151
|
+
model_name: "doubao-seed-2-0-lite-260215"
|
|
152
|
+
});
|
|
153
|
+
// Assuming video output is a text description somewhere in the response.
|
|
154
|
+
// Note: the exact structure depends on the API return, adjusting to generic text.
|
|
155
|
+
const text = JSON.stringify(result.data || '');
|
|
156
|
+
// Also need audio extract and ASR for video
|
|
157
|
+
// 1. Extract audio
|
|
158
|
+
// The output_path_template needs a unique path per video
|
|
159
|
+
const audioOutputPath = `tos://${file.bucket}/.tmp/audio/${(0, uuid_1.v4)()}.wav`;
|
|
160
|
+
await lasClient.process('las_audio_extract_and_split', {
|
|
161
|
+
input_path: fileUrl,
|
|
162
|
+
output_path_template: audioOutputPath,
|
|
163
|
+
output_format: 'wav'
|
|
164
|
+
});
|
|
165
|
+
// 2. ASR on the extracted audio
|
|
166
|
+
// Wait briefly for object to be available if needed (often synchronous but tos takes a ms)
|
|
167
|
+
const asrResult = await lasClient.submitAndPoll('las_asr_pro', {
|
|
168
|
+
audio: { url: audioOutputPath.replace('{index}.{output_file_ext}', '0.wav'), format: 'wav' },
|
|
169
|
+
request: { model_name: 'bigmodel' }
|
|
170
|
+
});
|
|
171
|
+
const audioText = asrResult.data?.result?.text || '';
|
|
172
|
+
// Combine video text and audio text
|
|
173
|
+
const combinedText = `Video Description: ${text}\n\nAudio Transcription: ${audioText}`;
|
|
174
|
+
docs = await processText(combinedText, file);
|
|
175
|
+
}
|
|
176
|
+
else if (file.category === 'structured' || file.category === 'non-structured') {
|
|
177
|
+
// If we had a direct text content, we could process it here.
|
|
178
|
+
// Assuming basic local download or similar is available, but for now we skip raw file reading from TOS in this demo script unless implemented.
|
|
179
|
+
// Fallback just logs
|
|
180
|
+
logger?.warn(`[ContextLake-Action] Skipping raw text/structured download for ${file.key} - implement TOS download if needed`);
|
|
181
|
+
}
|
|
182
|
+
if (docs.length > 0) {
|
|
183
|
+
await metadataProvider.addAssets(docs);
|
|
184
|
+
results.push({ file: file.key, status: 'success', chunks: docs.length });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
logger?.error(`[ContextLake-Action] Error processing ${file.key}: ${error.message}`);
|
|
189
|
+
results.push({ file: file.key, status: 'error', message: error.message });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return results;
|
|
193
|
+
}
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
import { ContextLakeConfig } from '../../utils/config';
|
|
2
|
-
interface
|
|
3
|
-
|
|
4
|
-
metadata?: Record<string, any>;
|
|
5
|
-
chunkSize?: number;
|
|
6
|
-
overlap?: number;
|
|
2
|
+
export interface IngestSourceParams {
|
|
3
|
+
datasource_name: string;
|
|
7
4
|
}
|
|
8
|
-
export declare function
|
|
9
|
-
|
|
5
|
+
export declare function ingestSource(params: IngestSourceParams, config: ContextLakeConfig, logger?: any): Promise<({
|
|
6
|
+
file: any;
|
|
7
|
+
status: string;
|
|
8
|
+
chunks: number;
|
|
9
|
+
message?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
file: any;
|
|
12
|
+
status: string;
|
|
13
|
+
message: any;
|
|
14
|
+
chunks?: undefined;
|
|
15
|
+
})[]>;
|