npm - @byted-las/contextlake-openclaw - Versions diffs - 1.0.6 → 1.0.7 - Mend

@byted-las/contextlake-openclaw 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/src/commands/cli.d.ts +0 -2
package/dist/src/commands/cli.js +0 -46
package/dist/src/commands/index.js +0 -29
package/dist/src/commands/slashcmd.d.ts +0 -6
package/dist/src/commands/slashcmd.js +0 -87
package/dist/src/commands/tools.d.ts +0 -2
package/dist/src/commands/tools.js +0 -94
package/dist/src/skills/SKILL.md +24 -18
package/dist/src/skills/contextlake-ingest/SKILL.md +55 -38
package/dist/src/skills/las-data-profiler/SKILL.md +24 -18
package/openclaw.plugin.json +1 -1
package/package.json +1 -1
package/src/commands/cli.ts +0 -45
package/src/commands/index.ts +0 -35
package/src/commands/slashcmd.ts +0 -59
package/src/commands/tools.ts +0 -99
package/src/skills/contextlake-ingest/SKILL.md +55 -38
package/src/skills/las-data-profiler/SKILL.md +24 -18

package/dist/src/commands/cli.d.ts CHANGED Viewed

@@ -1,7 +1,5 @@
 import { ContextLakeConfig } from '../utils/config';
 export declare function getCliCommands(pluginConfig: ContextLakeConfig, logger: any): {
-    connectAction: (datasource_name: string, url: string, options: any) => Promise<void>;
-    ingestAction: (datasource_name: string) => Promise<void>;
     searchAction: (query: any, options: any) => Promise<void>;
     listAction: (options: any) => Promise<void>;
     deleteAction: (options: any) => Promise<void>;

package/dist/src/commands/cli.js CHANGED Viewed

@@ -1,11 +1,8 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.getCliCommands = getCliCommands;
-// @ts-ignore
-const ingest_1 = require("../lib/actions/ingest");
 const retrieve_1 = require("../lib/actions/retrieve");
 const manage_1 = require("../lib/actions/manage");
-const profiler_1 = require("../lib/actions/profiler");
 const credentials_1 = require("../utils/credentials");
 function parseOptionalInt(value, fallback) {
     const parsed = Number.parseInt(String(value), 10);
@@ -34,49 +31,6 @@ function parseMetadata(metadata) {
 }
 function getCliCommands(pluginConfig, logger) {
     return {
-        connectAction: async (datasource_name, url, options) => {
-            logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
-            try {
-                const params = {
-                    datasource_name,
-                    url,
-                    endpoint: options.endpoint,
-                    access_key: options.ak,
-                    secret_key: options.sk,
-                    region: options.region,
-                    sample_rows: parseInt(options.sampleRows),
-                };
-                // eslint-disable-next-line no-console
-                console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
-                // eslint-disable-next-line no-console
-                console.log(`  url: ${params.url}`);
-                const result = await (0, profiler_1.connectDataSource)(params);
-                // eslint-disable-next-line no-console
-                console.log(JSON.stringify(result, null, 2));
-                logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
-            }
-            catch (e) {
-                // eslint-disable-next-line no-console
-                console.error('Error:', e.message);
-                logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
-                process.exitCode = 1;
-            }
-        },
-        ingestAction: async (datasource_name) => {
-            logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
-            try {
-                const result = await (0, ingest_1.ingestSource)({
-                    datasource_name
-                }, pluginConfig, logger);
-                // eslint-disable-next-line no-console
-                console.log(JSON.stringify(result, null, 2));
-                logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
-            }
-            catch (e) {
-                console.error('Error:', e.message);
-                logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
-            }
-        },
         searchAction: async (query, options) => {
             logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
             try {

package/dist/src/commands/index.js CHANGED Viewed

@@ -10,16 +10,12 @@ function registerAll(ctx, logger) {
     // Register Agent Tools
     try {
         const tools = (0, tools_1.getAgentTools)(pluginConfig, logger);
-        ctx.registerTool(tools.ingestTool);
-        logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
         ctx.registerTool(tools.retrieveTool);
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
         ctx.registerTool(tools.listTool);
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listTool.name}`);
         ctx.registerTool(tools.deleteTool);
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
-        ctx.registerTool(tools.lasDataProfilerTool);
-        logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
         ctx.registerTool(tools.listS3ObjectsTool);
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
         ctx.registerTool(tools.readS3ObjectTool);
@@ -49,19 +45,6 @@ function registerAll(ctx, logger) {
             const contextlake = program.command('contextlake')
                 .description('Manage ContextLake knowledge base');
             const commands = (0, cli_1.getCliCommands)(pluginConfig, logger);
-            // connect -- data source profiling (las-data-profiler)
-            contextlake.command('connect <datasource_name> <url>')
-                .description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
-                .option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
-                .option('--ak <credential_id>', 'Credential ID for the data source')
-                .option('--sk <credential_value>', 'Credential value for the data source')
-                .option('--region <region>', 'Region identifier (e.g. cn-beijing)')
-                .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
-                .action(commands.connectAction);
-            // Ingest
-            contextlake.command('ingest <datasource_name>')
-                .description('Process and ingest all files from a connected data source into the knowledge base')
-                .action(commands.ingestAction);
             // Search
             contextlake.command('search <query>')
                 .description('Search the knowledge base for relevant documents')
@@ -98,12 +81,6 @@ function registerAll(ctx, logger) {
             return;
         }
         const slashCommands = (0, slashcmd_1.getSlashCommands)(pluginConfig, logger);
-        ctx.registerCommand({
-            name: 'contextlake-ingest',
-            description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
-            acceptsArgs: true,
-            handler: slashCommands.ingestHandler
-        });
         ctx.registerCommand({
             name: 'contextlake-list',
             description: 'List documents currently in the knowledge base',
@@ -122,12 +99,6 @@ function registerAll(ctx, logger) {
             acceptsArgs: true,
             handler: slashCommands.deleteHandler
         });
-        ctx.registerCommand({
-            name: 'contextlake-profiler',
-            description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
-            acceptsArgs: true,
-            handler: slashCommands.profilerHandler
-        });
         ctx.registerCommand({
             name: 'contextlake-list-datasource',
             description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',

package/dist/src/commands/slashcmd.d.ts CHANGED Viewed

@@ -1,8 +1,5 @@
 import { ContextLakeConfig } from '../utils/config';
 export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any): {
-    ingestHandler: (commandCtx: any) => Promise<{
-        text: string;
-    }>;
     listHandler: (commandCtx: any) => Promise<{
         text: string;
     }>;
@@ -12,9 +9,6 @@ export declare function getSlashCommands(pluginConfig: ContextLakeConfig, logger
     deleteHandler: (commandCtx: any) => Promise<{
         text: string;
     }>;
-    profilerHandler: (commandCtx: any) => Promise<{
-        text: string;
-    }>;
     listDatasourceHandler: (commandCtx: any) => Promise<{
         text: string;
     }>;

package/dist/src/commands/slashcmd.js CHANGED Viewed

@@ -1,72 +1,11 @@
 "use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    var desc = Object.getOwnPropertyDescriptor(m, k);
-    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
-      desc = { enumerable: true, get: function() { return m[k]; } };
-    }
-    Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
-    Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
-    o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
-    var ownKeys = function(o) {
-        ownKeys = Object.getOwnPropertyNames || function (o) {
-            var ar = [];
-            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
-            return ar;
-        };
-        return ownKeys(o);
-    };
-    return function (mod) {
-        if (mod && mod.__esModule) return mod;
-        var result = {};
-        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
-        __setModuleDefault(result, mod);
-        return result;
-    };
-})();
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.getSlashCommands = getSlashCommands;
-const ingest_1 = require("../lib/actions/ingest");
 const retrieve_1 = require("../lib/actions/retrieve");
 const manage_1 = require("../lib/actions/manage");
 const profiler_1 = require("../lib/actions/profiler");
-const fs = __importStar(require("fs"));
-const path = __importStar(require("path"));
-const os = __importStar(require("os"));
 function getSlashCommands(pluginConfig, logger) {
     return {
-        ingestHandler: async (commandCtx) => {
-            const rawArgs = commandCtx.args || "";
-            const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
-            logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
-            try {
-                if (args.length === 0) {
-                    return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
-                }
-                const datasource_name = args[0];
-                const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
-                const dsDir = path.join(BASE_DIR, datasource_name);
-                const dbPath = path.join(dsDir, 'catalog_db');
-                if (!fs.existsSync(dbPath)) {
-                    return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
-                }
-                const result = await (0, ingest_1.ingestSource)({ datasource_name }, pluginConfig, logger);
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
-                return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
-            }
-            catch (e) {
-                logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
-                return { text: `**Error executing ingest:** ${e.message}` };
-            }
-        },
         listHandler: async (commandCtx) => {
             const rawArgs = commandCtx.args || "";
             const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
@@ -128,32 +67,6 @@ function getSlashCommands(pluginConfig, logger) {
                 return { text: `**Error executing delete:** ${e.message}` };
             }
         },
-        profilerHandler: async (commandCtx) => {
-            const rawArgs = commandCtx.args || "";
-            const args = rawArgs.split(' ').filter((arg) => arg.trim() !== '');
-            logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
-            try {
-                if (args.length < 2) {
-                    return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
-                }
-                const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
-                const params = {
-                    datasource_name,
-                    url,
-                    endpoint,
-                    access_key,
-                    secret_key,
-                    region,
-                };
-                const result = await (0, profiler_1.connectDataSource)(params);
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
-                return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
-            }
-            catch (e) {
-                logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
-                return { text: `**Error executing profiler:** ${e.message}` };
-            }
-        },
         listDatasourceHandler: async (commandCtx) => {
             logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);
             try {

package/dist/src/commands/tools.d.ts CHANGED Viewed

@@ -1,11 +1,9 @@
 import { ContextLakeConfig } from '../utils/config';
 import type { AnyAgentTool } from 'openclaw/plugin-sdk';
 export declare function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
-    ingestTool: AnyAgentTool;
     retrieveTool: AnyAgentTool;
     listTool: AnyAgentTool;
     deleteTool: AnyAgentTool;
-    lasDataProfilerTool: AnyAgentTool;
     listDatasourceTool: AnyAgentTool;
     listS3ObjectsTool: AnyAgentTool;
     readS3ObjectTool: AnyAgentTool;

package/dist/src/commands/tools.js CHANGED Viewed

@@ -1,7 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.getAgentTools = getAgentTools;
-const ingest_1 = require("../lib/actions/ingest");
 const retrieve_1 = require("../lib/actions/retrieve");
 const manage_1 = require("../lib/actions/manage");
 const profiler_1 = require("../lib/actions/profiler");
@@ -40,62 +39,6 @@ function getAgentTools(pluginConfig, logger) {
                 }
             }
         },
-        ingestTool: {
-            name: 'contextlake-ingest',
-            label: 'ContextLake Ingest',
-            description: `Process and ingest all files from a connected data source into the knowledge base.
-Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
-Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
-Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
-            parameters: {
-                type: 'object',
-                properties: {
-                    datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
-                },
-                required: ['datasource_name'],
-                additionalProperties: false
-            },
-            async execute(toolCallId, params) {
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
-                try {
-                    let actualParams = params;
-                    if (typeof params === 'string') {
-                        try {
-                            actualParams = JSON.parse(params);
-                        }
-                        catch (e) {
-                            logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
-                            return {
-                                content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
-                                details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
-                            };
-                        }
-                    }
-                    if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
-                        actualParams = actualParams.params;
-                    }
-                    if (!actualParams.datasource_name) {
-                        return {
-                            content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
-                            details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
-                        };
-                    }
-                    const result = await (0, ingest_1.ingestSource)(actualParams, pluginConfig, logger);
-                    logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
-                    return {
-                        content: [{ type: "text", text: JSON.stringify(result) }],
-                        details: result
-                    };
-                }
-                catch (error) {
-                    logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
-                    return {
-                        content: [{ type: "text", text: String(error.message) }],
-                        details: { error: error.message }
-                    };
-                }
-            }
-        },
         retrieveTool: {
             name: 'contextlake-retrieve',
             label: 'ContextLake Retrieve',
@@ -245,43 +188,6 @@ Example User Queries:
                 }
             }
         },
-        lasDataProfilerTool: {
-            name: 'las-data-profiler',
-            label: 'LAS Data Profiler',
-            description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
-            parameters: {
-                type: 'object',
-                properties: {
-                    datasource_name: { type: 'string', description: 'Name of the data source' },
-                    url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
-                    sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
-                },
-                required: ['datasource_name', 'url'],
-                additionalProperties: false
-            },
-            async execute(toolCallId, params) {
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
-                try {
-                    let actualParams = params;
-                    if (params && params.params) {
-                        actualParams = params.params;
-                    }
-                    const result = await (0, profiler_1.connectDataSource)(actualParams);
-                    logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
-                    return {
-                        content: [{ type: "text", text: JSON.stringify(result) }],
-                        details: result
-                    };
-                }
-                catch (error) {
-                    logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
-                    return {
-                        content: [{ type: "text", text: String(error.message) }],
-                        details: { error: error.message }
-                    };
-                }
-            }
-        },
         listS3ObjectsTool: {
             name: 'list-s3-objects',
             label: 'List S3 Objects',

package/dist/src/skills/SKILL.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 name: byted-las-data-profiler
 description: |
-  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
-  It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
+  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
+  performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
+  extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
+  and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
   IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
-  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
+  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
 ---
 ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
 - Need to understand what a batch of data files contains and what their schema looks like
 - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
 - Need to write the meta-information of object storage or local files into LanceDB
+- Mentions TOS, boto3, or object storage data profiling
 - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
+- Need to batch identify the type and size of remote/local files and build an index
+- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
+- Need to connect/dock a data source for profiling
+- Mentions "connect" data source, docking data source
-## Overall Workflow
-When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
-If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
+## Overview
+This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
-## Parameter Description (for `las-data-profiler` tool)
-| Parameter | Description | Example |
-|-----------|-------------|---------|
-| datasource_name | The name of the data source | my_tos_data |
-| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
-| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
-| access_key | AK | - |
-| secret_key | SK | - |
-| region | Region identifier | cn-beijing |
-| bucket | Bucket name (root directory path when local) | my-data-bucket |
-| prefix | Path prefix to limit the scan scope | datasets/2024/ |
+1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
+2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
+3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
+4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
 ## Output Location
 - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
-- Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
+- Table names: `files`, `structured_schemas`, `media_metadata`
+## Available Tools for this Skill
+- `list-s3-objects`: To traverse and list files in the bucket/directory.
+- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
+- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
+Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.

package/dist/src/skills/contextlake-ingest/SKILL.md CHANGED Viewed

@@ -1,42 +1,59 @@
 ---
 name: contextlake-ingest
 description: |
-  Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
-  Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
-  Supports processing of various file types including PDF, Word, Markdown, and Text.
-  Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
-  Example User Queries:
-  - "帮我把这个文档注入到知识湖中"
-  - "上传这份 PDF 到知识库"
-  - "Please ingest these documents into ContextLake"
-  - "将 /path/to/doc.txt 添加到知识库"
-  Example Tool Call:
-  ```json
-  {
-    "files": ["/absolute/path/to/document.pdf"],
-    "metadata": { "category": "tech" }
-  }
-  ```
-parameters:
-  files:
-    type: array
-    items:
-      type: string
-    description: List of file paths to ingest
-    required: true
-  metadata:
-    type: object
-    description: Optional JSON metadata to attach to documents
-    required: false
-  chunkSize:
-    type: integer
-    description: Chunk size for text splitting
-    required: false
-  overlap:
-    type: integer
-    description: Overlap size for text splitting
-    required: false
+  A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
+  Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
+  When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
+  you MUST follow the two-step workflow outlined below using the specific tools provided.
+  IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
+  Do not use custom Python scripts, external CLI tools, or unrelated plugins.
+---
+# ContextLake Data Ingestion Workflow Guide
+To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
+## Step 1: Data Profiling
+Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
+**Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
+### How it works:
+- **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
+- **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
+*Wait for Step 1 to complete successfully before proceeding to Step 2.*
 ---
+## Step 2: Data Ingestion
+Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
+**Tool to use:** OpenClaw Agent using basic tools guided by this skill.
+### How it works:
+- **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
+- **Underlying tools used:**
+  1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
+  2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
+  3. Chunk and process the text.
+  4. Use the embedding tool or model to generate vectors.
+  5. Save the final chunks and vectors into the main ContextLake knowledge base.
+*Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
+---
+## Auxiliary Tools (Use only when necessary)
+If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
+- **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
+  - Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
+- **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
+- **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
+## Error Handling
+- If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
+- Always report the results of both steps to the user clearly.

package/dist/src/skills/las-data-profiler/SKILL.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 name: byted-las-data-profiler
 description: |
-  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
-  It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
+  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
+  performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
+  extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
+  and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
   IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
-  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
+  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
 ---
 ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
 - Need to understand what a batch of data files contains and what their schema looks like
 - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
 - Need to write the meta-information of object storage or local files into LanceDB
+- Mentions TOS, boto3, or object storage data profiling
 - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
+- Need to batch identify the type and size of remote/local files and build an index
+- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
+- Need to connect/dock a data source for profiling
+- Mentions "connect" data source, docking data source
-## Overall Workflow
-When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
-If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
+## Overview
+This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
-## Parameter Description (for `las-data-profiler` tool)
-| Parameter | Description | Example |
-|-----------|-------------|---------|
-| datasource_name | The name of the data source | my_tos_data |
-| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
-| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
-| access_key | AK | - |
-| secret_key | SK | - |
-| region | Region identifier | cn-beijing |
-| bucket | Bucket name (root directory path when local) | my-data-bucket |
-| prefix | Path prefix to limit the scan scope | datasets/2024/ |
+1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
+2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
+3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
+4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
 ## Output Location
 - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
-- Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
+- Table names: `files`, `structured_schemas`, `media_metadata`
+## Available Tools for this Skill
+- `list-s3-objects`: To traverse and list files in the bucket/directory.
+- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
+- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
+Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.

package/openclaw.plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "id": "contextlake-openclaw",
   "name": "ContextLake",
-  "version": "1.0.6",
+  "version": "1.0.7",
   "description": "A lightweight knowledge base plugin for OpenClaw using LanceDB and TOS, with data profiling support",
   "skills": ["./src/skills"],
   "configSchema": {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@byted-las/contextlake-openclaw",
-  "version": "1.0.6",
+  "version": "1.0.7",
   "description": "ContextLake OpenClaw Plugin for managing knowledge base",
   "main": "index.ts",
   "files": [

package/src/commands/cli.ts CHANGED Viewed

@@ -37,51 +37,6 @@ function parseMetadata(metadata: any): Record<string, any> {
 export function getCliCommands(pluginConfig: ContextLakeConfig, logger: any) {
     return {
-        connectAction: async (datasource_name: string, url: string, options: any) => {
-            logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect started`, { datasource_name, url, options });
-            try {
-                const params: ConnectParams = {
-                    datasource_name,
-                    url,
-                    endpoint: options.endpoint,
-                    access_key: options.ak,
-                    secret_key: options.sk,
-                    region: options.region,
-                    sample_rows: parseInt(options.sampleRows),
-                };
-                // eslint-disable-next-line no-console
-                console.log(`[contextlake connect] Connecting to datasource "${datasource_name}"...`);
-                // eslint-disable-next-line no-console
-                console.log(`  url: ${params.url}`);
-                const result = await connectDataSource(params);
-                // eslint-disable-next-line no-console
-                console.log(JSON.stringify(result, null, 2));
-                logger.info(`[${new Date().toISOString()}] [ContextLake] CLI connect success`);
-            } catch (e: any) {
-                // eslint-disable-next-line no-console
-                console.error('Error:', e.message);
-                logger.error(`[${new Date().toISOString()}] [ContextLake] CLI connect failed`, { error: e.message, stack: e.stack });
-                process.exitCode = 1;
-            }
-        },
-        ingestAction: async (datasource_name: string) => {
-            logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest started`, { datasource_name });
-            try {
-                const result = await ingestSource({
-                    datasource_name
-                }, pluginConfig, logger);
-                // eslint-disable-next-line no-console
-                console.log(JSON.stringify(result, null, 2));
-                logger.info(`[${new Date().toISOString()}] [ContextLake] CLI ingest success`);
-            } catch (e: any) {
-                console.error('Error:', e.message);
-                logger.error(`[${new Date().toISOString()}] [ContextLake] CLI ingest failed`, { error: e.message, stack: e.stack });
-            }
-        },
         searchAction: async (query: any, options: any) => {
             logger.info(`[${new Date().toISOString()}] [ContextLake] CLI search started`, { query, options });
             try {

package/src/commands/index.ts CHANGED Viewed

@@ -12,9 +12,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
     try {
         const tools = getAgentTools(pluginConfig, logger);
-        ctx.registerTool(tools.ingestTool );
-        logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.ingestTool.name}`);
         ctx.registerTool(tools.retrieveTool );
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.retrieveTool.name}`);
@@ -23,9 +20,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
         ctx.registerTool(tools.deleteTool );
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.deleteTool.name}`);
-        ctx.registerTool(tools.lasDataProfilerTool );
-        logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.lasDataProfilerTool.name}`);
         ctx.registerTool(tools.listS3ObjectsTool );
         logger.info(`[${new Date().toISOString()}] [ContextLake] Tool registered: ${tools.listS3ObjectsTool.name}`);
@@ -65,21 +59,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
             const commands = getCliCommands(pluginConfig, logger);
-            // connect -- data source profiling (las-data-profiler)
-            contextlake.command('connect <datasource_name> <url>')
-                .description('Connect to a data source and profile its structure, schemas, and media metadata into LanceDB')
-                .option('--endpoint <url>', 'S3 Endpoint URL (not needed for local)')
-                .option('--ak <credential_id>', 'Credential ID for the data source')
-                .option('--sk <credential_value>', 'Credential value for the data source')
-                .option('--region <region>', 'Region identifier (e.g. cn-beijing)')
-                .option('--sample-rows <number>', 'Number of rows to sample per structured file', '100')
-                .action(commands.connectAction);
-            // Ingest
-            contextlake.command('ingest <datasource_name>')
-                .description('Process and ingest all files from a connected data source into the knowledge base')
-                .action(commands.ingestAction);
             // Search
             contextlake.command('search <query>')
                 .description('Search the knowledge base for relevant documents')
@@ -122,13 +101,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
         const slashCommands = getSlashCommands(pluginConfig, logger);
-        ctx.registerCommand({
-            name: 'contextlake-ingest',
-            description: 'Process and ingest all files from a connected data source (usage: /contextlake-ingest <datasource_name>)',
-            acceptsArgs: true,
-            handler: slashCommands.ingestHandler
-        });
         ctx.registerCommand({
             name: 'contextlake-list',
             description: 'List documents currently in the knowledge base',
@@ -150,13 +122,6 @@ export function registerAll(ctx: OpenClawPluginApi, logger: PluginLogger) {
             handler: slashCommands.deleteHandler
         });
-        ctx.registerCommand({
-            name: 'contextlake-profiler',
-            description: 'Connect to a data source and profile its structure (usage: /contextlake-profiler <datasource_name> <vendor> <bucket> <prefix>)',
-            acceptsArgs: true,
-            handler: slashCommands.profilerHandler
-        });
         ctx.registerCommand({
             name: 'contextlake-list-datasource',
             description: 'List all connected and profiled data sources (usage: /contextlake-list-datasource)',

package/src/commands/slashcmd.ts CHANGED Viewed

@@ -9,36 +9,6 @@ import * as os from 'os';
 export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
     return {
-        ingestHandler: async (commandCtx: any) => {
-            const rawArgs = commandCtx.args || "";
-            const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
-            logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest started`, { args });
-            try {
-                if (args.length === 0) {
-                    return { text: `**Error:** Missing datasource_name. Usage: /contextlake-ingest <datasource_name>` };
-                }
-                const datasource_name = args[0];
-                const BASE_DIR = path.join(os.homedir(), '.openclaw', 'contextlake', 'profiler');
-                const dsDir = path.join(BASE_DIR, datasource_name);
-                const dbPath = path.join(dsDir, 'catalog_db');
-                if (!fs.existsSync(dbPath)) {
-                    return { text: `**Error:** Data source "${datasource_name}" has not been profiled yet.\n\nPlease run the profiler first using:\n\`/contextlake-profiler <datasource_name> <vendor> <bucket> <prefix> [endpoint] [ak] [sk] [region]\`` };
-                }
-                const result = await ingestSource({ datasource_name }, pluginConfig, logger);
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command ingest completed`, { resultCount: result.length });
-                return { text: `**Ingest Results (${result.length} files processed):**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
-            } catch (e: any) {
-                logger.error(`[ContextLake] Slash ingest failed`, { error: e.message });
-                return { text: `**Error executing ingest:** ${e.message}` };
-            }
-        },
         listHandler: async (commandCtx: any) => {
             const rawArgs = commandCtx.args || "";
             const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
@@ -102,35 +72,6 @@ export function getSlashCommands(pluginConfig: ContextLakeConfig, logger: any) {
                 return { text: `**Error executing delete:** ${e.message}` };
             }
         },
-        profilerHandler: async (commandCtx: any) => {
-            const rawArgs = commandCtx.args || "";
-            const args = rawArgs.split(' ').filter((arg: string) => arg.trim() !== '');
-            logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler started`, { args });
-            try {
-                if (args.length < 2) {
-                    return { text: `**Error:** Missing arguments. Usage: /contextlake-profiler <datasource_name> <url> [endpoint] [ak] [sk] [region]` };
-                }
-                const [datasource_name, url, endpoint, access_key, secret_key, region] = args;
-                const params: ConnectParams = {
-                    datasource_name,
-                    url,
-                    endpoint,
-                    access_key,
-                    secret_key,
-                    region,
-                };
-                const result = await connectDataSource(params);
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command profiler completed`, { result });
-                return { text: `**Profiler Results:**\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\`` };
-            } catch (e: any) {
-                logger.error(`[ContextLake] Slash profiler failed`, { error: e.message });
-                return { text: `**Error executing profiler:** ${e.message}` };
-            }
-        },
         listDatasourceHandler: async (commandCtx: any) => {
             logger.info(`[${new Date().toISOString()}] [ContextLake] Slash command list-datasource started`);

package/src/commands/tools.ts CHANGED Viewed

@@ -10,11 +10,9 @@ import { ContextLakeConfig } from '../utils/config';
 import type { AnyAgentTool } from 'openclaw/plugin-sdk';
 export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
-    ingestTool: AnyAgentTool;
     retrieveTool: AnyAgentTool;
     listTool: AnyAgentTool;
     deleteTool: AnyAgentTool;
-    lasDataProfilerTool: AnyAgentTool;
     listDatasourceTool: AnyAgentTool;
     listS3ObjectsTool: AnyAgentTool;
     readS3ObjectTool: AnyAgentTool;
@@ -54,65 +52,6 @@ export function getAgentTools(pluginConfig: ContextLakeConfig, logger: any): {
                 }
             }
         },
-        ingestTool: {
-            name: 'contextlake-ingest',
-            label: 'ContextLake Ingest',
-            description: `Process and ingest all files from a connected data source into the knowledge base.
-Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
-Supports multimodal files (text, images, audio, video, pdf) by using LAS models to understand and embed them.
-Must be called after a data source has been successfully profiled via \`las-data-profiler\`.`,
-            parameters: {
-                type: 'object',
-                properties: {
-                    datasource_name: { type: 'string', description: 'Name of the data source previously profiled' }
-                },
-                required: ['datasource_name'],
-                additionalProperties: false
-            },
-            async execute(toolCallId: string, params: any) {
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Executing ingest skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
-                try {
-                    let actualParams = params;
-                    if (typeof params === 'string') {
-                        try {
-                            actualParams = JSON.parse(params);
-                        } catch (e) {
-                            logger.warn(`[ContextLake] Received string params, possibly toolCallId?`, { params });
-                            return {
-                                content: [{ type: "text", text: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }],
-                                details: { error: `Invalid params format: received string "${params}", expected object with 'datasource_name'.` }
-                            } as any;
-                        }
-                    }
-                    if (!actualParams.datasource_name && actualParams.params && actualParams.params.datasource_name) {
-                        actualParams = actualParams.params;
-                    }
-                    if (!actualParams.datasource_name) {
-                        return {
-                                content: [{ type: "text", text: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }],
-                                details: { error: `Invalid params: 'datasource_name' is required. Received keys: ${Object.keys(actualParams)}` }
-                            } as any;
-                    }
-                    const result = await ingestSource(actualParams, pluginConfig, logger);
-                    logger.info(`[${new Date().toISOString()}] [ContextLake] Ingest skill completed successfully`, { resultSummary: Array.isArray(result) ? `Processed ${result.length} items` : 'Success' });
-                    return {
-                        content: [{ type: "text", text: JSON.stringify(result) }],
-                        details: result
-                    } as any;
-                } catch (error: any) {
-                    logger.error(`[${new Date().toISOString()}] [ContextLake] Ingest skill failed`, { error: error.message, stack: error.stack });
-                    return {
-                        content: [{ type: "text", text: String(error.message) }],
-                        details: { error: error.message }
-                    } as any;
-                }
-            }
-        },
         retrieveTool: {
             name: 'contextlake-retrieve',
             label: 'ContextLake Retrieve',
@@ -268,44 +207,6 @@ Example User Queries:
                 }
             }
         },
-        lasDataProfilerTool: {
-            name: 'las-data-profiler',
-            label: 'LAS Data Profiler',
-            description: 'Connect to a data source (TOS/OSS/COS/S3/Local) and profile its structure, schemas, and media metadata into LanceDB',
-            parameters: {
-                type: 'object',
-                properties: {
-                    datasource_name: { type: 'string', description: 'Name of the data source' },
-                    url: { type: 'string', description: 'Data source URL (e.g. tos://bucket/prefix, oss://..., s3://..., file:///path)' },
-                    sample_rows: { type: 'integer', description: 'Number of rows to sample per structured file' }
-                },
-                required: ['datasource_name', 'url'],
-                additionalProperties: false
-            },
-            async execute(toolCallId: string, params: any) {
-                logger.info(`[${new Date().toISOString()}] [ContextLake] Executing las-data-profiler skill, toolCallId: ${toolCallId}`, { params: JSON.stringify(params) });
-                try {
-                    let actualParams = params;
-                    if (params && params.params) {
-                        actualParams = params.params;
-                    }
-                    const result = await connectDataSource(actualParams);
-                    logger.info(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill completed`, { result });
-                    return {
-                        content: [{ type: "text", text: JSON.stringify(result) }],
-                        details: result
-                    } as any;
-                } catch (error: any) {
-                    logger.error(`[${new Date().toISOString()}] [ContextLake] las-data-profiler skill failed`, { error: error.message, stack: error.stack });
-                    return {
-                        content: [{ type: "text", text: String(error.message) }],
-                        details: { error: error.message }
-                    } as any;
-                }
-            }
-        },
         listS3ObjectsTool: {
             name: 'list-s3-objects',
             label: 'List S3 Objects',

package/src/skills/contextlake-ingest/SKILL.md CHANGED Viewed

@@ -1,42 +1,59 @@
 ---
 name: contextlake-ingest
 description: |
-  Upload, ingest, and index documents into the ContextLake Knowledge Base (知识库) / Knowledge Lake (知识湖).
-  Use this tool when the user wants to "将知识注入", "上传文件", "入库", "添加文档", "ingest files", or "add knowledge".
-  Supports processing of various file types including PDF, Word, Markdown, and Text.
-  Automatically handles text extraction, cleaning, chunking, embedding generation, and storage.
-  Example User Queries:
-  - "帮我把这个文档注入到知识湖中"
-  - "上传这份 PDF 到知识库"
-  - "Please ingest these documents into ContextLake"
-  - "将 /path/to/doc.txt 添加到知识库"
-  Example Tool Call:
-  ```json
-  {
-    "files": ["/absolute/path/to/document.pdf"],
-    "metadata": { "category": "tech" }
-  }
-  ```
-parameters:
-  files:
-    type: array
-    items:
-      type: string
-    description: List of file paths to ingest
-    required: true
-  metadata:
-    type: object
-    description: Optional JSON metadata to attach to documents
-    required: false
-  chunkSize:
-    type: integer
-    description: Chunk size for text splitting
-    required: false
-  overlap:
-    type: integer
-    description: Overlap size for text splitting
-    required: false
+  A guide for OpenClaw agents on how to perform data ingestion into ContextLake.
+  Use this skill to understand the correct workflow for profiling a dataset and then ingesting it.
+  When a user asks to "ingest data", "导入数据", "接入数据源", or "处理某个桶里的数据",
+  you MUST follow the two-step workflow outlined below using the specific tools provided.
+  IMPORTANT RULE: You are STRICTLY FORBIDDEN from using any tools outside of this project for ingestion.
+  Do not use custom Python scripts, external CLI tools, or unrelated plugins.
+---
+# ContextLake Data Ingestion Workflow Guide
+To successfully ingest data into the ContextLake Knowledge Base, you **MUST** follow a strict two-step process.
+## Step 1: Data Profiling
+Before data can be ingested, the source MUST first be profiled to extract its schema, structure, and media metadata into a local LanceDB catalog.
+**Tool to use:** OpenClaw Agent using `byted-las-data-profiler` skill to guide the profiling process.
+### How it works:
+- **Purpose:** Connects to an S3-compatible source (TOS, OSS, AWS S3, etc.) or local file system and catalogs the files.
+- **Underlying tools used:** The agent will use `list-s3-objects`, `read-s3-object`, and `write-lance-catalog` to perform the profiling step by step.
+*Wait for Step 1 to complete successfully before proceeding to Step 2.*
 ---
+## Step 2: Data Ingestion
+Once the data source is successfully profiled and the catalog is created, you can proceed to ingest the data into ContextLake.
+**Tool to use:** OpenClaw Agent using basic tools guided by this skill.
+### How it works:
+- **Purpose:** Reads the LanceDB catalog created in Step 1, processes the multimodal files (text, images, audio, video, PDF) using LAS models, chunks the data, generates embeddings, and indexes them into the ContextLake Knowledge Base.
+- **Underlying tools used:**
+  1. Use `read-lance-catalog` to read the catalog of files from `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`.
+  2. For each file, use appropriate LAS tools (like `las_pdf_parse_doubao`, `las_image_resample`, `las_long_video_understand`) to extract text and features.
+  3. Chunk and process the text.
+  4. Use the embedding tool or model to generate vectors.
+  5. Save the final chunks and vectors into the main ContextLake knowledge base.
+*Note: You are acting as the ingestion pipeline. You must coordinate the reading of the catalog and the processing of each file type using the available LAS tools.*
+---
+## Auxiliary Tools (Use only when necessary)
+If you need to verify the catalog contents between Step 1 and Step 2, or if ingestion fails and you need to debug:
+- **`read-lance-catalog`**: Use this tool to read the records from the catalog database created in Step 1.
+  - Requires `db_path` (e.g., `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`) and `table_name` (usually "files").
+- **`list-s3-objects` / `read-s3-object`**: Use these tools to manually inspect the raw source files if profiling fails.
+- **`contextlake-list-datasource`**: Use this tool to see all data sources that have been connected/profiled.
+## Error Handling
+- If `contextlake-ingest` fails saying the datasource is not found, ensure that the data profiling step completed successfully for that exact `datasource_name`.
+- Always report the results of both steps to the user clearly.

package/src/skills/las-data-profiler/SKILL.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 name: byted-las-data-profiler
 description: |
-  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
-  It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
+  Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them,
+  performs schema inference and column semantic analysis on structured data (JSONL/CSV/Parquet/JSON),
+  extracts key meta-information for media files (Image/Audio/Video/PDF) by reading only header bytes,
+  and writes all results to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
   IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
-  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
+  You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`) to accomplish the profiling tasks.
 ---
 ## Trigger Scenarios
@@ -16,24 +18,28 @@ Be sure to use this Skill when the user mentions the following scenarios:
 - Need to understand what a batch of data files contains and what their schema looks like
 - Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
 - Need to write the meta-information of object storage or local files into LanceDB
+- Mentions TOS, boto3, or object storage data profiling
 - Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
+- Need to batch identify the type and size of remote/local files and build an index
+- Need to quickly understand the structure of an unfamiliar dataset (what files are there, how the schema is, field meanings)
+- Need to connect/dock a data source for profiling
+- Mentions "connect" data source, docking data source
-## Overall Workflow
-When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
-If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
+## Overview
+This Skill acts as a Dataset Profiling Guide. You should use the `list-s3-objects` tool to traverse the S3 bucket or local directory, use `read-s3-object` to read file contents or headers, parse the schema or media metadata, and finally use `write-lance-catalog` to save the catalog into a local LanceDB.
-## Parameter Description (for `las-data-profiler` tool)
-| Parameter | Description | Example |
-|-----------|-------------|---------|
-| datasource_name | The name of the data source | my_tos_data |
-| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
-| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
-| access_key | AK | - |
-| secret_key | SK | - |
-| region | Region identifier | cn-beijing |
-| bucket | Bucket name (root directory path when local) | my-data-bucket |
-| prefix | Path prefix to limit the scan scope | datasets/2024/ |
+1. **Cataloging**: Use `list-s3-objects` to record the meta-information (path, size, etc.) of files.
+2. **Understanding Structured Data**: Use `read-s3-object` to sample JSONL / CSV / TSV / Parquet / JSON.
+3. **Extracting Media Meta-information**: Use `read-s3-object` with `maxBytes` to read only the file header (without downloading the full file) for images, audio, video, and PDFs to extract key attributes.
+4. **Writing to LanceDB**: Use `write-lance-catalog` to save the results.
 ## Output Location
 - LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
-- Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
+- Table names: `files`, `structured_schemas`, `media_metadata`
+## Available Tools for this Skill
+- `list-s3-objects`: To traverse and list files in the bucket/directory.
+- `read-s3-object`: To read specific bytes of a file for schema inference or metadata extraction.
+- `write-lance-catalog`: To write the profiling results to the LanceDB catalog.
+Always report the final profiling summary back to the user once the `write-lance-catalog` completes successfully.