@byted-las/contextlake-openclaw 1.0.6 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/commands/cli.d.ts +0 -2
- package/dist/src/commands/cli.js +0 -46
- package/dist/src/commands/index.js +0 -29
- package/dist/src/commands/slashcmd.d.ts +0 -6
- package/dist/src/commands/slashcmd.js +0 -87
- package/dist/src/commands/tools.d.ts +0 -2
- package/dist/src/commands/tools.js +0 -94
- package/dist/src/skills/contextlake-ingest/SKILL.md +55 -38
- package/dist/src/skills/las-data-profiler/SKILL.md +24 -18
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
- package/src/commands/cli.ts +0 -45
- package/src/commands/index.ts +0 -35
- package/src/commands/slashcmd.ts +0 -59
- package/src/commands/tools.ts +0 -99
- package/src/skills/contextlake-ingest/SKILL.md +55 -38
- package/src/skills/las-data-profiler/SKILL.md +24 -18
- package/dist/src/lib/actions/ingest-source.d.ts +0 -15
- package/dist/src/lib/actions/ingest-source.js +0 -193
- package/dist/src/lib/actions/las.d.ts +0 -64
- package/dist/src/lib/actions/las.js +0 -72
- package/dist/src/lib/scripts/s3_catalog.py +0 -617
- package/dist/src/service/embedding/local.d.ts +0 -14
- package/dist/src/service/embedding/local.js +0 -107
- package/dist/src/skills/SKILL.md +0 -39
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.LocalEmbeddingProvider = exports.setNodeLlamaCppImporter = void 0;
|
|
4
|
-
// import type { Llama, LlamaEmbeddingContext, LlamaModel } from 'node-llama-cpp';
|
|
5
|
-
const DEFAULT_LOCAL_MODEL = 'hf:CompendiumLabs/bge-small-zh-v1.5-gguf/bge-small-zh-v1.5-f16.gguf';
|
|
6
|
-
let nodeLlamaImportPromise = null;
|
|
7
|
-
const setNodeLlamaCppImporter = (importer) => {
|
|
8
|
-
nodeLlamaImportPromise = importer();
|
|
9
|
-
};
|
|
10
|
-
exports.setNodeLlamaCppImporter = setNodeLlamaCppImporter;
|
|
11
|
-
const importNodeLlamaCpp = async () => {
|
|
12
|
-
if (!nodeLlamaImportPromise) {
|
|
13
|
-
nodeLlamaImportPromise = import('node-llama-cpp');
|
|
14
|
-
}
|
|
15
|
-
return nodeLlamaImportPromise;
|
|
16
|
-
};
|
|
17
|
-
class LocalEmbeddingProvider {
|
|
18
|
-
llama = null;
|
|
19
|
-
model = null;
|
|
20
|
-
context = null;
|
|
21
|
-
initPromise = null;
|
|
22
|
-
modelPath;
|
|
23
|
-
constructor(config) {
|
|
24
|
-
// Override transformers.js default with node-llama-cpp default
|
|
25
|
-
this.modelPath = config.model_name === 'Xenova/all-MiniLM-L6-v2'
|
|
26
|
-
? DEFAULT_LOCAL_MODEL
|
|
27
|
-
: (config.model_name || DEFAULT_LOCAL_MODEL);
|
|
28
|
-
}
|
|
29
|
-
async ensureInitialized() {
|
|
30
|
-
if (this.context) {
|
|
31
|
-
return;
|
|
32
|
-
}
|
|
33
|
-
if (this.initPromise) {
|
|
34
|
-
return this.initPromise;
|
|
35
|
-
}
|
|
36
|
-
this.initPromise = this.doInitialize().catch((err) => {
|
|
37
|
-
this.initPromise = null;
|
|
38
|
-
this.context = null;
|
|
39
|
-
this.model = null;
|
|
40
|
-
this.llama = null;
|
|
41
|
-
throw err;
|
|
42
|
-
});
|
|
43
|
-
return this.initPromise;
|
|
44
|
-
}
|
|
45
|
-
async doInitialize() {
|
|
46
|
-
try {
|
|
47
|
-
const { getLlama, resolveModelFile, LlamaLogLevel } = await importNodeLlamaCpp();
|
|
48
|
-
if (!this.llama) {
|
|
49
|
-
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
|
|
50
|
-
}
|
|
51
|
-
if (!this.model) {
|
|
52
|
-
const resolved = await resolveModelFile(this.modelPath);
|
|
53
|
-
this.model = await this.llama.loadModel({ modelPath: resolved });
|
|
54
|
-
}
|
|
55
|
-
if (!this.context) {
|
|
56
|
-
this.context = await this.model.createEmbeddingContext();
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
catch (err) {
|
|
60
|
-
const detail = err instanceof Error ? err.message : String(err);
|
|
61
|
-
throw new Error(`Local embeddings unavailable. Reason: ${detail}`, {
|
|
62
|
-
cause: err,
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
async generateEmbedding(text) {
|
|
67
|
-
if (!text || !text.trim()) {
|
|
68
|
-
throw new Error('Embedding input text must be a non-empty string');
|
|
69
|
-
}
|
|
70
|
-
await this.ensureInitialized();
|
|
71
|
-
const embedding = await this.context.getEmbeddingFor(text);
|
|
72
|
-
const vector = embedding.vector; // TypedArray
|
|
73
|
-
// Optimized normalization loop
|
|
74
|
-
let sumSq = 0;
|
|
75
|
-
const len = vector.length;
|
|
76
|
-
for (let i = 0; i < len; i++) {
|
|
77
|
-
const val = vector[i];
|
|
78
|
-
if (Number.isFinite(val)) {
|
|
79
|
-
sumSq += val * val;
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
vector[i] = 0;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
const magnitude = Math.sqrt(sumSq);
|
|
86
|
-
const result = new Array(len);
|
|
87
|
-
if (magnitude > 0) {
|
|
88
|
-
const scale = 1.0 / magnitude;
|
|
89
|
-
for (let i = 0; i < len; i++) {
|
|
90
|
-
result[i] = vector[i] * scale;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
else {
|
|
94
|
-
for (let i = 0; i < len; i++) {
|
|
95
|
-
result[i] = vector[i];
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
return result;
|
|
99
|
-
}
|
|
100
|
-
async generateEmbeddings(texts) {
|
|
101
|
-
if (!Array.isArray(texts)) {
|
|
102
|
-
throw new Error('Embedding input must be an array of strings');
|
|
103
|
-
}
|
|
104
|
-
return Promise.all(texts.map(text => this.generateEmbedding(text)));
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
exports.LocalEmbeddingProvider = LocalEmbeddingProvider;
|
package/dist/src/skills/SKILL.md
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: byted-las-data-profiler
|
|
3
|
-
description: |
|
|
4
|
-
Volcengine TOS Dataset Profiling Tool. Based on the S3-compatible protocol, it scans the file structure in TOS buckets and catalogs them.
|
|
5
|
-
It writes the catalog index to a local LanceDB. It is also compatible with Alibaba Cloud OSS, Tencent Cloud COS, AWS S3, and the local file system.
|
|
6
|
-
|
|
7
|
-
IMPORTANT RULE: You are STRICTLY FORBIDDEN from writing or executing Python scripts to access S3/TOS or LanceDB.
|
|
8
|
-
You MUST exclusively use the provided tools (`list-s3-objects`, `read-s3-object`, `write-lance-catalog`, `las-data-profiler`) to accomplish the profiling tasks.
|
|
9
|
-
---
|
|
10
|
-
|
|
11
|
-
## Trigger Scenarios
|
|
12
|
-
Be sure to use this Skill when the user mentions the following scenarios:
|
|
13
|
-
- Need to scan the file structure in a TOS bucket or understand the dataset composition
|
|
14
|
-
- Need to connect to object storage (TOS/OSS/COS/S3) using the S3 protocol
|
|
15
|
-
- Need to scan, traverse, or catalog the file structure of a specific bucket or local directory
|
|
16
|
-
- Need to understand what a batch of data files contains and what their schema looks like
|
|
17
|
-
- Need to extract meta-information such as image resolution, audio/video duration, PDF page count, etc.
|
|
18
|
-
- Need to write the meta-information of object storage or local files into LanceDB
|
|
19
|
-
- Mentions keywords like "dataset scanning", "file cataloging", "data catalog", "data profiling", etc.
|
|
20
|
-
|
|
21
|
-
## Overall Workflow
|
|
22
|
-
When instructed to profile a dataset, you should prefer using the `las-data-profiler` tool directly, which automatically handles the S3 listing and LanceDB writing using internal TypeScript logic.
|
|
23
|
-
If you need to perform custom exploration, you can use `list-s3-objects` to traverse the bucket and `read-s3-object` to read file headers, and `write-lance-catalog` to save results.
|
|
24
|
-
|
|
25
|
-
## Parameter Description (for `las-data-profiler` tool)
|
|
26
|
-
| Parameter | Description | Example |
|
|
27
|
-
|-----------|-------------|---------|
|
|
28
|
-
| datasource_name | The name of the data source | my_tos_data |
|
|
29
|
-
| vendor | volcengine / alibaba / tencent / aws / local | volcengine |
|
|
30
|
-
| endpoint | S3 Endpoint URL (not required for local) | https://tos-s3-cn-beijing.volces.com |
|
|
31
|
-
| access_key | AK | - |
|
|
32
|
-
| secret_key | SK | - |
|
|
33
|
-
| region | Region identifier | cn-beijing |
|
|
34
|
-
| bucket | Bucket name (root directory path when local) | my-data-bucket |
|
|
35
|
-
| prefix | Path prefix to limit the scan scope | datasets/2024/ |
|
|
36
|
-
|
|
37
|
-
## Output Location
|
|
38
|
-
- LanceDB table storage path: `~/.openclaw/contextlake/profiler/{datasource_name}/catalog_db`
|
|
39
|
-
- Configuration file: `~/.openclaw/contextlake/profiler/{datasource_name}/env.sh`
|