lightrag 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,79 @@
1
+ # LightRAG
2
+
3
+ Lightweight Retrieval-Augmented Generation library with Knowledge Graph.
4
+ Runs in Node.js (CommonJS). Browser usage requires a bundler.
5
+
6
+ ## Install
7
+
8
+ ```bash
9
+ npm install lightrag
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ ```js
15
+ const { pipeline } = require('@huggingface/transformers');
16
+ const { LightRAG, Embedder } = require('lightrag');
17
+
18
+ (async () => {
19
+ const _pipe = await pipeline('feature-extraction', 'Xenova/multilingual-e5-small', { dtype: 'fp32' });
20
+ const _embedder = new Embedder(_pipe);
21
+ const _rag = new LightRAG({
22
+ embedder: _embedder,
23
+ tokenizer: _pipe.tokenizer,
24
+ llmFunc: async (prompt, opts = {}) => {
25
+ // call your LLM API here, return string
26
+ },
27
+ });
28
+
29
+ await _rag.insert('Your document text here...');
30
+
31
+ const _answer = await _rag.query('What is this about?', { mode: 'hybrid' });
32
+ console.log(_answer);
33
+ })();
34
+ ```
35
+
36
+ ## API
37
+
38
+ ### `new LightRAG(opts)`
39
+
40
+ | Option | Type | Default | Description |
41
+ |--------|------|---------|-------------|
42
+ | `opts.llmFunc` | `async (prompt, options) => string` | **required** | LLM call function |
43
+ | `opts.embedder` | `Embedder` | **required** | Transformers.js embedder instance |
44
+ | `opts.tokenizer` | tokenizer | — | Tokenizer for token-counting chunker (falls back to character-based) |
45
+ | `opts.embeddingDim` | `number` | `384` | Embedding vector dimension |
46
+ | `opts.chunkSize` | `number` | `480` | Max tokens per chunk (with tokenizer) or chars (without) |
47
+ | `opts.chunkOverlap` | `number` | `50` | Overlap tokens (or chars) between chunks |
48
+ | `opts.maxEntityTokens` | `number` | `96000` | Max characters sent to LLM for entity extraction per chunk |
49
+
50
+ ### `rag.insert(text)`
51
+
52
+ Inserts document text: tokenizes → chunks → embeds each chunk into vector DB → extracts entities & relations into knowledge graph.
53
+
54
+ ### `rag.query(question, options)`
55
+
56
+ Query with RAG.
57
+
58
+ | Option | Type | Default | Description |
59
+ |--------|------|---------|-------------|
60
+ | `options.mode` | `string` | `'hybrid'` | `'naive'` \| `'local'` \| `'global'` \| `'hybrid'` \| `'mix'` |
61
+ | `options.systemPrompt` | `string` | `''` | System prompt prepended to LLM context |
62
+ | `options.history` | `array` | `[]` | Conversation history `[{role, content}]` |
63
+ | `options.stream` | `boolean` | `false` | Whether the LLM should stream (passed to `llmFunc`) |
64
+
65
+ ### `rag.getGraphData()`
66
+
67
+ Returns `{ nodes, edges }` — nodes with `id, entity_type, description, degree`; edges with `source, target, keywords, weight`.
68
+
69
+ ### `rag.getProgress()`
70
+
71
+ Returns `{ total, ready, isInserting, isReady }` — insertion progress.
72
+
73
+ ### `rag.toJSON()` / `LightRAG.fromJSON(data, opts)`
74
+
75
+ Serialize/deserialize the entire RAG state (vector DB + knowledge graph).
76
+
77
+ ## License
78
+
79
+ MIT
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "lightrag",
3
+ "version": "1.0.0",
4
+ "description": "Lightweight RAG library with knowledge graph — browser and Node.js, MIT licensed",
5
+ "main": "src/index.js",
6
+ "files": [
7
+ "src/"
8
+ ],
9
+ "scripts": {
10
+ "test": "echo \"Error: no test specified\" && exit 1"
11
+ },
12
+ "keywords": [
13
+ "rag",
14
+ "knowledge-graph",
15
+ "vector-db",
16
+ "llm",
17
+ "transformers.js",
18
+ "lightrag",
19
+ "entity-extraction"
20
+ ],
21
+ "author": "BlackboardLM",
22
+ "license": "MIT",
23
+ "repository": {
24
+ "type": "git",
25
+ "url": "https://github.com/winterist/BlackboardLM"
26
+ },
27
+ "type": "commonjs",
28
+ "dependencies": {
29
+ "@huggingface/transformers": "^4.2.0"
30
+ }
31
+ }
package/src/chunker.js ADDED
@@ -0,0 +1,33 @@
1
+ class TokenChunker {
2
+ constructor(tokenizer, tokenLimit = 480, overlapTokens = 50) {
3
+ this._tokenizer = tokenizer;
4
+ this._tokenLimit = tokenLimit;
5
+ this._overlapTokens = overlapTokens;
6
+ }
7
+ chunk(text) {
8
+ if (!this._tokenizer) {
9
+ const _chunks = [];
10
+ let _start = 0;
11
+ while (_start < text.length) {
12
+ const _end = Math.min(_start + 3000, text.length);
13
+ _chunks.push(text.slice(_start, _end));
14
+ if (_end >= text.length) break;
15
+ _start = _end - 150;
16
+ }
17
+ return _chunks;
18
+ }
19
+ const _tokens = this._tokenizer.encode(text);
20
+ const _chunks = [];
21
+ let _start = 0;
22
+ while (_start < _tokens.length) {
23
+ const _end = Math.min(_start + this._tokenLimit, _tokens.length);
24
+ const _chunkTokens = _tokens.slice(_start, _end);
25
+ const _chunkText = this._tokenizer.decode(_chunkTokens, { skip_special_tokens: true });
26
+ _chunks.push(_chunkText);
27
+ if (_end >= _tokens.length) break;
28
+ _start = _end - this._overlapTokens;
29
+ }
30
+ return _chunks;
31
+ }
32
+ }
33
+ module.exports = { TokenChunker };
@@ -0,0 +1,18 @@
1
+ class Embedder {
2
+ constructor(pipeline) {
3
+ this._pipe = pipeline;
4
+ }
5
+ async embed(texts, context = 'document') {
6
+ const _prefix = context === 'query' ? 'query: ' : 'passage: ';
7
+ const _prefixed = texts.map(_t => _prefix + _t);
8
+ const _output = await this._pipe(_prefixed, { pooling: 'mean', normalize: true });
9
+ return _output.tolist();
10
+ }
11
+ async embedQuery(query) {
12
+ return (await this.embed([query], 'query'))[0];
13
+ }
14
+ async embedDocuments(docs) {
15
+ return await this.embed(docs, 'document');
16
+ }
17
+ }
18
+ module.exports = { Embedder };
package/src/graph.js ADDED
@@ -0,0 +1,67 @@
1
+ class KnowledgeGraph {
2
+ constructor() {
3
+ this._nodes = new Map();
4
+ this._edges = [];
5
+ this._adj = new Map();
6
+ }
7
+ upsertNode(id, data = {}) {
8
+ if (!this._nodes.has(id)) {
9
+ this._nodes.set(id, { id, entity_type: data.entity_type || '', description: data.description || '' });
10
+ this._adj.set(id, new Map());
11
+ } else {
12
+ const _n = this._nodes.get(id);
13
+ if (data.description && data.description.length > _n.description.length) _n.description = data.description;
14
+ if (data.entity_type) _n.entity_type = data.entity_type;
15
+ }
16
+ }
17
+ upsertEdge(source, target, data = {}) {
18
+ if (!this._nodes.has(source)) this.upsertNode(source);
19
+ if (!this._nodes.has(target)) this.upsertNode(target);
20
+ const _e = { source, target, keywords: data.keywords || '', weight: data.weight || 1 };
21
+ this._edges.push(_e);
22
+ const _adjSource = this._adj.get(source);
23
+ if (_adjSource) _adjSource.set(target, _e);
24
+ const _adjTarget = this._adj.get(target);
25
+ if (_adjTarget) _adjTarget.set(source, _e);
26
+ }
27
+ getNeighbors(id, maxDegree = 10) {
28
+ const _adjNode = this._adj.get(id);
29
+ if (!_adjNode) return [];
30
+ return Array.from(_adjNode.entries())
31
+ .sort((_a, _b) => _b[1].weight - _a[1].weight)
32
+ .slice(0, maxDegree)
33
+ .map(([_nodeId, _edge]) => ({ nodeId: _nodeId, ..._edge }));
34
+ }
35
+ getAllNodes() {
36
+ return Array.from(this._nodes.values()).map(_n => ({
37
+ id: _n.id, entity_type: _n.entity_type, description: _n.description,
38
+ degree: (this._adj.get(_n.id)?.size || 0),
39
+ }));
40
+ }
41
+ getAllEdges() {
42
+ return this._edges
43
+ .filter(_e => this._nodes.has(_e.source) && this._nodes.has(_e.target))
44
+ .map(_e => ({ source: _e.source, target: _e.target, keywords: _e.keywords, weight: _e.weight }));
45
+ }
46
+ getNode(id) { return this._nodes.get(id) || null; }
47
+ get size() { return this._nodes.size; }
48
+ toJSON() {
49
+ return {
50
+ nodes: Array.from(this._nodes.entries()),
51
+ edges: this._edges,
52
+ adj: Array.from(this._adj.entries()).map(([k, v]) => [k, Array.from(v.entries())]),
53
+ };
54
+ }
55
+ static fromJSON(data) {
56
+ const _g = new KnowledgeGraph();
57
+ if (data.nodes) for (const [k, v] of data.nodes) _g._nodes.set(k, v);
58
+ if (data.edges) _g._edges = data.edges;
59
+ if (data.adj) for (const [k, entries] of data.adj) {
60
+ const _m = new Map();
61
+ for (const [k2, v] of entries) _m.set(k2, v);
62
+ _g._adj.set(k, _m);
63
+ }
64
+ return _g;
65
+ }
66
+ }
67
+ module.exports = { KnowledgeGraph };
package/src/index.js ADDED
@@ -0,0 +1,8 @@
1
+ const { LightRAG } = require('./lightrag');
2
+ const { Embedder } = require('./embedder');
3
+ const { KnowledgeGraph } = require('./graph');
4
+ const { VectorDB } = require('./vector_db');
5
+ const { TokenChunker } = require('./chunker');
6
+ const { ENTITY_EXTRACTION_PROMPT } = require('./prompts');
7
+
8
+ module.exports = { LightRAG, Embedder, KnowledgeGraph, VectorDB, TokenChunker, ENTITY_EXTRACTION_PROMPT };
@@ -0,0 +1,130 @@
1
+ const { TokenChunker } = require('./chunker');
2
+ const { VectorDB } = require('./vector_db');
3
+ const { KnowledgeGraph } = require('./graph');
4
+ const { ENTITY_EXTRACTION_PROMPT } = require('./prompts');
5
+
6
+ class LightRAG {
7
+ constructor(opts = {}) {
8
+ this._llmFunc = opts.llmFunc;
9
+ this._embedder = opts.embedder;
10
+ this._embeddingDim = opts.embeddingDim || 384;
11
+ this._chunkSize = opts.chunkSize || 480;
12
+ this._chunkOverlap = opts.chunkOverlap || 50;
13
+ this._maxEntityTokens = opts.maxEntityTokens || 96000;
14
+ this._chunker = new TokenChunker(opts.tokenizer, this._chunkSize, this._chunkOverlap);
15
+ this._vdb = new VectorDB(this._embeddingDim);
16
+ this._graph = new KnowledgeGraph();
17
+ this._progress = { total: 0, ready: 0, isInserting: false, isReady: true };
18
+ }
19
+
20
+ async insert(text) {
21
+ const _chunks = this._chunker.chunk(text);
22
+ console.error(`[LightRAG] Chunked text (token_limit=${this._chunkSize}, overlap=${this._chunkOverlap}) into ${_chunks.length} chunks`);
23
+ this._progress = { total: _chunks.length, ready: 0, isInserting: true, isReady: false };
24
+ for (let _i = 0; _i < _chunks.length; _i++) {
25
+ const _chunk = _chunks[_i];
26
+ const _id = `chunk_${Date.now()}_${_i}`;
27
+ console.error(`[LightRAG] Embedding chunk ${_i + 1}/${_chunks.length} (${_chunk.length} chars)`);
28
+ const _vectors = await this._embedder.embedDocuments([_chunk]);
29
+ const _vec = new Float32Array(_vectors[0]);
30
+ await this._vdb.upsert([{ id: _id, vector: Array.from(_vec), text: _chunk }]);
31
+ console.error(`[LightRAG] Extracting entities from chunk ${_i + 1}/${_chunks.length}`);
32
+ await this._extractEntities(_chunk, _id);
33
+ this._progress.ready = _i + 1;
34
+ }
35
+ console.error(`[LightRAG] Insert complete: ${this._vdb.size} vectors, ${this._graph.size} entities`);
36
+ this._progress.isInserting = false;
37
+ this._progress.isReady = true;
38
+ }
39
+
40
+ async _extractEntities(text, sourceId) {
41
+ console.error(`[LightRAG] Calling LLM for entity extraction (${text.length} chars)`);
42
+ const _prompt = ENTITY_EXTRACTION_PROMPT.replace('{text}', text.slice(0, this._maxEntityTokens));
43
+ try {
44
+ const _raw = await this._llmFunc(_prompt);
45
+ const _json = this._parseJson(_raw);
46
+ if (_json && _json.entities) {
47
+ for (const _e of _json.entities) {
48
+ const _name = (_e.name || '').trim().replace(/(?:^|\s+)([a-z])/g, (_m, _c) => _m[0] === ' ' ? ' ' + _c.toUpperCase() : _c.toUpperCase());
49
+ if (!_name) continue;
50
+ this._graph.upsertNode(_name, { entity_type: _e.type || '', description: _e.description || '', source_id: sourceId });
51
+ }
52
+ }
53
+ if (_json && _json.relations) {
54
+ for (const _r of _json.relations) {
55
+ const _s = (_r.source || '').trim().replace(/(?:^|\s+)([a-z])/g, (_m, _c) => _m[0] === ' ' ? ' ' + _c.toUpperCase() : _c.toUpperCase());
56
+ const _t = (_r.target || '').trim().replace(/(?:^|\s+)([a-z])/g, (_m, _c) => _m[0] === ' ' ? ' ' + _c.toUpperCase() : _c.toUpperCase());
57
+ if (!_s || !_t) continue;
58
+ this._graph.upsertEdge(_s, _t, { keywords: _r.keywords || '', weight: _r.weight || 1, source_id: sourceId });
59
+ }
60
+ }
61
+ const _ec = _json?.entities?.length || 0;
62
+ const _rc = _json?.relations?.length || 0;
63
+ console.error(`[LightRAG] Entity extraction done: ${_ec} entities, ${_rc} relations`);
64
+ } catch (_e) {
65
+ console.error(`[LightRAG] Entity extraction failed: ${_e.message}`);
66
+ }
67
+ }
68
+
69
+ _parseJson(raw) {
70
+ let _s = raw.trim();
71
+ if (_s.startsWith('```')) _s = _s.replace(/```\w*\n?/g, '').trim();
72
+ try { return JSON.parse(_s); } catch (_e) {}
73
+ const _m = _s.match(/\{[\s\S]*\}/);
74
+ if (_m) try { return JSON.parse(_m[0]); } catch (_e) {}
75
+ return null;
76
+ }
77
+
78
+ async query(question, options = {}) {
79
+ const _mode = options.mode || 'hybrid';
80
+ console.error(`[LightRAG] Query in "${_mode}" mode: "${question.slice(0, 80)}..."`);
81
+ if (_mode === 'naive' || this._vdb.size === 0) {
82
+ console.error(`[LightRAG] No retrieval needed (mode=${_mode}, vdb_size=${this._vdb.size})`);
83
+ return await this._llmFunc(question, { system_prompt: options.systemPrompt || '', history: options.history || [], stream: options.stream });
84
+ }
85
+ const _qVec = await this._embedder.embedQuery(question);
86
+ const _context = await this._buildContext(new Float32Array(_qVec), _mode);
87
+ console.error(`[LightRAG] Context built: ${_context.length} chars`);
88
+ const _sysPrompt = (options.systemPrompt || '') + (_context ? `\n\n---Context from your documents---\n${_context}` : '');
89
+ return await this._llmFunc(question, { system_prompt: _sysPrompt, history: options.history || [], stream: options.stream });
90
+ }
91
+
92
+ async _buildContext(_qVec, mode) {
93
+ let _parts = [];
94
+ if (mode === 'local' || mode === 'hybrid' || mode === 'mix') {
95
+ const _chunks = await this._vdb.query(Array.from(_qVec), 5);
96
+ if (_chunks.length) _parts.push('=== Relevant Document Chunks ===\n' + _chunks.map(_c => `[${_c.id}] ${_c.text}`).join('\n\n'));
97
+ }
98
+ if (mode === 'global' || mode === 'hybrid' || mode === 'mix') {
99
+ const _nodes = this._graph.getAllNodes();
100
+ const _sorted = _nodes.sort((_a, _b) => _b.degree - _a.degree).slice(0, 20);
101
+ if (_sorted.length) {
102
+ _parts.push('=== Knowledge Graph Entities ===\n' + _sorted.map(_n => `- ${_n.id} [${_n.entity_type}]: ${_n.description}`).join('\n'));
103
+ }
104
+ }
105
+ return _parts.join('\n\n');
106
+ }
107
+
108
+ getGraphData() {
109
+ return { nodes: this._graph.getAllNodes(), edges: this._graph.getAllEdges() };
110
+ }
111
+
112
+ getProgress() { return { ...this._progress }; }
113
+
114
+ toJSON() {
115
+ return {
116
+ embeddingDim: this._embeddingDim,
117
+ vdb: this._vdb.toJSON(),
118
+ graph: this._graph.toJSON(),
119
+ };
120
+ }
121
+
122
+ static fromJSON(data, opts = {}) {
123
+ const _rag = new LightRAG({ ...opts, embeddingDim: data.embeddingDim || 384 });
124
+ _rag._vdb = VectorDB.fromJSON(data.vdb || {});
125
+ _rag._graph = KnowledgeGraph.fromJSON(data.graph || {});
126
+ return _rag;
127
+ }
128
+ }
129
+
130
+ module.exports = { LightRAG };
package/src/prompts.js ADDED
@@ -0,0 +1,27 @@
1
+ const ENTITY_EXTRACTION_PROMPT = `---Role---
2
+ You are an expert knowledge graph builder. Extract entities and relationships from the text.
3
+
4
+ ---Goal---
5
+ Given a text chunk, identify ALL named entities (people, places, organizations, concepts, events, dates, technologies) and the relationships between them.
6
+
7
+ ---Output Format---
8
+ Return a JSON object with "entities" and "relations":
9
+ {
10
+ "entities": [
11
+ {"name": "Entity", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|DATE|TECH", "description": "Brief description"}
12
+ ],
13
+ "relations": [
14
+ {"source": "Entity A", "target": "Entity B", "keywords": "relationship", "weight": 1.0}
15
+ ]
16
+ }
17
+
18
+ ---Rules---
19
+ - Entity name under 60 chars, description under 120 chars
20
+ - Extract at least 3 entities per chunk
21
+ - Weight between 0.5 (weak) and 1.5 (strong)
22
+ - Return ONLY valid JSON, no other text.
23
+
24
+ ---Text---
25
+ {text}`;
26
+
27
+ module.exports = { ENTITY_EXTRACTION_PROMPT };
@@ -0,0 +1,47 @@
1
+ class VectorDB {
2
+ constructor(dim) {
3
+ this._dim = dim;
4
+ this._vectors = [];
5
+ this._meta = [];
6
+ }
7
+ async upsert(entries) {
8
+ for (const _e of entries) {
9
+ const _idx = this._meta.findIndex(_m => _m.id === _e.id);
10
+ if (_idx >= 0) {
11
+ this._vectors[_idx] = _e.vector;
12
+ this._meta[_idx] = { id: _e.id, text: _e.text || '' };
13
+ } else {
14
+ this._vectors.push(_e.vector);
15
+ this._meta.push({ id: _e.id, text: _e.text || '' });
16
+ }
17
+ }
18
+ }
19
+ async query(vector, topK = 10) {
20
+ if (this._vectors.length === 0) return [];
21
+ const _scores = this._vectors.map((_v, _i) => ({
22
+ index: _i,
23
+ score: this._cosine(vector, _v),
24
+ id: this._meta[_i].id,
25
+ text: this._meta[_i].text,
26
+ }));
27
+ _scores.sort((_a, _b) => _b.score - _a.score);
28
+ return _scores.slice(0, topK);
29
+ }
30
+ _cosine(a, b) {
31
+ let _dot = 0, _na = 0, _nb = 0;
32
+ for (let _i = 0; _i < a.length; _i++) { _dot += a[_i] * b[_i]; _na += a[_i] * a[_i]; _nb += b[_i] * b[_i]; }
33
+ _na = Math.sqrt(_na); _nb = Math.sqrt(_nb);
34
+ return (_na && _nb) ? _dot / (_na * _nb) : 0;
35
+ }
36
+ get size() { return this._vectors.length; }
37
+ toJSON() {
38
+ return { dim: this._dim, vectors: this._vectors, meta: this._meta };
39
+ }
40
+ static fromJSON(data) {
41
+ const _db = new VectorDB(data.dim);
42
+ _db._vectors = data.vectors || [];
43
+ _db._meta = data.meta || [];
44
+ return _db;
45
+ }
46
+ }
47
+ module.exports = { VectorDB };