raglite-chromadb 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import urllib.error
5
+ import subprocess
6
+ import time
7
+ import urllib.request
8
+ import hashlib
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Iterable
12
+
13
+ try:
14
+ # Package context
15
+ from .chroma_rest import (
16
+ ChromaLoc,
17
+ get_or_create_collection,
18
+ query as chroma_query,
19
+ upsert as chroma_upsert,
20
+ )
21
+ except ImportError: # pragma: no cover
22
+ # Script context
23
+ from chroma_rest import (
24
+ ChromaLoc,
25
+ get_or_create_collection,
26
+ query as chroma_query,
27
+ upsert as chroma_upsert,
28
+ )
29
+
30
+
31
+ DEFAULT_CHROMA_URL = "http://127.0.0.1:8100"
32
+ DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
33
+ DEFAULT_EMBED_MODEL = "nomic-embed-text"
34
+
35
+
36
+ @dataclass
37
+ class Chunk:
38
+ id: str
39
+ text: str
40
+ meta: dict
41
+
42
+
43
+ def _sha256_text(s: str) -> str:
44
+ return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()
45
+
46
+
47
+ def post_json(url: str, data: dict, timeout: int = 120) -> dict:
48
+ body = json.dumps(data).encode("utf-8")
49
+ req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"})
50
+ try:
51
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
52
+ return json.loads(resp.read().decode("utf-8"))
53
+ except urllib.error.HTTPError as e: # type: ignore[attr-defined]
54
+ detail = ""
55
+ try:
56
+ detail = e.read().decode("utf-8", errors="ignore")
57
+ except Exception:
58
+ detail = ""
59
+ raise RuntimeError(f"HTTP {e.code} calling {url}: {detail[:500]}")
60
+
61
+
62
+ def _embed_input(text: str, *, max_chars: int) -> str:
63
+ if len(text) > max_chars:
64
+ return text[:max_chars] + "\n[TRUNCATED_FOR_EMBEDDING]"
65
+ return text
66
+
67
+
68
+ def ollama_embed(*, ollama_url: str, model: str, text: str, max_chars: int = 800) -> list[float]:
69
+ """Embed text with Ollama.
70
+
71
+ Embedding models have a context limit; we defensively truncate.
72
+ """
73
+ prompt = _embed_input(text, max_chars=max_chars)
74
+ data = post_json(
75
+ f"{ollama_url}/api/embeddings",
76
+ {"model": model, "prompt": prompt},
77
+ timeout=600,
78
+ )
79
+ emb = data.get("embedding")
80
+ if not isinstance(emb, list):
81
+ raise RuntimeError("ollama embeddings: missing embedding")
82
+ return emb
83
+
84
+
85
+ def iter_md_files(
86
+ root: Path,
87
+ *,
88
+ include_outlines: bool = False,
89
+ include_kinds: set[str] | None = None,
90
+ exclude_kinds: set[str] | None = None,
91
+ ) -> Iterable[Path]:
92
+ """Yield markdown files that are good retrieval units.
93
+
94
+ Default behavior: index nodes + tool summaries + execution notes + indices + root index.
95
+ Skip outlines by default (large + noisy).
96
+
97
+ You can further filter by kind using include_kinds/exclude_kinds.
98
+ Kinds: node, tool-summary, execution-notes, index, root-index, outline, md
99
+ """
100
+
101
+ def kind_for(p: Path) -> str:
102
+ if "/nodes/" in p.as_posix():
103
+ return "node"
104
+ n = p.name
105
+ if n.endswith(".tool-summary.md"):
106
+ return "tool-summary"
107
+ if n.endswith(".execution-notes.md"):
108
+ return "execution-notes"
109
+ if n.endswith(".outline.md"):
110
+ return "outline"
111
+ if n.endswith(".index.md"):
112
+ return "index"
113
+ if n == "index.md":
114
+ return "root-index"
115
+ return "md"
116
+
117
+ for p in root.rglob("*.md"):
118
+ if not p.is_file() or ".raglite" in p.parts:
119
+ continue
120
+
121
+ k = kind_for(p)
122
+ if k == "outline" and not include_outlines:
123
+ continue
124
+
125
+ # Default allowlist
126
+ default_ok = k in {"node", "tool-summary", "execution-notes", "index", "root-index"}
127
+ if not default_ok:
128
+ continue
129
+
130
+ if include_kinds is not None and k not in include_kinds:
131
+ continue
132
+ if exclude_kinds is not None and k in exclude_kinds:
133
+ continue
134
+
135
+ yield p
136
+
137
+
138
+ def kind_from_name(name: str) -> str:
139
+ if name.endswith(".tool-summary.md"):
140
+ return "tool-summary"
141
+ if name.endswith(".execution-notes.md"):
142
+ return "execution-notes"
143
+ if name.endswith(".outline.md"):
144
+ return "outline"
145
+ if name.endswith(".index.md"):
146
+ return "index"
147
+ if name == "index.md":
148
+ return "root-index"
149
+ return "md"
150
+
151
+
152
+ def file_to_chunks(*, distilled_root: Path, file_path: Path, collection: str) -> list[Chunk]:
153
+ text = file_path.read_text(encoding="utf-8", errors="ignore")
154
+ max_chars = 3500
155
+
156
+ # Split by H1, otherwise hard split.
157
+ chunks: list[str] = []
158
+ cur: list[str] = []
159
+ for line in text.splitlines():
160
+ if line.startswith("# ") and cur:
161
+ chunks.append("\n".join(cur).strip())
162
+ cur = [line]
163
+ else:
164
+ cur.append(line)
165
+ if cur:
166
+ chunks.append("\n".join(cur).strip())
167
+
168
+ final: list[str] = []
169
+ for c in chunks:
170
+ if len(c) <= max_chars:
171
+ final.append(c)
172
+ else:
173
+ for i in range(0, len(c), max_chars):
174
+ final.append(c[i : i + max_chars])
175
+
176
+ rel_path = file_path.relative_to(distilled_root).as_posix()
177
+
178
+ out: list[Chunk] = []
179
+ for i, c in enumerate(final):
180
+ cid = f"{collection}:{rel_path}::{i}"
181
+ out.append(
182
+ Chunk(
183
+ id=cid,
184
+ text=c,
185
+ meta={
186
+ "path": str(file_path),
187
+ "rel_path": rel_path,
188
+ "name": file_path.name,
189
+ "kind": kind_from_name(file_path.name),
190
+ },
191
+ )
192
+ )
193
+ return out
194
+
195
+
196
+ def keyword_search(*, root: Path, query: str, top_k: int = 10) -> list[dict]:
197
+ try:
198
+ proc = subprocess.run(
199
+ ["rg", "-n", "--no-heading", "--smart-case", query, str(root)],
200
+ capture_output=True,
201
+ text=True,
202
+ check=False,
203
+ )
204
+ out = proc.stdout.strip().splitlines() if proc.stdout else []
205
+ hits: list[dict] = []
206
+ for line in out:
207
+ parts = line.split(":", 2)
208
+ if len(parts) == 3:
209
+ hits.append({"path": parts[0], "line": int(parts[1]), "text": parts[2]})
210
+ if len(hits) >= top_k:
211
+ break
212
+ return hits
213
+ except FileNotFoundError:
214
+ return []
215
+
216
+
217
+ def index_distilled_dir(
218
+ *,
219
+ distilled_root: Path,
220
+ chroma_url: str,
221
+ collection: str,
222
+ ollama_url: str,
223
+ embed_model: str,
224
+ embed_max_chars: int = 800,
225
+ sleep_ms: int = 0,
226
+ include_outlines: bool = False,
227
+ include_kinds: set[str] | None = None,
228
+ exclude_kinds: set[str] | None = None,
229
+ index_cache_path: Path | None = None,
230
+ skip_indexed: bool = False,
231
+ ) -> dict:
232
+ loc = ChromaLoc(base_url=chroma_url)
233
+ c = get_or_create_collection(loc, collection, space="cosine")
234
+ cid = str(c["id"])
235
+
236
+ files = list(
237
+ iter_md_files(
238
+ distilled_root,
239
+ include_outlines=include_outlines,
240
+ include_kinds=include_kinds,
241
+ exclude_kinds=exclude_kinds,
242
+ )
243
+ )
244
+
245
+ cache: dict = {}
246
+ if index_cache_path is not None and index_cache_path.exists():
247
+ try:
248
+ cache = json.loads(index_cache_path.read_text(encoding="utf-8"))
249
+ except Exception:
250
+ cache = {}
251
+
252
+ # Invalidate cache if it was generated for a different target.
253
+ cache_meta = cache.get("__meta__") if isinstance(cache, dict) else None
254
+ if isinstance(cache_meta, dict):
255
+ if cache_meta.get("collection") != collection or cache_meta.get("chroma_url") != chroma_url:
256
+ cache = {}
257
+
258
+ added = 0
259
+ skipped = 0
260
+ for p in files:
261
+ chunks = file_to_chunks(distilled_root=distilled_root, file_path=p, collection=collection)
262
+ for ch in chunks:
263
+ key = ch.id
264
+ prompt = _embed_input(ch.text, max_chars=embed_max_chars)
265
+ h = _sha256_text(prompt)
266
+ cached = cache.get(key) if isinstance(cache, dict) else None
267
+ if (
268
+ skip_indexed
269
+ and isinstance(cached, dict)
270
+ and cached.get("sha256") == h
271
+ and cached.get("embed_model") == embed_model
272
+ and int(cached.get("embed_max_chars") or embed_max_chars) == embed_max_chars
273
+ ):
274
+ skipped += 1
275
+ continue
276
+
277
+ emb = ollama_embed(ollama_url=ollama_url, model=embed_model, text=ch.text, max_chars=embed_max_chars)
278
+ chroma_upsert(loc, cid, ids=[ch.id], documents=[ch.text], embeddings=[emb], metadatas=[ch.meta])
279
+ added += 1
280
+
281
+ if index_cache_path is not None:
282
+ cache[key] = {
283
+ "sha256": h,
284
+ "embed_model": embed_model,
285
+ "embed_max_chars": embed_max_chars,
286
+ "updatedAt": int(time.time()),
287
+ }
288
+
289
+ if sleep_ms:
290
+ time.sleep(sleep_ms / 1000.0)
291
+
292
+ if index_cache_path is not None:
293
+ cache["__meta__"] = {
294
+ "collection": collection,
295
+ "chroma_url": chroma_url,
296
+ "updatedAt": int(time.time()),
297
+ }
298
+ index_cache_path.parent.mkdir(parents=True, exist_ok=True)
299
+ tmp = index_cache_path.with_suffix(index_cache_path.suffix + ".tmp")
300
+ tmp.write_text(json.dumps(cache, indent=2, sort_keys=True) + "\n", encoding="utf-8")
301
+ tmp.replace(index_cache_path)
302
+
303
+ return {"files": len(files), "chunksAdded": added, "chunksSkipped": skipped, "collection": collection}
304
+
305
+
306
+ def query_distilled(
307
+ *,
308
+ query: str,
309
+ distilled_root: Path,
310
+ chroma_url: str,
311
+ collection: str,
312
+ ollama_url: str,
313
+ embed_model: str,
314
+ embed_max_chars: int = 800,
315
+ top_k: int = 10,
316
+ keyword_top_k: int = 10,
317
+ ) -> dict:
318
+ loc = ChromaLoc(base_url=chroma_url)
319
+ c = get_or_create_collection(loc, collection, space="cosine")
320
+ cid = str(c["id"])
321
+
322
+ qemb = ollama_embed(ollama_url=ollama_url, model=embed_model, text=query, max_chars=embed_max_chars)
323
+ vec = chroma_query(loc, cid, query_embeddings=[qemb], n_results=top_k, include=["documents", "metadatas", "distances"]) # type: ignore
324
+ kw = keyword_search(root=distilled_root, query=query, top_k=keyword_top_k)
325
+ return {"vector": vec, "keyword": kw}
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: raglite-chromadb
3
+ Version: 1.0.1
4
+ Summary: Local-first RAG-lite CLI: condense docs into structured Markdown, then index/query with Chroma + hybrid search
5
+ Author: Viraj Sanghvi
6
+ License: MIT
7
+ Keywords: rag,docs,chroma,ollama,openclaw,summarization,local-first
8
+ Requires-Python: >=3.11
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: beautifulsoup4==4.12.3
11
+ Requires-Dist: lxml==5.3.0
12
+ Requires-Dist: pypdf==5.2.0
13
+
14
+ # RAGLite
15
+
16
+ <p align="center">
17
+ <img src="assets/hero.svg" alt="RAGLite: Compress first. Index second." width="900" />
18
+ </p>
19
+
20
+ RAGLite is a local-first CLI that turns a folder of docs (PDF/HTML/TXT/MD) into **structured, low-fluff Markdown** — and then makes it searchable with **Chroma** 🧠 + **ripgrep** 🔎.
21
+
22
+ Core idea: **compression-before-embeddings** ✂️➡️🧠
23
+
24
+ <p align="center">
25
+ <img src="assets/diagram.svg" alt="RAGLite workflow: condense, index, query" width="900" />
26
+ </p>
27
+
28
+ ## What you get
29
+
30
+ For each input file:
31
+ - `*.execution-notes.md` — practical run/operate notes (checks, failure modes, commands)
32
+ - `*.tool-summary.md` — compact index entry (purpose, capabilities, entrypoints, footguns)
33
+
34
+ Optionally:
35
+ - `raglite index` stores embeddings in **Chroma** đź§  (one DB, many collections)
36
+ - `raglite query` runs **hybrid search** 🔎 (vector + keyword)
37
+
38
+ ## Why local + open-source?
39
+
40
+ If you want a private, local setup (no managed “fancy vector DB” required), RAGLite keeps everything on your machine:
41
+ - Distilled Markdown artifacts are plain files you can audit + version control
42
+ - Indexing uses **Chroma** (open-source, local) and keyword search uses **ripgrep**
43
+ - You can still swap in a hosted vector DB later if you outgrow local
44
+
45
+ ## Engines
46
+
47
+ RAGLite supports two backends:
48
+
49
+ - **OpenClaw (recommended):** uses your local OpenClaw Gateway `/v1/responses` endpoint for higher-quality, format-following condensation.
50
+ - **Ollama:** uses `POST /api/generate` for fully local inference (often less reliable at strict templates).
51
+
52
+ ## Prereqs
53
+
54
+ - **Python 3.11+**
55
+ - An LLM engine:
56
+ - **OpenClaw** (recommended) 🦞, or
57
+ - **Ollama** 🦙
58
+ - For search:
59
+ - **Chroma** (open-source, local) đź§  at `http://127.0.0.1:8100`
60
+
61
+ ## Install
62
+
63
+ ```bash
64
+ # from repo root
65
+ python3 -m venv .venv
66
+ source .venv/bin/activate
67
+ pip install -e .
68
+ ```
69
+
70
+ ## Quickstart (60s)
71
+
72
+ ```bash
73
+ # 0) Setup
74
+ cd ~/Projects/raglite
75
+ source .venv/bin/activate
76
+
77
+ # 1) Condense → Index (one command)
78
+ raglite run /path/to/docs \
79
+ --out ./raglite_out \
80
+ --engine ollama --ollama-model llama3.2:3b \
81
+ --collection my-docs \
82
+ --chroma-url http://127.0.0.1:8100 \
83
+ --skip-indexed
84
+
85
+ # 2) Query
86
+ raglite query ./raglite_out \
87
+ --collection my-docs \
88
+ "rollback procedure"
89
+ ```
90
+
91
+ ## Usage
92
+
93
+ ### 1) Distill docs ✍️
94
+
95
+ ```bash
96
+ raglite condense /path/to/docs \
97
+ --out ./raglite_out \
98
+ --engine openclaw
99
+ ```
100
+
101
+ (Or fully local: `--engine ollama --ollama-model llama3.2:3b`.)
102
+
103
+ ### 2) Index distilled output (Chroma)
104
+
105
+ ```bash
106
+ raglite index ./raglite_out \
107
+ --collection my-docs \
108
+ --chroma-url http://127.0.0.1:8100
109
+ ```
110
+
111
+ ### 3) Query (hybrid)
112
+
113
+ ```bash
114
+ raglite query ./raglite_out \
115
+ --collection my-docs \
116
+ --top-k 5 \
117
+ --keyword-top-k 5 \
118
+ "rollback procedure"
119
+ ```
120
+
121
+ ### Useful flags
122
+
123
+ - `--skip-existing` : don’t redo files that already have both outputs
124
+ - `--skip-indexed` : don’t re-embed chunks that are already indexed
125
+ - `--nodes` : write per-section nodes + per-doc/root indices
126
+ - `--node-max-chars 1200` : keep nodes embed-friendly
127
+ - `--sleep-ms 200` : throttle between files (helps avoid timeouts)
128
+ - `--max-chars 180000` : cap extracted text per file before summarizing
129
+
130
+ ## Output layout
131
+
132
+ RAGLite preserves folder structure under your `--out` dir:
133
+
134
+ ```text
135
+ <out>/
136
+ some/subdir/file.execution-notes.md
137
+ some/subdir/file.tool-summary.md
138
+
139
+ (Default output folder is `./raglite_out`.)
140
+ ```
141
+
142
+ ## Notes / gotchas
143
+
144
+ - PDF extraction is best-effort: scanned PDFs without embedded text won’t be great.
145
+ - If you use `--engine openclaw`, pass `--gateway-token` or set `OPENCLAW_GATEWAY_TOKEN`.
146
+ - Indexing defaults to high-signal artifacts (nodes/summaries/notes) and skips `*.outline.md` unless you opt in.
147
+
148
+ ## Roadmap
149
+
150
+ ### Current (implemented)
151
+ - `condense` — condense/summarize documents into Markdown artifacts
152
+ - `index` — chunk + embed + store in **Chroma** collections
153
+ - `query` — retrieve relevant chunks (vector + keyword)
154
+ - `run` — one-command pipeline (condense → index)
155
+ - Outline + nodes + indices: `--outline`, `--nodes`, root `index.md` + per-doc `*.index.md`
156
+
157
+ ### Next (near-term)
158
+ - Detect deletions (prune removed chunks from Chroma)
159
+ - Batch upserts to Chroma for speed
160
+ - Better query output formatting (snippets + anchors)
161
+ - `raglite doctor` (dependency checks)
162
+
163
+ (Full: [ROADMAP.md](ROADMAP.md))
164
+
165
+ ---
166
+
167
+ Built to turn “docs” into **usable, searchable tool knowledge**.
@@ -0,0 +1,11 @@
1
+ raglite/__init__.py,sha256=da1PTClDMl-IBkrSvq6JC1lnS-K_BASzCvxVhNxN5Ls,13
2
+ raglite/chroma_rest.py,sha256=jpIqfjPwbjsgIQtH3BG3vElwYY9KTKCne0b05YqhRcM,3452
3
+ raglite/extract.py,sha256=lhVcT-S5gn3MMp6vuUuQeOq9FIlajL0yPEN9Tif61MQ,1641
4
+ raglite/prompts.py,sha256=8jWECjSlf6FqDocxhQo-S8Sf_DkjAYreBuTk4TFAnqo,1737
5
+ raglite/raglite_cli.py,sha256=i-EgNLKR9ukyV6fkrlxPectRNRxapReRpgWL5NKsQ6U,37413
6
+ raglite/vector_index.py,sha256=VXUQ0gUYZJ66SEUk6BEZUG0dyJ91O74xLS7ig3g8TYc,10177
7
+ raglite_chromadb-1.0.1.dist-info/METADATA,sha256=hFtFlfdH6ROlqCAzPKsJW6AK-jxYUOaWD5GYKuFwpko,4816
8
+ raglite_chromadb-1.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ raglite_chromadb-1.0.1.dist-info/entry_points.txt,sha256=5WcIbYwLaI82HMrzBXsg0dnmQJUGBMY3kDKW8lYEVIo,52
10
+ raglite_chromadb-1.0.1.dist-info/top_level.txt,sha256=cFZneANtWpFMj7NcIF9ajaFVUXuALD_RS0MNiiUVODA,8
11
+ raglite_chromadb-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ raglite = raglite.raglite_cli:cli
@@ -0,0 +1 @@
1
+ raglite