raglite-chromadb 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- raglite/__init__.py +1 -0
- raglite/chroma_rest.py +111 -0
- raglite/extract.py +63 -0
- raglite/prompts.py +56 -0
- raglite/raglite_cli.py +953 -0
- raglite/vector_index.py +325 -0
- raglite_chromadb-1.0.1.dist-info/METADATA +167 -0
- raglite_chromadb-1.0.1.dist-info/RECORD +11 -0
- raglite_chromadb-1.0.1.dist-info/WHEEL +5 -0
- raglite_chromadb-1.0.1.dist-info/entry_points.txt +2 -0
- raglite_chromadb-1.0.1.dist-info/top_level.txt +1 -0
raglite/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__all__ = []
|
raglite/chroma_rest.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import urllib.error
|
|
5
|
+
import urllib.request
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ChromaLoc:
|
|
11
|
+
base_url: str = "http://127.0.0.1:8100"
|
|
12
|
+
tenant: str = "default_tenant"
|
|
13
|
+
database: str = "default_database"
|
|
14
|
+
|
|
15
|
+
def collections_url(self) -> str:
|
|
16
|
+
return f"{self.base_url}/api/v2/tenants/{self.tenant}/databases/{self.database}/collections"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _req_json(method: str, url: str, body: dict | None = None, timeout: int = 120) -> dict | list:
|
|
20
|
+
data = None if body is None else json.dumps(body).encode("utf-8")
|
|
21
|
+
req = urllib.request.Request(url, data=data, method=method)
|
|
22
|
+
req.add_header("Content-Type", "application/json")
|
|
23
|
+
try:
|
|
24
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
25
|
+
raw = resp.read().decode("utf-8")
|
|
26
|
+
if not raw:
|
|
27
|
+
return {}
|
|
28
|
+
return json.loads(raw)
|
|
29
|
+
except urllib.error.HTTPError as e: # type: ignore[attr-defined]
|
|
30
|
+
detail = ""
|
|
31
|
+
try:
|
|
32
|
+
detail = e.read().decode("utf-8", errors="ignore")
|
|
33
|
+
except Exception:
|
|
34
|
+
detail = ""
|
|
35
|
+
raise RuntimeError(f"Chroma HTTP {e.code} calling {url}: {detail[:500]}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def list_collections(loc: ChromaLoc) -> list[dict]:
|
|
39
|
+
res = _req_json("GET", loc.collections_url())
|
|
40
|
+
if not isinstance(res, list):
|
|
41
|
+
raise RuntimeError(f"Expected list from Chroma list_collections, got {type(res).__name__}")
|
|
42
|
+
return res
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_or_create_collection(loc: ChromaLoc, name: str, *, space: str = "cosine") -> dict:
|
|
46
|
+
cols = list_collections(loc)
|
|
47
|
+
for c in cols:
|
|
48
|
+
if c.get("name") == name:
|
|
49
|
+
return c
|
|
50
|
+
|
|
51
|
+
created = _req_json(
|
|
52
|
+
"POST",
|
|
53
|
+
loc.collections_url(),
|
|
54
|
+
{"name": name, "metadata": {"hnsw:space": space}},
|
|
55
|
+
)
|
|
56
|
+
if not isinstance(created, dict):
|
|
57
|
+
raise RuntimeError(f"Expected dict from Chroma create_collection, got {type(created).__name__}")
|
|
58
|
+
return created
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def add(
|
|
62
|
+
loc: ChromaLoc,
|
|
63
|
+
collection_id: str,
|
|
64
|
+
*,
|
|
65
|
+
ids: list[str],
|
|
66
|
+
documents: list[str],
|
|
67
|
+
embeddings: list[list[float]],
|
|
68
|
+
metadatas: list[dict] | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
url = f"{loc.collections_url()}/{collection_id}/add"
|
|
71
|
+
body: dict = {"ids": ids, "documents": documents, "embeddings": embeddings}
|
|
72
|
+
if metadatas is not None:
|
|
73
|
+
body["metadatas"] = metadatas
|
|
74
|
+
_req_json("POST", url, body, timeout=600)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def upsert(
|
|
78
|
+
loc: ChromaLoc,
|
|
79
|
+
collection_id: str,
|
|
80
|
+
*,
|
|
81
|
+
ids: list[str],
|
|
82
|
+
documents: list[str],
|
|
83
|
+
embeddings: list[list[float]],
|
|
84
|
+
metadatas: list[dict] | None = None,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""Upsert records into a collection.
|
|
87
|
+
|
|
88
|
+
Chroma's /upsert updates existing ids and inserts new ones.
|
|
89
|
+
"""
|
|
90
|
+
url = f"{loc.collections_url()}/{collection_id}/upsert"
|
|
91
|
+
body: dict = {"ids": ids, "documents": documents, "embeddings": embeddings}
|
|
92
|
+
if metadatas is not None:
|
|
93
|
+
body["metadatas"] = metadatas
|
|
94
|
+
_req_json("POST", url, body, timeout=600)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def query(
|
|
98
|
+
loc: ChromaLoc,
|
|
99
|
+
collection_id: str,
|
|
100
|
+
*,
|
|
101
|
+
query_embeddings: list[list[float]],
|
|
102
|
+
n_results: int = 10,
|
|
103
|
+
include: list[str] | None = None,
|
|
104
|
+
) -> dict:
|
|
105
|
+
url = f"{loc.collections_url()}/{collection_id}/query"
|
|
106
|
+
body: dict = {"query_embeddings": query_embeddings, "n_results": n_results}
|
|
107
|
+
if include is not None:
|
|
108
|
+
body["include"] = include
|
|
109
|
+
res = _req_json("POST", url, body, timeout=600)
|
|
110
|
+
assert isinstance(res, dict)
|
|
111
|
+
return res
|
raglite/extract.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from pypdf import PdfReader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
FileKind = Literal["pdf", "html", "htm", "txt"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ExtractResult:
|
|
17
|
+
kind: FileKind
|
|
18
|
+
text: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _clean_text(s: str) -> str:
|
|
22
|
+
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
|
23
|
+
s = re.sub(r"[ \t]+", " ", s)
|
|
24
|
+
s = re.sub(r"\n{3,}", "\n\n", s)
|
|
25
|
+
return s.strip()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_pdf(path: Path) -> ExtractResult:
|
|
29
|
+
reader = PdfReader(str(path))
|
|
30
|
+
parts = []
|
|
31
|
+
for i, page in enumerate(reader.pages):
|
|
32
|
+
try:
|
|
33
|
+
txt = page.extract_text() or ""
|
|
34
|
+
except Exception:
|
|
35
|
+
txt = ""
|
|
36
|
+
if txt.strip():
|
|
37
|
+
parts.append(f"\n\n--- Page {i+1} ---\n\n{txt}")
|
|
38
|
+
return ExtractResult(kind="pdf", text=_clean_text("\n".join(parts)))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def extract_html(path: Path) -> ExtractResult:
|
|
42
|
+
html = path.read_text(encoding="utf-8", errors="ignore")
|
|
43
|
+
soup = BeautifulSoup(html, "lxml")
|
|
44
|
+
|
|
45
|
+
# Remove scripts/styles
|
|
46
|
+
for tag in soup(["script", "style", "noscript"]):
|
|
47
|
+
tag.decompose()
|
|
48
|
+
|
|
49
|
+
text = soup.get_text("\n")
|
|
50
|
+
return ExtractResult(kind="html", text=_clean_text(text))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def extract_txt(path: Path) -> ExtractResult:
|
|
54
|
+
return ExtractResult(kind="txt", text=_clean_text(path.read_text(encoding="utf-8", errors="ignore")))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def extract_file(path: Path) -> ExtractResult:
|
|
58
|
+
suffix = path.suffix.lower().lstrip(".")
|
|
59
|
+
if suffix == "pdf":
|
|
60
|
+
return extract_pdf(path)
|
|
61
|
+
if suffix in ("html", "htm"):
|
|
62
|
+
return extract_html(path)
|
|
63
|
+
return extract_txt(path)
|
raglite/prompts.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class PromptPair:
|
|
8
|
+
execution_notes_prompt: str
|
|
9
|
+
tool_summary_prompt: str
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_prompts(*, token_cap_hint: str = "~1200 tokens max") -> PromptPair:
|
|
13
|
+
# These prompts are designed to be copy/pasted into Cosmo.
|
|
14
|
+
execution_notes = f"""You are an expert at converting documentation into EXECUTION-RELEVANT notes for an AI agent that can run tools (CLI commands, HTTP calls, scripts, and functions).
|
|
15
|
+
|
|
16
|
+
OUTPUT FORMAT (Markdown):
|
|
17
|
+
- Title
|
|
18
|
+
- What this tool/service is
|
|
19
|
+
- When to use
|
|
20
|
+
- Inputs (required/optional)
|
|
21
|
+
- Outputs
|
|
22
|
+
- Preconditions / assumptions
|
|
23
|
+
- Step-by-step 'golden path' (numbered)
|
|
24
|
+
- Verification checks (how to confirm success)
|
|
25
|
+
- Common errors + fixes
|
|
26
|
+
- Safety/rollback notes (what not to do / how to undo)
|
|
27
|
+
|
|
28
|
+
RULES:
|
|
29
|
+
- Be concise and operational; no marketing.
|
|
30
|
+
- Prefer concrete commands, flags, endpoints, and example payloads.
|
|
31
|
+
- If the doc is long, extract only what is needed to execute.
|
|
32
|
+
- Keep the final output within {token_cap_hint}.
|
|
33
|
+
|
|
34
|
+
SOURCE DOCUMENT (extracted text) is below. Use it as the only source of truth.
|
|
35
|
+
---
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
tool_summary = f"""You are an expert at writing ultra-condensed TOOL INDEX summaries for an agent tool library.
|
|
39
|
+
|
|
40
|
+
Write a short Markdown file with:
|
|
41
|
+
- Tool name
|
|
42
|
+
- 1-sentence purpose
|
|
43
|
+
- Capabilities (3-7 bullets)
|
|
44
|
+
- Required environment/dependencies
|
|
45
|
+
- Primary entrypoints (commands/endpoints)
|
|
46
|
+
- Key limitations / footguns (1-3 bullets)
|
|
47
|
+
|
|
48
|
+
RULES:
|
|
49
|
+
- No fluff. Assume the reader is an executor agent.
|
|
50
|
+
- Keep within ~250-400 tokens.
|
|
51
|
+
|
|
52
|
+
SOURCE DOCUMENT (extracted text) is below. Use it as the only source of truth.
|
|
53
|
+
---
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
return PromptPair(execution_notes_prompt=execution_notes, tool_summary_prompt=tool_summary)
|