raglite-chromadb 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
raglite/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __all__ = []
raglite/chroma_rest.py ADDED
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import urllib.error
5
+ import urllib.request
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass
10
+ class ChromaLoc:
11
+ base_url: str = "http://127.0.0.1:8100"
12
+ tenant: str = "default_tenant"
13
+ database: str = "default_database"
14
+
15
+ def collections_url(self) -> str:
16
+ return f"{self.base_url}/api/v2/tenants/{self.tenant}/databases/{self.database}/collections"
17
+
18
+
19
+ def _req_json(method: str, url: str, body: dict | None = None, timeout: int = 120) -> dict | list:
20
+ data = None if body is None else json.dumps(body).encode("utf-8")
21
+ req = urllib.request.Request(url, data=data, method=method)
22
+ req.add_header("Content-Type", "application/json")
23
+ try:
24
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
25
+ raw = resp.read().decode("utf-8")
26
+ if not raw:
27
+ return {}
28
+ return json.loads(raw)
29
+ except urllib.error.HTTPError as e: # type: ignore[attr-defined]
30
+ detail = ""
31
+ try:
32
+ detail = e.read().decode("utf-8", errors="ignore")
33
+ except Exception:
34
+ detail = ""
35
+ raise RuntimeError(f"Chroma HTTP {e.code} calling {url}: {detail[:500]}")
36
+
37
+
38
+ def list_collections(loc: ChromaLoc) -> list[dict]:
39
+ res = _req_json("GET", loc.collections_url())
40
+ if not isinstance(res, list):
41
+ raise RuntimeError(f"Expected list from Chroma list_collections, got {type(res).__name__}")
42
+ return res
43
+
44
+
45
+ def get_or_create_collection(loc: ChromaLoc, name: str, *, space: str = "cosine") -> dict:
46
+ cols = list_collections(loc)
47
+ for c in cols:
48
+ if c.get("name") == name:
49
+ return c
50
+
51
+ created = _req_json(
52
+ "POST",
53
+ loc.collections_url(),
54
+ {"name": name, "metadata": {"hnsw:space": space}},
55
+ )
56
+ if not isinstance(created, dict):
57
+ raise RuntimeError(f"Expected dict from Chroma create_collection, got {type(created).__name__}")
58
+ return created
59
+
60
+
61
+ def add(
62
+ loc: ChromaLoc,
63
+ collection_id: str,
64
+ *,
65
+ ids: list[str],
66
+ documents: list[str],
67
+ embeddings: list[list[float]],
68
+ metadatas: list[dict] | None = None,
69
+ ) -> None:
70
+ url = f"{loc.collections_url()}/{collection_id}/add"
71
+ body: dict = {"ids": ids, "documents": documents, "embeddings": embeddings}
72
+ if metadatas is not None:
73
+ body["metadatas"] = metadatas
74
+ _req_json("POST", url, body, timeout=600)
75
+
76
+
77
+ def upsert(
78
+ loc: ChromaLoc,
79
+ collection_id: str,
80
+ *,
81
+ ids: list[str],
82
+ documents: list[str],
83
+ embeddings: list[list[float]],
84
+ metadatas: list[dict] | None = None,
85
+ ) -> None:
86
+ """Upsert records into a collection.
87
+
88
+ Chroma's /upsert updates existing ids and inserts new ones.
89
+ """
90
+ url = f"{loc.collections_url()}/{collection_id}/upsert"
91
+ body: dict = {"ids": ids, "documents": documents, "embeddings": embeddings}
92
+ if metadatas is not None:
93
+ body["metadatas"] = metadatas
94
+ _req_json("POST", url, body, timeout=600)
95
+
96
+
97
+ def query(
98
+ loc: ChromaLoc,
99
+ collection_id: str,
100
+ *,
101
+ query_embeddings: list[list[float]],
102
+ n_results: int = 10,
103
+ include: list[str] | None = None,
104
+ ) -> dict:
105
+ url = f"{loc.collections_url()}/{collection_id}/query"
106
+ body: dict = {"query_embeddings": query_embeddings, "n_results": n_results}
107
+ if include is not None:
108
+ body["include"] = include
109
+ res = _req_json("POST", url, body, timeout=600)
110
+ assert isinstance(res, dict)
111
+ return res
raglite/extract.py ADDED
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from bs4 import BeautifulSoup
9
+ from pypdf import PdfReader
10
+
11
+
12
+ FileKind = Literal["pdf", "html", "htm", "txt"]
13
+
14
+
15
+ @dataclass
16
+ class ExtractResult:
17
+ kind: FileKind
18
+ text: str
19
+
20
+
21
+ def _clean_text(s: str) -> str:
22
+ s = s.replace("\r\n", "\n").replace("\r", "\n")
23
+ s = re.sub(r"[ \t]+", " ", s)
24
+ s = re.sub(r"\n{3,}", "\n\n", s)
25
+ return s.strip()
26
+
27
+
28
+ def extract_pdf(path: Path) -> ExtractResult:
29
+ reader = PdfReader(str(path))
30
+ parts = []
31
+ for i, page in enumerate(reader.pages):
32
+ try:
33
+ txt = page.extract_text() or ""
34
+ except Exception:
35
+ txt = ""
36
+ if txt.strip():
37
+ parts.append(f"\n\n--- Page {i+1} ---\n\n{txt}")
38
+ return ExtractResult(kind="pdf", text=_clean_text("\n".join(parts)))
39
+
40
+
41
+ def extract_html(path: Path) -> ExtractResult:
42
+ html = path.read_text(encoding="utf-8", errors="ignore")
43
+ soup = BeautifulSoup(html, "lxml")
44
+
45
+ # Remove scripts/styles
46
+ for tag in soup(["script", "style", "noscript"]):
47
+ tag.decompose()
48
+
49
+ text = soup.get_text("\n")
50
+ return ExtractResult(kind="html", text=_clean_text(text))
51
+
52
+
53
+ def extract_txt(path: Path) -> ExtractResult:
54
+ return ExtractResult(kind="txt", text=_clean_text(path.read_text(encoding="utf-8", errors="ignore")))
55
+
56
+
57
+ def extract_file(path: Path) -> ExtractResult:
58
+ suffix = path.suffix.lower().lstrip(".")
59
+ if suffix == "pdf":
60
+ return extract_pdf(path)
61
+ if suffix in ("html", "htm"):
62
+ return extract_html(path)
63
+ return extract_txt(path)
raglite/prompts.py ADDED
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class PromptPair:
8
+ execution_notes_prompt: str
9
+ tool_summary_prompt: str
10
+
11
+
12
+ def build_prompts(*, token_cap_hint: str = "~1200 tokens max") -> PromptPair:
13
+ # These prompts are designed to be copy/pasted into Cosmo.
14
+ execution_notes = f"""You are an expert at converting documentation into EXECUTION-RELEVANT notes for an AI agent that can run tools (CLI commands, HTTP calls, scripts, and functions).
15
+
16
+ OUTPUT FORMAT (Markdown):
17
+ - Title
18
+ - What this tool/service is
19
+ - When to use
20
+ - Inputs (required/optional)
21
+ - Outputs
22
+ - Preconditions / assumptions
23
+ - Step-by-step 'golden path' (numbered)
24
+ - Verification checks (how to confirm success)
25
+ - Common errors + fixes
26
+ - Safety/rollback notes (what not to do / how to undo)
27
+
28
+ RULES:
29
+ - Be concise and operational; no marketing.
30
+ - Prefer concrete commands, flags, endpoints, and example payloads.
31
+ - If the doc is long, extract only what is needed to execute.
32
+ - Keep the final output within {token_cap_hint}.
33
+
34
+ SOURCE DOCUMENT (extracted text) is below. Use it as the only source of truth.
35
+ ---
36
+ """
37
+
38
+ tool_summary = f"""You are an expert at writing ultra-condensed TOOL INDEX summaries for an agent tool library.
39
+
40
+ Write a short Markdown file with:
41
+ - Tool name
42
+ - 1-sentence purpose
43
+ - Capabilities (3-7 bullets)
44
+ - Required environment/dependencies
45
+ - Primary entrypoints (commands/endpoints)
46
+ - Key limitations / footguns (1-3 bullets)
47
+
48
+ RULES:
49
+ - No fluff. Assume the reader is an executor agent.
50
+ - Keep within ~250-400 tokens.
51
+
52
+ SOURCE DOCUMENT (extracted text) is below. Use it as the only source of truth.
53
+ ---
54
+ """
55
+
56
+ return PromptPair(execution_notes_prompt=execution_notes, tool_summary_prompt=tool_summary)