PyPI - biller-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

biller-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

biller_cli/__init__.py ADDED Viewed

File without changes

biller_cli/ai/__init__.py ADDED Viewed

@@ -0,0 +1,397 @@
+"""
+biller_cli/ai/ingest.py
+Chunks bbps_biller_integrator_codebase.md and writes embeddings to ChromaDB.
+Split strategy:
+  Pass 1 — split on file boundary markers (^## src/)
+  Pass 2 — split oversized chunks (>400 tokens) at method/blank boundaries
+            with 50-token overlap
+Exclusions — dead scaffolding deleted in Phase 0:
+  BillerController, UserService, UserServiceImpl,
+  UserDao, UserDaoImpl, User.java
+Run:
+  biller-cli ingest --source ~/Desktop/bbps/AIGateway/docs/bbps_biller_integrator_codebase.md
+"""
+from __future__ import annotations
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Generator
+import chromadb
+import tiktoken
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+CHROMA_DIR = Path.home() / ".biller-cli" / "chroma"
+COLLECTION_NAME = "biller_codebase"
+EMBED_MODEL = "nomic-embed-text"
+MAX_TOKENS = 400
+OVERLAP_TOKENS = 50
+# Regex that matches file-level headers only.
+# Matches: ## src/main/java/... and ## src/main/resources/...
+# Does NOT match: ## Java Source Files, ## Resource Files, ## AGENT_REBUILD_GUIDE, etc.
+FILE_HEADER_RE = re.compile(r"^## (src/\S+)", re.MULTILINE)
+# Dead scaffolding — explicitly deleted in Phase 0.
+# These paths exist in the dump (generated from biller-audit, not the clean tree).
+# They must never be indexed.
+EXCLUDE_PATHS: frozenset[str] = frozenset(
+    {
+        "src/main/java/bharat/connect/biller/controller/BillerController.java",
+        "src/main/java/bharat/connect/biller/service/UserService.java",
+        "src/main/java/bharat/connect/biller/service/impl/UserServiceImpl.java",
+        "src/main/java/bharat/connect/biller/dao/UserDao.java",
+        "src/main/java/bharat/connect/biller/dao/impl/UserDaoImpl.java",
+        "src/main/java/bharat/connect/biller/model/User.java",
+    }
+)
+# tiktoken encoder — cl100k_base is accurate enough for token budgeting on Java.
+# It is NOT the nomic-embed-text tokeniser, but the counts are close enough for
+# a 400-token ceiling. Do not use len(text.split()) — that undercounts by ~30%.
+_ENC = tiktoken.get_encoding("cl100k_base")
+# ---------------------------------------------------------------------------
+# Pre-flight checks
+# ---------------------------------------------------------------------------
+def _check_ollama_model(model: str) -> None:
+    """Exit with an actionable error if the required Ollama model is not pulled."""
+    try:
+        result = subprocess.run(
+            ["ollama", "list"], capture_output=True, text=True, timeout=10
+        )
+        if model not in result.stdout:
+            print(
+                f"\nError: Ollama model '{model}' is not available.\n"
+                f"Pull it with:  ollama pull {model}\n"
+            )
+            sys.exit(1)
+    except FileNotFoundError:
+        print(
+            "\nError: Ollama is not installed or not on PATH.\n"
+            "Install from: https://ollama.com\n"
+            f"Then run:     ollama pull {model}\n"
+        )
+        sys.exit(1)
+    except subprocess.TimeoutExpired:
+        print(
+            "\nError: Ollama did not respond within 10 seconds.\n"
+            "Start it with: ollama serve\n"
+        )
+        sys.exit(1)
+# ---------------------------------------------------------------------------
+# Parsing — Pass 1
+# ---------------------------------------------------------------------------
+def _doc_type(file_path: str) -> str:
+    """Return 'source' for Java files, 'resource' for everything else."""
+    return "source" if file_path.endswith(".java") else "resource"
+def _parse_file_sections(text: str) -> Generator[tuple[str, str], None, None]:
+    """
+    Yield (file_path, content) pairs for every ## src/ section in the dump.
+    Skips sections whose file_path is in EXCLUDE_PATHS.
+    Content includes everything between the file header and the next ## src/ header
+    (or end of document), minus the header line itself.
+    """
+    matches = list(FILE_HEADER_RE.finditer(text))
+    for i, match in enumerate(matches):
+        file_path = match.group(1)
+        if file_path in EXCLUDE_PATHS:
+            continue
+        # Content runs from end of this header line to start of next ## src/ header.
+        content_start = match.end()
+        content_end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        content = text[content_start:content_end].strip()
+        if content:
+            yield file_path, content
+# ---------------------------------------------------------------------------
+# Chunking — Pass 2
+# ---------------------------------------------------------------------------
+def _token_count(text: str) -> int:
+    return len(_ENC.encode(text))
+def _split_oversized(content: str, file_path: str) -> list[str]:
+    """
+    Split content that exceeds MAX_TOKENS.
+    Split preference order:
+      1. Method/block boundaries: a blank line after a closing brace (}\\n\\n)
+      2. Any blank line (\\n\\n)
+      3. Hard token split as last resort (preserves OVERLAP_TOKENS of context)
+    Returns a list of sub-chunks, each under MAX_TOKENS where possible.
+    A single method that exceeds MAX_TOKENS will not be split mid-line — it is
+    kept intact and logged as an oversized chunk rather than producing incoherent
+    fragments.
+    """
+    # Try splitting at method boundaries first, then blank lines.
+    for delimiter in (r"\}\n\n", r"\n\n"):
+        parts = re.split(delimiter, content)
+        if len(parts) > 1:
+            return _merge_parts(parts, file_path)
+    # No natural boundary found — hard token split with overlap.
+    return _hard_split(content)
+def _merge_parts(parts: list[str], file_path: str) -> list[str]:
+    """
+    Greedily merge split parts into chunks that stay under MAX_TOKENS.
+    When a chunk would exceed MAX_TOKENS, close it and start a new one
+    seeded with OVERLAP_TOKENS of the previous chunk's tail for context.
+    """
+    chunks: list[str] = []
+    current = ""
+    for part in parts:
+        candidate = (current + "\n\n" + part).strip() if current else part.strip()
+        if _token_count(candidate) <= MAX_TOKENS:
+            current = candidate
+        else:
+            if current:
+                chunks.append(current)
+                # Seed next chunk with overlap from tail of current.
+                tail_tokens = _ENC.encode(current)[-OVERLAP_TOKENS:]
+                overlap_text = _ENC.decode(tail_tokens)
+                current = (overlap_text + "\n\n" + part.strip()).strip()
+            else:
+                # Single part already exceeds MAX_TOKENS — keep it intact.
+                # Splitting a single method mid-line is worse than an oversized chunk.
+                chunks.append(part.strip())
+                current = ""
+    if current:
+        chunks.append(current)
+    return [c for c in chunks if c]
+def _hard_split(content: str) -> list[str]:
+    """Token-boundary split for content with no natural delimiters."""
+    tokens = _ENC.encode(content)
+    chunks: list[str] = []
+    step = MAX_TOKENS - OVERLAP_TOKENS
+    for start in range(0, len(tokens), step):
+        chunk_tokens = tokens[start : start + MAX_TOKENS]
+        chunks.append(_ENC.decode(chunk_tokens))
+    return chunks
+def _chunk_file(file_path: str, content: str) -> list[dict]:
+    """
+    Return a list of chunk dicts ready for ChromaDB insertion.
+    Each dict has keys: text, file_path, chunk_index, language, token_count, doc_type
+    """
+    token_count = _token_count(content)
+    if token_count <= MAX_TOKENS:
+        sub_chunks = [content]
+    else:
+        sub_chunks = _split_oversized(content, file_path)
+    language = "java" if file_path.endswith(".java") else (
+        "xml" if file_path.endswith(".xsd") else (
+            "sql" if file_path.endswith(".sql") else "properties"
+        )
+    )
+    result = []
+    for idx, chunk_text in enumerate(sub_chunks):
+        result.append(
+            {
+                "text": chunk_text,
+                "file_path": file_path,
+                "chunk_index": idx,
+                "language": language,
+                "token_count": _token_count(chunk_text),
+                "doc_type": _doc_type(file_path),
+            }
+        )
+    return result
+# ---------------------------------------------------------------------------
+# Embedding — via Ollama HTTP API
+# ---------------------------------------------------------------------------
+def _embed_batch(texts: list[str]) -> list[list[float]]:
+    """
+    Embed a batch of texts using nomic-embed-text via the Ollama Python library.
+    Returns a list of embedding vectors in the same order as input texts.
+    """
+    import ollama  # imported here so the rest of the module is importable without ollama
+    embeddings = []
+    for text in texts:
+        response = ollama.embeddings(model=EMBED_MODEL, prompt=text)
+        embeddings.append(response["embedding"])
+    return embeddings
+# ---------------------------------------------------------------------------
+# ChromaDB write
+# ---------------------------------------------------------------------------
+def _get_collection(client: chromadb.PersistentClient) -> chromadb.Collection:
+    """
+    Delete and recreate the collection on every ingest run.
+    This prevents duplicate chunk accumulation when re-ingesting
+    after a codebase update. No partial-update strategy — full rebuild only.
+    """
+    try:
+        client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass  # Collection did not exist — first run.
+    return client.create_collection(
+        name=COLLECTION_NAME,
+        metadata={"hnsw:space": "cosine"},
+    )
+BATCH_SIZE = 50  # ChromaDB add() performance degrades with very large single batches.
+def _write_to_chroma(
+    collection: chromadb.Collection, chunks: list[dict]
+) -> None:
+    """Write all chunks to ChromaDB in batches."""
+    total = len(chunks)
+    for batch_start in range(0, total, BATCH_SIZE):
+        batch = chunks[batch_start : batch_start + BATCH_SIZE]
+        texts = [c["text"] for c in batch]
+        embeddings = _embed_batch(texts)
+        ids = [
+            f"{c['file_path']}::chunk_{c['chunk_index']}" for c in batch
+        ]
+        metadatas = [
+            {
+                "file_path": c["file_path"],
+                "chunk_index": c["chunk_index"],
+                "language": c["language"],
+                "token_count": c["token_count"],
+                "doc_type": c["doc_type"],
+            }
+            for c in batch
+        ]
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=metadatas,
+        )
+        done = min(batch_start + BATCH_SIZE, total)
+        print(f"  Embedded and stored {done}/{total} chunks...", end="\r")
+    print()  # newline after the progress line
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def run_ingest(source_path: Path) -> None:
+    """
+    Full ingest pipeline. Called by `biller-cli ingest --source <path>`.
+    Steps:
+      1. Pre-flight: verify Ollama + nomic-embed-text available
+      2. Read source file
+      3. Parse file sections (Pass 1)
+      4. Chunk oversized sections (Pass 2)
+      5. Write to ChromaDB (delete + recreate collection)
+    """
+    # --- 1. Pre-flight ---
+    print("Checking Ollama availability...")
+    _check_ollama_model(EMBED_MODEL)
+    print(f"  OK: '{EMBED_MODEL}' is available.\n")
+    # --- 2. Read source ---
+    if not source_path.exists():
+        print(f"Error: Source file not found: {source_path}")
+        sys.exit(1)
+    print(f"Reading: {source_path}")
+    text = source_path.read_text(encoding="utf-8")
+    print(f"  {len(text):,} characters loaded.\n")
+    # --- 3. Parse ---
+    print("Parsing file sections...")
+    all_chunks: list[dict] = []
+    excluded_count = 0
+    section_count = 0
+    for file_path, content in _parse_file_sections(text):
+        section_count += 1
+        file_chunks = _chunk_file(file_path, content)
+        all_chunks.extend(file_chunks)
+    # Count excluded sections separately for the summary.
+    for match in FILE_HEADER_RE.finditer(text):
+        if match.group(1) in EXCLUDE_PATHS:
+            excluded_count += 1
+    print(f"  {section_count} sections ingested, {excluded_count} excluded (dead scaffolding).")
+    print(f"  {len(all_chunks)} total chunks after Pass 2 splitting.\n")
+    if not all_chunks:
+        print("Error: No chunks produced. Verify the source file format.")
+        print("Expected headers matching: ## src/<path>")
+        sys.exit(1)
+    # --- 4 & 5. Embed + write ---
+    print(f"Initialising ChromaDB at: {CHROMA_DIR}")
+    CHROMA_DIR.mkdir(parents=True, exist_ok=True)
+    client = chromadb.PersistentClient(path=str(CHROMA_DIR))
+    collection = _get_collection(client)
+    print(f"  Collection '{COLLECTION_NAME}' ready (previous data cleared).\n")
+    print(f"Embedding and storing {len(all_chunks)} chunks...")
+    print("  This will take several minutes on first run.\n")
+    _write_to_chroma(collection, all_chunks)
+    # --- Summary ---
+    final_count = collection.count()
+    print(f"\nIngest complete.")
+    print(f"  Collection : {COLLECTION_NAME}")
+    print(f"  Location   : {CHROMA_DIR}")
+    print(f"  Chunks     : {final_count}")
+    print(f"  Excluded   : {excluded_count} dead scaffolding files\n")
+    print("Smoke test:")
+    print('  python -c "')
+    print('  import chromadb')
+    print(f'  c = chromadb.PersistentClient(path=\\"{CHROMA_DIR}\\")')
+    print(f'  col = c.get_collection(\\"{COLLECTION_NAME}\\")')
+    print('  r = col.query(query_texts=[\\\"BillFetchService implementation\\\"], n_results=3)')
+    print('  [print(m[\\\"file_path\\\"]) for m in r[\\\"metadatas\\\"][0]]')
+    print('  "')