PyPI - docforge-cli - Versions diffs - 0.4.0__tar.gz → 0.5.0__tar.gz - Mend

docforge-cli 0.4.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docforge-cli
-Version: 0.4.0
+Version: 0.5.0
 Summary: Forge searchable context from Confluence and git repos for AI coding assistants
 License: MIT
 Project-URL: Homepage, https://GranatenUdo.github.io/docforge/

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docforge-cli"
-version = "0.4.0"
+version = "0.5.0"
 description = "Forge searchable context from Confluence and git repos for AI coding assistants"
 readme = "README.md"
 license = {text = "MIT"}

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge/api.py RENAMED Viewed

@@ -209,31 +209,63 @@ async def search(
         async with pool.acquire() as conn:
             rows = await conn.fetch(
                 """
-                SELECT
-                    c.text,
-                    c.section_title,
-                    s.title AS source_title,
-                    s.url AS source_url,
-                    s.tags AS source_tags,
-                    1 - (c.embedding <=> $1::vector) AS similarity,
-                    (1 - (c.embedding <=> $1::vector)) *
-                        (1
-                         + $2::float * cardinality(
-                             ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
-                           )
-                         + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
-                        ) AS boosted_score
-                FROM chunks c
-                JOIN sources s ON c.source_id = s.id
-                WHERE s.status = 'active'
+                WITH q_tsq AS (SELECT websearch_to_tsquery($8::regconfig, $2::text) AS q),
+                     dense AS (
+                         SELECT id, source_id, text, section_title,
+                                ROW_NUMBER() OVER (ORDER BY dist) AS rank
+                         FROM (
+                             SELECT c.id, c.source_id, c.text, c.section_title,
+                                    c.embedding <=> $1::vector AS dist
+                             FROM chunks c JOIN sources s ON c.source_id = s.id
+                             WHERE s.status = 'active'
+                             ORDER BY c.embedding <=> $1::vector
+                             LIMIT $3
+                         ) AS t
+                     ),
+                     sparse AS (
+                         SELECT id, source_id, text, section_title,
+                                ROW_NUMBER() OVER (ORDER BY rk DESC) AS rank
+                         FROM (
+                             SELECT c.id, c.source_id, c.text, c.section_title,
+                                    ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) AS rk
+                             FROM chunks c JOIN sources s ON c.source_id = s.id
+                             WHERE s.status = 'active'
+                               AND c.text_tsv @@ (SELECT q FROM q_tsq)
+                             ORDER BY ts_rank_cd(c.text_tsv, (SELECT q FROM q_tsq)) DESC
+                             LIMIT $3
+                         ) AS t
+                     ),
+                     fused AS (
+                         SELECT COALESCE(d.id, sp.id) AS id,
+                                COALESCE(d.source_id, sp.source_id) AS source_id,
+                                COALESCE(d.text, sp.text) AS text,
+                                COALESCE(d.section_title, sp.section_title) AS section_title,
+                                COALESCE(1.0/($9 + d.rank), 0)
+                                  + COALESCE(1.0/($9 + sp.rank), 0) AS rrf
+                         FROM dense d FULL OUTER JOIN sparse sp ON d.id = sp.id
+                     )
+                SELECT f.text, f.section_title,
+                       s.title AS source_title, s.url AS source_url, s.tags AS source_tags,
+                       f.rrf AS similarity,
+                       f.rrf * (1
+                                + $4::float * cardinality(
+                                    ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($5::text[]))
+                                  )
+                                + $6::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
+                       ) AS boosted_score
+                FROM fused f JOIN sources s ON f.source_id = s.id
                 ORDER BY boosted_score DESC
-                LIMIT $5
+                LIMIT $7
                 """,
-                np.array(query_vector, dtype=np.float32),
-                settings.tag_match_weight,
-                user_tags,
-                settings.org_tag_weight,
-                req.limit,
+                np.array(query_vector, dtype=np.float32),  # $1
+                req.query,  # $2
+                settings.hybrid_pool_size,  # $3
+                settings.tag_match_weight,  # $4
+                user_tags,  # $5
+                settings.org_tag_weight,  # $6
+                req.limit,  # $7
+                settings.fts_language,  # $8
+                settings.rrf_k,  # $9
             )
     except Exception as e:
         logger.error("Database error during search: %s", e)

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge/cli.py RENAMED Viewed

@@ -8,6 +8,8 @@ from pathlib import Path
 import typer
+from docforge.remote_client import AuthName
 app = typer.Typer(
     help="Forge searchable context from Confluence and git repos for AI coding assistants.",
 )
@@ -125,24 +127,26 @@ def serve(
         help="Run MCP backed by a remote search API at this URL",
         envvar="DOCFORGE_API_URL",
     ),
-    auth: str = typer.Option(
-        "none",
+    auth: AuthName = typer.Option(
+        AuthName.none,
         "--auth",
-        help="Auth provider for --remote-api: none | bearer | azure",
+        help="Auth provider for --remote-api",
         envvar="DOCFORGE_AUTH",
     ),
 ) -> None:
     """Run the MCP server (or FastAPI API with --api, or remote-backed MCP with --remote-api)."""
     _setup_logging()
+    if remote_api and api:
+        typer.echo("Error: --api and --remote-api are mutually exclusive.", err=True)
+        raise typer.Exit(1)
+    if auth is not AuthName.none and not remote_api:
+        typer.echo("Warning: --auth has no effect without --remote-api.", err=True)
     if remote_api:
-        if api:
-            typer.echo("Error: --api and --remote-api are mutually exclusive.", err=True)
-            raise typer.Exit(1)
         from docforge.remote_client import run_remote_mcp
         run_remote_mcp(url=remote_api, auth_name=auth)
-        return
-    if api:
+    elif api:
         import uvicorn
         from docforge.api import app as fastapi_app

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge/config.py RENAMED Viewed

@@ -66,6 +66,17 @@ class Settings(BaseSettings):
     tag_match_weight: float = 0.1
     org_tag_weight: float = 0.05
+    # Hybrid retrieval (RRF over dense + sparse). rrf_k=60 matches the universal
+    # default (Azure AI Search, Elasticsearch, OpenSearch); higher k flattens
+    # the rank distribution, lower amplifies. hybrid_pool_size is the top-N
+    # from each retriever feeding RRF — 4-10x req.limit is the standard rule,
+    # and req.limit caps at 50 so 100 covers under-recalled queries with margin.
+    # fts_language is the Postgres text-search config; switch to 'simple' if
+    # non-English content appears in the corpus.
+    rrf_k: int = 60
+    hybrid_pool_size: int = 100
+    fts_language: str = "english"
     # Default identity (used as CLI flag defaults when set via env/yml)
     default_user_name: str = ""
     default_team_name: str = ""

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge/mcp_server.py RENAMED Viewed

@@ -49,6 +49,32 @@ def _get_embedder() -> EmbedderProtocol:
     return _embedder
+def format_search_results_markdown(
+    results: list[dict],
+    *,
+    empty_message: str = "No documentation found matching your query.",
+) -> str:
+    """Render a list of search-result dicts as the canonical Markdown shape.
+    Each result must have keys: similarity, source_title, source_url, text.
+    Optional: section_title, source_tags.
+    """
+    if not results:
+        return empty_message
+    parts: list[str] = []
+    for i, r in enumerate(results, 1):
+        header = f"**Result {i}** (relevance: {r['similarity']:.2f}) -- {r['source_title']}"
+        if r.get("section_title"):
+            header += f" > {r['section_title']}"
+        header += f"\nSource: {r['source_url']}"
+        tags = r.get("source_tags") or []
+        if tags:
+            header += f"\nTags: {', '.join(tags)}"
+        parts.append(f"{header}\n\n{r['text']}")
+    return "\n\n---\n\n".join(parts)
 @mcp.tool()
 async def search_documentation(
     query: Annotated[str, Field(max_length=8000)],
@@ -115,31 +141,23 @@ async def search_documentation(
     await log_query(pool, user_name, team_name, area_name, query, len(rows))
-    if not rows:
-        return (
+    return format_search_results_markdown(
+        [
+            {
+                "similarity": row["similarity"],
+                "source_title": row["source_title"],
+                "source_url": row["source_url"],
+                "section_title": row["section_title"],
+                "source_tags": list(row["source_tags"] or []),
+                "text": row["text"],
+            }
+            for row in rows
+        ],
+        empty_message=(
             "No documentation found matching your query. "
             "The index may be empty -- run `python -m docforge ingest` to populate it."
-        )
-    parts: list[str] = []
-    for i, row in enumerate(rows, 1):
-        similarity = row["similarity"]
-        source = row["source_title"]
-        url = row["source_url"]
-        section = row["section_title"]
-        text = row["text"]
-        tags = list(row["source_tags"] or [])
-        header = f"**Result {i}** (relevance: {similarity:.2f}) — {source}"
-        if section:
-            header += f" > {section}"
-        header += f"\nSource: {url}"
-        if tags:
-            header += f"\nTags: {', '.join(tags)}"
-        parts.append(f"{header}\n\n{text}")
-    return "\n\n---\n\n".join(parts)
+        ),
+    )
 @mcp.tool()

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge/remote_client.py RENAMED Viewed

@@ -7,12 +7,21 @@ Used by `docforge serve --remote-api $URL --auth ...`. See the
 from __future__ import annotations
 import os
+from enum import Enum
 from typing import Protocol
 import httpx
 from fastmcp import FastMCP
+class AuthName(str, Enum):
+    """Selectable auth providers for the --remote-api mode."""
+    none = "none"
+    bearer = "bearer"
+    azure = "azure"
 class AuthProvider(Protocol):
     """Async source of HTTP headers attached to each remote request."""
@@ -63,15 +72,19 @@ class AzureAuth:
         return {"Authorization": f"Bearer {token.token}"}
-def make_auth_provider(name: str) -> AuthProvider:
+def make_auth_provider(name: AuthName | str) -> AuthProvider:
     """Return an AuthProvider instance for the given name."""
-    if name == "none":
+    try:
+        name = AuthName(name) if isinstance(name, str) else name
+    except ValueError as e:
+        raise ValueError(f"Unknown auth provider: {name!r}. Valid: none, bearer, azure.") from e
+    if name is AuthName.none:
         return NoneAuth()
-    if name == "bearer":
+    if name is AuthName.bearer:
         return BearerAuth()
-    if name == "azure":
+    if name is AuthName.azure:
         return AzureAuth()
-    raise ValueError(f"Unknown auth provider: {name!r}. Valid: none, bearer, azure.")
+    raise ValueError(f"Unknown auth provider: {name!r}.")
 class RemoteBackend:
@@ -86,7 +99,18 @@ class RemoteBackend:
     ) -> None:
         self._url = url.rstrip("/")
         self._auth = auth
-        self._transport = transport  # for tests
+        self._transport = transport
+        self._client: httpx.AsyncClient | None = None
+    async def _ensure_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            self._client = httpx.AsyncClient(transport=self._transport, timeout=30.0)
+        return self._client
+    async def aclose(self) -> None:
+        if self._client is not None:
+            await self._client.aclose()
+            self._client = None
     def _identity_body(self) -> dict[str, str]:
         out: dict[str, str] = {}
@@ -100,18 +124,25 @@ class RemoteBackend:
                 out[body_key] = val
         return out
-    async def search(self, *, query: str, limit: int = 5) -> str:
-        """Search the remote API and return Markdown-formatted results."""
-        body: dict[str, object] = {"query": query, "limit": limit}
-        body.update(self._identity_body())
+    async def _request(
+        self,
+        method: str,
+        path: str,
+        *,
+        json: dict[str, object] | None = None,
+    ) -> httpx.Response | str:
+        """Perform an HTTP request with auth and uniform error handling.
+        Returns the Response on 2xx; an already-formatted error string otherwise.
+        """
         try:
             headers = await self._auth.headers()
         except Exception as e:
             return f"Auth provider error: {e}"
+        client = await self._ensure_client()
         try:
-            async with httpx.AsyncClient(transport=self._transport, timeout=30.0) as client:
-                resp = await client.post(f"{self._url}/search", json=body, headers=headers)
+            resp = await client.request(method, f"{self._url}{path}", json=json, headers=headers)
         except httpx.ConnectError:
             return f"Could not reach remote API at {self._url}."
         except httpx.HTTPError as e:
@@ -123,45 +154,28 @@ class RemoteBackend:
             return f"Remote API error ({resp.status_code}). Try again in a moment."
         if resp.status_code != 200:
             return f"Remote API returned {resp.status_code}: {resp.text[:200]}"
+        return resp
-        data = resp.json()
-        results = data.get("results", [])
-        if not results:
-            return "No documentation found matching your query."
-        parts: list[str] = []
-        for i, r in enumerate(results, 1):
-            header = f"**Result {i}** (relevance: {r['similarity']:.2f}) -- {r['source_title']}"
-            if r.get("section_title"):
-                header += f" > {r['section_title']}"
-            header += f"\nSource: {r['source_url']}"
-            tags = r.get("source_tags") or []
-            if tags:
-                header += f"\nTags: {', '.join(tags)}"
-            parts.append(f"{header}\n\n{r['text']}")
-        return "\n\n---\n\n".join(parts)
+    async def search(self, *, query: str, limit: int = 5) -> str:
+        """Search the remote API and return Markdown-formatted results."""
+        body: dict[str, object] = {"query": query, "limit": limit}
+        body.update(self._identity_body())
+        result = await self._request("POST", "/search", json=body)
+        if isinstance(result, str):
+            return result
-    async def list_sources(self) -> str:
-        """List indexed sources from the remote API."""
-        try:
-            headers = await self._auth.headers()
-        except Exception as e:
-            return f"Auth provider error: {e}"
+        from docforge.mcp_server import format_search_results_markdown
-        try:
-            async with httpx.AsyncClient(transport=self._transport, timeout=10.0) as client:
-                resp = await client.get(f"{self._url}/sources", headers=headers)
-        except httpx.ConnectError:
-            return f"Could not reach remote API at {self._url}."
-        except httpx.HTTPError as e:
-            return f"Remote API error: {e}"
+        data = result.json()
+        return format_search_results_markdown(data.get("results", []))
-        if resp.status_code == 401:
-            return "Auth failed (401). Check DOCFORGE_API_URL and the --auth provider."
-        if resp.status_code != 200:
-            return f"Remote API returned {resp.status_code}: {resp.text[:200]}"
+    async def list_sources(self) -> str:
+        """List indexed sources from the remote API."""
+        result = await self._request("GET", "/sources")
+        if isinstance(result, str):
+            return result
-        data = resp.json()
+        data = result.json()
         sources = data.get("sources", [])
         if not sources:
             return "No sources indexed."
@@ -180,7 +194,7 @@ INSTRUCTIONS = (
 )
-def run_remote_mcp(*, url: str, auth_name: str = "none") -> None:
+def run_remote_mcp(*, url: str, auth_name: AuthName | str = AuthName.none) -> None:
     """Run an MCP server proxying tool calls to a remote docforge search-api."""
     auth = make_auth_provider(auth_name)
     backend = RemoteBackend(url=url, auth=auth)

docforge_cli-0.5.0/src/docforge/sql/migrations/007_add_chunks_text_tsv.sql ADDED Viewed

@@ -0,0 +1,16 @@
+-- Migration 007: add tsvector column and GIN index for hybrid retrieval.
+--
+-- text_tsv is GENERATED ALWAYS AS STORED, so Postgres backfills existing
+-- rows as part of the ALTER TABLE and auto-populates on every INSERT.
+-- No application changes required for ingest.
+--
+-- The GIN index is built non-concurrently. For the current chunk count
+-- (~tens of thousands) this is sub-second. If chunks grows past ~1M
+-- rows, switch a future migration to CREATE INDEX CONCURRENTLY (which
+-- requires running outside a transaction).
+ALTER TABLE chunks
+    ADD COLUMN IF NOT EXISTS text_tsv tsvector
+        GENERATED ALWAYS AS (to_tsvector('english', text)) STORED;
+CREATE INDEX IF NOT EXISTS chunks_text_tsv_idx ON chunks USING GIN (text_tsv);

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docforge-cli
-Version: 0.4.0
+Version: 0.5.0
 Summary: Forge searchable context from Confluence and git repos for AI coding assistants
 License: MIT
 Project-URL: Homepage, https://GranatenUdo.github.io/docforge/

{docforge_cli-0.4.0 → docforge_cli-0.5.0}/src/docforge_cli.egg-info/SOURCES.txt RENAMED Viewed

@@ -32,6 +32,7 @@ src/docforge/sql/migrations/003_add_source_tags.sql
 src/docforge/sql/migrations/004_add_query_log.sql
 src/docforge/sql/migrations/005_add_query_log_user_oid.sql
 src/docforge/sql/migrations/006_add_query_log_request_ms.sql
+src/docforge/sql/migrations/007_add_chunks_text_tsv.sql
 src/docforge/templates/docforge.yml
 src/docforge/templates/docker-compose.yml
 src/docforge/templates/mcp_client.py