docforge-cli 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. docforge_cli-0.2.0/LICENSE +21 -0
  2. docforge_cli-0.2.0/PKG-INFO +178 -0
  3. docforge_cli-0.2.0/README.md +145 -0
  4. docforge_cli-0.2.0/docforge/__init__.py +0 -0
  5. docforge_cli-0.2.0/docforge/__main__.py +5 -0
  6. docforge_cli-0.2.0/docforge/api.py +266 -0
  7. docforge_cli-0.2.0/docforge/cli.py +296 -0
  8. docforge_cli-0.2.0/docforge/config.py +99 -0
  9. docforge_cli-0.2.0/docforge/crawlers/__init__.py +1 -0
  10. docforge_cli-0.2.0/docforge/crawlers/confluence.py +109 -0
  11. docforge_cli-0.2.0/docforge/crawlers/git.py +79 -0
  12. docforge_cli-0.2.0/docforge/db.py +57 -0
  13. docforge_cli-0.2.0/docforge/ingest.py +401 -0
  14. docforge_cli-0.2.0/docforge/lint.py +92 -0
  15. docforge_cli-0.2.0/docforge/mcp_server.py +188 -0
  16. docforge_cli-0.2.0/docforge/processors/__init__.py +1 -0
  17. docforge_cli-0.2.0/docforge/processors/chunker.py +141 -0
  18. docforge_cli-0.2.0/docforge/processors/embedder.py +78 -0
  19. docforge_cli-0.2.0/docforge/processors/parser.py +143 -0
  20. docforge_cli-0.2.0/docforge/query_log.py +45 -0
  21. docforge_cli-0.2.0/docforge/ranking.py +20 -0
  22. docforge_cli-0.2.0/docforge/scripts/__init__.py +1 -0
  23. docforge_cli-0.2.0/docforge/scripts/eval_search.py +226 -0
  24. docforge_cli-0.2.0/docforge/scripts/latency_report.py +142 -0
  25. docforge_cli-0.2.0/docforge/sources.py +46 -0
  26. docforge_cli-0.2.0/docforge/sql/migrations/001_add_source_identifier.sql +3 -0
  27. docforge_cli-0.2.0/docforge/sql/migrations/002_add_status_index.sql +1 -0
  28. docforge_cli-0.2.0/docforge/sql/migrations/003_add_source_tags.sql +4 -0
  29. docforge_cli-0.2.0/docforge/sql/migrations/004_add_query_log.sql +11 -0
  30. docforge_cli-0.2.0/docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
  31. docforge_cli-0.2.0/docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
  32. docforge_cli-0.2.0/docforge/sql/schema.sql +29 -0
  33. docforge_cli-0.2.0/docforge/templates/docforge.yml +11 -0
  34. docforge_cli-0.2.0/docforge/templates/docker-compose.yml +14 -0
  35. docforge_cli-0.2.0/docforge/templates/mcp_client.py +83 -0
  36. docforge_cli-0.2.0/docforge/templates/sources.yml +21 -0
  37. docforge_cli-0.2.0/docforge_cli.egg-info/PKG-INFO +178 -0
  38. docforge_cli-0.2.0/docforge_cli.egg-info/SOURCES.txt +42 -0
  39. docforge_cli-0.2.0/docforge_cli.egg-info/dependency_links.txt +1 -0
  40. docforge_cli-0.2.0/docforge_cli.egg-info/entry_points.txt +2 -0
  41. docforge_cli-0.2.0/docforge_cli.egg-info/requires.txt +25 -0
  42. docforge_cli-0.2.0/docforge_cli.egg-info/top_level.txt +1 -0
  43. docforge_cli-0.2.0/pyproject.toml +73 -0
  44. docforge_cli-0.2.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tobias Ens
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: docforge-cli
3
+ Version: 0.2.0
4
+ Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
+ License: MIT
6
+ Requires-Python: >=3.12
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: typer>=0.12
10
+ Requires-Dist: asyncpg>=0.30
11
+ Requires-Dist: httpx>=0.27
12
+ Requires-Dist: pydantic>=2.9
13
+ Requires-Dist: pydantic-settings>=2.6
14
+ Requires-Dist: beautifulsoup4>=4.12
15
+ Requires-Dist: sentence-transformers>=5.0
16
+ Requires-Dist: pgvector>=0.3
17
+ Requires-Dist: pyyaml>=6.0
18
+ Requires-Dist: fastmcp>=2.0
19
+ Requires-Dist: fastapi>=0.115
20
+ Requires-Dist: uvicorn>=0.34
21
+ Requires-Dist: numpy>=1.26
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=8.0; extra == "dev"
24
+ Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
25
+ Requires-Dist: pytest-cov>=7.0; extra == "dev"
26
+ Requires-Dist: ruff>=0.8; extra == "dev"
27
+ Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
28
+ Provides-Extra: entra
29
+ Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
30
+ Requires-Dist: azure-identity>=1.19; extra == "entra"
31
+ Requires-Dist: aiohttp>=3.10; extra == "entra"
32
+ Dynamic: license-file
33
+
34
+ # docforge
35
+
36
+ **The self-hosted context engine for AI coding assistants.**
37
+
38
+ Point docforge at your Confluence spaces and local git repositories. It indexes, embeds, and serves them over MCP — so Claude Code, Cursor, Copilot, and any assistant that speaks MCP can search your team's knowledge without your data leaving your infrastructure.
39
+
40
+ docforge doesn't replace your AI assistant. It feeds it — turning Claude Code, Cursor, Copilot, and anything else that speaks MCP into tools that actually know your team's docs and code.
41
+
42
+ [![CI](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml/badge.svg)](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml)
43
+ [![PyPI](https://img.shields.io/pypi/v/docforge-cli.svg)](https://pypi.org/project/docforge-cli/)
44
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
45
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
46
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
47
+
48
+ ## Why docforge
49
+
50
+ | Tool | Self-hosted | Integration | Confluence + code | Footprint | Complements AI assistants? |
51
+ |---|---|---|---|---|---|
52
+ | **docforge** | ✓ | MCP server | ✓ (Confluence + local git) | Minimal (PG + 1 container) | ✓ (any MCP client) |
53
+ | [Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga) | ✗ (Cloud-only) | MCP server | Confluence only (Cloud) | SaaS | ✓ |
54
+ | [zilliztech/claude-context](https://github.com/zilliztech/claude-context) | ✓ | MCP server | Code only | Minimal | ✓ |
55
+ | [Onyx](https://github.com/onyx-dot-app/onyx) | ✓ | MCP + chat UI | ✓ (50+ connectors) | Heavy (Standard) / Minimal (Lite) | ✓ (+ its own UI) |
56
+ | Cursor codebase index + @Docs | ✗ | Proprietary | Code + public web docs | SaaS | — (built into Cursor only) |
57
+ | [Copilot Spaces](https://github.com/orgs/community/discussions/180894) | ✗ | Proprietary (MCP for actions) | Code + attachments | SaaS | — (built into Copilot only) |
58
+ | [Sourcegraph Cody](https://sourcegraph.com/docs/cody/enterprise/features) | ✓ (Enterprise) | OpenCtx / MCP | ✓ (via OpenCtx) | Heavy (Sourcegraph platform) | — (built into Cody only) |
59
+ | LangChain / LlamaIndex DIY | ✓ | Whatever you build | You wire it | Depends | Depends |
60
+
61
+ docforge is the narrow, focused option in this landscape: minimal footprint, MCP-native so it works with every assistant, and combines Confluence + code out of the box. It doesn't compete on connector count (Onyx wins there), visual UX (Cursor and Cody win), or SaaS convenience (Rovo). It competes on being **small, legible, vendor-neutral, and self-hosted** — four properties no commercial option offers together.
62
+
63
+ ### ✅ When docforge fits
64
+
65
+ - You run Confluence Data Center/Server, or you want to self-host.
66
+ - Your team uses MCP-capable assistants (Claude Code, Cursor with MCP, Copilot with MCP, etc.).
67
+ - You want Confluence + git repos indexed together with one tool.
68
+ - Operational simplicity matters — one Postgres, one container, MIT-licensed code you can audit in an afternoon.
69
+
70
+ ### ❌ When docforge is the wrong choice
71
+
72
+ - You need 50+ connectors (Slack, Jira, Gmail, Drive, Notion) → use **[Onyx](https://github.com/onyx-dot-app/onyx)** or **[Glean](https://www.glean.com/)**.
73
+ - You need per-document ACLs enforced at query time → not yet supported; use **Onyx**.
74
+ - You need a chat UI for non-developers → docforge has no UI; use **Onyx**, **Glean**, or **Cody**.
75
+ - You're on Atlassian Cloud and happy with SaaS → **[Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga)** is free and official.
76
+ - You need SSO / SCIM / RBAC → out of scope; docforge authenticates but doesn't authorize per-resource.
77
+ - Your corpus is very large (>100K pages/chunks) → dense-only retrieval without hybrid starts to degrade; on the [roadmap](ROADMAP.md).
78
+ - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
79
+ - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
80
+
81
+ ## Quick Start
82
+
83
+ ```bash
84
+ pip install docforge-cli
85
+ docforge init my-project
86
+ cd my-project
87
+ # Edit docforge.yml with your Confluence URL
88
+ # Edit sources.yml with your page IDs and local git repo paths
89
+ # Edit .env with your credentials
90
+ docker compose up -d db
91
+ docforge init-db
92
+ docforge ingest
93
+ docforge serve
94
+ ```
95
+
96
+ **Note:** The git crawler indexes **local filesystem paths** — docforge does not clone GitHub URLs. Clone first, then point docforge at the checkout path in `sources.yml`.
97
+
98
+ ## How It Works
99
+
100
+ 1. **Configure** your Confluence URL, page IDs, and local git repo paths in `sources.yml`.
101
+ 2. **Ingest** crawls pages and files, chunks text (~500 tokens), generates vector embeddings (768-dim).
102
+ 3. **Serve** exposes an MCP server that AI assistants query automatically.
103
+
104
+ When an AI assistant needs cross-team context, it calls docforge's `search_documentation` MCP tool behind the scenes and gets relevant documentation chunks with source attribution.
105
+
106
+ ### Architecture
107
+
108
+ ![docforge architecture: Confluence and local git repos flow through docforge ingest into Postgres with pgvector, then docforge serve exposes an MCP server consumed by Claude Code, Cursor, and Copilot](docs/assets/architecture.svg)
109
+
110
+ ## Commands
111
+
112
+ | Command | Description |
113
+ |---------|-------------|
114
+ | `docforge init <name>` | Scaffold a new project with config templates |
115
+ | `docforge init-db` | Initialize the PostgreSQL database schema |
116
+ | `docforge ingest` | Crawl all sources, embed, store in PostgreSQL |
117
+ | `docforge search "<query>"` | Test search from terminal |
118
+ | `docforge serve` | Run MCP server for AI assistants |
119
+ | `docforge serve --api` | Run FastAPI search API (for hosted deployment) |
120
+ | `docforge status` | Show index stats and health |
121
+
122
+ ## Deploy to your infrastructure
123
+
124
+ For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
125
+
126
+ - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
127
+ - Container App running the FastAPI search API.
128
+ - Container Registry, Key Vault, Log Analytics, managed environment.
129
+ - Team members use a lightweight MCP client that calls the hosted API.
130
+
131
+ See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
132
+
133
+ ## Configuration
134
+
135
+ See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
136
+
137
+ ## Contributing
138
+
139
+ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development setup, branch conventions, and PR expectations. Bug reports and feature requests go through [GitHub Issues](https://github.com/GranatenUdo/docforge/issues); open-ended questions and ideas live in [Discussions](https://github.com/GranatenUdo/docforge/discussions).
140
+
141
+ ## Evaluation & retrieval quality
142
+
143
+ docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
144
+
145
+ ## FAQ
146
+
147
+ ### "Cannot connect to PostgreSQL"
148
+
149
+ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
150
+
151
+ ### "HF_TOKEN required" or model download fails
152
+
153
+ The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
154
+
155
+ ### "No results found" after ingest
156
+
157
+ Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
158
+
159
+ ### First ingest / first container start is very slow
160
+
161
+ The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
162
+
163
+ ### "Ingest skipped everything"
164
+
165
+ docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
166
+
167
+ ## License
168
+
169
+ MIT. See [LICENSE](LICENSE).
170
+
171
+ ## Credits
172
+
173
+ docforge stands on open shoulders:
174
+
175
+ - [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) — open-weights embedding model under the Gemma license.
176
+ - [pgvector](https://github.com/pgvector/pgvector) — vector similarity for Postgres.
177
+ - [FastMCP](https://github.com/PrefectHQ/fastmcp) — MCP server framework.
178
+ - [FastAPI](https://fastapi.tiangolo.com/), [Typer](https://typer.tiangolo.com/), [asyncpg](https://magicstack.github.io/asyncpg/), [sentence-transformers](https://www.sbert.net/) — core infrastructure.
@@ -0,0 +1,145 @@
1
+ # docforge
2
+
3
+ **The self-hosted context engine for AI coding assistants.**
4
+
5
+ Point docforge at your Confluence spaces and local git repositories. It indexes, embeds, and serves them over MCP — so Claude Code, Cursor, Copilot, and any assistant that speaks MCP can search your team's knowledge without your data leaving your infrastructure.
6
+
7
+ docforge doesn't replace your AI assistant. It feeds it — turning Claude Code, Cursor, Copilot, and anything else that speaks MCP into tools that actually know your team's docs and code.
8
+
9
+ [![CI](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml/badge.svg)](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml)
10
+ [![PyPI](https://img.shields.io/pypi/v/docforge-cli.svg)](https://pypi.org/project/docforge-cli/)
11
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
12
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
13
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
14
+
15
+ ## Why docforge
16
+
17
+ | Tool | Self-hosted | Integration | Confluence + code | Footprint | Complements AI assistants? |
18
+ |---|---|---|---|---|---|
19
+ | **docforge** | ✓ | MCP server | ✓ (Confluence + local git) | Minimal (PG + 1 container) | ✓ (any MCP client) |
20
+ | [Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga) | ✗ (Cloud-only) | MCP server | Confluence only (Cloud) | SaaS | ✓ |
21
+ | [zilliztech/claude-context](https://github.com/zilliztech/claude-context) | ✓ | MCP server | Code only | Minimal | ✓ |
22
+ | [Onyx](https://github.com/onyx-dot-app/onyx) | ✓ | MCP + chat UI | ✓ (50+ connectors) | Heavy (Standard) / Minimal (Lite) | ✓ (+ its own UI) |
23
+ | Cursor codebase index + @Docs | ✗ | Proprietary | Code + public web docs | SaaS | — (built into Cursor only) |
24
+ | [Copilot Spaces](https://github.com/orgs/community/discussions/180894) | ✗ | Proprietary (MCP for actions) | Code + attachments | SaaS | — (built into Copilot only) |
25
+ | [Sourcegraph Cody](https://sourcegraph.com/docs/cody/enterprise/features) | ✓ (Enterprise) | OpenCtx / MCP | ✓ (via OpenCtx) | Heavy (Sourcegraph platform) | — (built into Cody only) |
26
+ | LangChain / LlamaIndex DIY | ✓ | Whatever you build | You wire it | Depends | Depends |
27
+
28
+ docforge is the narrow, focused option in this landscape: minimal footprint, MCP-native so it works with every assistant, and combines Confluence + code out of the box. It doesn't compete on connector count (Onyx wins there), visual UX (Cursor and Cody win), or SaaS convenience (Rovo). It competes on being **small, legible, vendor-neutral, and self-hosted** — four properties no commercial option offers together.
29
+
30
+ ### ✅ When docforge fits
31
+
32
+ - You run Confluence Data Center/Server, or you want to self-host.
33
+ - Your team uses MCP-capable assistants (Claude Code, Cursor with MCP, Copilot with MCP, etc.).
34
+ - You want Confluence + git repos indexed together with one tool.
35
+ - Operational simplicity matters — one Postgres, one container, MIT-licensed code you can audit in an afternoon.
36
+
37
+ ### ❌ When docforge is the wrong choice
38
+
39
+ - You need 50+ connectors (Slack, Jira, Gmail, Drive, Notion) → use **[Onyx](https://github.com/onyx-dot-app/onyx)** or **[Glean](https://www.glean.com/)**.
40
+ - You need per-document ACLs enforced at query time → not yet supported; use **Onyx**.
41
+ - You need a chat UI for non-developers → docforge has no UI; use **Onyx**, **Glean**, or **Cody**.
42
+ - You're on Atlassian Cloud and happy with SaaS → **[Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga)** is free and official.
43
+ - You need SSO / SCIM / RBAC → out of scope; docforge authenticates but doesn't authorize per-resource.
44
+ - Your corpus is very large (>100K pages/chunks) → dense-only retrieval without hybrid starts to degrade; on the [roadmap](ROADMAP.md).
45
+ - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
46
+ - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
47
+
48
+ ## Quick Start
49
+
50
+ ```bash
51
+ pip install docforge-cli
52
+ docforge init my-project
53
+ cd my-project
54
+ # Edit docforge.yml with your Confluence URL
55
+ # Edit sources.yml with your page IDs and local git repo paths
56
+ # Edit .env with your credentials
57
+ docker compose up -d db
58
+ docforge init-db
59
+ docforge ingest
60
+ docforge serve
61
+ ```
62
+
63
+ **Note:** The git crawler indexes **local filesystem paths** — docforge does not clone GitHub URLs. Clone first, then point docforge at the checkout path in `sources.yml`.
64
+
65
+ ## How It Works
66
+
67
+ 1. **Configure** your Confluence URL, page IDs, and local git repo paths in `sources.yml`.
68
+ 2. **Ingest** crawls pages and files, chunks text (~500 tokens), generates vector embeddings (768-dim).
69
+ 3. **Serve** exposes an MCP server that AI assistants query automatically.
70
+
71
+ When an AI assistant needs cross-team context, it calls docforge's `search_documentation` MCP tool behind the scenes and gets relevant documentation chunks with source attribution.
72
+
73
+ ### Architecture
74
+
75
+ ![docforge architecture: Confluence and local git repos flow through docforge ingest into Postgres with pgvector, then docforge serve exposes an MCP server consumed by Claude Code, Cursor, and Copilot](docs/assets/architecture.svg)
76
+
77
+ ## Commands
78
+
79
+ | Command | Description |
80
+ |---------|-------------|
81
+ | `docforge init <name>` | Scaffold a new project with config templates |
82
+ | `docforge init-db` | Initialize the PostgreSQL database schema |
83
+ | `docforge ingest` | Crawl all sources, embed, store in PostgreSQL |
84
+ | `docforge search "<query>"` | Test search from terminal |
85
+ | `docforge serve` | Run MCP server for AI assistants |
86
+ | `docforge serve --api` | Run FastAPI search API (for hosted deployment) |
87
+ | `docforge status` | Show index stats and health |
88
+
89
+ ## Deploy to your infrastructure
90
+
91
+ For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
92
+
93
+ - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
94
+ - Container App running the FastAPI search API.
95
+ - Container Registry, Key Vault, Log Analytics, managed environment.
96
+ - Team members use a lightweight MCP client that calls the hosted API.
97
+
98
+ See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
99
+
100
+ ## Configuration
101
+
102
+ See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
103
+
104
+ ## Contributing
105
+
106
+ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development setup, branch conventions, and PR expectations. Bug reports and feature requests go through [GitHub Issues](https://github.com/GranatenUdo/docforge/issues); open-ended questions and ideas live in [Discussions](https://github.com/GranatenUdo/docforge/discussions).
107
+
108
+ ## Evaluation & retrieval quality
109
+
110
+ docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
111
+
112
+ ## FAQ
113
+
114
+ ### "Cannot connect to PostgreSQL"
115
+
116
+ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
117
+
118
+ ### "HF_TOKEN required" or model download fails
119
+
120
+ The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
121
+
122
+ ### "No results found" after ingest
123
+
124
+ Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
125
+
126
+ ### First ingest / first container start is very slow
127
+
128
+ The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
129
+
130
+ ### "Ingest skipped everything"
131
+
132
+ docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
133
+
134
+ ## License
135
+
136
+ MIT. See [LICENSE](LICENSE).
137
+
138
+ ## Credits
139
+
140
+ docforge stands on open shoulders:
141
+
142
+ - [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) — open-weights embedding model under the Gemma license.
143
+ - [pgvector](https://github.com/pgvector/pgvector) — vector similarity for Postgres.
144
+ - [FastMCP](https://github.com/PrefectHQ/fastmcp) — MCP server framework.
145
+ - [FastAPI](https://fastapi.tiangolo.com/), [Typer](https://typer.tiangolo.com/), [asyncpg](https://magicstack.github.io/asyncpg/), [sentence-transformers](https://www.sbert.net/) — core infrastructure.
File without changes
@@ -0,0 +1,5 @@
1
+ """Module entrypoint — `python -m docforge` dispatches to the Typer app."""
2
+
3
+ from docforge.cli import app
4
+
5
+ app()
@@ -0,0 +1,266 @@
1
+ """FastAPI search API for docforge.
2
+
3
+ Runs on Azure Container Apps. Loads embedding model at startup,
4
+ serves search queries over HTTP.
5
+
6
+ Run locally: uvicorn docforge.api:app --reload
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import logging
13
+ import time
14
+ from contextlib import asynccontextmanager
15
+ from typing import Any
16
+
17
+ import numpy as np
18
+ from fastapi import Depends, FastAPI, HTTPException, Request
19
+ from fastapi.security import SecurityScopes
20
+ from pydantic import BaseModel
21
+
22
+ from docforge.config import Settings
23
+ from docforge.db import close_pool, get_pool
24
+ from docforge.processors.embedder import Embedder
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ _embedder: Embedder | None = None
29
+ _settings: Settings | None = None
30
+ _azure_scheme = None # Populated in lifespan when auth.mode == "entra"
31
+ _cleanup_task: asyncio.Task | None = None
32
+
33
+ _CLEANUP_INTERVAL_SECONDS = 3600 # one hour — overridable in tests
34
+
35
+
36
+ async def _query_log_cleanup_loop(database_url: str, retention_days: int) -> None:
37
+ """Deletes query_log rows older than retention_days every
38
+ _CLEANUP_INTERVAL_SECONDS. Idempotent, so multi-replica is safe."""
39
+ # int() coercion makes the f-string SQL below injection-safe. asyncpg's
40
+ # $1::interval parameter binding doesn't accept str, hence the literal.
41
+ days = int(retention_days)
42
+ while True:
43
+ try:
44
+ pool = await get_pool(database_url)
45
+ async with pool.acquire() as conn:
46
+ result = await conn.execute(
47
+ f"DELETE FROM query_log WHERE created_at < now() - interval '{days} days'"
48
+ )
49
+ logger.info("query_log cleanup: %s", result)
50
+ except Exception as e:
51
+ logger.exception("query_log cleanup failed: %s", e)
52
+ await asyncio.sleep(_CLEANUP_INTERVAL_SECONDS)
53
+
54
+
55
+ def _get_settings() -> Settings:
56
+ global _settings
57
+ if _settings is None:
58
+ _settings = Settings()
59
+ return _settings
60
+
61
+
62
+ def _build_auth_scheme(settings: Settings):
63
+ """Return a SingleTenantAzureAuthorizationCodeBearer if mode==entra, else None."""
64
+ if settings.auth.mode != "entra":
65
+ return None
66
+ from fastapi_azure_auth import SingleTenantAzureAuthorizationCodeBearer
67
+
68
+ app_client_id = settings.auth.audience.removeprefix("api://")
69
+ return SingleTenantAzureAuthorizationCodeBearer(
70
+ app_client_id=app_client_id,
71
+ tenant_id=settings.auth.tenant_id,
72
+ scopes={f"{settings.auth.audience}/search": "Search docforge"},
73
+ )
74
+
75
+
76
+ @asynccontextmanager
77
+ async def lifespan(app: FastAPI):
78
+ """Load the embedding model at startup; close the DB pool on shutdown."""
79
+ global _embedder, _azure_scheme, _cleanup_task
80
+ settings = _get_settings()
81
+ _azure_scheme = _build_auth_scheme(settings)
82
+ if _azure_scheme is not None:
83
+ await _azure_scheme.openid_config.load_config()
84
+ logger.info(
85
+ "Entra auth enabled (tenant=%s, audience=%s)",
86
+ settings.auth.tenant_id,
87
+ settings.auth.audience,
88
+ )
89
+ logger.info("Loading embedding model...")
90
+ _embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
91
+ logger.info("Model loaded: %s (%dd)", _embedder.model_name, _embedder.dimensions)
92
+
93
+ _cleanup_task = asyncio.create_task(
94
+ _query_log_cleanup_loop(settings.database_url, settings.query_log_retention_days)
95
+ )
96
+
97
+ yield
98
+
99
+ if _cleanup_task is not None:
100
+ _cleanup_task.cancel()
101
+ try:
102
+ await _cleanup_task
103
+ except asyncio.CancelledError:
104
+ pass
105
+ await close_pool()
106
+
107
+
108
+ app = FastAPI(title="docforge", lifespan=lifespan)
109
+
110
+
111
+ async def _auth_dependency(request: Request):
112
+ """Return the authenticated User under auth.mode=entra, None otherwise."""
113
+ if _azure_scheme is None:
114
+ return None
115
+ # Empty SecurityScopes: we don't enforce scope-level authorization beyond
116
+ # the token validation the scheme itself does. Without this arg the call
117
+ # signature mismatches what fastapi-azure-auth expects.
118
+ return await _azure_scheme(request, SecurityScopes())
119
+
120
+
121
+ class SearchRequest(BaseModel):
122
+ query: str
123
+ user_name: str
124
+ team_name: str
125
+ area_name: str | None = None
126
+ limit: int = 5
127
+
128
+
129
+ class SearchResult(BaseModel):
130
+ text: str
131
+ section_title: str | None
132
+ source_title: str
133
+ source_url: str
134
+ source_tags: list[str]
135
+ similarity: float
136
+
137
+
138
+ class SearchResponse(BaseModel):
139
+ results: list[SearchResult]
140
+ query: str
141
+ count: int
142
+
143
+
144
+ @app.get("/health")
145
+ async def health() -> dict[str, Any]:
146
+ """Health check endpoint."""
147
+ return {
148
+ "status": "ok",
149
+ "model": _embedder.model_name if _embedder else "not loaded",
150
+ }
151
+
152
+
153
+ @app.post("/search", response_model=SearchResponse)
154
+ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchResponse:
155
+ """Search indexed documentation by semantic similarity."""
156
+ start = time.perf_counter()
157
+ if not _embedder:
158
+ raise HTTPException(status_code=503, detail="Embedding model not loaded yet")
159
+
160
+ try:
161
+ query_vector = _embedder.embed_query(req.query)
162
+ except Exception as e:
163
+ logger.error("Embedding failed: %s", e)
164
+ raise HTTPException(status_code=500, detail="Failed to embed query")
165
+
166
+ settings = _get_settings()
167
+ user_tags = [req.team_name] + ([req.area_name] if req.area_name else [])
168
+
169
+ try:
170
+ pool = await get_pool(settings.database_url)
171
+ async with pool.acquire() as conn:
172
+ rows = await conn.fetch(
173
+ """
174
+ SELECT
175
+ c.text,
176
+ c.section_title,
177
+ s.title AS source_title,
178
+ s.url AS source_url,
179
+ s.tags AS source_tags,
180
+ 1 - (c.embedding <=> $1::vector) AS similarity,
181
+ (1 - (c.embedding <=> $1::vector)) *
182
+ (1
183
+ + $2::float * cardinality(
184
+ ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
185
+ )
186
+ + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
187
+ ) AS boosted_score
188
+ FROM chunks c
189
+ JOIN sources s ON c.source_id = s.id
190
+ WHERE s.status = 'active'
191
+ ORDER BY boosted_score DESC
192
+ LIMIT $5
193
+ """,
194
+ np.array(query_vector, dtype=np.float32),
195
+ settings.tag_match_weight,
196
+ user_tags,
197
+ settings.org_tag_weight,
198
+ req.limit,
199
+ )
200
+ except Exception as e:
201
+ logger.error("Database error during search: %s", e)
202
+ raise HTTPException(status_code=503, detail="Database unavailable")
203
+
204
+ from docforge.query_log import log_query
205
+
206
+ request_ms = int((time.perf_counter() - start) * 1000)
207
+
208
+ # team_name and area_name remain self-declared (routing hints, not identity).
209
+ # user_name and user_oid come from the token when present.
210
+ await log_query(
211
+ pool,
212
+ user.preferred_username if user else req.user_name,
213
+ req.team_name,
214
+ req.area_name,
215
+ req.query,
216
+ len(rows),
217
+ user_oid=user.oid if user else None,
218
+ request_ms=request_ms,
219
+ )
220
+
221
+ results = [
222
+ SearchResult(
223
+ text=row["text"],
224
+ section_title=row["section_title"],
225
+ source_title=row["source_title"],
226
+ source_url=row["source_url"],
227
+ source_tags=list(row["source_tags"] or []),
228
+ similarity=float(row["similarity"]),
229
+ )
230
+ for row in rows
231
+ ]
232
+
233
+ return SearchResponse(results=results, query=req.query, count=len(results))
234
+
235
+
236
+ @app.get("/sources")
237
+ async def list_sources(user=Depends(_auth_dependency)) -> dict[str, Any]:
238
+ """List all indexed documentation sources."""
239
+ settings = _get_settings()
240
+ try:
241
+ pool = await get_pool(settings.database_url)
242
+ async with pool.acquire() as conn:
243
+ rows = await conn.fetch(
244
+ """
245
+ SELECT title, url, status, last_crawled_at,
246
+ (SELECT count(*) FROM chunks WHERE source_id = s.id) AS chunk_count
247
+ FROM sources s
248
+ ORDER BY title
249
+ """
250
+ )
251
+ except Exception as e:
252
+ logger.error("Database error listing sources: %s", e)
253
+ raise HTTPException(status_code=503, detail="Database unavailable")
254
+
255
+ return {
256
+ "count": len(rows),
257
+ "sources": [
258
+ {
259
+ "title": row["title"],
260
+ "url": row["url"],
261
+ "status": row["status"],
262
+ "chunk_count": row["chunk_count"],
263
+ }
264
+ for row in rows
265
+ ],
266
+ }