docforge-cli 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docforge_cli-0.2.0/LICENSE +21 -0
- docforge_cli-0.2.0/PKG-INFO +178 -0
- docforge_cli-0.2.0/README.md +145 -0
- docforge_cli-0.2.0/docforge/__init__.py +0 -0
- docforge_cli-0.2.0/docforge/__main__.py +5 -0
- docforge_cli-0.2.0/docforge/api.py +266 -0
- docforge_cli-0.2.0/docforge/cli.py +296 -0
- docforge_cli-0.2.0/docforge/config.py +99 -0
- docforge_cli-0.2.0/docforge/crawlers/__init__.py +1 -0
- docforge_cli-0.2.0/docforge/crawlers/confluence.py +109 -0
- docforge_cli-0.2.0/docforge/crawlers/git.py +79 -0
- docforge_cli-0.2.0/docforge/db.py +57 -0
- docforge_cli-0.2.0/docforge/ingest.py +401 -0
- docforge_cli-0.2.0/docforge/lint.py +92 -0
- docforge_cli-0.2.0/docforge/mcp_server.py +188 -0
- docforge_cli-0.2.0/docforge/processors/__init__.py +1 -0
- docforge_cli-0.2.0/docforge/processors/chunker.py +141 -0
- docforge_cli-0.2.0/docforge/processors/embedder.py +78 -0
- docforge_cli-0.2.0/docforge/processors/parser.py +143 -0
- docforge_cli-0.2.0/docforge/query_log.py +45 -0
- docforge_cli-0.2.0/docforge/ranking.py +20 -0
- docforge_cli-0.2.0/docforge/scripts/__init__.py +1 -0
- docforge_cli-0.2.0/docforge/scripts/eval_search.py +226 -0
- docforge_cli-0.2.0/docforge/scripts/latency_report.py +142 -0
- docforge_cli-0.2.0/docforge/sources.py +46 -0
- docforge_cli-0.2.0/docforge/sql/migrations/001_add_source_identifier.sql +3 -0
- docforge_cli-0.2.0/docforge/sql/migrations/002_add_status_index.sql +1 -0
- docforge_cli-0.2.0/docforge/sql/migrations/003_add_source_tags.sql +4 -0
- docforge_cli-0.2.0/docforge/sql/migrations/004_add_query_log.sql +11 -0
- docforge_cli-0.2.0/docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
- docforge_cli-0.2.0/docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
- docforge_cli-0.2.0/docforge/sql/schema.sql +29 -0
- docforge_cli-0.2.0/docforge/templates/docforge.yml +11 -0
- docforge_cli-0.2.0/docforge/templates/docker-compose.yml +14 -0
- docforge_cli-0.2.0/docforge/templates/mcp_client.py +83 -0
- docforge_cli-0.2.0/docforge/templates/sources.yml +21 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/PKG-INFO +178 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/SOURCES.txt +42 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/dependency_links.txt +1 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/entry_points.txt +2 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/requires.txt +25 -0
- docforge_cli-0.2.0/docforge_cli.egg-info/top_level.txt +1 -0
- docforge_cli-0.2.0/pyproject.toml +73 -0
- docforge_cli-0.2.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tobias Ens
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docforge-cli
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Forge searchable context from Confluence and git repos for AI coding assistants
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: typer>=0.12
|
|
10
|
+
Requires-Dist: asyncpg>=0.30
|
|
11
|
+
Requires-Dist: httpx>=0.27
|
|
12
|
+
Requires-Dist: pydantic>=2.9
|
|
13
|
+
Requires-Dist: pydantic-settings>=2.6
|
|
14
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
15
|
+
Requires-Dist: sentence-transformers>=5.0
|
|
16
|
+
Requires-Dist: pgvector>=0.3
|
|
17
|
+
Requires-Dist: pyyaml>=6.0
|
|
18
|
+
Requires-Dist: fastmcp>=2.0
|
|
19
|
+
Requires-Dist: fastapi>=0.115
|
|
20
|
+
Requires-Dist: uvicorn>=0.34
|
|
21
|
+
Requires-Dist: numpy>=1.26
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-cov>=7.0; extra == "dev"
|
|
26
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
27
|
+
Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
|
|
28
|
+
Provides-Extra: entra
|
|
29
|
+
Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
|
|
30
|
+
Requires-Dist: azure-identity>=1.19; extra == "entra"
|
|
31
|
+
Requires-Dist: aiohttp>=3.10; extra == "entra"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# docforge
|
|
35
|
+
|
|
36
|
+
**The self-hosted context engine for AI coding assistants.**
|
|
37
|
+
|
|
38
|
+
Point docforge at your Confluence spaces and local git repositories. It indexes, embeds, and serves them over MCP — so Claude Code, Cursor, Copilot, and any assistant that speaks MCP can search your team's knowledge without your data leaving your infrastructure.
|
|
39
|
+
|
|
40
|
+
docforge doesn't replace your AI assistant. It feeds it — turning Claude Code, Cursor, Copilot, and anything else that speaks MCP into tools that actually know your team's docs and code.
|
|
41
|
+
|
|
42
|
+
[](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml)
|
|
43
|
+
[](https://pypi.org/project/docforge-cli/)
|
|
44
|
+
[](https://www.python.org/downloads/)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
[](https://github.com/astral-sh/ruff)
|
|
47
|
+
|
|
48
|
+
## Why docforge
|
|
49
|
+
|
|
50
|
+
| Tool | Self-hosted | Integration | Confluence + code | Footprint | Complements AI assistants? |
|
|
51
|
+
|---|---|---|---|---|---|
|
|
52
|
+
| **docforge** | ✓ | MCP server | ✓ (Confluence + local git) | Minimal (PG + 1 container) | ✓ (any MCP client) |
|
|
53
|
+
| [Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga) | ✗ (Cloud-only) | MCP server | Confluence only (Cloud) | SaaS | ✓ |
|
|
54
|
+
| [zilliztech/claude-context](https://github.com/zilliztech/claude-context) | ✓ | MCP server | Code only | Minimal | ✓ |
|
|
55
|
+
| [Onyx](https://github.com/onyx-dot-app/onyx) | ✓ | MCP + chat UI | ✓ (50+ connectors) | Heavy (Standard) / Minimal (Lite) | ✓ (+ its own UI) |
|
|
56
|
+
| Cursor codebase index + @Docs | ✗ | Proprietary | Code + public web docs | SaaS | — (built into Cursor only) |
|
|
57
|
+
| [Copilot Spaces](https://github.com/orgs/community/discussions/180894) | ✗ | Proprietary (MCP for actions) | Code + attachments | SaaS | — (built into Copilot only) |
|
|
58
|
+
| [Sourcegraph Cody](https://sourcegraph.com/docs/cody/enterprise/features) | ✓ (Enterprise) | OpenCtx / MCP | ✓ (via OpenCtx) | Heavy (Sourcegraph platform) | — (built into Cody only) |
|
|
59
|
+
| LangChain / LlamaIndex DIY | ✓ | Whatever you build | You wire it | Depends | Depends |
|
|
60
|
+
|
|
61
|
+
docforge is the narrow, focused option in this landscape: minimal footprint, MCP-native so it works with every assistant, and combines Confluence + code out of the box. It doesn't compete on connector count (Onyx wins there), visual UX (Cursor and Cody win), or SaaS convenience (Rovo). It competes on being **small, legible, vendor-neutral, and self-hosted** — four properties no commercial option offers together.
|
|
62
|
+
|
|
63
|
+
### ✅ When docforge fits
|
|
64
|
+
|
|
65
|
+
- You run Confluence Data Center/Server, or you want to self-host.
|
|
66
|
+
- Your team uses MCP-capable assistants (Claude Code, Cursor with MCP, Copilot with MCP, etc.).
|
|
67
|
+
- You want Confluence + git repos indexed together with one tool.
|
|
68
|
+
- Operational simplicity matters — one Postgres, one container, MIT-licensed code you can audit in an afternoon.
|
|
69
|
+
|
|
70
|
+
### ❌ When docforge is the wrong choice
|
|
71
|
+
|
|
72
|
+
- You need 50+ connectors (Slack, Jira, Gmail, Drive, Notion) → use **[Onyx](https://github.com/onyx-dot-app/onyx)** or **[Glean](https://www.glean.com/)**.
|
|
73
|
+
- You need per-document ACLs enforced at query time → not yet supported; use **Onyx**.
|
|
74
|
+
- You need a chat UI for non-developers → docforge has no UI; use **Onyx**, **Glean**, or **Cody**.
|
|
75
|
+
- You're on Atlassian Cloud and happy with SaaS → **[Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga)** is free and official.
|
|
76
|
+
- You need SSO / SCIM / RBAC → out of scope; docforge authenticates but doesn't authorize per-resource.
|
|
77
|
+
- Your corpus is very large (>100K pages/chunks) → dense-only retrieval without hybrid starts to degrade; on the [roadmap](ROADMAP.md).
|
|
78
|
+
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
79
|
+
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install docforge-cli
|
|
85
|
+
docforge init my-project
|
|
86
|
+
cd my-project
|
|
87
|
+
# Edit docforge.yml with your Confluence URL
|
|
88
|
+
# Edit sources.yml with your page IDs and local git repo paths
|
|
89
|
+
# Edit .env with your credentials
|
|
90
|
+
docker compose up -d db
|
|
91
|
+
docforge init-db
|
|
92
|
+
docforge ingest
|
|
93
|
+
docforge serve
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**Note:** The git crawler indexes **local filesystem paths** — docforge does not clone GitHub URLs. Clone first, then point docforge at the checkout path in `sources.yml`.
|
|
97
|
+
|
|
98
|
+
## How It Works
|
|
99
|
+
|
|
100
|
+
1. **Configure** your Confluence URL, page IDs, and local git repo paths in `sources.yml`.
|
|
101
|
+
2. **Ingest** crawls pages and files, chunks text (~500 tokens), generates vector embeddings (768-dim).
|
|
102
|
+
3. **Serve** exposes an MCP server that AI assistants query automatically.
|
|
103
|
+
|
|
104
|
+
When an AI assistant needs cross-team context, it calls docforge's `search_documentation` MCP tool behind the scenes and gets relevant documentation chunks with source attribution.
|
|
105
|
+
|
|
106
|
+
### Architecture
|
|
107
|
+
|
|
108
|
+

|
|
109
|
+
|
|
110
|
+
## Commands
|
|
111
|
+
|
|
112
|
+
| Command | Description |
|
|
113
|
+
|---------|-------------|
|
|
114
|
+
| `docforge init <name>` | Scaffold a new project with config templates |
|
|
115
|
+
| `docforge init-db` | Initialize the PostgreSQL database schema |
|
|
116
|
+
| `docforge ingest` | Crawl all sources, embed, store in PostgreSQL |
|
|
117
|
+
| `docforge search "<query>"` | Test search from terminal |
|
|
118
|
+
| `docforge serve` | Run MCP server for AI assistants |
|
|
119
|
+
| `docforge serve --api` | Run FastAPI search API (for hosted deployment) |
|
|
120
|
+
| `docforge status` | Show index stats and health |
|
|
121
|
+
|
|
122
|
+
## Deploy to your infrastructure
|
|
123
|
+
|
|
124
|
+
For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
|
|
125
|
+
|
|
126
|
+
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
127
|
+
- Container App running the FastAPI search API.
|
|
128
|
+
- Container Registry, Key Vault, Log Analytics, managed environment.
|
|
129
|
+
- Team members use a lightweight MCP client that calls the hosted API.
|
|
130
|
+
|
|
131
|
+
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
132
|
+
|
|
133
|
+
## Configuration
|
|
134
|
+
|
|
135
|
+
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
136
|
+
|
|
137
|
+
## Contributing
|
|
138
|
+
|
|
139
|
+
Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development setup, branch conventions, and PR expectations. Bug reports and feature requests go through [GitHub Issues](https://github.com/GranatenUdo/docforge/issues); open-ended questions and ideas live in [Discussions](https://github.com/GranatenUdo/docforge/discussions).
|
|
140
|
+
|
|
141
|
+
## Evaluation & retrieval quality
|
|
142
|
+
|
|
143
|
+
docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
|
|
144
|
+
|
|
145
|
+
## FAQ
|
|
146
|
+
|
|
147
|
+
### "Cannot connect to PostgreSQL"
|
|
148
|
+
|
|
149
|
+
Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
|
|
150
|
+
|
|
151
|
+
### "HF_TOKEN required" or model download fails
|
|
152
|
+
|
|
153
|
+
The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
|
|
154
|
+
|
|
155
|
+
### "No results found" after ingest
|
|
156
|
+
|
|
157
|
+
Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
|
|
158
|
+
|
|
159
|
+
### First ingest / first container start is very slow
|
|
160
|
+
|
|
161
|
+
The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
|
|
162
|
+
|
|
163
|
+
### "Ingest skipped everything"
|
|
164
|
+
|
|
165
|
+
docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT. See [LICENSE](LICENSE).
|
|
170
|
+
|
|
171
|
+
## Credits
|
|
172
|
+
|
|
173
|
+
docforge stands on open shoulders:
|
|
174
|
+
|
|
175
|
+
- [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) — open-weights embedding model under the Gemma license.
|
|
176
|
+
- [pgvector](https://github.com/pgvector/pgvector) — vector similarity for Postgres.
|
|
177
|
+
- [FastMCP](https://github.com/PrefectHQ/fastmcp) — MCP server framework.
|
|
178
|
+
- [FastAPI](https://fastapi.tiangolo.com/), [Typer](https://typer.tiangolo.com/), [asyncpg](https://magicstack.github.io/asyncpg/), [sentence-transformers](https://www.sbert.net/) — core infrastructure.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# docforge
|
|
2
|
+
|
|
3
|
+
**The self-hosted context engine for AI coding assistants.**
|
|
4
|
+
|
|
5
|
+
Point docforge at your Confluence spaces and local git repositories. It indexes, embeds, and serves them over MCP — so Claude Code, Cursor, Copilot, and any assistant that speaks MCP can search your team's knowledge without your data leaving your infrastructure.
|
|
6
|
+
|
|
7
|
+
docforge doesn't replace your AI assistant. It feeds it — turning Claude Code, Cursor, Copilot, and anything else that speaks MCP into tools that actually know your team's docs and code.
|
|
8
|
+
|
|
9
|
+
[](https://github.com/GranatenUdo/docforge/actions/workflows/ci.yml)
|
|
10
|
+
[](https://pypi.org/project/docforge-cli/)
|
|
11
|
+
[](https://www.python.org/downloads/)
|
|
12
|
+
[](LICENSE)
|
|
13
|
+
[](https://github.com/astral-sh/ruff)
|
|
14
|
+
|
|
15
|
+
## Why docforge
|
|
16
|
+
|
|
17
|
+
| Tool | Self-hosted | Integration | Confluence + code | Footprint | Complements AI assistants? |
|
|
18
|
+
|---|---|---|---|---|---|
|
|
19
|
+
| **docforge** | ✓ | MCP server | ✓ (Confluence + local git) | Minimal (PG + 1 container) | ✓ (any MCP client) |
|
|
20
|
+
| [Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga) | ✗ (Cloud-only) | MCP server | Confluence only (Cloud) | SaaS | ✓ |
|
|
21
|
+
| [zilliztech/claude-context](https://github.com/zilliztech/claude-context) | ✓ | MCP server | Code only | Minimal | ✓ |
|
|
22
|
+
| [Onyx](https://github.com/onyx-dot-app/onyx) | ✓ | MCP + chat UI | ✓ (50+ connectors) | Heavy (Standard) / Minimal (Lite) | ✓ (+ its own UI) |
|
|
23
|
+
| Cursor codebase index + @Docs | ✗ | Proprietary | Code + public web docs | SaaS | — (built into Cursor only) |
|
|
24
|
+
| [Copilot Spaces](https://github.com/orgs/community/discussions/180894) | ✗ | Proprietary (MCP for actions) | Code + attachments | SaaS | — (built into Copilot only) |
|
|
25
|
+
| [Sourcegraph Cody](https://sourcegraph.com/docs/cody/enterprise/features) | ✓ (Enterprise) | OpenCtx / MCP | ✓ (via OpenCtx) | Heavy (Sourcegraph platform) | — (built into Cody only) |
|
|
26
|
+
| LangChain / LlamaIndex DIY | ✓ | Whatever you build | You wire it | Depends | Depends |
|
|
27
|
+
|
|
28
|
+
docforge is the narrow, focused option in this landscape: minimal footprint, MCP-native so it works with every assistant, and combines Confluence + code out of the box. It doesn't compete on connector count (Onyx wins there), visual UX (Cursor and Cody win), or SaaS convenience (Rovo). It competes on being **small, legible, vendor-neutral, and self-hosted** — four properties no commercial option offers together.
|
|
29
|
+
|
|
30
|
+
### ✅ When docforge fits
|
|
31
|
+
|
|
32
|
+
- You run Confluence Data Center/Server, or you want to self-host.
|
|
33
|
+
- Your team uses MCP-capable assistants (Claude Code, Cursor with MCP, Copilot with MCP, etc.).
|
|
34
|
+
- You want Confluence + git repos indexed together with one tool.
|
|
35
|
+
- Operational simplicity matters — one Postgres, one container, MIT-licensed code you can audit in an afternoon.
|
|
36
|
+
|
|
37
|
+
### ❌ When docforge is the wrong choice
|
|
38
|
+
|
|
39
|
+
- You need 50+ connectors (Slack, Jira, Gmail, Drive, Notion) → use **[Onyx](https://github.com/onyx-dot-app/onyx)** or **[Glean](https://www.glean.com/)**.
|
|
40
|
+
- You need per-document ACLs enforced at query time → not yet supported; use **Onyx**.
|
|
41
|
+
- You need a chat UI for non-developers → docforge has no UI; use **Onyx**, **Glean**, or **Cody**.
|
|
42
|
+
- You're on Atlassian Cloud and happy with SaaS → **[Atlassian Rovo MCP](https://www.atlassian.com/blog/announcements/atlassian-rovo-mcp-ga)** is free and official.
|
|
43
|
+
- You need SSO / SCIM / RBAC → out of scope; docforge authenticates but doesn't authorize per-resource.
|
|
44
|
+
- Your corpus is very large (>100K pages/chunks) → dense-only retrieval without hybrid starts to degrade; on the [roadmap](ROADMAP.md).
|
|
45
|
+
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
46
|
+
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install docforge-cli
|
|
52
|
+
docforge init my-project
|
|
53
|
+
cd my-project
|
|
54
|
+
# Edit docforge.yml with your Confluence URL
|
|
55
|
+
# Edit sources.yml with your page IDs and local git repo paths
|
|
56
|
+
# Edit .env with your credentials
|
|
57
|
+
docker compose up -d db
|
|
58
|
+
docforge init-db
|
|
59
|
+
docforge ingest
|
|
60
|
+
docforge serve
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Note:** The git crawler indexes **local filesystem paths** — docforge does not clone GitHub URLs. Clone first, then point docforge at the checkout path in `sources.yml`.
|
|
64
|
+
|
|
65
|
+
## How It Works
|
|
66
|
+
|
|
67
|
+
1. **Configure** your Confluence URL, page IDs, and local git repo paths in `sources.yml`.
|
|
68
|
+
2. **Ingest** crawls pages and files, chunks text (~500 tokens), generates vector embeddings (768-dim).
|
|
69
|
+
3. **Serve** exposes an MCP server that AI assistants query automatically.
|
|
70
|
+
|
|
71
|
+
When an AI assistant needs cross-team context, it calls docforge's `search_documentation` MCP tool behind the scenes and gets relevant documentation chunks with source attribution.
|
|
72
|
+
|
|
73
|
+
### Architecture
|
|
74
|
+
|
|
75
|
+

|
|
76
|
+
|
|
77
|
+
## Commands
|
|
78
|
+
|
|
79
|
+
| Command | Description |
|
|
80
|
+
|---------|-------------|
|
|
81
|
+
| `docforge init <name>` | Scaffold a new project with config templates |
|
|
82
|
+
| `docforge init-db` | Initialize the PostgreSQL database schema |
|
|
83
|
+
| `docforge ingest` | Crawl all sources, embed, store in PostgreSQL |
|
|
84
|
+
| `docforge search "<query>"` | Test search from terminal |
|
|
85
|
+
| `docforge serve` | Run MCP server for AI assistants |
|
|
86
|
+
| `docforge serve --api` | Run FastAPI search API (for hosted deployment) |
|
|
87
|
+
| `docforge status` | Show index stats and health |
|
|
88
|
+
|
|
89
|
+
## Deploy to your infrastructure
|
|
90
|
+
|
|
91
|
+
For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
|
|
92
|
+
|
|
93
|
+
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
94
|
+
- Container App running the FastAPI search API.
|
|
95
|
+
- Container Registry, Key Vault, Log Analytics, managed environment.
|
|
96
|
+
- Team members use a lightweight MCP client that calls the hosted API.
|
|
97
|
+
|
|
98
|
+
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
99
|
+
|
|
100
|
+
## Configuration
|
|
101
|
+
|
|
102
|
+
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
103
|
+
|
|
104
|
+
## Contributing
|
|
105
|
+
|
|
106
|
+
Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development setup, branch conventions, and PR expectations. Bug reports and feature requests go through [GitHub Issues](https://github.com/GranatenUdo/docforge/issues); open-ended questions and ideas live in [Discussions](https://github.com/GranatenUdo/docforge/discussions).
|
|
107
|
+
|
|
108
|
+
## Evaluation & retrieval quality
|
|
109
|
+
|
|
110
|
+
docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
|
|
111
|
+
|
|
112
|
+
## FAQ
|
|
113
|
+
|
|
114
|
+
### "Cannot connect to PostgreSQL"
|
|
115
|
+
|
|
116
|
+
Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
|
|
117
|
+
|
|
118
|
+
### "HF_TOKEN required" or model download fails
|
|
119
|
+
|
|
120
|
+
The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
|
|
121
|
+
|
|
122
|
+
### "No results found" after ingest
|
|
123
|
+
|
|
124
|
+
Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
|
|
125
|
+
|
|
126
|
+
### First ingest / first container start is very slow
|
|
127
|
+
|
|
128
|
+
The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
|
|
129
|
+
|
|
130
|
+
### "Ingest skipped everything"
|
|
131
|
+
|
|
132
|
+
docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT. See [LICENSE](LICENSE).
|
|
137
|
+
|
|
138
|
+
## Credits
|
|
139
|
+
|
|
140
|
+
docforge stands on open shoulders:
|
|
141
|
+
|
|
142
|
+
- [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) — open-weights embedding model under the Gemma license.
|
|
143
|
+
- [pgvector](https://github.com/pgvector/pgvector) — vector similarity for Postgres.
|
|
144
|
+
- [FastMCP](https://github.com/PrefectHQ/fastmcp) — MCP server framework.
|
|
145
|
+
- [FastAPI](https://fastapi.tiangolo.com/), [Typer](https://typer.tiangolo.com/), [asyncpg](https://magicstack.github.io/asyncpg/), [sentence-transformers](https://www.sbert.net/) — core infrastructure.
|
|
File without changes
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""FastAPI search API for docforge.
|
|
2
|
+
|
|
3
|
+
Runs on Azure Container Apps. Loads embedding model at startup,
|
|
4
|
+
serves search queries over HTTP.
|
|
5
|
+
|
|
6
|
+
Run locally: uvicorn docforge.api:app --reload
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import logging
|
|
13
|
+
import time
|
|
14
|
+
from contextlib import asynccontextmanager
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from fastapi import Depends, FastAPI, HTTPException, Request
|
|
19
|
+
from fastapi.security import SecurityScopes
|
|
20
|
+
from pydantic import BaseModel
|
|
21
|
+
|
|
22
|
+
from docforge.config import Settings
|
|
23
|
+
from docforge.db import close_pool, get_pool
|
|
24
|
+
from docforge.processors.embedder import Embedder
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
_embedder: Embedder | None = None
|
|
29
|
+
_settings: Settings | None = None
|
|
30
|
+
_azure_scheme = None # Populated in lifespan when auth.mode == "entra"
|
|
31
|
+
_cleanup_task: asyncio.Task | None = None
|
|
32
|
+
|
|
33
|
+
_CLEANUP_INTERVAL_SECONDS = 3600 # one hour — overridable in tests
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def _query_log_cleanup_loop(database_url: str, retention_days: int) -> None:
|
|
37
|
+
"""Deletes query_log rows older than retention_days every
|
|
38
|
+
_CLEANUP_INTERVAL_SECONDS. Idempotent, so multi-replica is safe."""
|
|
39
|
+
# int() coercion makes the f-string SQL below injection-safe. asyncpg's
|
|
40
|
+
# $1::interval parameter binding doesn't accept str, hence the literal.
|
|
41
|
+
days = int(retention_days)
|
|
42
|
+
while True:
|
|
43
|
+
try:
|
|
44
|
+
pool = await get_pool(database_url)
|
|
45
|
+
async with pool.acquire() as conn:
|
|
46
|
+
result = await conn.execute(
|
|
47
|
+
f"DELETE FROM query_log WHERE created_at < now() - interval '{days} days'"
|
|
48
|
+
)
|
|
49
|
+
logger.info("query_log cleanup: %s", result)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.exception("query_log cleanup failed: %s", e)
|
|
52
|
+
await asyncio.sleep(_CLEANUP_INTERVAL_SECONDS)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _get_settings() -> Settings:
|
|
56
|
+
global _settings
|
|
57
|
+
if _settings is None:
|
|
58
|
+
_settings = Settings()
|
|
59
|
+
return _settings
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _build_auth_scheme(settings: Settings):
|
|
63
|
+
"""Return a SingleTenantAzureAuthorizationCodeBearer if mode==entra, else None."""
|
|
64
|
+
if settings.auth.mode != "entra":
|
|
65
|
+
return None
|
|
66
|
+
from fastapi_azure_auth import SingleTenantAzureAuthorizationCodeBearer
|
|
67
|
+
|
|
68
|
+
app_client_id = settings.auth.audience.removeprefix("api://")
|
|
69
|
+
return SingleTenantAzureAuthorizationCodeBearer(
|
|
70
|
+
app_client_id=app_client_id,
|
|
71
|
+
tenant_id=settings.auth.tenant_id,
|
|
72
|
+
scopes={f"{settings.auth.audience}/search": "Search docforge"},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@asynccontextmanager
|
|
77
|
+
async def lifespan(app: FastAPI):
|
|
78
|
+
"""Load the embedding model at startup; close the DB pool on shutdown."""
|
|
79
|
+
global _embedder, _azure_scheme, _cleanup_task
|
|
80
|
+
settings = _get_settings()
|
|
81
|
+
_azure_scheme = _build_auth_scheme(settings)
|
|
82
|
+
if _azure_scheme is not None:
|
|
83
|
+
await _azure_scheme.openid_config.load_config()
|
|
84
|
+
logger.info(
|
|
85
|
+
"Entra auth enabled (tenant=%s, audience=%s)",
|
|
86
|
+
settings.auth.tenant_id,
|
|
87
|
+
settings.auth.audience,
|
|
88
|
+
)
|
|
89
|
+
logger.info("Loading embedding model...")
|
|
90
|
+
_embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
|
|
91
|
+
logger.info("Model loaded: %s (%dd)", _embedder.model_name, _embedder.dimensions)
|
|
92
|
+
|
|
93
|
+
_cleanup_task = asyncio.create_task(
|
|
94
|
+
_query_log_cleanup_loop(settings.database_url, settings.query_log_retention_days)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
yield
|
|
98
|
+
|
|
99
|
+
if _cleanup_task is not None:
|
|
100
|
+
_cleanup_task.cancel()
|
|
101
|
+
try:
|
|
102
|
+
await _cleanup_task
|
|
103
|
+
except asyncio.CancelledError:
|
|
104
|
+
pass
|
|
105
|
+
await close_pool()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
app = FastAPI(title="docforge", lifespan=lifespan)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
async def _auth_dependency(request: Request):
|
|
112
|
+
"""Return the authenticated User under auth.mode=entra, None otherwise."""
|
|
113
|
+
if _azure_scheme is None:
|
|
114
|
+
return None
|
|
115
|
+
# Empty SecurityScopes: we don't enforce scope-level authorization beyond
|
|
116
|
+
# the token validation the scheme itself does. Without this arg the call
|
|
117
|
+
# signature mismatches what fastapi-azure-auth expects.
|
|
118
|
+
return await _azure_scheme(request, SecurityScopes())
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class SearchRequest(BaseModel):
|
|
122
|
+
query: str
|
|
123
|
+
user_name: str
|
|
124
|
+
team_name: str
|
|
125
|
+
area_name: str | None = None
|
|
126
|
+
limit: int = 5
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class SearchResult(BaseModel):
|
|
130
|
+
text: str
|
|
131
|
+
section_title: str | None
|
|
132
|
+
source_title: str
|
|
133
|
+
source_url: str
|
|
134
|
+
source_tags: list[str]
|
|
135
|
+
similarity: float
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class SearchResponse(BaseModel):
|
|
139
|
+
results: list[SearchResult]
|
|
140
|
+
query: str
|
|
141
|
+
count: int
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@app.get("/health")
|
|
145
|
+
async def health() -> dict[str, Any]:
|
|
146
|
+
"""Health check endpoint."""
|
|
147
|
+
return {
|
|
148
|
+
"status": "ok",
|
|
149
|
+
"model": _embedder.model_name if _embedder else "not loaded",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@app.post("/search", response_model=SearchResponse)
|
|
154
|
+
async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchResponse:
|
|
155
|
+
"""Search indexed documentation by semantic similarity."""
|
|
156
|
+
start = time.perf_counter()
|
|
157
|
+
if not _embedder:
|
|
158
|
+
raise HTTPException(status_code=503, detail="Embedding model not loaded yet")
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
query_vector = _embedder.embed_query(req.query)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error("Embedding failed: %s", e)
|
|
164
|
+
raise HTTPException(status_code=500, detail="Failed to embed query")
|
|
165
|
+
|
|
166
|
+
settings = _get_settings()
|
|
167
|
+
user_tags = [req.team_name] + ([req.area_name] if req.area_name else [])
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
pool = await get_pool(settings.database_url)
|
|
171
|
+
async with pool.acquire() as conn:
|
|
172
|
+
rows = await conn.fetch(
|
|
173
|
+
"""
|
|
174
|
+
SELECT
|
|
175
|
+
c.text,
|
|
176
|
+
c.section_title,
|
|
177
|
+
s.title AS source_title,
|
|
178
|
+
s.url AS source_url,
|
|
179
|
+
s.tags AS source_tags,
|
|
180
|
+
1 - (c.embedding <=> $1::vector) AS similarity,
|
|
181
|
+
(1 - (c.embedding <=> $1::vector)) *
|
|
182
|
+
(1
|
|
183
|
+
+ $2::float * cardinality(
|
|
184
|
+
ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
|
|
185
|
+
)
|
|
186
|
+
+ $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
|
|
187
|
+
) AS boosted_score
|
|
188
|
+
FROM chunks c
|
|
189
|
+
JOIN sources s ON c.source_id = s.id
|
|
190
|
+
WHERE s.status = 'active'
|
|
191
|
+
ORDER BY boosted_score DESC
|
|
192
|
+
LIMIT $5
|
|
193
|
+
""",
|
|
194
|
+
np.array(query_vector, dtype=np.float32),
|
|
195
|
+
settings.tag_match_weight,
|
|
196
|
+
user_tags,
|
|
197
|
+
settings.org_tag_weight,
|
|
198
|
+
req.limit,
|
|
199
|
+
)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.error("Database error during search: %s", e)
|
|
202
|
+
raise HTTPException(status_code=503, detail="Database unavailable")
|
|
203
|
+
|
|
204
|
+
from docforge.query_log import log_query
|
|
205
|
+
|
|
206
|
+
request_ms = int((time.perf_counter() - start) * 1000)
|
|
207
|
+
|
|
208
|
+
# team_name and area_name remain self-declared (routing hints, not identity).
|
|
209
|
+
# user_name and user_oid come from the token when present.
|
|
210
|
+
await log_query(
|
|
211
|
+
pool,
|
|
212
|
+
user.preferred_username if user else req.user_name,
|
|
213
|
+
req.team_name,
|
|
214
|
+
req.area_name,
|
|
215
|
+
req.query,
|
|
216
|
+
len(rows),
|
|
217
|
+
user_oid=user.oid if user else None,
|
|
218
|
+
request_ms=request_ms,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
results = [
|
|
222
|
+
SearchResult(
|
|
223
|
+
text=row["text"],
|
|
224
|
+
section_title=row["section_title"],
|
|
225
|
+
source_title=row["source_title"],
|
|
226
|
+
source_url=row["source_url"],
|
|
227
|
+
source_tags=list(row["source_tags"] or []),
|
|
228
|
+
similarity=float(row["similarity"]),
|
|
229
|
+
)
|
|
230
|
+
for row in rows
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
return SearchResponse(results=results, query=req.query, count=len(results))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
@app.get("/sources")
|
|
237
|
+
async def list_sources(user=Depends(_auth_dependency)) -> dict[str, Any]:
|
|
238
|
+
"""List all indexed documentation sources."""
|
|
239
|
+
settings = _get_settings()
|
|
240
|
+
try:
|
|
241
|
+
pool = await get_pool(settings.database_url)
|
|
242
|
+
async with pool.acquire() as conn:
|
|
243
|
+
rows = await conn.fetch(
|
|
244
|
+
"""
|
|
245
|
+
SELECT title, url, status, last_crawled_at,
|
|
246
|
+
(SELECT count(*) FROM chunks WHERE source_id = s.id) AS chunk_count
|
|
247
|
+
FROM sources s
|
|
248
|
+
ORDER BY title
|
|
249
|
+
"""
|
|
250
|
+
)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.error("Database error listing sources: %s", e)
|
|
253
|
+
raise HTTPException(status_code=503, detail="Database unavailable")
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
"count": len(rows),
|
|
257
|
+
"sources": [
|
|
258
|
+
{
|
|
259
|
+
"title": row["title"],
|
|
260
|
+
"url": row["url"],
|
|
261
|
+
"status": row["status"],
|
|
262
|
+
"chunk_count": row["chunk_count"],
|
|
263
|
+
}
|
|
264
|
+
for row in rows
|
|
265
|
+
],
|
|
266
|
+
}
|