docforge-cli 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docforge_cli-0.2.0/docforge_cli.egg-info → docforge_cli-0.3.0}/PKG-INFO +108 -35
- docforge_cli-0.2.0/PKG-INFO → docforge_cli-0.3.0/README.md +81 -46
- docforge_cli-0.3.0/pyproject.toml +81 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/api.py +107 -72
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/cli.py +18 -3
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/config.py +22 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/db.py +15 -4
- docforge_cli-0.3.0/src/docforge/embedder_api.py +86 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/ingest.py +8 -4
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/mcp_server.py +23 -15
- docforge_cli-0.3.0/src/docforge/processors/embedder.py +246 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/eval_search.py +2 -2
- docforge_cli-0.2.0/README.md → docforge_cli-0.3.0/src/docforge_cli.egg-info/PKG-INFO +119 -13
- docforge_cli-0.3.0/src/docforge_cli.egg-info/SOURCES.txt +43 -0
- docforge_cli-0.3.0/src/docforge_cli.egg-info/requires.txt +25 -0
- docforge_cli-0.2.0/docforge/processors/embedder.py +0 -78
- docforge_cli-0.2.0/docforge_cli.egg-info/SOURCES.txt +0 -42
- docforge_cli-0.2.0/docforge_cli.egg-info/requires.txt +0 -25
- docforge_cli-0.2.0/pyproject.toml +0 -73
- {docforge_cli-0.2.0 → docforge_cli-0.3.0}/LICENSE +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0}/setup.cfg +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/__init__.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/__main__.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/__init__.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/confluence.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/git.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/lint.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/__init__.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/chunker.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/parser.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/query_log.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/ranking.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/__init__.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/latency_report.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sources.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/002_add_status_index.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/003_add_source_tags.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/004_add_query_log.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/schema.sql +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/docforge.yml +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/docker-compose.yml +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/mcp_client.py +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/sources.yml +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/dependency_links.txt +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/entry_points.txt +0 -0
- {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/top_level.txt +0 -0
|
@@ -1,34 +1,39 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docforge-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Forge searchable context from Confluence and git repos for AI coding assistants
|
|
5
5
|
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
|
|
7
|
+
Project-URL: Source, https://github.com/GranatenUdo/docforge
|
|
8
|
+
Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
|
|
10
|
+
Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
|
|
6
11
|
Requires-Python: >=3.12
|
|
7
12
|
Description-Content-Type: text/markdown
|
|
8
13
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: typer
|
|
10
|
-
Requires-Dist: asyncpg
|
|
11
|
-
Requires-Dist: httpx
|
|
12
|
-
Requires-Dist: pydantic
|
|
13
|
-
Requires-Dist: pydantic-settings
|
|
14
|
-
Requires-Dist: beautifulsoup4
|
|
15
|
-
Requires-Dist: sentence-transformers
|
|
16
|
-
Requires-Dist: pgvector
|
|
17
|
-
Requires-Dist: pyyaml
|
|
18
|
-
Requires-Dist: fastmcp
|
|
19
|
-
Requires-Dist: fastapi
|
|
20
|
-
Requires-Dist: uvicorn
|
|
21
|
-
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: typer<1.0,>=0.12
|
|
15
|
+
Requires-Dist: asyncpg<1.0,>=0.30
|
|
16
|
+
Requires-Dist: httpx<1.0,>=0.27
|
|
17
|
+
Requires-Dist: pydantic<3.0,>=2.9
|
|
18
|
+
Requires-Dist: pydantic-settings<3.0,>=2.6
|
|
19
|
+
Requires-Dist: beautifulsoup4<5.0,>=4.12
|
|
20
|
+
Requires-Dist: sentence-transformers<6.0,>=5.0
|
|
21
|
+
Requires-Dist: pgvector<1.0,>=0.3
|
|
22
|
+
Requires-Dist: pyyaml<7.0,>=6.0
|
|
23
|
+
Requires-Dist: fastmcp<4.0,>=3.0
|
|
24
|
+
Requires-Dist: fastapi<1.0,>=0.115
|
|
25
|
+
Requires-Dist: uvicorn<1.0,>=0.34
|
|
26
|
+
Requires-Dist: numpy<3.0,>=1.26
|
|
22
27
|
Provides-Extra: dev
|
|
23
|
-
Requires-Dist: pytest
|
|
24
|
-
Requires-Dist: pytest-asyncio
|
|
25
|
-
Requires-Dist: pytest-cov
|
|
26
|
-
Requires-Dist: ruff
|
|
27
|
-
Requires-Dist: testcontainers[postgres]
|
|
28
|
+
Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
|
|
32
|
+
Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
|
|
28
33
|
Provides-Extra: entra
|
|
29
|
-
Requires-Dist: fastapi-azure-auth
|
|
30
|
-
Requires-Dist: azure-identity
|
|
31
|
-
Requires-Dist: aiohttp
|
|
34
|
+
Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
|
|
35
|
+
Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
|
|
36
|
+
Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
|
|
32
37
|
Dynamic: license-file
|
|
33
38
|
|
|
34
39
|
# docforge
|
|
@@ -78,15 +83,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
|
|
|
78
83
|
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
79
84
|
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
80
85
|
|
|
86
|
+
For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
|
|
87
|
+
|
|
81
88
|
## Quick Start
|
|
82
89
|
|
|
90
|
+
**Prerequisites:**
|
|
91
|
+
- Python 3.12+
|
|
92
|
+
- Docker (for the local Postgres + pgvector container)
|
|
93
|
+
- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
|
|
94
|
+
|
|
83
95
|
```bash
|
|
84
96
|
pip install docforge-cli
|
|
85
97
|
docforge init my-project
|
|
86
98
|
cd my-project
|
|
87
99
|
# Edit docforge.yml with your Confluence URL
|
|
88
100
|
# Edit sources.yml with your page IDs and local git repo paths
|
|
89
|
-
# Edit .env with your credentials
|
|
101
|
+
# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
|
|
90
102
|
docker compose up -d db
|
|
91
103
|
docforge init-db
|
|
92
104
|
docforge ingest
|
|
@@ -121,15 +133,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
|
|
|
121
133
|
|
|
122
134
|
## Deploy to your infrastructure
|
|
123
135
|
|
|
124
|
-
For team-wide use, deploy the search API to Azure (~$
|
|
136
|
+
For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
|
|
125
137
|
|
|
126
138
|
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
127
139
|
- Container App running the FastAPI search API.
|
|
128
|
-
- Container
|
|
140
|
+
- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
|
|
141
|
+
- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
|
|
129
142
|
- Team members use a lightweight MCP client that calls the hosted API.
|
|
130
143
|
|
|
131
144
|
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
132
145
|
|
|
146
|
+
## Self-hosting / forking
|
|
147
|
+
|
|
148
|
+
The embedder image bakes the EmbeddingGemma-300M model at build time,
|
|
149
|
+
which requires a HuggingFace access token. Forks and adopters need to:
|
|
150
|
+
|
|
151
|
+
1. Get an HF token at https://huggingface.co/settings/tokens.
|
|
152
|
+
2. Accept the EmbeddingGemma license at
|
|
153
|
+
https://huggingface.co/google/embeddinggemma-300m.
|
|
154
|
+
3. Add a repo secret `HF_TOKEN` under
|
|
155
|
+
`Settings → Secrets and variables → Actions`.
|
|
156
|
+
|
|
157
|
+
The CI workflow forwards the secret to BuildKit via
|
|
158
|
+
`--mount=type=secret,id=hf_token`; the token never enters any image
|
|
159
|
+
layer. If you fork this repo and run the CI workflow, it will build the
|
|
160
|
+
embedder image automatically on commits to `master` and PRs (without
|
|
161
|
+
pushing unless on `master`). To enable pushes to a registry, also add
|
|
162
|
+
secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
|
|
163
|
+
|
|
164
|
+
## Upgrading the embedding model
|
|
165
|
+
|
|
166
|
+
The dimension-mismatch guard in `RemoteEmbedder` makes an
|
|
167
|
+
embedder/search API mismatch loud (`HTTP 503` with a clear log line)
|
|
168
|
+
rather than silent. Upgrade procedure:
|
|
169
|
+
|
|
170
|
+
1. **Pick the new model.** Note its output dimensionality `D` (e.g.
|
|
171
|
+
`768` for EmbeddingGemma, `1024` for many newer models).
|
|
172
|
+
|
|
173
|
+
2. **Update config.** Set `embedding_model: <new>` and
|
|
174
|
+
`embedding_dimensions: D` in the search API's deployment config
|
|
175
|
+
(Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
|
|
176
|
+
|
|
177
|
+
3. **Build the embedder image** with the new model:
|
|
178
|
+
```bash
|
|
179
|
+
docker build \
|
|
180
|
+
--build-arg EMBEDDING_MODEL=<new> \
|
|
181
|
+
--secret id=hf_token,env=HF_TOKEN \
|
|
182
|
+
-f Dockerfile.embedder \
|
|
183
|
+
-t docforge-embedder:<tag> .
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
4. **Apply schema migration.** Add a new vector column:
|
|
187
|
+
```sql
|
|
188
|
+
ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
|
|
189
|
+
```
|
|
190
|
+
Re-ingest to populate the new column. Until backfill completes, the
|
|
191
|
+
search API serves from the old column.
|
|
192
|
+
|
|
193
|
+
5. **Cut over.** Deploy the new embedder image first, then the new
|
|
194
|
+
search API. The dim-mismatch guard ensures search refuses to serve
|
|
195
|
+
wrong-dim vectors.
|
|
196
|
+
|
|
197
|
+
6. **Drop the old column** after a confidence interval.
|
|
198
|
+
|
|
133
199
|
## Configuration
|
|
134
200
|
|
|
135
201
|
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
@@ -140,34 +206,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
|
|
|
140
206
|
|
|
141
207
|
## Evaluation & retrieval quality
|
|
142
208
|
|
|
143
|
-
docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
|
|
209
|
+
docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
|
|
144
210
|
|
|
145
211
|
## FAQ
|
|
146
212
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
213
|
+
The three install-time issues new users hit most often are inline below. The
|
|
214
|
+
full FAQ — including "no results found", "ingest skipped everything", removing
|
|
215
|
+
sources, swapping embedding models, and where to file issues — lives on the
|
|
216
|
+
[microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
|
|
150
217
|
|
|
151
218
|
### "HF_TOKEN required" or model download fails
|
|
152
219
|
|
|
153
220
|
The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
|
|
154
221
|
|
|
155
|
-
### "No results found" after ingest
|
|
156
|
-
|
|
157
|
-
Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
|
|
158
|
-
|
|
159
222
|
### First ingest / first container start is very slow
|
|
160
223
|
|
|
161
224
|
The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
|
|
162
225
|
|
|
163
|
-
### "
|
|
226
|
+
### "Cannot connect to PostgreSQL"
|
|
164
227
|
|
|
165
|
-
|
|
228
|
+
Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
|
|
166
229
|
|
|
167
230
|
## License
|
|
168
231
|
|
|
169
232
|
MIT. See [LICENSE](LICENSE).
|
|
170
233
|
|
|
234
|
+
## License compatibility
|
|
235
|
+
|
|
236
|
+
docforge is MIT-licensed; the default embedding model,
|
|
237
|
+
[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
|
|
238
|
+
distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
|
|
239
|
+
which restrict harmful use and building products that compete with Gemma. Swap
|
|
240
|
+
to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
|
|
241
|
+
if those constraints don't fit your use case (see
|
|
242
|
+
[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
|
|
243
|
+
|
|
171
244
|
## Credits
|
|
172
245
|
|
|
173
246
|
docforge stands on open shoulders:
|
|
@@ -1,36 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: docforge-cli
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: Forge searchable context from Confluence and git repos for AI coding assistants
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.12
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: typer>=0.12
|
|
10
|
-
Requires-Dist: asyncpg>=0.30
|
|
11
|
-
Requires-Dist: httpx>=0.27
|
|
12
|
-
Requires-Dist: pydantic>=2.9
|
|
13
|
-
Requires-Dist: pydantic-settings>=2.6
|
|
14
|
-
Requires-Dist: beautifulsoup4>=4.12
|
|
15
|
-
Requires-Dist: sentence-transformers>=5.0
|
|
16
|
-
Requires-Dist: pgvector>=0.3
|
|
17
|
-
Requires-Dist: pyyaml>=6.0
|
|
18
|
-
Requires-Dist: fastmcp>=2.0
|
|
19
|
-
Requires-Dist: fastapi>=0.115
|
|
20
|
-
Requires-Dist: uvicorn>=0.34
|
|
21
|
-
Requires-Dist: numpy>=1.26
|
|
22
|
-
Provides-Extra: dev
|
|
23
|
-
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
24
|
-
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
25
|
-
Requires-Dist: pytest-cov>=7.0; extra == "dev"
|
|
26
|
-
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
27
|
-
Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
|
|
28
|
-
Provides-Extra: entra
|
|
29
|
-
Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
|
|
30
|
-
Requires-Dist: azure-identity>=1.19; extra == "entra"
|
|
31
|
-
Requires-Dist: aiohttp>=3.10; extra == "entra"
|
|
32
|
-
Dynamic: license-file
|
|
33
|
-
|
|
34
1
|
# docforge
|
|
35
2
|
|
|
36
3
|
**The self-hosted context engine for AI coding assistants.**
|
|
@@ -78,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
|
|
|
78
45
|
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
79
46
|
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
80
47
|
|
|
48
|
+
For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
|
|
49
|
+
|
|
81
50
|
## Quick Start
|
|
82
51
|
|
|
52
|
+
**Prerequisites:**
|
|
53
|
+
- Python 3.12+
|
|
54
|
+
- Docker (for the local Postgres + pgvector container)
|
|
55
|
+
- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
|
|
56
|
+
|
|
83
57
|
```bash
|
|
84
58
|
pip install docforge-cli
|
|
85
59
|
docforge init my-project
|
|
86
60
|
cd my-project
|
|
87
61
|
# Edit docforge.yml with your Confluence URL
|
|
88
62
|
# Edit sources.yml with your page IDs and local git repo paths
|
|
89
|
-
# Edit .env with your credentials
|
|
63
|
+
# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
|
|
90
64
|
docker compose up -d db
|
|
91
65
|
docforge init-db
|
|
92
66
|
docforge ingest
|
|
@@ -121,15 +95,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
|
|
|
121
95
|
|
|
122
96
|
## Deploy to your infrastructure
|
|
123
97
|
|
|
124
|
-
For team-wide use, deploy the search API to Azure (~$
|
|
98
|
+
For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
|
|
125
99
|
|
|
126
100
|
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
127
101
|
- Container App running the FastAPI search API.
|
|
128
|
-
- Container
|
|
102
|
+
- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
|
|
103
|
+
- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
|
|
129
104
|
- Team members use a lightweight MCP client that calls the hosted API.
|
|
130
105
|
|
|
131
106
|
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
132
107
|
|
|
108
|
+
## Self-hosting / forking
|
|
109
|
+
|
|
110
|
+
The embedder image bakes the EmbeddingGemma-300M model at build time,
|
|
111
|
+
which requires a HuggingFace access token. Forks and adopters need to:
|
|
112
|
+
|
|
113
|
+
1. Get an HF token at https://huggingface.co/settings/tokens.
|
|
114
|
+
2. Accept the EmbeddingGemma license at
|
|
115
|
+
https://huggingface.co/google/embeddinggemma-300m.
|
|
116
|
+
3. Add a repo secret `HF_TOKEN` under
|
|
117
|
+
`Settings → Secrets and variables → Actions`.
|
|
118
|
+
|
|
119
|
+
The CI workflow forwards the secret to BuildKit via
|
|
120
|
+
`--mount=type=secret,id=hf_token`; the token never enters any image
|
|
121
|
+
layer. If you fork this repo and run the CI workflow, it will build the
|
|
122
|
+
embedder image automatically on commits to `master` and PRs (without
|
|
123
|
+
pushing unless on `master`). To enable pushes to a registry, also add
|
|
124
|
+
secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
|
|
125
|
+
|
|
126
|
+
## Upgrading the embedding model
|
|
127
|
+
|
|
128
|
+
The dimension-mismatch guard in `RemoteEmbedder` makes an
|
|
129
|
+
embedder/search API mismatch loud (`HTTP 503` with a clear log line)
|
|
130
|
+
rather than silent. Upgrade procedure:
|
|
131
|
+
|
|
132
|
+
1. **Pick the new model.** Note its output dimensionality `D` (e.g.
|
|
133
|
+
`768` for EmbeddingGemma, `1024` for many newer models).
|
|
134
|
+
|
|
135
|
+
2. **Update config.** Set `embedding_model: <new>` and
|
|
136
|
+
`embedding_dimensions: D` in the search API's deployment config
|
|
137
|
+
(Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
|
|
138
|
+
|
|
139
|
+
3. **Build the embedder image** with the new model:
|
|
140
|
+
```bash
|
|
141
|
+
docker build \
|
|
142
|
+
--build-arg EMBEDDING_MODEL=<new> \
|
|
143
|
+
--secret id=hf_token,env=HF_TOKEN \
|
|
144
|
+
-f Dockerfile.embedder \
|
|
145
|
+
-t docforge-embedder:<tag> .
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
4. **Apply schema migration.** Add a new vector column:
|
|
149
|
+
```sql
|
|
150
|
+
ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
|
|
151
|
+
```
|
|
152
|
+
Re-ingest to populate the new column. Until backfill completes, the
|
|
153
|
+
search API serves from the old column.
|
|
154
|
+
|
|
155
|
+
5. **Cut over.** Deploy the new embedder image first, then the new
|
|
156
|
+
search API. The dim-mismatch guard ensures search refuses to serve
|
|
157
|
+
wrong-dim vectors.
|
|
158
|
+
|
|
159
|
+
6. **Drop the old column** after a confidence interval.
|
|
160
|
+
|
|
133
161
|
## Configuration
|
|
134
162
|
|
|
135
163
|
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
@@ -140,34 +168,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
|
|
|
140
168
|
|
|
141
169
|
## Evaluation & retrieval quality
|
|
142
170
|
|
|
143
|
-
docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
|
|
171
|
+
docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
|
|
144
172
|
|
|
145
173
|
## FAQ
|
|
146
174
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
175
|
+
The three install-time issues new users hit most often are inline below. The
|
|
176
|
+
full FAQ — including "no results found", "ingest skipped everything", removing
|
|
177
|
+
sources, swapping embedding models, and where to file issues — lives on the
|
|
178
|
+
[microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
|
|
150
179
|
|
|
151
180
|
### "HF_TOKEN required" or model download fails
|
|
152
181
|
|
|
153
182
|
The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
|
|
154
183
|
|
|
155
|
-
### "No results found" after ingest
|
|
156
|
-
|
|
157
|
-
Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
|
|
158
|
-
|
|
159
184
|
### First ingest / first container start is very slow
|
|
160
185
|
|
|
161
186
|
The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
|
|
162
187
|
|
|
163
|
-
### "
|
|
188
|
+
### "Cannot connect to PostgreSQL"
|
|
164
189
|
|
|
165
|
-
|
|
190
|
+
Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
|
|
166
191
|
|
|
167
192
|
## License
|
|
168
193
|
|
|
169
194
|
MIT. See [LICENSE](LICENSE).
|
|
170
195
|
|
|
196
|
+
## License compatibility
|
|
197
|
+
|
|
198
|
+
docforge is MIT-licensed; the default embedding model,
|
|
199
|
+
[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
|
|
200
|
+
distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
|
|
201
|
+
which restrict harmful use and building products that compete with Gemma. Swap
|
|
202
|
+
to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
|
|
203
|
+
if those constraints don't fit your use case (see
|
|
204
|
+
[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
|
|
205
|
+
|
|
171
206
|
## Credits
|
|
172
207
|
|
|
173
208
|
docforge stands on open shoulders:
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=75.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "docforge-cli"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Forge searchable context from Confluence and git repos for AI coding assistants"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.12"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"typer>=0.12,<1.0",
|
|
14
|
+
"asyncpg>=0.30,<1.0",
|
|
15
|
+
"httpx>=0.27,<1.0",
|
|
16
|
+
"pydantic>=2.9,<3.0",
|
|
17
|
+
"pydantic-settings>=2.6,<3.0",
|
|
18
|
+
"beautifulsoup4>=4.12,<5.0",
|
|
19
|
+
"sentence-transformers>=5.0,<6.0",
|
|
20
|
+
"pgvector>=0.3,<1.0",
|
|
21
|
+
"pyyaml>=6.0,<7.0",
|
|
22
|
+
"fastmcp>=3.0,<4.0",
|
|
23
|
+
"fastapi>=0.115,<1.0",
|
|
24
|
+
"uvicorn>=0.34,<1.0",
|
|
25
|
+
"numpy>=1.26,<3.0", # both 1.x and 2.x tested
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://GranatenUdo.github.io/docforge/"
|
|
30
|
+
Source = "https://github.com/GranatenUdo/docforge"
|
|
31
|
+
Issues = "https://github.com/GranatenUdo/docforge/issues"
|
|
32
|
+
Changelog = "https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md"
|
|
33
|
+
Documentation = "https://GranatenUdo.github.io/docforge/"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
docforge = "docforge.cli:app"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=9.0,<10.0",
|
|
41
|
+
"pytest-asyncio>=1.0,<2.0",
|
|
42
|
+
"pytest-cov>=7.0,<8.0",
|
|
43
|
+
"ruff>=0.8,<1.0",
|
|
44
|
+
"testcontainers[postgres]>=4.0,<5.0",
|
|
45
|
+
]
|
|
46
|
+
entra = [
|
|
47
|
+
"fastapi-azure-auth>=5.0,<6.0",
|
|
48
|
+
"azure-identity>=1.19,<2.0",
|
|
49
|
+
# aiohttp is required by azure-identity.aio's async pipeline
|
|
50
|
+
"aiohttp>=3.10,<4.0",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
where = ["src"]
|
|
55
|
+
include = ["docforge*"]
|
|
56
|
+
|
|
57
|
+
[tool.setuptools.package-data]
|
|
58
|
+
docforge = ["templates/**/*", "sql/**/*"]
|
|
59
|
+
|
|
60
|
+
[tool.ruff]
|
|
61
|
+
line-length = 100
|
|
62
|
+
target-version = "py312"
|
|
63
|
+
|
|
64
|
+
[tool.ruff.lint]
|
|
65
|
+
select = ["E", "F", "I", "W"]
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
asyncio_mode = "auto"
|
|
69
|
+
testpaths = ["tests"]
|
|
70
|
+
markers = [
|
|
71
|
+
"integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
|
|
72
|
+
]
|
|
73
|
+
addopts = "--cov=src/docforge"
|
|
74
|
+
|
|
75
|
+
[tool.coverage.report]
|
|
76
|
+
fail_under = 60
|
|
77
|
+
exclude_also = [
|
|
78
|
+
"if __name__ == \"__main__\":",
|
|
79
|
+
"pragma: no cover",
|
|
80
|
+
"raise NotImplementedError",
|
|
81
|
+
]
|