docforge-cli 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {docforge_cli-0.2.0/docforge_cli.egg-info → docforge_cli-0.3.0}/PKG-INFO +108 -35
  2. docforge_cli-0.2.0/PKG-INFO → docforge_cli-0.3.0/README.md +81 -46
  3. docforge_cli-0.3.0/pyproject.toml +81 -0
  4. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/api.py +107 -72
  5. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/cli.py +18 -3
  6. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/config.py +22 -0
  7. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/db.py +15 -4
  8. docforge_cli-0.3.0/src/docforge/embedder_api.py +86 -0
  9. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/ingest.py +8 -4
  10. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/mcp_server.py +23 -15
  11. docforge_cli-0.3.0/src/docforge/processors/embedder.py +246 -0
  12. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/eval_search.py +2 -2
  13. docforge_cli-0.2.0/README.md → docforge_cli-0.3.0/src/docforge_cli.egg-info/PKG-INFO +119 -13
  14. docforge_cli-0.3.0/src/docforge_cli.egg-info/SOURCES.txt +43 -0
  15. docforge_cli-0.3.0/src/docforge_cli.egg-info/requires.txt +25 -0
  16. docforge_cli-0.2.0/docforge/processors/embedder.py +0 -78
  17. docforge_cli-0.2.0/docforge_cli.egg-info/SOURCES.txt +0 -42
  18. docforge_cli-0.2.0/docforge_cli.egg-info/requires.txt +0 -25
  19. docforge_cli-0.2.0/pyproject.toml +0 -73
  20. {docforge_cli-0.2.0 → docforge_cli-0.3.0}/LICENSE +0 -0
  21. {docforge_cli-0.2.0 → docforge_cli-0.3.0}/setup.cfg +0 -0
  22. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/__init__.py +0 -0
  23. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/__main__.py +0 -0
  24. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/__init__.py +0 -0
  25. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/confluence.py +0 -0
  26. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/crawlers/git.py +0 -0
  27. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/lint.py +0 -0
  28. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/__init__.py +0 -0
  29. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/chunker.py +0 -0
  30. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/processors/parser.py +0 -0
  31. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/query_log.py +0 -0
  32. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/ranking.py +0 -0
  33. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/__init__.py +0 -0
  34. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/scripts/latency_report.py +0 -0
  35. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sources.py +0 -0
  36. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
  37. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/002_add_status_index.sql +0 -0
  38. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/003_add_source_tags.sql +0 -0
  39. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/004_add_query_log.sql +0 -0
  40. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
  41. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
  42. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/sql/schema.sql +0 -0
  43. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/docforge.yml +0 -0
  44. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/docker-compose.yml +0 -0
  45. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/mcp_client.py +0 -0
  46. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge/templates/sources.yml +0 -0
  47. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/dependency_links.txt +0 -0
  48. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/entry_points.txt +0 -0
  49. {docforge_cli-0.2.0 → docforge_cli-0.3.0/src}/docforge_cli.egg-info/top_level.txt +0 -0
@@ -1,34 +1,39 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docforge-cli
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
5
  License: MIT
6
+ Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
7
+ Project-URL: Source, https://github.com/GranatenUdo/docforge
8
+ Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
9
+ Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
10
+ Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
6
11
  Requires-Python: >=3.12
7
12
  Description-Content-Type: text/markdown
8
13
  License-File: LICENSE
9
- Requires-Dist: typer>=0.12
10
- Requires-Dist: asyncpg>=0.30
11
- Requires-Dist: httpx>=0.27
12
- Requires-Dist: pydantic>=2.9
13
- Requires-Dist: pydantic-settings>=2.6
14
- Requires-Dist: beautifulsoup4>=4.12
15
- Requires-Dist: sentence-transformers>=5.0
16
- Requires-Dist: pgvector>=0.3
17
- Requires-Dist: pyyaml>=6.0
18
- Requires-Dist: fastmcp>=2.0
19
- Requires-Dist: fastapi>=0.115
20
- Requires-Dist: uvicorn>=0.34
21
- Requires-Dist: numpy>=1.26
14
+ Requires-Dist: typer<1.0,>=0.12
15
+ Requires-Dist: asyncpg<1.0,>=0.30
16
+ Requires-Dist: httpx<1.0,>=0.27
17
+ Requires-Dist: pydantic<3.0,>=2.9
18
+ Requires-Dist: pydantic-settings<3.0,>=2.6
19
+ Requires-Dist: beautifulsoup4<5.0,>=4.12
20
+ Requires-Dist: sentence-transformers<6.0,>=5.0
21
+ Requires-Dist: pgvector<1.0,>=0.3
22
+ Requires-Dist: pyyaml<7.0,>=6.0
23
+ Requires-Dist: fastmcp<4.0,>=3.0
24
+ Requires-Dist: fastapi<1.0,>=0.115
25
+ Requires-Dist: uvicorn<1.0,>=0.34
26
+ Requires-Dist: numpy<3.0,>=1.26
22
27
  Provides-Extra: dev
23
- Requires-Dist: pytest>=8.0; extra == "dev"
24
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
25
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
26
- Requires-Dist: ruff>=0.8; extra == "dev"
27
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
28
+ Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
29
+ Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
30
+ Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
31
+ Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
32
+ Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
28
33
  Provides-Extra: entra
29
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
30
- Requires-Dist: azure-identity>=1.19; extra == "entra"
31
- Requires-Dist: aiohttp>=3.10; extra == "entra"
34
+ Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
35
+ Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
36
+ Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
32
37
  Dynamic: license-file
33
38
 
34
39
  # docforge
@@ -78,15 +83,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
78
83
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
79
84
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
80
85
 
86
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
87
+
81
88
  ## Quick Start
82
89
 
90
+ **Prerequisites:**
91
+ - Python 3.12+
92
+ - Docker (for the local Postgres + pgvector container)
93
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
94
+
83
95
  ```bash
84
96
  pip install docforge-cli
85
97
  docforge init my-project
86
98
  cd my-project
87
99
  # Edit docforge.yml with your Confluence URL
88
100
  # Edit sources.yml with your page IDs and local git repo paths
89
- # Edit .env with your credentials
101
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
90
102
  docker compose up -d db
91
103
  docforge init-db
92
104
  docforge ingest
@@ -121,15 +133,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
121
133
 
122
134
  ## Deploy to your infrastructure
123
135
 
124
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
136
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
125
137
 
126
138
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
127
139
  - Container App running the FastAPI search API.
128
- - Container Registry, Key Vault, Log Analytics, managed environment.
140
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
141
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
129
142
  - Team members use a lightweight MCP client that calls the hosted API.
130
143
 
131
144
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
132
145
 
146
+ ## Self-hosting / forking
147
+
148
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
149
+ which requires a HuggingFace access token. Forks and adopters need to:
150
+
151
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
152
+ 2. Accept the EmbeddingGemma license at
153
+ https://huggingface.co/google/embeddinggemma-300m.
154
+ 3. Add a repo secret `HF_TOKEN` under
155
+ `Settings → Secrets and variables → Actions`.
156
+
157
+ The CI workflow forwards the secret to BuildKit via
158
+ `--mount=type=secret,id=hf_token`; the token never enters any image
159
+ layer. If you fork this repo and run the CI workflow, it will build the
160
+ embedder image automatically on commits to `master` and PRs (without
161
+ pushing unless on `master`). To enable pushes to a registry, also add
162
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
163
+
164
+ ## Upgrading the embedding model
165
+
166
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
167
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
168
+ rather than silent. Upgrade procedure:
169
+
170
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
171
+ `768` for EmbeddingGemma, `1024` for many newer models).
172
+
173
+ 2. **Update config.** Set `embedding_model: <new>` and
174
+ `embedding_dimensions: D` in the search API's deployment config
175
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
176
+
177
+ 3. **Build the embedder image** with the new model:
178
+ ```bash
179
+ docker build \
180
+ --build-arg EMBEDDING_MODEL=<new> \
181
+ --secret id=hf_token,env=HF_TOKEN \
182
+ -f Dockerfile.embedder \
183
+ -t docforge-embedder:<tag> .
184
+ ```
185
+
186
+ 4. **Apply schema migration.** Add a new vector column:
187
+ ```sql
188
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
189
+ ```
190
+ Re-ingest to populate the new column. Until backfill completes, the
191
+ search API serves from the old column.
192
+
193
+ 5. **Cut over.** Deploy the new embedder image first, then the new
194
+ search API. The dim-mismatch guard ensures search refuses to serve
195
+ wrong-dim vectors.
196
+
197
+ 6. **Drop the old column** after a confidence interval.
198
+
133
199
  ## Configuration
134
200
 
135
201
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -140,34 +206,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
140
206
 
141
207
  ## Evaluation & retrieval quality
142
208
 
143
- docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
209
+ docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
144
210
 
145
211
  ## FAQ
146
212
 
147
- ### "Cannot connect to PostgreSQL"
148
-
149
- Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
213
+ The three install-time issues new users hit most often are inline below. The
214
+ full FAQ — including "no results found", "ingest skipped everything", removing
215
+ sources, swapping embedding models, and where to file issues lives on the
216
+ [microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
150
217
 
151
218
  ### "HF_TOKEN required" or model download fails
152
219
 
153
220
  The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
154
221
 
155
- ### "No results found" after ingest
156
-
157
- Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
158
-
159
222
  ### First ingest / first container start is very slow
160
223
 
161
224
  The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
162
225
 
163
- ### "Ingest skipped everything"
226
+ ### "Cannot connect to PostgreSQL"
164
227
 
165
- docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
228
+ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
166
229
 
167
230
  ## License
168
231
 
169
232
  MIT. See [LICENSE](LICENSE).
170
233
 
234
+ ## License compatibility
235
+
236
+ docforge is MIT-licensed; the default embedding model,
237
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
238
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
239
+ which restrict harmful use and building products that compete with Gemma. Swap
240
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
241
+ if those constraints don't fit your use case (see
242
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
243
+
171
244
  ## Credits
172
245
 
173
246
  docforge stands on open shoulders:
@@ -1,36 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: docforge-cli
3
- Version: 0.2.0
4
- Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
- License: MIT
6
- Requires-Python: >=3.12
7
- Description-Content-Type: text/markdown
8
- License-File: LICENSE
9
- Requires-Dist: typer>=0.12
10
- Requires-Dist: asyncpg>=0.30
11
- Requires-Dist: httpx>=0.27
12
- Requires-Dist: pydantic>=2.9
13
- Requires-Dist: pydantic-settings>=2.6
14
- Requires-Dist: beautifulsoup4>=4.12
15
- Requires-Dist: sentence-transformers>=5.0
16
- Requires-Dist: pgvector>=0.3
17
- Requires-Dist: pyyaml>=6.0
18
- Requires-Dist: fastmcp>=2.0
19
- Requires-Dist: fastapi>=0.115
20
- Requires-Dist: uvicorn>=0.34
21
- Requires-Dist: numpy>=1.26
22
- Provides-Extra: dev
23
- Requires-Dist: pytest>=8.0; extra == "dev"
24
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
25
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
26
- Requires-Dist: ruff>=0.8; extra == "dev"
27
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
28
- Provides-Extra: entra
29
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
30
- Requires-Dist: azure-identity>=1.19; extra == "entra"
31
- Requires-Dist: aiohttp>=3.10; extra == "entra"
32
- Dynamic: license-file
33
-
34
1
  # docforge
35
2
 
36
3
  **The self-hosted context engine for AI coding assistants.**
@@ -78,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
78
45
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
79
46
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
80
47
 
48
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
49
+
81
50
  ## Quick Start
82
51
 
52
+ **Prerequisites:**
53
+ - Python 3.12+
54
+ - Docker (for the local Postgres + pgvector container)
55
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
56
+
83
57
  ```bash
84
58
  pip install docforge-cli
85
59
  docforge init my-project
86
60
  cd my-project
87
61
  # Edit docforge.yml with your Confluence URL
88
62
  # Edit sources.yml with your page IDs and local git repo paths
89
- # Edit .env with your credentials
63
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
90
64
  docker compose up -d db
91
65
  docforge init-db
92
66
  docforge ingest
@@ -121,15 +95,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
121
95
 
122
96
  ## Deploy to your infrastructure
123
97
 
124
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
98
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
125
99
 
126
100
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
127
101
  - Container App running the FastAPI search API.
128
- - Container Registry, Key Vault, Log Analytics, managed environment.
102
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
103
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
129
104
  - Team members use a lightweight MCP client that calls the hosted API.
130
105
 
131
106
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
132
107
 
108
+ ## Self-hosting / forking
109
+
110
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
111
+ which requires a HuggingFace access token. Forks and adopters need to:
112
+
113
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
114
+ 2. Accept the EmbeddingGemma license at
115
+ https://huggingface.co/google/embeddinggemma-300m.
116
+ 3. Add a repo secret `HF_TOKEN` under
117
+ `Settings → Secrets and variables → Actions`.
118
+
119
+ The CI workflow forwards the secret to BuildKit via
120
+ `--mount=type=secret,id=hf_token`; the token never enters any image
121
+ layer. If you fork this repo and run the CI workflow, it will build the
122
+ embedder image automatically on commits to `master` and PRs (without
123
+ pushing unless on `master`). To enable pushes to a registry, also add
124
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
125
+
126
+ ## Upgrading the embedding model
127
+
128
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
129
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
130
+ rather than silent. Upgrade procedure:
131
+
132
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
133
+ `768` for EmbeddingGemma, `1024` for many newer models).
134
+
135
+ 2. **Update config.** Set `embedding_model: <new>` and
136
+ `embedding_dimensions: D` in the search API's deployment config
137
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
138
+
139
+ 3. **Build the embedder image** with the new model:
140
+ ```bash
141
+ docker build \
142
+ --build-arg EMBEDDING_MODEL=<new> \
143
+ --secret id=hf_token,env=HF_TOKEN \
144
+ -f Dockerfile.embedder \
145
+ -t docforge-embedder:<tag> .
146
+ ```
147
+
148
+ 4. **Apply schema migration.** Add a new vector column:
149
+ ```sql
150
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
151
+ ```
152
+ Re-ingest to populate the new column. Until backfill completes, the
153
+ search API serves from the old column.
154
+
155
+ 5. **Cut over.** Deploy the new embedder image first, then the new
156
+ search API. The dim-mismatch guard ensures search refuses to serve
157
+ wrong-dim vectors.
158
+
159
+ 6. **Drop the old column** after a confidence interval.
160
+
133
161
  ## Configuration
134
162
 
135
163
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -140,34 +168,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
140
168
 
141
169
  ## Evaluation & retrieval quality
142
170
 
143
- docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
171
+ docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
144
172
 
145
173
  ## FAQ
146
174
 
147
- ### "Cannot connect to PostgreSQL"
148
-
149
- Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
175
+ The three install-time issues new users hit most often are inline below. The
176
+ full FAQ — including "no results found", "ingest skipped everything", removing
177
+ sources, swapping embedding models, and where to file issues lives on the
178
+ [microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
150
179
 
151
180
  ### "HF_TOKEN required" or model download fails
152
181
 
153
182
  The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
154
183
 
155
- ### "No results found" after ingest
156
-
157
- Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
158
-
159
184
  ### First ingest / first container start is very slow
160
185
 
161
186
  The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
162
187
 
163
- ### "Ingest skipped everything"
188
+ ### "Cannot connect to PostgreSQL"
164
189
 
165
- docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
190
+ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
166
191
 
167
192
  ## License
168
193
 
169
194
  MIT. See [LICENSE](LICENSE).
170
195
 
196
+ ## License compatibility
197
+
198
+ docforge is MIT-licensed; the default embedding model,
199
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
200
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
201
+ which restrict harmful use and building products that compete with Gemma. Swap
202
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
203
+ if those constraints don't fit your use case (see
204
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
205
+
171
206
  ## Credits
172
207
 
173
208
  docforge stands on open shoulders:
@@ -0,0 +1,81 @@
1
+ [build-system]
2
+ requires = ["setuptools>=75.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "docforge-cli"
7
+ version = "0.3.0"
8
+ description = "Forge searchable context from Confluence and git repos for AI coding assistants"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.12"
12
+ dependencies = [
13
+ "typer>=0.12,<1.0",
14
+ "asyncpg>=0.30,<1.0",
15
+ "httpx>=0.27,<1.0",
16
+ "pydantic>=2.9,<3.0",
17
+ "pydantic-settings>=2.6,<3.0",
18
+ "beautifulsoup4>=4.12,<5.0",
19
+ "sentence-transformers>=5.0,<6.0",
20
+ "pgvector>=0.3,<1.0",
21
+ "pyyaml>=6.0,<7.0",
22
+ "fastmcp>=3.0,<4.0",
23
+ "fastapi>=0.115,<1.0",
24
+ "uvicorn>=0.34,<1.0",
25
+ "numpy>=1.26,<3.0", # both 1.x and 2.x tested
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://GranatenUdo.github.io/docforge/"
30
+ Source = "https://github.com/GranatenUdo/docforge"
31
+ Issues = "https://github.com/GranatenUdo/docforge/issues"
32
+ Changelog = "https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md"
33
+ Documentation = "https://GranatenUdo.github.io/docforge/"
34
+
35
+ [project.scripts]
36
+ docforge = "docforge.cli:app"
37
+
38
+ [project.optional-dependencies]
39
+ dev = [
40
+ "pytest>=9.0,<10.0",
41
+ "pytest-asyncio>=1.0,<2.0",
42
+ "pytest-cov>=7.0,<8.0",
43
+ "ruff>=0.8,<1.0",
44
+ "testcontainers[postgres]>=4.0,<5.0",
45
+ ]
46
+ entra = [
47
+ "fastapi-azure-auth>=5.0,<6.0",
48
+ "azure-identity>=1.19,<2.0",
49
+ # aiohttp is required by azure-identity.aio's async pipeline
50
+ "aiohttp>=3.10,<4.0",
51
+ ]
52
+
53
+ [tool.setuptools.packages.find]
54
+ where = ["src"]
55
+ include = ["docforge*"]
56
+
57
+ [tool.setuptools.package-data]
58
+ docforge = ["templates/**/*", "sql/**/*"]
59
+
60
+ [tool.ruff]
61
+ line-length = 100
62
+ target-version = "py312"
63
+
64
+ [tool.ruff.lint]
65
+ select = ["E", "F", "I", "W"]
66
+
67
+ [tool.pytest.ini_options]
68
+ asyncio_mode = "auto"
69
+ testpaths = ["tests"]
70
+ markers = [
71
+ "integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
72
+ ]
73
+ addopts = "--cov=src/docforge"
74
+
75
+ [tool.coverage.report]
76
+ fail_under = 60
77
+ exclude_also = [
78
+ "if __name__ == \"__main__\":",
79
+ "pragma: no cover",
80
+ "raise NotImplementedError",
81
+ ]