docforge-cli 0.2.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {docforge_cli-0.2.1/src/docforge_cli.egg-info → docforge_cli-0.4.0}/PKG-INFO +132 -25
  2. docforge_cli-0.2.1/PKG-INFO → docforge_cli-0.4.0/README.md +107 -41
  3. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/pyproject.toml +27 -23
  4. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/api.py +112 -76
  5. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/cli.py +42 -5
  6. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/config.py +23 -1
  7. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/db.py +15 -4
  8. docforge_cli-0.4.0/src/docforge/embedder_api.py +86 -0
  9. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/ingest.py +8 -4
  10. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/mcp_server.py +23 -15
  11. docforge_cli-0.4.0/src/docforge/processors/embedder.py +246 -0
  12. docforge_cli-0.4.0/src/docforge/remote_client.py +199 -0
  13. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/scripts/eval_search.py +2 -2
  14. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sources.py +1 -1
  15. docforge_cli-0.2.1/README.md → docforge_cli-0.4.0/src/docforge_cli.egg-info/PKG-INFO +148 -3
  16. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge_cli.egg-info/SOURCES.txt +2 -0
  17. docforge_cli-0.4.0/src/docforge_cli.egg-info/requires.txt +29 -0
  18. docforge_cli-0.2.1/src/docforge/processors/embedder.py +0 -78
  19. docforge_cli-0.2.1/src/docforge_cli.egg-info/requires.txt +0 -25
  20. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/LICENSE +0 -0
  21. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/setup.cfg +0 -0
  22. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/__init__.py +0 -0
  23. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/__main__.py +0 -0
  24. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/crawlers/__init__.py +0 -0
  25. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/crawlers/confluence.py +0 -0
  26. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/crawlers/git.py +0 -0
  27. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/lint.py +0 -0
  28. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/processors/__init__.py +0 -0
  29. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/processors/chunker.py +0 -0
  30. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/processors/parser.py +0 -0
  31. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/query_log.py +0 -0
  32. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/ranking.py +0 -0
  33. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/scripts/__init__.py +0 -0
  34. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/scripts/latency_report.py +0 -0
  35. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
  36. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
  37. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
  38. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
  39. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
  40. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
  41. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/sql/schema.sql +0 -0
  42. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/templates/docforge.yml +0 -0
  43. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/templates/docker-compose.yml +0 -0
  44. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/templates/mcp_client.py +0 -0
  45. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge/templates/sources.yml +0 -0
  46. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
  47. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge_cli.egg-info/entry_points.txt +0 -0
  48. {docforge_cli-0.2.1 → docforge_cli-0.4.0}/src/docforge_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docforge-cli
3
- Version: 0.2.1
3
+ Version: 0.4.0
4
4
  Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
@@ -11,29 +11,32 @@ Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
11
11
  Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: typer>=0.12
15
- Requires-Dist: asyncpg>=0.30
16
- Requires-Dist: httpx>=0.27
17
- Requires-Dist: pydantic>=2.9
18
- Requires-Dist: pydantic-settings>=2.6
19
- Requires-Dist: beautifulsoup4>=4.12
20
- Requires-Dist: sentence-transformers>=5.0
21
- Requires-Dist: pgvector>=0.3
22
- Requires-Dist: pyyaml>=6.0
23
- Requires-Dist: fastmcp>=2.0
24
- Requires-Dist: fastapi>=0.115
25
- Requires-Dist: uvicorn>=0.34
26
- Requires-Dist: numpy>=1.26
14
+ Requires-Dist: typer<1.0,>=0.12
15
+ Requires-Dist: asyncpg<1.0,>=0.30
16
+ Requires-Dist: httpx<1.0,>=0.27
17
+ Requires-Dist: pydantic<3.0,>=2.9
18
+ Requires-Dist: pydantic-settings<3.0,>=2.6
19
+ Requires-Dist: beautifulsoup4<5.0,>=4.12
20
+ Requires-Dist: sentence-transformers<6.0,>=5.0
21
+ Requires-Dist: pgvector<1.0,>=0.3
22
+ Requires-Dist: pyyaml<7.0,>=6.0
23
+ Requires-Dist: fastmcp<4.0,>=3.0
24
+ Requires-Dist: fastapi<1.0,>=0.115
25
+ Requires-Dist: uvicorn<1.0,>=0.34
26
+ Requires-Dist: numpy<3.0,>=1.26
27
27
  Provides-Extra: dev
28
- Requires-Dist: pytest>=8.0; extra == "dev"
29
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
30
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
31
- Requires-Dist: ruff>=0.8; extra == "dev"
32
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
28
+ Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
29
+ Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
30
+ Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
31
+ Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
32
+ Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
33
33
  Provides-Extra: entra
34
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
35
- Requires-Dist: azure-identity>=1.19; extra == "entra"
36
- Requires-Dist: aiohttp>=3.10; extra == "entra"
34
+ Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
35
+ Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
36
+ Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
37
+ Provides-Extra: azure
38
+ Requires-Dist: azure-identity<2.0,>=1.19; extra == "azure"
39
+ Requires-Dist: aiohttp<4.0,>=3.10; extra == "azure"
37
40
  Dynamic: license-file
38
41
 
39
42
  # docforge
@@ -83,15 +86,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
83
86
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
84
87
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
85
88
 
89
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
90
+
86
91
  ## Quick Start
87
92
 
93
+ **Prerequisites:**
94
+ - Python 3.12+
95
+ - Docker (for the local Postgres + pgvector container)
96
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
97
+
88
98
  ```bash
89
99
  pip install docforge-cli
90
100
  docforge init my-project
91
101
  cd my-project
92
102
  # Edit docforge.yml with your Confluence URL
93
103
  # Edit sources.yml with your page IDs and local git repo paths
94
- # Edit .env with your credentials
104
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
95
105
  docker compose up -d db
96
106
  docforge init-db
97
107
  docforge ingest
@@ -126,15 +136,102 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
126
136
 
127
137
  ## Deploy to your infrastructure
128
138
 
129
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
139
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
130
140
 
131
141
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
132
142
  - Container App running the FastAPI search API.
133
- - Container Registry, Key Vault, Log Analytics, managed environment.
143
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
144
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
134
145
  - Team members use a lightweight MCP client that calls the hosted API.
135
146
 
136
147
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
137
148
 
149
+ ## Use a hosted instance (no local DB required)
150
+
151
+ If your team already operates a docforge deployment and you only want to *use* it from your editor (Claude Code, etc.), you don't need to clone, ingest, or run Postgres locally:
152
+
153
+ ```bash
154
+ # Generic (no auth)
155
+ pip install docforge-cli
156
+ claude mcp add -s user -e DOCFORGE_API_URL=https://docforge.example.com \
157
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL
158
+
159
+ # Static Bearer token
160
+ pip install docforge-cli
161
+ claude mcp add -s user \
162
+ -e DOCFORGE_API_URL=https://docforge.example.com \
163
+ -e DOCFORGE_API_TOKEN=eyJ... \
164
+ -e DOCFORGE_AUTH=bearer \
165
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL --auth bearer
166
+
167
+ # Entra (Azure AD)
168
+ pip install docforge-cli[azure]
169
+ az login --tenant <your-tenant-id>
170
+ claude mcp add -s user \
171
+ -e DOCFORGE_API_URL=https://docforge.example.com \
172
+ -e DOCFORGE_AUDIENCE=api://<app-registration-uri> \
173
+ -e DOCFORGE_AUTH=azure \
174
+ -e DOCFORGE_TEAM=your-team \
175
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL --auth azure
176
+ ```
177
+
178
+ With `--auth azure`, `user_name` is bound to your Entra JWT subject — you can't (and don't need to) configure it.
179
+
180
+ `DOCFORGE_TEAM` is optional but recommended for team-tag relevance boosting in search results.
181
+
182
+ ## Self-hosting / forking
183
+
184
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
185
+ which requires a HuggingFace access token. Forks and adopters need to:
186
+
187
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
188
+ 2. Accept the EmbeddingGemma license at
189
+ https://huggingface.co/google/embeddinggemma-300m.
190
+ 3. Add a repo secret `HF_TOKEN` under
191
+ `Settings → Secrets and variables → Actions`.
192
+
193
+ The CI workflow forwards the secret to BuildKit via
194
+ `--mount=type=secret,id=hf_token`; the token never enters any image
195
+ layer. If you fork this repo and run the CI workflow, it will build the
196
+ embedder image automatically on commits to `master` and PRs (without
197
+ pushing unless on `master`). To enable pushes to a registry, also add
198
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
199
+
200
+ ## Upgrading the embedding model
201
+
202
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
203
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
204
+ rather than silent. Upgrade procedure:
205
+
206
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
207
+ `768` for EmbeddingGemma, `1024` for many newer models).
208
+
209
+ 2. **Update config.** Set `embedding_model: <new>` and
210
+ `embedding_dimensions: D` in the search API's deployment config
211
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
212
+
213
+ 3. **Build the embedder image** with the new model:
214
+ ```bash
215
+ docker build \
216
+ --build-arg EMBEDDING_MODEL=<new> \
217
+ --secret id=hf_token,env=HF_TOKEN \
218
+ -f Dockerfile.embedder \
219
+ -t docforge-embedder:<tag> .
220
+ ```
221
+
222
+ 4. **Apply schema migration.** Add a new vector column:
223
+ ```sql
224
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
225
+ ```
226
+ Re-ingest to populate the new column. Until backfill completes, the
227
+ search API serves from the old column.
228
+
229
+ 5. **Cut over.** Deploy the new embedder image first, then the new
230
+ search API. The dim-mismatch guard ensures search refuses to serve
231
+ wrong-dim vectors.
232
+
233
+ 6. **Drop the old column** after a confidence interval.
234
+
138
235
  ## Configuration
139
236
 
140
237
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -170,6 +267,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
170
267
 
171
268
  MIT. See [LICENSE](LICENSE).
172
269
 
270
+ ## License compatibility
271
+
272
+ docforge is MIT-licensed; the default embedding model,
273
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
274
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
275
+ which restrict harmful use and building products that compete with Gemma. Swap
276
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
277
+ if those constraints don't fit your use case (see
278
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
279
+
173
280
  ## Credits
174
281
 
175
282
  docforge stands on open shoulders:
@@ -1,41 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: docforge-cli
3
- Version: 0.2.1
4
- Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
- License: MIT
6
- Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
7
- Project-URL: Source, https://github.com/GranatenUdo/docforge
8
- Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
9
- Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
10
- Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
11
- Requires-Python: >=3.12
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: typer>=0.12
15
- Requires-Dist: asyncpg>=0.30
16
- Requires-Dist: httpx>=0.27
17
- Requires-Dist: pydantic>=2.9
18
- Requires-Dist: pydantic-settings>=2.6
19
- Requires-Dist: beautifulsoup4>=4.12
20
- Requires-Dist: sentence-transformers>=5.0
21
- Requires-Dist: pgvector>=0.3
22
- Requires-Dist: pyyaml>=6.0
23
- Requires-Dist: fastmcp>=2.0
24
- Requires-Dist: fastapi>=0.115
25
- Requires-Dist: uvicorn>=0.34
26
- Requires-Dist: numpy>=1.26
27
- Provides-Extra: dev
28
- Requires-Dist: pytest>=8.0; extra == "dev"
29
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
30
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
31
- Requires-Dist: ruff>=0.8; extra == "dev"
32
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
33
- Provides-Extra: entra
34
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
35
- Requires-Dist: azure-identity>=1.19; extra == "entra"
36
- Requires-Dist: aiohttp>=3.10; extra == "entra"
37
- Dynamic: license-file
38
-
39
1
  # docforge
40
2
 
41
3
  **The self-hosted context engine for AI coding assistants.**
@@ -83,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
83
45
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
84
46
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
85
47
 
48
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
49
+
86
50
  ## Quick Start
87
51
 
52
+ **Prerequisites:**
53
+ - Python 3.12+
54
+ - Docker (for the local Postgres + pgvector container)
55
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
56
+
88
57
  ```bash
89
58
  pip install docforge-cli
90
59
  docforge init my-project
91
60
  cd my-project
92
61
  # Edit docforge.yml with your Confluence URL
93
62
  # Edit sources.yml with your page IDs and local git repo paths
94
- # Edit .env with your credentials
63
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
95
64
  docker compose up -d db
96
65
  docforge init-db
97
66
  docforge ingest
@@ -126,15 +95,102 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
126
95
 
127
96
  ## Deploy to your infrastructure
128
97
 
129
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
98
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
130
99
 
131
100
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
132
101
  - Container App running the FastAPI search API.
133
- - Container Registry, Key Vault, Log Analytics, managed environment.
102
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
103
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
134
104
  - Team members use a lightweight MCP client that calls the hosted API.
135
105
 
136
106
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
137
107
 
108
+ ## Use a hosted instance (no local DB required)
109
+
110
+ If your team already operates a docforge deployment and you only want to *use* it from your editor (Claude Code, etc.), you don't need to clone, ingest, or run Postgres locally:
111
+
112
+ ```bash
113
+ # Generic (no auth)
114
+ pip install docforge-cli
115
+ claude mcp add -s user -e DOCFORGE_API_URL=https://docforge.example.com \
116
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL
117
+
118
+ # Static Bearer token
119
+ pip install docforge-cli
120
+ claude mcp add -s user \
121
+ -e DOCFORGE_API_URL=https://docforge.example.com \
122
+ -e DOCFORGE_API_TOKEN=eyJ... \
123
+ -e DOCFORGE_AUTH=bearer \
124
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL --auth bearer
125
+
126
+ # Entra (Azure AD)
127
+ pip install docforge-cli[azure]
128
+ az login --tenant <your-tenant-id>
129
+ claude mcp add -s user \
130
+ -e DOCFORGE_API_URL=https://docforge.example.com \
131
+ -e DOCFORGE_AUDIENCE=api://<app-registration-uri> \
132
+ -e DOCFORGE_AUTH=azure \
133
+ -e DOCFORGE_TEAM=your-team \
134
+ docforge -- docforge serve --remote-api $DOCFORGE_API_URL --auth azure
135
+ ```
136
+
137
+ With `--auth azure`, `user_name` is bound to your Entra JWT subject — you can't (and don't need to) configure it.
138
+
139
+ `DOCFORGE_TEAM` is optional but recommended for team-tag relevance boosting in search results.
140
+
141
+ ## Self-hosting / forking
142
+
143
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
144
+ which requires a HuggingFace access token. Forks and adopters need to:
145
+
146
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
147
+ 2. Accept the EmbeddingGemma license at
148
+ https://huggingface.co/google/embeddinggemma-300m.
149
+ 3. Add a repo secret `HF_TOKEN` under
150
+ `Settings → Secrets and variables → Actions`.
151
+
152
+ The CI workflow forwards the secret to BuildKit via
153
+ `--mount=type=secret,id=hf_token`; the token never enters any image
154
+ layer. If you fork this repo and run the CI workflow, it will build the
155
+ embedder image automatically on commits to `master` and PRs (without
156
+ pushing unless on `master`). To enable pushes to a registry, also add
157
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
158
+
159
+ ## Upgrading the embedding model
160
+
161
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
162
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
163
+ rather than silent. Upgrade procedure:
164
+
165
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
166
+ `768` for EmbeddingGemma, `1024` for many newer models).
167
+
168
+ 2. **Update config.** Set `embedding_model: <new>` and
169
+ `embedding_dimensions: D` in the search API's deployment config
170
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
171
+
172
+ 3. **Build the embedder image** with the new model:
173
+ ```bash
174
+ docker build \
175
+ --build-arg EMBEDDING_MODEL=<new> \
176
+ --secret id=hf_token,env=HF_TOKEN \
177
+ -f Dockerfile.embedder \
178
+ -t docforge-embedder:<tag> .
179
+ ```
180
+
181
+ 4. **Apply schema migration.** Add a new vector column:
182
+ ```sql
183
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
184
+ ```
185
+ Re-ingest to populate the new column. Until backfill completes, the
186
+ search API serves from the old column.
187
+
188
+ 5. **Cut over.** Deploy the new embedder image first, then the new
189
+ search API. The dim-mismatch guard ensures search refuses to serve
190
+ wrong-dim vectors.
191
+
192
+ 6. **Drop the old column** after a confidence interval.
193
+
138
194
  ## Configuration
139
195
 
140
196
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -170,6 +226,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
170
226
 
171
227
  MIT. See [LICENSE](LICENSE).
172
228
 
229
+ ## License compatibility
230
+
231
+ docforge is MIT-licensed; the default embedding model,
232
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
233
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
234
+ which restrict harmful use and building products that compete with Gemma. Swap
235
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
236
+ if those constraints don't fit your use case (see
237
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
238
+
173
239
  ## Credits
174
240
 
175
241
  docforge stands on open shoulders:
@@ -4,25 +4,25 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docforge-cli"
7
- version = "0.2.1"
7
+ version = "0.4.0"
8
8
  description = "Forge searchable context from Confluence and git repos for AI coding assistants"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
11
11
  requires-python = ">=3.12"
12
12
  dependencies = [
13
- "typer>=0.12",
14
- "asyncpg>=0.30",
15
- "httpx>=0.27",
16
- "pydantic>=2.9",
17
- "pydantic-settings>=2.6",
18
- "beautifulsoup4>=4.12",
19
- "sentence-transformers>=5.0",
20
- "pgvector>=0.3",
21
- "pyyaml>=6.0",
22
- "fastmcp>=2.0",
23
- "fastapi>=0.115",
24
- "uvicorn>=0.34",
25
- "numpy>=1.26",
13
+ "typer>=0.12,<1.0",
14
+ "asyncpg>=0.30,<1.0",
15
+ "httpx>=0.27,<1.0",
16
+ "pydantic>=2.9,<3.0",
17
+ "pydantic-settings>=2.6,<3.0",
18
+ "beautifulsoup4>=4.12,<5.0",
19
+ "sentence-transformers>=5.0,<6.0",
20
+ "pgvector>=0.3,<1.0",
21
+ "pyyaml>=6.0,<7.0",
22
+ "fastmcp>=3.0,<4.0",
23
+ "fastapi>=0.115,<1.0",
24
+ "uvicorn>=0.34,<1.0",
25
+ "numpy>=1.26,<3.0", # both 1.x and 2.x tested
26
26
  ]
27
27
 
28
28
  [project.urls]
@@ -37,17 +37,21 @@ docforge = "docforge.cli:app"
37
37
 
38
38
  [project.optional-dependencies]
39
39
  dev = [
40
- "pytest>=8.0",
41
- "pytest-asyncio>=0.24",
42
- "pytest-cov>=7.0",
43
- "ruff>=0.8",
44
- "testcontainers[postgres]>=4.0",
40
+ "pytest>=9.0,<10.0",
41
+ "pytest-asyncio>=1.0,<2.0",
42
+ "pytest-cov>=7.0,<8.0",
43
+ "ruff>=0.8,<1.0",
44
+ "testcontainers[postgres]>=4.0,<5.0",
45
45
  ]
46
46
  entra = [
47
- "fastapi-azure-auth>=5.0",
48
- "azure-identity>=1.19",
47
+ "fastapi-azure-auth>=5.0,<6.0",
48
+ "azure-identity>=1.19,<2.0",
49
49
  # aiohttp is required by azure-identity.aio's async pipeline
50
- "aiohttp>=3.10",
50
+ "aiohttp>=3.10,<4.0",
51
+ ]
52
+ azure = [
53
+ "azure-identity>=1.19,<2.0",
54
+ "aiohttp>=3.10,<4.0", # required by azure-identity.aio
51
55
  ]
52
56
 
53
57
  [tool.setuptools.packages.find]
@@ -68,7 +72,7 @@ select = ["E", "F", "I", "W"]
68
72
  asyncio_mode = "auto"
69
73
  testpaths = ["tests"]
70
74
  markers = [
71
- "integration: requires Docker (pgvector container)",
75
+ "integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
72
76
  ]
73
77
  addopts = "--cov=src/docforge"
74
78