docforge-cli 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {docforge_cli-0.2.1/src/docforge_cli.egg-info → docforge_cli-0.3.0}/PKG-INFO +96 -25
  2. docforge_cli-0.2.1/PKG-INFO → docforge_cli-0.3.0/README.md +74 -41
  3. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/pyproject.toml +23 -23
  4. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/api.py +107 -72
  5. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/cli.py +18 -3
  6. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/config.py +22 -0
  7. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/db.py +15 -4
  8. docforge_cli-0.3.0/src/docforge/embedder_api.py +86 -0
  9. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/ingest.py +8 -4
  10. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/mcp_server.py +23 -15
  11. docforge_cli-0.3.0/src/docforge/processors/embedder.py +246 -0
  12. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/eval_search.py +2 -2
  13. docforge_cli-0.2.1/README.md → docforge_cli-0.3.0/src/docforge_cli.egg-info/PKG-INFO +112 -3
  14. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/SOURCES.txt +1 -0
  15. docforge_cli-0.3.0/src/docforge_cli.egg-info/requires.txt +25 -0
  16. docforge_cli-0.2.1/src/docforge/processors/embedder.py +0 -78
  17. docforge_cli-0.2.1/src/docforge_cli.egg-info/requires.txt +0 -25
  18. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/LICENSE +0 -0
  19. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/setup.cfg +0 -0
  20. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/__init__.py +0 -0
  21. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/__main__.py +0 -0
  22. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/__init__.py +0 -0
  23. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/confluence.py +0 -0
  24. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/git.py +0 -0
  25. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/lint.py +0 -0
  26. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/__init__.py +0 -0
  27. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/chunker.py +0 -0
  28. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/parser.py +0 -0
  29. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/query_log.py +0 -0
  30. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/ranking.py +0 -0
  31. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/__init__.py +0 -0
  32. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/latency_report.py +0 -0
  33. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sources.py +0 -0
  34. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
  35. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
  36. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
  37. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
  38. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
  39. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
  40. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/schema.sql +0 -0
  41. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/docforge.yml +0 -0
  42. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/docker-compose.yml +0 -0
  43. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/mcp_client.py +0 -0
  44. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/sources.yml +0 -0
  45. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
  46. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/entry_points.txt +0 -0
  47. {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docforge-cli
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
@@ -11,29 +11,29 @@ Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
11
11
  Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: typer>=0.12
15
- Requires-Dist: asyncpg>=0.30
16
- Requires-Dist: httpx>=0.27
17
- Requires-Dist: pydantic>=2.9
18
- Requires-Dist: pydantic-settings>=2.6
19
- Requires-Dist: beautifulsoup4>=4.12
20
- Requires-Dist: sentence-transformers>=5.0
21
- Requires-Dist: pgvector>=0.3
22
- Requires-Dist: pyyaml>=6.0
23
- Requires-Dist: fastmcp>=2.0
24
- Requires-Dist: fastapi>=0.115
25
- Requires-Dist: uvicorn>=0.34
26
- Requires-Dist: numpy>=1.26
14
+ Requires-Dist: typer<1.0,>=0.12
15
+ Requires-Dist: asyncpg<1.0,>=0.30
16
+ Requires-Dist: httpx<1.0,>=0.27
17
+ Requires-Dist: pydantic<3.0,>=2.9
18
+ Requires-Dist: pydantic-settings<3.0,>=2.6
19
+ Requires-Dist: beautifulsoup4<5.0,>=4.12
20
+ Requires-Dist: sentence-transformers<6.0,>=5.0
21
+ Requires-Dist: pgvector<1.0,>=0.3
22
+ Requires-Dist: pyyaml<7.0,>=6.0
23
+ Requires-Dist: fastmcp<4.0,>=3.0
24
+ Requires-Dist: fastapi<1.0,>=0.115
25
+ Requires-Dist: uvicorn<1.0,>=0.34
26
+ Requires-Dist: numpy<3.0,>=1.26
27
27
  Provides-Extra: dev
28
- Requires-Dist: pytest>=8.0; extra == "dev"
29
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
30
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
31
- Requires-Dist: ruff>=0.8; extra == "dev"
32
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
28
+ Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
29
+ Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
30
+ Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
31
+ Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
32
+ Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
33
33
  Provides-Extra: entra
34
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
35
- Requires-Dist: azure-identity>=1.19; extra == "entra"
36
- Requires-Dist: aiohttp>=3.10; extra == "entra"
34
+ Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
35
+ Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
36
+ Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
37
37
  Dynamic: license-file
38
38
 
39
39
  # docforge
@@ -83,15 +83,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
83
83
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
84
84
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
85
85
 
86
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
87
+
86
88
  ## Quick Start
87
89
 
90
+ **Prerequisites:**
91
+ - Python 3.12+
92
+ - Docker (for the local Postgres + pgvector container)
93
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
94
+
88
95
  ```bash
89
96
  pip install docforge-cli
90
97
  docforge init my-project
91
98
  cd my-project
92
99
  # Edit docforge.yml with your Confluence URL
93
100
  # Edit sources.yml with your page IDs and local git repo paths
94
- # Edit .env with your credentials
101
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
95
102
  docker compose up -d db
96
103
  docforge init-db
97
104
  docforge ingest
@@ -126,15 +133,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
126
133
 
127
134
  ## Deploy to your infrastructure
128
135
 
129
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
136
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
130
137
 
131
138
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
132
139
  - Container App running the FastAPI search API.
133
- - Container Registry, Key Vault, Log Analytics, managed environment.
140
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
141
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
134
142
  - Team members use a lightweight MCP client that calls the hosted API.
135
143
 
136
144
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
137
145
 
146
+ ## Self-hosting / forking
147
+
148
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
149
+ which requires a HuggingFace access token. Forks and adopters need to:
150
+
151
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
152
+ 2. Accept the EmbeddingGemma license at
153
+ https://huggingface.co/google/embeddinggemma-300m.
154
+ 3. Add a repo secret `HF_TOKEN` under
155
+ `Settings → Secrets and variables → Actions`.
156
+
157
+ The CI workflow forwards the secret to BuildKit via
158
+ `--mount=type=secret,id=hf_token`; the token never enters any image
159
+ layer. If you fork this repo and run the CI workflow, it will build the
160
+ embedder image automatically on commits to `master` and PRs (without
161
+ pushing unless on `master`). To enable pushes to a registry, also add
162
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
163
+
164
+ ## Upgrading the embedding model
165
+
166
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
167
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
168
+ rather than silent. Upgrade procedure:
169
+
170
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
171
+ `768` for EmbeddingGemma, `1024` for many newer models).
172
+
173
+ 2. **Update config.** Set `embedding_model: <new>` and
174
+ `embedding_dimensions: D` in the search API's deployment config
175
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
176
+
177
+ 3. **Build the embedder image** with the new model:
178
+ ```bash
179
+ docker build \
180
+ --build-arg EMBEDDING_MODEL=<new> \
181
+ --secret id=hf_token,env=HF_TOKEN \
182
+ -f Dockerfile.embedder \
183
+ -t docforge-embedder:<tag> .
184
+ ```
185
+
186
+ 4. **Apply schema migration.** Add a new vector column:
187
+ ```sql
188
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
189
+ ```
190
+ Re-ingest to populate the new column. Until backfill completes, the
191
+ search API serves from the old column.
192
+
193
+ 5. **Cut over.** Deploy the new embedder image first, then the new
194
+ search API. The dim-mismatch guard ensures search refuses to serve
195
+ wrong-dim vectors.
196
+
197
+ 6. **Drop the old column** after a confidence interval.
198
+
138
199
  ## Configuration
139
200
 
140
201
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -170,6 +231,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
170
231
 
171
232
  MIT. See [LICENSE](LICENSE).
172
233
 
234
+ ## License compatibility
235
+
236
+ docforge is MIT-licensed; the default embedding model,
237
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
238
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
239
+ which restrict harmful use and building products that compete with Gemma. Swap
240
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
241
+ if those constraints don't fit your use case (see
242
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
243
+
173
244
  ## Credits
174
245
 
175
246
  docforge stands on open shoulders:
@@ -1,41 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: docforge-cli
3
- Version: 0.2.1
4
- Summary: Forge searchable context from Confluence and git repos for AI coding assistants
5
- License: MIT
6
- Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
7
- Project-URL: Source, https://github.com/GranatenUdo/docforge
8
- Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
9
- Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
10
- Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
11
- Requires-Python: >=3.12
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: typer>=0.12
15
- Requires-Dist: asyncpg>=0.30
16
- Requires-Dist: httpx>=0.27
17
- Requires-Dist: pydantic>=2.9
18
- Requires-Dist: pydantic-settings>=2.6
19
- Requires-Dist: beautifulsoup4>=4.12
20
- Requires-Dist: sentence-transformers>=5.0
21
- Requires-Dist: pgvector>=0.3
22
- Requires-Dist: pyyaml>=6.0
23
- Requires-Dist: fastmcp>=2.0
24
- Requires-Dist: fastapi>=0.115
25
- Requires-Dist: uvicorn>=0.34
26
- Requires-Dist: numpy>=1.26
27
- Provides-Extra: dev
28
- Requires-Dist: pytest>=8.0; extra == "dev"
29
- Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
30
- Requires-Dist: pytest-cov>=7.0; extra == "dev"
31
- Requires-Dist: ruff>=0.8; extra == "dev"
32
- Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
33
- Provides-Extra: entra
34
- Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
35
- Requires-Dist: azure-identity>=1.19; extra == "entra"
36
- Requires-Dist: aiohttp>=3.10; extra == "entra"
37
- Dynamic: license-file
38
-
39
1
  # docforge
40
2
 
41
3
  **The self-hosted context engine for AI coding assistants.**
@@ -83,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
83
45
  - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
84
46
  - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
85
47
 
48
+ For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
49
+
86
50
  ## Quick Start
87
51
 
52
+ **Prerequisites:**
53
+ - Python 3.12+
54
+ - Docker (for the local Postgres + pgvector container)
55
+ - A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
56
+
88
57
  ```bash
89
58
  pip install docforge-cli
90
59
  docforge init my-project
91
60
  cd my-project
92
61
  # Edit docforge.yml with your Confluence URL
93
62
  # Edit sources.yml with your page IDs and local git repo paths
94
- # Edit .env with your credentials
63
+ # Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
95
64
  docker compose up -d db
96
65
  docforge init-db
97
66
  docforge ingest
@@ -126,15 +95,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
126
95
 
127
96
  ## Deploy to your infrastructure
128
97
 
129
- For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
98
+ For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
130
99
 
131
100
  - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
132
101
  - Container App running the FastAPI search API.
133
- - Container Registry, Key Vault, Log Analytics, managed environment.
102
+ - Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
103
+ - Container Registry (Standard), Key Vault, Log Analytics, managed environment.
134
104
  - Team members use a lightweight MCP client that calls the hosted API.
135
105
 
136
106
  See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
137
107
 
108
+ ## Self-hosting / forking
109
+
110
+ The embedder image bakes the EmbeddingGemma-300M model at build time,
111
+ which requires a HuggingFace access token. Forks and adopters need to:
112
+
113
+ 1. Get an HF token at https://huggingface.co/settings/tokens.
114
+ 2. Accept the EmbeddingGemma license at
115
+ https://huggingface.co/google/embeddinggemma-300m.
116
+ 3. Add a repo secret `HF_TOKEN` under
117
+ `Settings → Secrets and variables → Actions`.
118
+
119
+ The CI workflow forwards the secret to BuildKit via
120
+ `--mount=type=secret,id=hf_token`; the token never enters any image
121
+ layer. If you fork this repo and run the CI workflow, it will build the
122
+ embedder image automatically on commits to `master` and PRs (without
123
+ pushing unless on `master`). To enable pushes to a registry, also add
124
+ secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
125
+
126
+ ## Upgrading the embedding model
127
+
128
+ The dimension-mismatch guard in `RemoteEmbedder` makes an
129
+ embedder/search API mismatch loud (`HTTP 503` with a clear log line)
130
+ rather than silent. Upgrade procedure:
131
+
132
+ 1. **Pick the new model.** Note its output dimensionality `D` (e.g.
133
+ `768` for EmbeddingGemma, `1024` for many newer models).
134
+
135
+ 2. **Update config.** Set `embedding_model: <new>` and
136
+ `embedding_dimensions: D` in the search API's deployment config
137
+ (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
138
+
139
+ 3. **Build the embedder image** with the new model:
140
+ ```bash
141
+ docker build \
142
+ --build-arg EMBEDDING_MODEL=<new> \
143
+ --secret id=hf_token,env=HF_TOKEN \
144
+ -f Dockerfile.embedder \
145
+ -t docforge-embedder:<tag> .
146
+ ```
147
+
148
+ 4. **Apply schema migration.** Add a new vector column:
149
+ ```sql
150
+ ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
151
+ ```
152
+ Re-ingest to populate the new column. Until backfill completes, the
153
+ search API serves from the old column.
154
+
155
+ 5. **Cut over.** Deploy the new embedder image first, then the new
156
+ search API. The dim-mismatch guard ensures search refuses to serve
157
+ wrong-dim vectors.
158
+
159
+ 6. **Drop the old column** after a confidence interval.
160
+
138
161
  ## Configuration
139
162
 
140
163
  See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -170,6 +193,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
170
193
 
171
194
  MIT. See [LICENSE](LICENSE).
172
195
 
196
+ ## License compatibility
197
+
198
+ docforge is MIT-licensed; the default embedding model,
199
+ [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
200
+ distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
201
+ which restrict harmful use and building products that compete with Gemma. Swap
202
+ to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
203
+ if those constraints don't fit your use case (see
204
+ [microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
205
+
173
206
  ## Credits
174
207
 
175
208
  docforge stands on open shoulders:
@@ -4,25 +4,25 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docforge-cli"
7
- version = "0.2.1"
7
+ version = "0.3.0"
8
8
  description = "Forge searchable context from Confluence and git repos for AI coding assistants"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
11
11
  requires-python = ">=3.12"
12
12
  dependencies = [
13
- "typer>=0.12",
14
- "asyncpg>=0.30",
15
- "httpx>=0.27",
16
- "pydantic>=2.9",
17
- "pydantic-settings>=2.6",
18
- "beautifulsoup4>=4.12",
19
- "sentence-transformers>=5.0",
20
- "pgvector>=0.3",
21
- "pyyaml>=6.0",
22
- "fastmcp>=2.0",
23
- "fastapi>=0.115",
24
- "uvicorn>=0.34",
25
- "numpy>=1.26",
13
+ "typer>=0.12,<1.0",
14
+ "asyncpg>=0.30,<1.0",
15
+ "httpx>=0.27,<1.0",
16
+ "pydantic>=2.9,<3.0",
17
+ "pydantic-settings>=2.6,<3.0",
18
+ "beautifulsoup4>=4.12,<5.0",
19
+ "sentence-transformers>=5.0,<6.0",
20
+ "pgvector>=0.3,<1.0",
21
+ "pyyaml>=6.0,<7.0",
22
+ "fastmcp>=3.0,<4.0",
23
+ "fastapi>=0.115,<1.0",
24
+ "uvicorn>=0.34,<1.0",
25
+ "numpy>=1.26,<3.0", # both 1.x and 2.x tested
26
26
  ]
27
27
 
28
28
  [project.urls]
@@ -37,17 +37,17 @@ docforge = "docforge.cli:app"
37
37
 
38
38
  [project.optional-dependencies]
39
39
  dev = [
40
- "pytest>=8.0",
41
- "pytest-asyncio>=0.24",
42
- "pytest-cov>=7.0",
43
- "ruff>=0.8",
44
- "testcontainers[postgres]>=4.0",
40
+ "pytest>=9.0,<10.0",
41
+ "pytest-asyncio>=1.0,<2.0",
42
+ "pytest-cov>=7.0,<8.0",
43
+ "ruff>=0.8,<1.0",
44
+ "testcontainers[postgres]>=4.0,<5.0",
45
45
  ]
46
46
  entra = [
47
- "fastapi-azure-auth>=5.0",
48
- "azure-identity>=1.19",
47
+ "fastapi-azure-auth>=5.0,<6.0",
48
+ "azure-identity>=1.19,<2.0",
49
49
  # aiohttp is required by azure-identity.aio's async pipeline
50
- "aiohttp>=3.10",
50
+ "aiohttp>=3.10,<4.0",
51
51
  ]
52
52
 
53
53
  [tool.setuptools.packages.find]
@@ -68,7 +68,7 @@ select = ["E", "F", "I", "W"]
68
68
  asyncio_mode = "auto"
69
69
  testpaths = ["tests"]
70
70
  markers = [
71
- "integration: requires Docker (pgvector container)",
71
+ "integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
72
72
  ]
73
73
  addopts = "--cov=src/docforge"
74
74
 
@@ -14,51 +14,50 @@ import time
14
14
  from contextlib import asynccontextmanager
15
15
  from typing import Any
16
16
 
17
+ import asyncpg
17
18
  import numpy as np
18
19
  from fastapi import Depends, FastAPI, HTTPException, Request
19
20
  from fastapi.security import SecurityScopes
20
- from pydantic import BaseModel
21
+ from pydantic import BaseModel, Field
21
22
 
22
23
  from docforge.config import Settings
23
- from docforge.db import close_pool, get_pool
24
- from docforge.processors.embedder import Embedder
24
+ from docforge.db import _init_connection # registers pgvector codec on each new pool conn
25
+ from docforge.processors.embedder import Embedder, EmbedderProtocol
26
+ from docforge.query_log import log_query
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
28
- _embedder: Embedder | None = None
29
- _settings: Settings | None = None
30
- _azure_scheme = None # Populated in lifespan when auth.mode == "entra"
31
- _cleanup_task: asyncio.Task | None = None
32
-
33
30
  _CLEANUP_INTERVAL_SECONDS = 3600 # one hour — overridable in tests
31
+ CLEANUP_LOCK_ID = 0xD0CF0001 # decimal 3,503,226,881 — stable across replicas
34
32
 
35
33
 
36
- async def _query_log_cleanup_loop(database_url: str, retention_days: int) -> None:
37
- """Deletes query_log rows older than retention_days every
38
- _CLEANUP_INTERVAL_SECONDS. Idempotent, so multi-replica is safe."""
39
- # int() coercion makes the f-string SQL below injection-safe. asyncpg's
34
+ async def _query_log_cleanup_loop(pool: asyncpg.Pool, retention_days: int) -> None:
35
+ """Each iteration takes a transaction-scoped advisory lock. A replica
36
+ that can't acquire it skips this iteration. The lock auto-releases at
37
+ COMMIT/ROLLBACK and on connection drop no manual unlock to forget."""
38
+ # int() coercion makes the f-string SQL below injection-safe; asyncpg's
40
39
  # $1::interval parameter binding doesn't accept str, hence the literal.
41
40
  days = int(retention_days)
42
41
  while True:
43
42
  try:
44
- pool = await get_pool(database_url)
45
43
  async with pool.acquire() as conn:
46
- result = await conn.execute(
47
- f"DELETE FROM query_log WHERE created_at < now() - interval '{days} days'"
48
- )
49
- logger.info("query_log cleanup: %s", result)
44
+ async with conn.transaction():
45
+ got_lock = await conn.fetchval(
46
+ "SELECT pg_try_advisory_xact_lock($1)", CLEANUP_LOCK_ID
47
+ )
48
+ if got_lock:
49
+ result = await conn.execute(
50
+ f"DELETE FROM query_log "
51
+ f"WHERE created_at < now() - interval '{days} days'"
52
+ )
53
+ logger.info("query_log cleanup: %s", result)
54
+ else:
55
+ logger.debug("query_log cleanup: another replica holds the lock")
50
56
  except Exception as e:
51
57
  logger.exception("query_log cleanup failed: %s", e)
52
58
  await asyncio.sleep(_CLEANUP_INTERVAL_SECONDS)
53
59
 
54
60
 
55
- def _get_settings() -> Settings:
56
- global _settings
57
- if _settings is None:
58
- _settings = Settings()
59
- return _settings
60
-
61
-
62
61
  def _build_auth_scheme(settings: Settings):
63
62
  """Return a SingleTenantAzureAuthorizationCodeBearer if mode==entra, else None."""
64
63
  if settings.auth.mode != "entra":
@@ -75,55 +74,91 @@ def _build_auth_scheme(settings: Settings):
75
74
 
76
75
  @asynccontextmanager
77
76
  async def lifespan(app: FastAPI):
78
- """Load the embedding model at startup; close the DB pool on shutdown."""
79
- global _embedder, _azure_scheme, _cleanup_task
80
- settings = _get_settings()
81
- _azure_scheme = _build_auth_scheme(settings)
82
- if _azure_scheme is not None:
83
- await _azure_scheme.openid_config.load_config()
84
- logger.info(
85
- "Entra auth enabled (tenant=%s, audience=%s)",
86
- settings.auth.tenant_id,
87
- settings.auth.audience,
88
- )
89
- logger.info("Loading embedding model...")
90
- _embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
91
- logger.info("Model loaded: %s (%dd)", _embedder.model_name, _embedder.dimensions)
92
-
93
- _cleanup_task = asyncio.create_task(
94
- _query_log_cleanup_loop(settings.database_url, settings.query_log_retention_days)
77
+ """Build per-process resources at startup; tear them down on shutdown.
78
+
79
+ Yields a dict whose entries flow into request.state for handler access
80
+ via the Depends getters below."""
81
+ settings = Settings()
82
+ embedder: EmbedderProtocol | None = None # set inside try; outer finally reads it
83
+ pool = await asyncpg.create_pool(
84
+ settings.database_url,
85
+ min_size=settings.pool_min_size,
86
+ max_size=settings.pool_max_size,
87
+ init=_init_connection,
95
88
  )
89
+ try:
90
+ # Embedder construction can raise (Phase 1 dimension guard); the
91
+ # outer finally still closes the pool in that case. Offloaded to a
92
+ # thread so the model-load file I/O doesn't stall the event loop.
93
+ embedder = await asyncio.to_thread(Embedder.from_settings, settings)
94
+ logger.info("Model loaded: %s (%dd)", embedder.model_name, embedder.dimensions)
95
+
96
+ azure_scheme = _build_auth_scheme(settings)
97
+ if azure_scheme is not None:
98
+ await azure_scheme.openid_config.load_config()
99
+ logger.info(
100
+ "Entra auth enabled (tenant=%s, audience=%s)",
101
+ settings.auth.tenant_id,
102
+ settings.auth.audience,
103
+ )
96
104
 
97
- yield
98
-
99
- if _cleanup_task is not None:
100
- _cleanup_task.cancel()
105
+ cleanup_task = asyncio.create_task(
106
+ _query_log_cleanup_loop(pool, settings.query_log_retention_days)
107
+ )
101
108
  try:
102
- await _cleanup_task
103
- except asyncio.CancelledError:
104
- pass
105
- await close_pool()
109
+ yield {
110
+ "settings": settings,
111
+ "pool": pool,
112
+ "embedder": embedder,
113
+ "azure_scheme": azure_scheme,
114
+ }
115
+ finally:
116
+ cleanup_task.cancel()
117
+ try:
118
+ await cleanup_task
119
+ except asyncio.CancelledError:
120
+ pass
121
+ finally:
122
+ if embedder is not None and hasattr(embedder, "aclose"):
123
+ await embedder.aclose()
124
+ await pool.close()
106
125
 
107
126
 
108
127
  app = FastAPI(title="docforge", lifespan=lifespan)
109
128
 
110
129
 
111
- async def _auth_dependency(request: Request):
130
+ def get_settings(request: Request) -> Settings:
131
+ return request.state.settings
132
+
133
+
134
+ def get_pool_dep(request: Request) -> asyncpg.Pool:
135
+ return request.state.pool
136
+
137
+
138
+ def get_embedder(request: Request) -> EmbedderProtocol:
139
+ return request.state.embedder
140
+
141
+
142
+ def get_azure_scheme(request: Request):
143
+ return request.state.azure_scheme
144
+
145
+
146
+ async def _auth_dependency(
147
+ request: Request,
148
+ azure_scheme=Depends(get_azure_scheme),
149
+ ):
112
150
  """Return the authenticated User under auth.mode=entra, None otherwise."""
113
- if _azure_scheme is None:
151
+ if azure_scheme is None:
114
152
  return None
115
- # Empty SecurityScopes: we don't enforce scope-level authorization beyond
116
- # the token validation the scheme itself does. Without this arg the call
117
- # signature mismatches what fastapi-azure-auth expects.
118
- return await _azure_scheme(request, SecurityScopes())
153
+ return await azure_scheme(request, SecurityScopes())
119
154
 
120
155
 
121
156
  class SearchRequest(BaseModel):
122
- query: str
157
+ query: str = Field(..., max_length=8000)
123
158
  user_name: str
124
159
  team_name: str
125
160
  area_name: str | None = None
126
- limit: int = 5
161
+ limit: int = Field(5, ge=1, le=50)
127
162
 
128
163
 
129
164
  class SearchResult(BaseModel):
@@ -142,32 +177,35 @@ class SearchResponse(BaseModel):
142
177
 
143
178
 
144
179
  @app.get("/health")
145
- async def health() -> dict[str, Any]:
180
+ async def health(request: Request) -> dict[str, Any]:
146
181
  """Health check endpoint."""
182
+ embedder = getattr(request.state, "embedder", None)
147
183
  return {
148
184
  "status": "ok",
149
- "model": _embedder.model_name if _embedder else "not loaded",
185
+ "model": embedder.model_name if embedder else "not loaded",
150
186
  }
151
187
 
152
188
 
153
189
  @app.post("/search", response_model=SearchResponse)
154
- async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchResponse:
190
+ async def search(
191
+ req: SearchRequest,
192
+ settings: Settings = Depends(get_settings),
193
+ pool: asyncpg.Pool = Depends(get_pool_dep),
194
+ embedder: EmbedderProtocol = Depends(get_embedder),
195
+ user=Depends(_auth_dependency),
196
+ ) -> SearchResponse:
155
197
  """Search indexed documentation by semantic similarity."""
156
198
  start = time.perf_counter()
157
- if not _embedder:
158
- raise HTTPException(status_code=503, detail="Embedding model not loaded yet")
159
199
 
160
200
  try:
161
- query_vector = _embedder.embed_query(req.query)
201
+ query_vector = await embedder.aembed_query(req.query)
162
202
  except Exception as e:
163
203
  logger.error("Embedding failed: %s", e)
164
204
  raise HTTPException(status_code=500, detail="Failed to embed query")
165
205
 
166
- settings = _get_settings()
167
206
  user_tags = [req.team_name] + ([req.area_name] if req.area_name else [])
168
207
 
169
208
  try:
170
- pool = await get_pool(settings.database_url)
171
209
  async with pool.acquire() as conn:
172
210
  rows = await conn.fetch(
173
211
  """
@@ -201,12 +239,8 @@ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchRe
201
239
  logger.error("Database error during search: %s", e)
202
240
  raise HTTPException(status_code=503, detail="Database unavailable")
203
241
 
204
- from docforge.query_log import log_query
205
-
206
242
  request_ms = int((time.perf_counter() - start) * 1000)
207
243
 
208
- # team_name and area_name remain self-declared (routing hints, not identity).
209
- # user_name and user_oid come from the token when present.
210
244
  await log_query(
211
245
  pool,
212
246
  user.preferred_username if user else req.user_name,
@@ -234,11 +268,12 @@ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchRe
234
268
 
235
269
 
236
270
  @app.get("/sources")
237
- async def list_sources(user=Depends(_auth_dependency)) -> dict[str, Any]:
271
+ async def list_sources(
272
+ pool: asyncpg.Pool = Depends(get_pool_dep),
273
+ user=Depends(_auth_dependency),
274
+ ) -> dict[str, Any]:
238
275
  """List all indexed documentation sources."""
239
- settings = _get_settings()
240
276
  try:
241
- pool = await get_pool(settings.database_url)
242
277
  async with pool.acquire() as conn:
243
278
  rows = await conn.fetch(
244
279
  """