docforge-cli 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docforge_cli-0.2.1/src/docforge_cli.egg-info → docforge_cli-0.3.0}/PKG-INFO +96 -25
- docforge_cli-0.2.1/PKG-INFO → docforge_cli-0.3.0/README.md +74 -41
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/pyproject.toml +23 -23
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/api.py +107 -72
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/cli.py +18 -3
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/config.py +22 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/db.py +15 -4
- docforge_cli-0.3.0/src/docforge/embedder_api.py +86 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/ingest.py +8 -4
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/mcp_server.py +23 -15
- docforge_cli-0.3.0/src/docforge/processors/embedder.py +246 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/eval_search.py +2 -2
- docforge_cli-0.2.1/README.md → docforge_cli-0.3.0/src/docforge_cli.egg-info/PKG-INFO +112 -3
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/SOURCES.txt +1 -0
- docforge_cli-0.3.0/src/docforge_cli.egg-info/requires.txt +25 -0
- docforge_cli-0.2.1/src/docforge/processors/embedder.py +0 -78
- docforge_cli-0.2.1/src/docforge_cli.egg-info/requires.txt +0 -25
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/LICENSE +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/setup.cfg +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/__init__.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/__main__.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/__init__.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/confluence.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/crawlers/git.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/lint.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/__init__.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/chunker.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/processors/parser.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/query_log.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/ranking.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/__init__.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/scripts/latency_report.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sources.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/001_add_source_identifier.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/002_add_status_index.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/003_add_source_tags.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/004_add_query_log.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/005_add_query_log_user_oid.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/migrations/006_add_query_log_request_ms.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/sql/schema.sql +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/docforge.yml +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/docker-compose.yml +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/mcp_client.py +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge/templates/sources.yml +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/dependency_links.txt +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/entry_points.txt +0 -0
- {docforge_cli-0.2.1 → docforge_cli-0.3.0}/src/docforge_cli.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docforge-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Forge searchable context from Confluence and git repos for AI coding assistants
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
|
|
@@ -11,29 +11,29 @@ Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
|
|
|
11
11
|
Requires-Python: >=3.12
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Requires-Dist: typer
|
|
15
|
-
Requires-Dist: asyncpg
|
|
16
|
-
Requires-Dist: httpx
|
|
17
|
-
Requires-Dist: pydantic
|
|
18
|
-
Requires-Dist: pydantic-settings
|
|
19
|
-
Requires-Dist: beautifulsoup4
|
|
20
|
-
Requires-Dist: sentence-transformers
|
|
21
|
-
Requires-Dist: pgvector
|
|
22
|
-
Requires-Dist: pyyaml
|
|
23
|
-
Requires-Dist: fastmcp
|
|
24
|
-
Requires-Dist: fastapi
|
|
25
|
-
Requires-Dist: uvicorn
|
|
26
|
-
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: typer<1.0,>=0.12
|
|
15
|
+
Requires-Dist: asyncpg<1.0,>=0.30
|
|
16
|
+
Requires-Dist: httpx<1.0,>=0.27
|
|
17
|
+
Requires-Dist: pydantic<3.0,>=2.9
|
|
18
|
+
Requires-Dist: pydantic-settings<3.0,>=2.6
|
|
19
|
+
Requires-Dist: beautifulsoup4<5.0,>=4.12
|
|
20
|
+
Requires-Dist: sentence-transformers<6.0,>=5.0
|
|
21
|
+
Requires-Dist: pgvector<1.0,>=0.3
|
|
22
|
+
Requires-Dist: pyyaml<7.0,>=6.0
|
|
23
|
+
Requires-Dist: fastmcp<4.0,>=3.0
|
|
24
|
+
Requires-Dist: fastapi<1.0,>=0.115
|
|
25
|
+
Requires-Dist: uvicorn<1.0,>=0.34
|
|
26
|
+
Requires-Dist: numpy<3.0,>=1.26
|
|
27
27
|
Provides-Extra: dev
|
|
28
|
-
Requires-Dist: pytest
|
|
29
|
-
Requires-Dist: pytest-asyncio
|
|
30
|
-
Requires-Dist: pytest-cov
|
|
31
|
-
Requires-Dist: ruff
|
|
32
|
-
Requires-Dist: testcontainers[postgres]
|
|
28
|
+
Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
|
|
32
|
+
Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
|
|
33
33
|
Provides-Extra: entra
|
|
34
|
-
Requires-Dist: fastapi-azure-auth
|
|
35
|
-
Requires-Dist: azure-identity
|
|
36
|
-
Requires-Dist: aiohttp
|
|
34
|
+
Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
|
|
35
|
+
Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
|
|
36
|
+
Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# docforge
|
|
@@ -83,15 +83,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
|
|
|
83
83
|
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
84
84
|
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
85
85
|
|
|
86
|
+
For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
|
|
87
|
+
|
|
86
88
|
## Quick Start
|
|
87
89
|
|
|
90
|
+
**Prerequisites:**
|
|
91
|
+
- Python 3.12+
|
|
92
|
+
- Docker (for the local Postgres + pgvector container)
|
|
93
|
+
- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
|
|
94
|
+
|
|
88
95
|
```bash
|
|
89
96
|
pip install docforge-cli
|
|
90
97
|
docforge init my-project
|
|
91
98
|
cd my-project
|
|
92
99
|
# Edit docforge.yml with your Confluence URL
|
|
93
100
|
# Edit sources.yml with your page IDs and local git repo paths
|
|
94
|
-
# Edit .env with your credentials
|
|
101
|
+
# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
|
|
95
102
|
docker compose up -d db
|
|
96
103
|
docforge init-db
|
|
97
104
|
docforge ingest
|
|
@@ -126,15 +133,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
|
|
|
126
133
|
|
|
127
134
|
## Deploy to your infrastructure
|
|
128
135
|
|
|
129
|
-
For team-wide use, deploy the search API to Azure (~$
|
|
136
|
+
For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
|
|
130
137
|
|
|
131
138
|
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
132
139
|
- Container App running the FastAPI search API.
|
|
133
|
-
- Container
|
|
140
|
+
- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
|
|
141
|
+
- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
|
|
134
142
|
- Team members use a lightweight MCP client that calls the hosted API.
|
|
135
143
|
|
|
136
144
|
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
137
145
|
|
|
146
|
+
## Self-hosting / forking
|
|
147
|
+
|
|
148
|
+
The embedder image bakes the EmbeddingGemma-300M model at build time,
|
|
149
|
+
which requires a HuggingFace access token. Forks and adopters need to:
|
|
150
|
+
|
|
151
|
+
1. Get an HF token at https://huggingface.co/settings/tokens.
|
|
152
|
+
2. Accept the EmbeddingGemma license at
|
|
153
|
+
https://huggingface.co/google/embeddinggemma-300m.
|
|
154
|
+
3. Add a repo secret `HF_TOKEN` under
|
|
155
|
+
`Settings → Secrets and variables → Actions`.
|
|
156
|
+
|
|
157
|
+
The CI workflow forwards the secret to BuildKit via
|
|
158
|
+
`--mount=type=secret,id=hf_token`; the token never enters any image
|
|
159
|
+
layer. If you fork this repo and run the CI workflow, it will build the
|
|
160
|
+
embedder image automatically on commits to `master` and PRs (without
|
|
161
|
+
pushing unless on `master`). To enable pushes to a registry, also add
|
|
162
|
+
secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
|
|
163
|
+
|
|
164
|
+
## Upgrading the embedding model
|
|
165
|
+
|
|
166
|
+
The dimension-mismatch guard in `RemoteEmbedder` makes an
|
|
167
|
+
embedder/search API mismatch loud (`HTTP 503` with a clear log line)
|
|
168
|
+
rather than silent. Upgrade procedure:
|
|
169
|
+
|
|
170
|
+
1. **Pick the new model.** Note its output dimensionality `D` (e.g.
|
|
171
|
+
`768` for EmbeddingGemma, `1024` for many newer models).
|
|
172
|
+
|
|
173
|
+
2. **Update config.** Set `embedding_model: <new>` and
|
|
174
|
+
`embedding_dimensions: D` in the search API's deployment config
|
|
175
|
+
(Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
|
|
176
|
+
|
|
177
|
+
3. **Build the embedder image** with the new model:
|
|
178
|
+
```bash
|
|
179
|
+
docker build \
|
|
180
|
+
--build-arg EMBEDDING_MODEL=<new> \
|
|
181
|
+
--secret id=hf_token,env=HF_TOKEN \
|
|
182
|
+
-f Dockerfile.embedder \
|
|
183
|
+
-t docforge-embedder:<tag> .
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
4. **Apply schema migration.** Add a new vector column:
|
|
187
|
+
```sql
|
|
188
|
+
ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
|
|
189
|
+
```
|
|
190
|
+
Re-ingest to populate the new column. Until backfill completes, the
|
|
191
|
+
search API serves from the old column.
|
|
192
|
+
|
|
193
|
+
5. **Cut over.** Deploy the new embedder image first, then the new
|
|
194
|
+
search API. The dim-mismatch guard ensures search refuses to serve
|
|
195
|
+
wrong-dim vectors.
|
|
196
|
+
|
|
197
|
+
6. **Drop the old column** after a confidence interval.
|
|
198
|
+
|
|
138
199
|
## Configuration
|
|
139
200
|
|
|
140
201
|
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
@@ -170,6 +231,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
|
|
|
170
231
|
|
|
171
232
|
MIT. See [LICENSE](LICENSE).
|
|
172
233
|
|
|
234
|
+
## License compatibility
|
|
235
|
+
|
|
236
|
+
docforge is MIT-licensed; the default embedding model,
|
|
237
|
+
[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
|
|
238
|
+
distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
|
|
239
|
+
which restrict harmful use and building products that compete with Gemma. Swap
|
|
240
|
+
to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
|
|
241
|
+
if those constraints don't fit your use case (see
|
|
242
|
+
[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
|
|
243
|
+
|
|
173
244
|
## Credits
|
|
174
245
|
|
|
175
246
|
docforge stands on open shoulders:
|
|
@@ -1,41 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: docforge-cli
|
|
3
|
-
Version: 0.2.1
|
|
4
|
-
Summary: Forge searchable context from Confluence and git repos for AI coding assistants
|
|
5
|
-
License: MIT
|
|
6
|
-
Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
|
|
7
|
-
Project-URL: Source, https://github.com/GranatenUdo/docforge
|
|
8
|
-
Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
|
|
9
|
-
Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
|
|
10
|
-
Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
|
|
11
|
-
Requires-Python: >=3.12
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Requires-Dist: typer>=0.12
|
|
15
|
-
Requires-Dist: asyncpg>=0.30
|
|
16
|
-
Requires-Dist: httpx>=0.27
|
|
17
|
-
Requires-Dist: pydantic>=2.9
|
|
18
|
-
Requires-Dist: pydantic-settings>=2.6
|
|
19
|
-
Requires-Dist: beautifulsoup4>=4.12
|
|
20
|
-
Requires-Dist: sentence-transformers>=5.0
|
|
21
|
-
Requires-Dist: pgvector>=0.3
|
|
22
|
-
Requires-Dist: pyyaml>=6.0
|
|
23
|
-
Requires-Dist: fastmcp>=2.0
|
|
24
|
-
Requires-Dist: fastapi>=0.115
|
|
25
|
-
Requires-Dist: uvicorn>=0.34
|
|
26
|
-
Requires-Dist: numpy>=1.26
|
|
27
|
-
Provides-Extra: dev
|
|
28
|
-
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
-
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
30
|
-
Requires-Dist: pytest-cov>=7.0; extra == "dev"
|
|
31
|
-
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
32
|
-
Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
|
|
33
|
-
Provides-Extra: entra
|
|
34
|
-
Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
|
|
35
|
-
Requires-Dist: azure-identity>=1.19; extra == "entra"
|
|
36
|
-
Requires-Dist: aiohttp>=3.10; extra == "entra"
|
|
37
|
-
Dynamic: license-file
|
|
38
|
-
|
|
39
1
|
# docforge
|
|
40
2
|
|
|
41
3
|
**The self-hosted context engine for AI coding assistants.**
|
|
@@ -83,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
|
|
|
83
45
|
- You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
|
|
84
46
|
- You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
|
|
85
47
|
|
|
48
|
+
For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
|
|
49
|
+
|
|
86
50
|
## Quick Start
|
|
87
51
|
|
|
52
|
+
**Prerequisites:**
|
|
53
|
+
- Python 3.12+
|
|
54
|
+
- Docker (for the local Postgres + pgvector container)
|
|
55
|
+
- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
|
|
56
|
+
|
|
88
57
|
```bash
|
|
89
58
|
pip install docforge-cli
|
|
90
59
|
docforge init my-project
|
|
91
60
|
cd my-project
|
|
92
61
|
# Edit docforge.yml with your Confluence URL
|
|
93
62
|
# Edit sources.yml with your page IDs and local git repo paths
|
|
94
|
-
# Edit .env with your credentials
|
|
63
|
+
# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
|
|
95
64
|
docker compose up -d db
|
|
96
65
|
docforge init-db
|
|
97
66
|
docforge ingest
|
|
@@ -126,15 +95,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
|
|
|
126
95
|
|
|
127
96
|
## Deploy to your infrastructure
|
|
128
97
|
|
|
129
|
-
For team-wide use, deploy the search API to Azure (~$
|
|
98
|
+
For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
|
|
130
99
|
|
|
131
100
|
- PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
|
|
132
101
|
- Container App running the FastAPI search API.
|
|
133
|
-
- Container
|
|
102
|
+
- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
|
|
103
|
+
- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
|
|
134
104
|
- Team members use a lightweight MCP client that calls the hosted API.
|
|
135
105
|
|
|
136
106
|
See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
|
|
137
107
|
|
|
108
|
+
## Self-hosting / forking
|
|
109
|
+
|
|
110
|
+
The embedder image bakes the EmbeddingGemma-300M model at build time,
|
|
111
|
+
which requires a HuggingFace access token. Forks and adopters need to:
|
|
112
|
+
|
|
113
|
+
1. Get an HF token at https://huggingface.co/settings/tokens.
|
|
114
|
+
2. Accept the EmbeddingGemma license at
|
|
115
|
+
https://huggingface.co/google/embeddinggemma-300m.
|
|
116
|
+
3. Add a repo secret `HF_TOKEN` under
|
|
117
|
+
`Settings → Secrets and variables → Actions`.
|
|
118
|
+
|
|
119
|
+
The CI workflow forwards the secret to BuildKit via
|
|
120
|
+
`--mount=type=secret,id=hf_token`; the token never enters any image
|
|
121
|
+
layer. If you fork this repo and run the CI workflow, it will build the
|
|
122
|
+
embedder image automatically on commits to `master` and PRs (without
|
|
123
|
+
pushing unless on `master`). To enable pushes to a registry, also add
|
|
124
|
+
secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
|
|
125
|
+
|
|
126
|
+
## Upgrading the embedding model
|
|
127
|
+
|
|
128
|
+
The dimension-mismatch guard in `RemoteEmbedder` makes an
|
|
129
|
+
embedder/search API mismatch loud (`HTTP 503` with a clear log line)
|
|
130
|
+
rather than silent. Upgrade procedure:
|
|
131
|
+
|
|
132
|
+
1. **Pick the new model.** Note its output dimensionality `D` (e.g.
|
|
133
|
+
`768` for EmbeddingGemma, `1024` for many newer models).
|
|
134
|
+
|
|
135
|
+
2. **Update config.** Set `embedding_model: <new>` and
|
|
136
|
+
`embedding_dimensions: D` in the search API's deployment config
|
|
137
|
+
(Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
|
|
138
|
+
|
|
139
|
+
3. **Build the embedder image** with the new model:
|
|
140
|
+
```bash
|
|
141
|
+
docker build \
|
|
142
|
+
--build-arg EMBEDDING_MODEL=<new> \
|
|
143
|
+
--secret id=hf_token,env=HF_TOKEN \
|
|
144
|
+
-f Dockerfile.embedder \
|
|
145
|
+
-t docforge-embedder:<tag> .
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
4. **Apply schema migration.** Add a new vector column:
|
|
149
|
+
```sql
|
|
150
|
+
ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
|
|
151
|
+
```
|
|
152
|
+
Re-ingest to populate the new column. Until backfill completes, the
|
|
153
|
+
search API serves from the old column.
|
|
154
|
+
|
|
155
|
+
5. **Cut over.** Deploy the new embedder image first, then the new
|
|
156
|
+
search API. The dim-mismatch guard ensures search refuses to serve
|
|
157
|
+
wrong-dim vectors.
|
|
158
|
+
|
|
159
|
+
6. **Drop the old column** after a confidence interval.
|
|
160
|
+
|
|
138
161
|
## Configuration
|
|
139
162
|
|
|
140
163
|
See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
|
|
@@ -170,6 +193,16 @@ Check that the database is running: `docker compose up -d db`. Verify `DATABASE_
|
|
|
170
193
|
|
|
171
194
|
MIT. See [LICENSE](LICENSE).
|
|
172
195
|
|
|
196
|
+
## License compatibility
|
|
197
|
+
|
|
198
|
+
docforge is MIT-licensed; the default embedding model,
|
|
199
|
+
[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
|
|
200
|
+
distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
|
|
201
|
+
which restrict harmful use and building products that compete with Gemma. Swap
|
|
202
|
+
to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
|
|
203
|
+
if those constraints don't fit your use case (see
|
|
204
|
+
[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
|
|
205
|
+
|
|
173
206
|
## Credits
|
|
174
207
|
|
|
175
208
|
docforge stands on open shoulders:
|
|
@@ -4,25 +4,25 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docforge-cli"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Forge searchable context from Confluence and git repos for AI coding assistants"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
11
11
|
requires-python = ">=3.12"
|
|
12
12
|
dependencies = [
|
|
13
|
-
"typer>=0.12",
|
|
14
|
-
"asyncpg>=0.30",
|
|
15
|
-
"httpx>=0.27",
|
|
16
|
-
"pydantic>=2.9",
|
|
17
|
-
"pydantic-settings>=2.6",
|
|
18
|
-
"beautifulsoup4>=4.12",
|
|
19
|
-
"sentence-transformers>=5.0",
|
|
20
|
-
"pgvector>=0.3",
|
|
21
|
-
"pyyaml>=6.0",
|
|
22
|
-
"fastmcp>=
|
|
23
|
-
"fastapi>=0.115",
|
|
24
|
-
"uvicorn>=0.34",
|
|
25
|
-
"numpy>=1.26",
|
|
13
|
+
"typer>=0.12,<1.0",
|
|
14
|
+
"asyncpg>=0.30,<1.0",
|
|
15
|
+
"httpx>=0.27,<1.0",
|
|
16
|
+
"pydantic>=2.9,<3.0",
|
|
17
|
+
"pydantic-settings>=2.6,<3.0",
|
|
18
|
+
"beautifulsoup4>=4.12,<5.0",
|
|
19
|
+
"sentence-transformers>=5.0,<6.0",
|
|
20
|
+
"pgvector>=0.3,<1.0",
|
|
21
|
+
"pyyaml>=6.0,<7.0",
|
|
22
|
+
"fastmcp>=3.0,<4.0",
|
|
23
|
+
"fastapi>=0.115,<1.0",
|
|
24
|
+
"uvicorn>=0.34,<1.0",
|
|
25
|
+
"numpy>=1.26,<3.0", # both 1.x and 2.x tested
|
|
26
26
|
]
|
|
27
27
|
|
|
28
28
|
[project.urls]
|
|
@@ -37,17 +37,17 @@ docforge = "docforge.cli:app"
|
|
|
37
37
|
|
|
38
38
|
[project.optional-dependencies]
|
|
39
39
|
dev = [
|
|
40
|
-
"pytest>=
|
|
41
|
-
"pytest-asyncio>=0.
|
|
42
|
-
"pytest-cov>=7.0",
|
|
43
|
-
"ruff>=0.8",
|
|
44
|
-
"testcontainers[postgres]>=4.0",
|
|
40
|
+
"pytest>=9.0,<10.0",
|
|
41
|
+
"pytest-asyncio>=1.0,<2.0",
|
|
42
|
+
"pytest-cov>=7.0,<8.0",
|
|
43
|
+
"ruff>=0.8,<1.0",
|
|
44
|
+
"testcontainers[postgres]>=4.0,<5.0",
|
|
45
45
|
]
|
|
46
46
|
entra = [
|
|
47
|
-
"fastapi-azure-auth>=5.0",
|
|
48
|
-
"azure-identity>=1.19",
|
|
47
|
+
"fastapi-azure-auth>=5.0,<6.0",
|
|
48
|
+
"azure-identity>=1.19,<2.0",
|
|
49
49
|
# aiohttp is required by azure-identity.aio's async pipeline
|
|
50
|
-
"aiohttp>=3.10",
|
|
50
|
+
"aiohttp>=3.10,<4.0",
|
|
51
51
|
]
|
|
52
52
|
|
|
53
53
|
[tool.setuptools.packages.find]
|
|
@@ -68,7 +68,7 @@ select = ["E", "F", "I", "W"]
|
|
|
68
68
|
asyncio_mode = "auto"
|
|
69
69
|
testpaths = ["tests"]
|
|
70
70
|
markers = [
|
|
71
|
-
"integration:
|
|
71
|
+
"integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
|
|
72
72
|
]
|
|
73
73
|
addopts = "--cov=src/docforge"
|
|
74
74
|
|
|
@@ -14,51 +14,50 @@ import time
|
|
|
14
14
|
from contextlib import asynccontextmanager
|
|
15
15
|
from typing import Any
|
|
16
16
|
|
|
17
|
+
import asyncpg
|
|
17
18
|
import numpy as np
|
|
18
19
|
from fastapi import Depends, FastAPI, HTTPException, Request
|
|
19
20
|
from fastapi.security import SecurityScopes
|
|
20
|
-
from pydantic import BaseModel
|
|
21
|
+
from pydantic import BaseModel, Field
|
|
21
22
|
|
|
22
23
|
from docforge.config import Settings
|
|
23
|
-
from docforge.db import
|
|
24
|
-
from docforge.processors.embedder import Embedder
|
|
24
|
+
from docforge.db import _init_connection # registers pgvector codec on each new pool conn
|
|
25
|
+
from docforge.processors.embedder import Embedder, EmbedderProtocol
|
|
26
|
+
from docforge.query_log import log_query
|
|
25
27
|
|
|
26
28
|
logger = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
|
-
_embedder: Embedder | None = None
|
|
29
|
-
_settings: Settings | None = None
|
|
30
|
-
_azure_scheme = None # Populated in lifespan when auth.mode == "entra"
|
|
31
|
-
_cleanup_task: asyncio.Task | None = None
|
|
32
|
-
|
|
33
30
|
_CLEANUP_INTERVAL_SECONDS = 3600 # one hour — overridable in tests
|
|
31
|
+
CLEANUP_LOCK_ID = 0xD0CF0001 # decimal 3,503,226,881 — stable across replicas
|
|
34
32
|
|
|
35
33
|
|
|
36
|
-
async def _query_log_cleanup_loop(
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
|
|
34
|
+
async def _query_log_cleanup_loop(pool: asyncpg.Pool, retention_days: int) -> None:
|
|
35
|
+
"""Each iteration takes a transaction-scoped advisory lock. A replica
|
|
36
|
+
that can't acquire it skips this iteration. The lock auto-releases at
|
|
37
|
+
COMMIT/ROLLBACK and on connection drop — no manual unlock to forget."""
|
|
38
|
+
# int() coercion makes the f-string SQL below injection-safe; asyncpg's
|
|
40
39
|
# $1::interval parameter binding doesn't accept str, hence the literal.
|
|
41
40
|
days = int(retention_days)
|
|
42
41
|
while True:
|
|
43
42
|
try:
|
|
44
|
-
pool = await get_pool(database_url)
|
|
45
43
|
async with pool.acquire() as conn:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
44
|
+
async with conn.transaction():
|
|
45
|
+
got_lock = await conn.fetchval(
|
|
46
|
+
"SELECT pg_try_advisory_xact_lock($1)", CLEANUP_LOCK_ID
|
|
47
|
+
)
|
|
48
|
+
if got_lock:
|
|
49
|
+
result = await conn.execute(
|
|
50
|
+
f"DELETE FROM query_log "
|
|
51
|
+
f"WHERE created_at < now() - interval '{days} days'"
|
|
52
|
+
)
|
|
53
|
+
logger.info("query_log cleanup: %s", result)
|
|
54
|
+
else:
|
|
55
|
+
logger.debug("query_log cleanup: another replica holds the lock")
|
|
50
56
|
except Exception as e:
|
|
51
57
|
logger.exception("query_log cleanup failed: %s", e)
|
|
52
58
|
await asyncio.sleep(_CLEANUP_INTERVAL_SECONDS)
|
|
53
59
|
|
|
54
60
|
|
|
55
|
-
def _get_settings() -> Settings:
|
|
56
|
-
global _settings
|
|
57
|
-
if _settings is None:
|
|
58
|
-
_settings = Settings()
|
|
59
|
-
return _settings
|
|
60
|
-
|
|
61
|
-
|
|
62
61
|
def _build_auth_scheme(settings: Settings):
|
|
63
62
|
"""Return a SingleTenantAzureAuthorizationCodeBearer if mode==entra, else None."""
|
|
64
63
|
if settings.auth.mode != "entra":
|
|
@@ -75,55 +74,91 @@ def _build_auth_scheme(settings: Settings):
|
|
|
75
74
|
|
|
76
75
|
@asynccontextmanager
|
|
77
76
|
async def lifespan(app: FastAPI):
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
logger.info("Loading embedding model...")
|
|
90
|
-
_embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
|
|
91
|
-
logger.info("Model loaded: %s (%dd)", _embedder.model_name, _embedder.dimensions)
|
|
92
|
-
|
|
93
|
-
_cleanup_task = asyncio.create_task(
|
|
94
|
-
_query_log_cleanup_loop(settings.database_url, settings.query_log_retention_days)
|
|
77
|
+
"""Build per-process resources at startup; tear them down on shutdown.
|
|
78
|
+
|
|
79
|
+
Yields a dict whose entries flow into request.state for handler access
|
|
80
|
+
via the Depends getters below."""
|
|
81
|
+
settings = Settings()
|
|
82
|
+
embedder: EmbedderProtocol | None = None # set inside try; outer finally reads it
|
|
83
|
+
pool = await asyncpg.create_pool(
|
|
84
|
+
settings.database_url,
|
|
85
|
+
min_size=settings.pool_min_size,
|
|
86
|
+
max_size=settings.pool_max_size,
|
|
87
|
+
init=_init_connection,
|
|
95
88
|
)
|
|
89
|
+
try:
|
|
90
|
+
# Embedder construction can raise (Phase 1 dimension guard); the
|
|
91
|
+
# outer finally still closes the pool in that case. Offloaded to a
|
|
92
|
+
# thread so the model-load file I/O doesn't stall the event loop.
|
|
93
|
+
embedder = await asyncio.to_thread(Embedder.from_settings, settings)
|
|
94
|
+
logger.info("Model loaded: %s (%dd)", embedder.model_name, embedder.dimensions)
|
|
95
|
+
|
|
96
|
+
azure_scheme = _build_auth_scheme(settings)
|
|
97
|
+
if azure_scheme is not None:
|
|
98
|
+
await azure_scheme.openid_config.load_config()
|
|
99
|
+
logger.info(
|
|
100
|
+
"Entra auth enabled (tenant=%s, audience=%s)",
|
|
101
|
+
settings.auth.tenant_id,
|
|
102
|
+
settings.auth.audience,
|
|
103
|
+
)
|
|
96
104
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
_cleanup_task.cancel()
|
|
105
|
+
cleanup_task = asyncio.create_task(
|
|
106
|
+
_query_log_cleanup_loop(pool, settings.query_log_retention_days)
|
|
107
|
+
)
|
|
101
108
|
try:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
109
|
+
yield {
|
|
110
|
+
"settings": settings,
|
|
111
|
+
"pool": pool,
|
|
112
|
+
"embedder": embedder,
|
|
113
|
+
"azure_scheme": azure_scheme,
|
|
114
|
+
}
|
|
115
|
+
finally:
|
|
116
|
+
cleanup_task.cancel()
|
|
117
|
+
try:
|
|
118
|
+
await cleanup_task
|
|
119
|
+
except asyncio.CancelledError:
|
|
120
|
+
pass
|
|
121
|
+
finally:
|
|
122
|
+
if embedder is not None and hasattr(embedder, "aclose"):
|
|
123
|
+
await embedder.aclose()
|
|
124
|
+
await pool.close()
|
|
106
125
|
|
|
107
126
|
|
|
108
127
|
app = FastAPI(title="docforge", lifespan=lifespan)
|
|
109
128
|
|
|
110
129
|
|
|
111
|
-
|
|
130
|
+
def get_settings(request: Request) -> Settings:
|
|
131
|
+
return request.state.settings
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_pool_dep(request: Request) -> asyncpg.Pool:
|
|
135
|
+
return request.state.pool
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_embedder(request: Request) -> EmbedderProtocol:
|
|
139
|
+
return request.state.embedder
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_azure_scheme(request: Request):
|
|
143
|
+
return request.state.azure_scheme
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
async def _auth_dependency(
|
|
147
|
+
request: Request,
|
|
148
|
+
azure_scheme=Depends(get_azure_scheme),
|
|
149
|
+
):
|
|
112
150
|
"""Return the authenticated User under auth.mode=entra, None otherwise."""
|
|
113
|
-
if
|
|
151
|
+
if azure_scheme is None:
|
|
114
152
|
return None
|
|
115
|
-
|
|
116
|
-
# the token validation the scheme itself does. Without this arg the call
|
|
117
|
-
# signature mismatches what fastapi-azure-auth expects.
|
|
118
|
-
return await _azure_scheme(request, SecurityScopes())
|
|
153
|
+
return await azure_scheme(request, SecurityScopes())
|
|
119
154
|
|
|
120
155
|
|
|
121
156
|
class SearchRequest(BaseModel):
|
|
122
|
-
query: str
|
|
157
|
+
query: str = Field(..., max_length=8000)
|
|
123
158
|
user_name: str
|
|
124
159
|
team_name: str
|
|
125
160
|
area_name: str | None = None
|
|
126
|
-
limit: int = 5
|
|
161
|
+
limit: int = Field(5, ge=1, le=50)
|
|
127
162
|
|
|
128
163
|
|
|
129
164
|
class SearchResult(BaseModel):
|
|
@@ -142,32 +177,35 @@ class SearchResponse(BaseModel):
|
|
|
142
177
|
|
|
143
178
|
|
|
144
179
|
@app.get("/health")
|
|
145
|
-
async def health() -> dict[str, Any]:
|
|
180
|
+
async def health(request: Request) -> dict[str, Any]:
|
|
146
181
|
"""Health check endpoint."""
|
|
182
|
+
embedder = getattr(request.state, "embedder", None)
|
|
147
183
|
return {
|
|
148
184
|
"status": "ok",
|
|
149
|
-
"model":
|
|
185
|
+
"model": embedder.model_name if embedder else "not loaded",
|
|
150
186
|
}
|
|
151
187
|
|
|
152
188
|
|
|
153
189
|
@app.post("/search", response_model=SearchResponse)
|
|
154
|
-
async def search(
|
|
190
|
+
async def search(
|
|
191
|
+
req: SearchRequest,
|
|
192
|
+
settings: Settings = Depends(get_settings),
|
|
193
|
+
pool: asyncpg.Pool = Depends(get_pool_dep),
|
|
194
|
+
embedder: EmbedderProtocol = Depends(get_embedder),
|
|
195
|
+
user=Depends(_auth_dependency),
|
|
196
|
+
) -> SearchResponse:
|
|
155
197
|
"""Search indexed documentation by semantic similarity."""
|
|
156
198
|
start = time.perf_counter()
|
|
157
|
-
if not _embedder:
|
|
158
|
-
raise HTTPException(status_code=503, detail="Embedding model not loaded yet")
|
|
159
199
|
|
|
160
200
|
try:
|
|
161
|
-
query_vector =
|
|
201
|
+
query_vector = await embedder.aembed_query(req.query)
|
|
162
202
|
except Exception as e:
|
|
163
203
|
logger.error("Embedding failed: %s", e)
|
|
164
204
|
raise HTTPException(status_code=500, detail="Failed to embed query")
|
|
165
205
|
|
|
166
|
-
settings = _get_settings()
|
|
167
206
|
user_tags = [req.team_name] + ([req.area_name] if req.area_name else [])
|
|
168
207
|
|
|
169
208
|
try:
|
|
170
|
-
pool = await get_pool(settings.database_url)
|
|
171
209
|
async with pool.acquire() as conn:
|
|
172
210
|
rows = await conn.fetch(
|
|
173
211
|
"""
|
|
@@ -201,12 +239,8 @@ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchRe
|
|
|
201
239
|
logger.error("Database error during search: %s", e)
|
|
202
240
|
raise HTTPException(status_code=503, detail="Database unavailable")
|
|
203
241
|
|
|
204
|
-
from docforge.query_log import log_query
|
|
205
|
-
|
|
206
242
|
request_ms = int((time.perf_counter() - start) * 1000)
|
|
207
243
|
|
|
208
|
-
# team_name and area_name remain self-declared (routing hints, not identity).
|
|
209
|
-
# user_name and user_oid come from the token when present.
|
|
210
244
|
await log_query(
|
|
211
245
|
pool,
|
|
212
246
|
user.preferred_username if user else req.user_name,
|
|
@@ -234,11 +268,12 @@ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchRe
|
|
|
234
268
|
|
|
235
269
|
|
|
236
270
|
@app.get("/sources")
|
|
237
|
-
async def list_sources(
|
|
271
|
+
async def list_sources(
|
|
272
|
+
pool: asyncpg.Pool = Depends(get_pool_dep),
|
|
273
|
+
user=Depends(_auth_dependency),
|
|
274
|
+
) -> dict[str, Any]:
|
|
238
275
|
"""List all indexed documentation sources."""
|
|
239
|
-
settings = _get_settings()
|
|
240
276
|
try:
|
|
241
|
-
pool = await get_pool(settings.database_url)
|
|
242
277
|
async with pool.acquire() as conn:
|
|
243
278
|
rows = await conn.fetch(
|
|
244
279
|
"""
|