PyPI - docforge-cli - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

docforge-cli 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{docforge_cli-0.2.0/docforge_cli.egg-info → docforge_cli-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,34 +1,39 @@
 Metadata-Version: 2.4
 Name: docforge-cli
-Version: 0.2.0
+Version: 0.3.0
 Summary: Forge searchable context from Confluence and git repos for AI coding assistants
 License: MIT
+Project-URL: Homepage, https://GranatenUdo.github.io/docforge/
+Project-URL: Source, https://github.com/GranatenUdo/docforge
+Project-URL: Issues, https://github.com/GranatenUdo/docforge/issues
+Project-URL: Changelog, https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md
+Project-URL: Documentation, https://GranatenUdo.github.io/docforge/
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: typer>=0.12
-Requires-Dist: asyncpg>=0.30
-Requires-Dist: httpx>=0.27
-Requires-Dist: pydantic>=2.9
-Requires-Dist: pydantic-settings>=2.6
-Requires-Dist: beautifulsoup4>=4.12
-Requires-Dist: sentence-transformers>=5.0
-Requires-Dist: pgvector>=0.3
-Requires-Dist: pyyaml>=6.0
-Requires-Dist: fastmcp>=2.0
-Requires-Dist: fastapi>=0.115
-Requires-Dist: uvicorn>=0.34
-Requires-Dist: numpy>=1.26
+Requires-Dist: typer<1.0,>=0.12
+Requires-Dist: asyncpg<1.0,>=0.30
+Requires-Dist: httpx<1.0,>=0.27
+Requires-Dist: pydantic<3.0,>=2.9
+Requires-Dist: pydantic-settings<3.0,>=2.6
+Requires-Dist: beautifulsoup4<5.0,>=4.12
+Requires-Dist: sentence-transformers<6.0,>=5.0
+Requires-Dist: pgvector<1.0,>=0.3
+Requires-Dist: pyyaml<7.0,>=6.0
+Requires-Dist: fastmcp<4.0,>=3.0
+Requires-Dist: fastapi<1.0,>=0.115
+Requires-Dist: uvicorn<1.0,>=0.34
+Requires-Dist: numpy<3.0,>=1.26
 Provides-Extra: dev
-Requires-Dist: pytest>=8.0; extra == "dev"
-Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
-Requires-Dist: pytest-cov>=7.0; extra == "dev"
-Requires-Dist: ruff>=0.8; extra == "dev"
-Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
+Requires-Dist: pytest<10.0,>=9.0; extra == "dev"
+Requires-Dist: pytest-asyncio<2.0,>=1.0; extra == "dev"
+Requires-Dist: pytest-cov<8.0,>=7.0; extra == "dev"
+Requires-Dist: ruff<1.0,>=0.8; extra == "dev"
+Requires-Dist: testcontainers[postgres]<5.0,>=4.0; extra == "dev"
 Provides-Extra: entra
-Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
-Requires-Dist: azure-identity>=1.19; extra == "entra"
-Requires-Dist: aiohttp>=3.10; extra == "entra"
+Requires-Dist: fastapi-azure-auth<6.0,>=5.0; extra == "entra"
+Requires-Dist: azure-identity<2.0,>=1.19; extra == "entra"
+Requires-Dist: aiohttp<4.0,>=3.10; extra == "entra"
 Dynamic: license-file
 # docforge
@@ -78,15 +83,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
 - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
 - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
+For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
 ## Quick Start
+**Prerequisites:**
+- Python 3.12+
+- Docker (for the local Postgres + pgvector container)
+- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
 ```bash
 pip install docforge-cli
 docforge init my-project
 cd my-project
 # Edit docforge.yml with your Confluence URL
 # Edit sources.yml with your page IDs and local git repo paths
-# Edit .env with your credentials
+# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
 docker compose up -d db
 docforge init-db
 docforge ingest
@@ -121,15 +133,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
 ## Deploy to your infrastructure
-For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
+For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
 - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
 - Container App running the FastAPI search API.
-- Container Registry, Key Vault, Log Analytics, managed environment.
+- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
+- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
 - Team members use a lightweight MCP client that calls the hosted API.
 See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
+## Self-hosting / forking
+The embedder image bakes the EmbeddingGemma-300M model at build time,
+which requires a HuggingFace access token. Forks and adopters need to:
+1. Get an HF token at https://huggingface.co/settings/tokens.
+2. Accept the EmbeddingGemma license at
+   https://huggingface.co/google/embeddinggemma-300m.
+3. Add a repo secret `HF_TOKEN` under
+   `Settings → Secrets and variables → Actions`.
+The CI workflow forwards the secret to BuildKit via
+`--mount=type=secret,id=hf_token`; the token never enters any image
+layer. If you fork this repo and run the CI workflow, it will build the
+embedder image automatically on commits to `master` and PRs (without
+pushing unless on `master`). To enable pushes to a registry, also add
+secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
+## Upgrading the embedding model
+The dimension-mismatch guard in `RemoteEmbedder` makes an
+embedder/search API mismatch loud (`HTTP 503` with a clear log line)
+rather than silent. Upgrade procedure:
+1. **Pick the new model.** Note its output dimensionality `D` (e.g.
+   `768` for EmbeddingGemma, `1024` for many newer models).
+2. **Update config.** Set `embedding_model: <new>` and
+   `embedding_dimensions: D` in the search API's deployment config
+   (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
+3. **Build the embedder image** with the new model:
+   ```bash
+   docker build \
+     --build-arg EMBEDDING_MODEL=<new> \
+     --secret id=hf_token,env=HF_TOKEN \
+     -f Dockerfile.embedder \
+     -t docforge-embedder:<tag> .
+   ```
+4. **Apply schema migration.** Add a new vector column:
+   ```sql
+   ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
+   ```
+   Re-ingest to populate the new column. Until backfill completes, the
+   search API serves from the old column.
+5. **Cut over.** Deploy the new embedder image first, then the new
+   search API. The dim-mismatch guard ensures search refuses to serve
+   wrong-dim vectors.
+6. **Drop the old column** after a confidence interval.
 ## Configuration
 See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -140,34 +206,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
 ## Evaluation & retrieval quality
-docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
+docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
 ## FAQ
-### "Cannot connect to PostgreSQL"
-Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
+The three install-time issues new users hit most often are inline below. The
+full FAQ — including "no results found", "ingest skipped everything", removing
+sources, swapping embedding models, and where to file issues — lives on the
+[microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
 ### "HF_TOKEN required" or model download fails
 The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
-### "No results found" after ingest
-Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
 ### First ingest / first container start is very slow
 The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
-### "Ingest skipped everything"
+### "Cannot connect to PostgreSQL"
-docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
+Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
 ## License
 MIT. See [LICENSE](LICENSE).
+## License compatibility
+docforge is MIT-licensed; the default embedding model,
+[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
+distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
+which restrict harmful use and building products that compete with Gemma. Swap
+to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
+if those constraints don't fit your use case (see
+[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
 ## Credits
 docforge stands on open shoulders:

docforge_cli-0.2.0/PKG-INFO → docforge_cli-0.3.0/README.md RENAMED Viewed

@@ -1,36 +1,3 @@
-Metadata-Version: 2.4
-Name: docforge-cli
-Version: 0.2.0
-Summary: Forge searchable context from Confluence and git repos for AI coding assistants
-License: MIT
-Requires-Python: >=3.12
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: typer>=0.12
-Requires-Dist: asyncpg>=0.30
-Requires-Dist: httpx>=0.27
-Requires-Dist: pydantic>=2.9
-Requires-Dist: pydantic-settings>=2.6
-Requires-Dist: beautifulsoup4>=4.12
-Requires-Dist: sentence-transformers>=5.0
-Requires-Dist: pgvector>=0.3
-Requires-Dist: pyyaml>=6.0
-Requires-Dist: fastmcp>=2.0
-Requires-Dist: fastapi>=0.115
-Requires-Dist: uvicorn>=0.34
-Requires-Dist: numpy>=1.26
-Provides-Extra: dev
-Requires-Dist: pytest>=8.0; extra == "dev"
-Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
-Requires-Dist: pytest-cov>=7.0; extra == "dev"
-Requires-Dist: ruff>=0.8; extra == "dev"
-Requires-Dist: testcontainers[postgres]>=4.0; extra == "dev"
-Provides-Extra: entra
-Requires-Dist: fastapi-azure-auth>=5.0; extra == "entra"
-Requires-Dist: azure-identity>=1.19; extra == "entra"
-Requires-Dist: aiohttp>=3.10; extra == "entra"
-Dynamic: license-file
 # docforge
 **The self-hosted context engine for AI coding assistants.**
@@ -78,15 +45,22 @@ docforge is the narrow, focused option in this landscape: minimal footprint, MCP
 - You need near-real-time updates → ingest is batch; no webhook-driven continuous sync yet.
 - You need multilingual search evaluated → EmbeddingGemma is multilingual, but docforge has no eval coverage on non-English corpora yet.
+For the full trust model, accepted risks, and assumptions docforge makes about its operating environment, see [`docs/threat-model.md`](docs/threat-model.md).
 ## Quick Start
+**Prerequisites:**
+- Python 3.12+
+- Docker (for the local Postgres + pgvector container)
+- A [Hugging Face token](https://huggingface.co/settings/tokens) with access to the gated [EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m) model. Accept the model license on the model page first.
 ```bash
 pip install docforge-cli
 docforge init my-project
 cd my-project
 # Edit docforge.yml with your Confluence URL
 # Edit sources.yml with your page IDs and local git repo paths
-# Edit .env with your credentials
+# Edit .env with your credentials (CONFLUENCE_API_TOKEN, HF_TOKEN, DATABASE_URL)
 docker compose up -d db
 docforge init-db
 docforge ingest
@@ -121,15 +95,69 @@ When an AI assistant needs cross-team context, it calls docforge's `search_docum
 ## Deploy to your infrastructure
-For team-wide use, deploy the search API to Azure (~$35/month at default SKUs):
+For team-wide use, deploy the search API to Azure (~$90/month at default SKUs with embedder always-on for production; ~$55/month with the default scale-to-zero embedder):
 - PostgreSQL Flexible Server (Burstable B1ms, 32 GB) with pgvector.
 - Container App running the FastAPI search API.
-- Container Registry, Key Vault, Log Analytics, managed environment.
+- Container App running the embedder service (EmbeddingGemma-300M, model baked into the image).
+- Container Registry (Standard), Key Vault, Log Analytics, managed environment.
 - Team members use a lightweight MCP client that calls the hosted API.
 See [`deploy/azure/`](deploy/azure/) for Bicep templates and a full cost breakdown.
+## Self-hosting / forking
+The embedder image bakes the EmbeddingGemma-300M model at build time,
+which requires a HuggingFace access token. Forks and adopters need to:
+1. Get an HF token at https://huggingface.co/settings/tokens.
+2. Accept the EmbeddingGemma license at
+   https://huggingface.co/google/embeddinggemma-300m.
+3. Add a repo secret `HF_TOKEN` under
+   `Settings → Secrets and variables → Actions`.
+The CI workflow forwards the secret to BuildKit via
+`--mount=type=secret,id=hf_token`; the token never enters any image
+layer. If you fork this repo and run the CI workflow, it will build the
+embedder image automatically on commits to `master` and PRs (without
+pushing unless on `master`). To enable pushes to a registry, also add
+secrets `ACR_LOGIN_SERVER`, `ACR_USERNAME`, and `ACR_PASSWORD`.
+## Upgrading the embedding model
+The dimension-mismatch guard in `RemoteEmbedder` makes an
+embedder/search API mismatch loud (`HTTP 503` with a clear log line)
+rather than silent. Upgrade procedure:
+1. **Pick the new model.** Note its output dimensionality `D` (e.g.
+   `768` for EmbeddingGemma, `1024` for many newer models).
+2. **Update config.** Set `embedding_model: <new>` and
+   `embedding_dimensions: D` in the search API's deployment config
+   (Bicep parameters + Key Vault, or `docforge.yml` for self-hosters).
+3. **Build the embedder image** with the new model:
+   ```bash
+   docker build \
+     --build-arg EMBEDDING_MODEL=<new> \
+     --secret id=hf_token,env=HF_TOKEN \
+     -f Dockerfile.embedder \
+     -t docforge-embedder:<tag> .
+   ```
+4. **Apply schema migration.** Add a new vector column:
+   ```sql
+   ALTER TABLE chunks ADD COLUMN embedding_new vector(D);
+   ```
+   Re-ingest to populate the new column. Until backfill completes, the
+   search API serves from the old column.
+5. **Cut over.** Deploy the new embedder image first, then the new
+   search API. The dim-mismatch guard ensures search refuses to serve
+   wrong-dim vectors.
+6. **Drop the old column** after a confidence interval.
 ## Configuration
 See `docs/` for the full configuration reference, including `docforge.yml` and `sources.yml` schemas.
@@ -140,34 +168,41 @@ Contributions welcome. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for development
 ## Evaluation & retrieval quality
-docforge ships with a retrieval-quality eval harness at [`docforge/scripts/eval_search.py`](docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`docforge/scripts/README.md`](docforge/scripts/README.md) for details.
+docforge ships with a retrieval-quality eval harness at [`src/docforge/scripts/eval_search.py`](src/docforge/scripts/eval_search.py). It measures recall@1, recall@k, and MRR against a ground-truth query set you maintain. The harness is designed for **drift detection** — run it after `sources.yml` changes, embedding-model updates, or ranking tweaks, and compare against your baseline. There is no absolute quality threshold; the metric magnitude depends on how closely your ground-truth queries match source titles. See [`src/docforge/scripts/README.md`](src/docforge/scripts/README.md) for details.
 ## FAQ
-### "Cannot connect to PostgreSQL"
-Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
+The three install-time issues new users hit most often are inline below. The
+full FAQ — including "no results found", "ingest skipped everything", removing
+sources, swapping embedding models, and where to file issues — lives on the
+[microsite FAQ](https://GranatenUdo.github.io/docforge/faq/).
 ### "HF_TOKEN required" or model download fails
 The embedding model `google/embeddinggemma-300m` requires a Hugging Face token with access to the gated model. Create one at https://huggingface.co/settings/tokens, accept the model license at https://huggingface.co/google/embeddinggemma-300m, and set `HF_TOKEN=hf_...` in `.env`.
-### "No results found" after ingest
-Run `docforge status` to confirm sources and chunks exist. If counts are zero, check the ingest logs for per-source failures — the summary at the end lists sources that failed.
 ### First ingest / first container start is very slow
 The first run downloads the 300M embedding model (~1.2 GB) from Hugging Face. Locally, the model is cached at `~/.cache/huggingface/`. In the Docker image, it is cached at `/app/.cache/huggingface/` — **mount this as a volume** so container restarts do not re-download: `docker run -v docforge-hf-cache:/app/.cache/huggingface ...`.
-### "Ingest skipped everything"
+### "Cannot connect to PostgreSQL"
-docforge skips sources whose `content_hash` matches the stored hash (no changes detected). To force re-ingest, clear the hash: `UPDATE sources SET content_hash = NULL;` then run `docforge ingest`.
+Check that the database is running: `docker compose up -d db`. Verify `DATABASE_URL` in `.env` points to `postgresql://docforge:localdev@localhost:5432/docforge` (or your custom value).
 ## License
 MIT. See [LICENSE](LICENSE).
+## License compatibility
+docforge is MIT-licensed; the default embedding model,
+[EmbeddingGemma-300M](https://huggingface.co/google/embeddinggemma-300m), is
+distributed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms),
+which restrict harmful use and building products that compete with Gemma. Swap
+to a permissively-licensed alternative via `embedding_model` in `docforge.yml`
+if those constraints don't fit your use case (see
+[microsite FAQ — Can I use a different embedding model?](https://GranatenUdo.github.io/docforge/faq/#can-i-use-a-different-embedding-model)).
 ## Credits
 docforge stands on open shoulders:

docforge_cli-0.3.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,81 @@
+[build-system]
+requires = ["setuptools>=75.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "docforge-cli"
+version = "0.3.0"
+description = "Forge searchable context from Confluence and git repos for AI coding assistants"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.12"
+dependencies = [
+    "typer>=0.12,<1.0",
+    "asyncpg>=0.30,<1.0",
+    "httpx>=0.27,<1.0",
+    "pydantic>=2.9,<3.0",
+    "pydantic-settings>=2.6,<3.0",
+    "beautifulsoup4>=4.12,<5.0",
+    "sentence-transformers>=5.0,<6.0",
+    "pgvector>=0.3,<1.0",
+    "pyyaml>=6.0,<7.0",
+    "fastmcp>=3.0,<4.0",
+    "fastapi>=0.115,<1.0",
+    "uvicorn>=0.34,<1.0",
+    "numpy>=1.26,<3.0",   # both 1.x and 2.x tested
+]
+[project.urls]
+Homepage = "https://GranatenUdo.github.io/docforge/"
+Source = "https://github.com/GranatenUdo/docforge"
+Issues = "https://github.com/GranatenUdo/docforge/issues"
+Changelog = "https://github.com/GranatenUdo/docforge/blob/master/CHANGELOG.md"
+Documentation = "https://GranatenUdo.github.io/docforge/"
+[project.scripts]
+docforge = "docforge.cli:app"
+[project.optional-dependencies]
+dev = [
+    "pytest>=9.0,<10.0",
+    "pytest-asyncio>=1.0,<2.0",
+    "pytest-cov>=7.0,<8.0",
+    "ruff>=0.8,<1.0",
+    "testcontainers[postgres]>=4.0,<5.0",
+]
+entra = [
+    "fastapi-azure-auth>=5.0,<6.0",
+    "azure-identity>=1.19,<2.0",
+    # aiohttp is required by azure-identity.aio's async pipeline
+    "aiohttp>=3.10,<4.0",
+]
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["docforge*"]
+[tool.setuptools.package-data]
+docforge = ["templates/**/*", "sql/**/*"]
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+[tool.ruff.lint]
+select = ["E", "F", "I", "W"]
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+markers = [
+    "integration: tests requiring real external resources (Docker for Postgres, network for embedding model)",
+]
+addopts = "--cov=src/docforge"
+[tool.coverage.report]
+fail_under = 60
+exclude_also = [
+    "if __name__ == \"__main__\":",
+    "pragma: no cover",
+    "raise NotImplementedError",
+]

docforge-cli 0.2.0__tar.gz → 0.3.0__tar.gz

docforge-cli 0.2.0tar.gz → 0.3.0tar.gz