docforge-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docforge/__init__.py +0 -0
  2. docforge/__main__.py +5 -0
  3. docforge/api.py +266 -0
  4. docforge/cli.py +296 -0
  5. docforge/config.py +99 -0
  6. docforge/crawlers/__init__.py +1 -0
  7. docforge/crawlers/confluence.py +109 -0
  8. docforge/crawlers/git.py +79 -0
  9. docforge/db.py +57 -0
  10. docforge/ingest.py +401 -0
  11. docforge/lint.py +92 -0
  12. docforge/mcp_server.py +188 -0
  13. docforge/processors/__init__.py +1 -0
  14. docforge/processors/chunker.py +141 -0
  15. docforge/processors/embedder.py +78 -0
  16. docforge/processors/parser.py +143 -0
  17. docforge/query_log.py +45 -0
  18. docforge/ranking.py +20 -0
  19. docforge/scripts/__init__.py +1 -0
  20. docforge/scripts/eval_search.py +226 -0
  21. docforge/scripts/latency_report.py +142 -0
  22. docforge/sources.py +46 -0
  23. docforge/sql/migrations/001_add_source_identifier.sql +3 -0
  24. docforge/sql/migrations/002_add_status_index.sql +1 -0
  25. docforge/sql/migrations/003_add_source_tags.sql +4 -0
  26. docforge/sql/migrations/004_add_query_log.sql +11 -0
  27. docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
  28. docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
  29. docforge/sql/schema.sql +29 -0
  30. docforge/templates/docforge.yml +11 -0
  31. docforge/templates/docker-compose.yml +14 -0
  32. docforge/templates/mcp_client.py +83 -0
  33. docforge/templates/sources.yml +21 -0
  34. docforge_cli-0.2.0.dist-info/METADATA +178 -0
  35. docforge_cli-0.2.0.dist-info/RECORD +39 -0
  36. docforge_cli-0.2.0.dist-info/WHEEL +5 -0
  37. docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
  38. docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
  39. docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
docforge/__init__.py ADDED
File without changes
docforge/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Module entrypoint — `python -m docforge` dispatches to the Typer app."""
2
+
3
+ from docforge.cli import app
4
+
5
+ app()
docforge/api.py ADDED
@@ -0,0 +1,266 @@
1
+ """FastAPI search API for docforge.
2
+
3
+ Runs on Azure Container Apps. Loads embedding model at startup,
4
+ serves search queries over HTTP.
5
+
6
+ Run locally: uvicorn docforge.api:app --reload
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import logging
13
+ import time
14
+ from contextlib import asynccontextmanager
15
+ from typing import Any
16
+
17
+ import numpy as np
18
+ from fastapi import Depends, FastAPI, HTTPException, Request
19
+ from fastapi.security import SecurityScopes
20
+ from pydantic import BaseModel
21
+
22
+ from docforge.config import Settings
23
+ from docforge.db import close_pool, get_pool
24
+ from docforge.processors.embedder import Embedder
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ _embedder: Embedder | None = None
29
+ _settings: Settings | None = None
30
+ _azure_scheme = None # Populated in lifespan when auth.mode == "entra"
31
+ _cleanup_task: asyncio.Task | None = None
32
+
33
+ _CLEANUP_INTERVAL_SECONDS = 3600 # one hour — overridable in tests
34
+
35
+
36
+ async def _query_log_cleanup_loop(database_url: str, retention_days: int) -> None:
37
+ """Deletes query_log rows older than retention_days every
38
+ _CLEANUP_INTERVAL_SECONDS. Idempotent, so multi-replica is safe."""
39
+ # int() coercion makes the f-string SQL below injection-safe. asyncpg's
40
+ # $1::interval parameter binding doesn't accept str, hence the literal.
41
+ days = int(retention_days)
42
+ while True:
43
+ try:
44
+ pool = await get_pool(database_url)
45
+ async with pool.acquire() as conn:
46
+ result = await conn.execute(
47
+ f"DELETE FROM query_log WHERE created_at < now() - interval '{days} days'"
48
+ )
49
+ logger.info("query_log cleanup: %s", result)
50
+ except Exception as e:
51
+ logger.exception("query_log cleanup failed: %s", e)
52
+ await asyncio.sleep(_CLEANUP_INTERVAL_SECONDS)
53
+
54
+
55
+ def _get_settings() -> Settings:
56
+ global _settings
57
+ if _settings is None:
58
+ _settings = Settings()
59
+ return _settings
60
+
61
+
62
+ def _build_auth_scheme(settings: Settings):
63
+ """Return a SingleTenantAzureAuthorizationCodeBearer if mode==entra, else None."""
64
+ if settings.auth.mode != "entra":
65
+ return None
66
+ from fastapi_azure_auth import SingleTenantAzureAuthorizationCodeBearer
67
+
68
+ app_client_id = settings.auth.audience.removeprefix("api://")
69
+ return SingleTenantAzureAuthorizationCodeBearer(
70
+ app_client_id=app_client_id,
71
+ tenant_id=settings.auth.tenant_id,
72
+ scopes={f"{settings.auth.audience}/search": "Search docforge"},
73
+ )
74
+
75
+
76
+ @asynccontextmanager
77
+ async def lifespan(app: FastAPI):
78
+ """Load the embedding model at startup; close the DB pool on shutdown."""
79
+ global _embedder, _azure_scheme, _cleanup_task
80
+ settings = _get_settings()
81
+ _azure_scheme = _build_auth_scheme(settings)
82
+ if _azure_scheme is not None:
83
+ await _azure_scheme.openid_config.load_config()
84
+ logger.info(
85
+ "Entra auth enabled (tenant=%s, audience=%s)",
86
+ settings.auth.tenant_id,
87
+ settings.auth.audience,
88
+ )
89
+ logger.info("Loading embedding model...")
90
+ _embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
91
+ logger.info("Model loaded: %s (%dd)", _embedder.model_name, _embedder.dimensions)
92
+
93
+ _cleanup_task = asyncio.create_task(
94
+ _query_log_cleanup_loop(settings.database_url, settings.query_log_retention_days)
95
+ )
96
+
97
+ yield
98
+
99
+ if _cleanup_task is not None:
100
+ _cleanup_task.cancel()
101
+ try:
102
+ await _cleanup_task
103
+ except asyncio.CancelledError:
104
+ pass
105
+ await close_pool()
106
+
107
+
108
+ app = FastAPI(title="docforge", lifespan=lifespan)
109
+
110
+
111
+ async def _auth_dependency(request: Request):
112
+ """Return the authenticated User under auth.mode=entra, None otherwise."""
113
+ if _azure_scheme is None:
114
+ return None
115
+ # Empty SecurityScopes: we don't enforce scope-level authorization beyond
116
+ # the token validation the scheme itself does. Without this arg the call
117
+ # signature mismatches what fastapi-azure-auth expects.
118
+ return await _azure_scheme(request, SecurityScopes())
119
+
120
+
121
+ class SearchRequest(BaseModel):
122
+ query: str
123
+ user_name: str
124
+ team_name: str
125
+ area_name: str | None = None
126
+ limit: int = 5
127
+
128
+
129
+ class SearchResult(BaseModel):
130
+ text: str
131
+ section_title: str | None
132
+ source_title: str
133
+ source_url: str
134
+ source_tags: list[str]
135
+ similarity: float
136
+
137
+
138
+ class SearchResponse(BaseModel):
139
+ results: list[SearchResult]
140
+ query: str
141
+ count: int
142
+
143
+
144
+ @app.get("/health")
145
+ async def health() -> dict[str, Any]:
146
+ """Health check endpoint."""
147
+ return {
148
+ "status": "ok",
149
+ "model": _embedder.model_name if _embedder else "not loaded",
150
+ }
151
+
152
+
153
+ @app.post("/search", response_model=SearchResponse)
154
+ async def search(req: SearchRequest, user=Depends(_auth_dependency)) -> SearchResponse:
155
+ """Search indexed documentation by semantic similarity."""
156
+ start = time.perf_counter()
157
+ if not _embedder:
158
+ raise HTTPException(status_code=503, detail="Embedding model not loaded yet")
159
+
160
+ try:
161
+ query_vector = _embedder.embed_query(req.query)
162
+ except Exception as e:
163
+ logger.error("Embedding failed: %s", e)
164
+ raise HTTPException(status_code=500, detail="Failed to embed query")
165
+
166
+ settings = _get_settings()
167
+ user_tags = [req.team_name] + ([req.area_name] if req.area_name else [])
168
+
169
+ try:
170
+ pool = await get_pool(settings.database_url)
171
+ async with pool.acquire() as conn:
172
+ rows = await conn.fetch(
173
+ """
174
+ SELECT
175
+ c.text,
176
+ c.section_title,
177
+ s.title AS source_title,
178
+ s.url AS source_url,
179
+ s.tags AS source_tags,
180
+ 1 - (c.embedding <=> $1::vector) AS similarity,
181
+ (1 - (c.embedding <=> $1::vector)) *
182
+ (1
183
+ + $2::float * cardinality(
184
+ ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
185
+ )
186
+ + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
187
+ ) AS boosted_score
188
+ FROM chunks c
189
+ JOIN sources s ON c.source_id = s.id
190
+ WHERE s.status = 'active'
191
+ ORDER BY boosted_score DESC
192
+ LIMIT $5
193
+ """,
194
+ np.array(query_vector, dtype=np.float32),
195
+ settings.tag_match_weight,
196
+ user_tags,
197
+ settings.org_tag_weight,
198
+ req.limit,
199
+ )
200
+ except Exception as e:
201
+ logger.error("Database error during search: %s", e)
202
+ raise HTTPException(status_code=503, detail="Database unavailable")
203
+
204
+ from docforge.query_log import log_query
205
+
206
+ request_ms = int((time.perf_counter() - start) * 1000)
207
+
208
+ # team_name and area_name remain self-declared (routing hints, not identity).
209
+ # user_name and user_oid come from the token when present.
210
+ await log_query(
211
+ pool,
212
+ user.preferred_username if user else req.user_name,
213
+ req.team_name,
214
+ req.area_name,
215
+ req.query,
216
+ len(rows),
217
+ user_oid=user.oid if user else None,
218
+ request_ms=request_ms,
219
+ )
220
+
221
+ results = [
222
+ SearchResult(
223
+ text=row["text"],
224
+ section_title=row["section_title"],
225
+ source_title=row["source_title"],
226
+ source_url=row["source_url"],
227
+ source_tags=list(row["source_tags"] or []),
228
+ similarity=float(row["similarity"]),
229
+ )
230
+ for row in rows
231
+ ]
232
+
233
+ return SearchResponse(results=results, query=req.query, count=len(results))
234
+
235
+
236
+ @app.get("/sources")
237
+ async def list_sources(user=Depends(_auth_dependency)) -> dict[str, Any]:
238
+ """List all indexed documentation sources."""
239
+ settings = _get_settings()
240
+ try:
241
+ pool = await get_pool(settings.database_url)
242
+ async with pool.acquire() as conn:
243
+ rows = await conn.fetch(
244
+ """
245
+ SELECT title, url, status, last_crawled_at,
246
+ (SELECT count(*) FROM chunks WHERE source_id = s.id) AS chunk_count
247
+ FROM sources s
248
+ ORDER BY title
249
+ """
250
+ )
251
+ except Exception as e:
252
+ logger.error("Database error listing sources: %s", e)
253
+ raise HTTPException(status_code=503, detail="Database unavailable")
254
+
255
+ return {
256
+ "count": len(rows),
257
+ "sources": [
258
+ {
259
+ "title": row["title"],
260
+ "url": row["url"],
261
+ "status": row["status"],
262
+ "chunk_count": row["chunk_count"],
263
+ }
264
+ for row in rows
265
+ ],
266
+ }
docforge/cli.py ADDED
@@ -0,0 +1,296 @@
1
+ """docforge CLI — forge searchable context from documentation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ import typer
10
+
11
+ app = typer.Typer(
12
+ help="Forge searchable context from Confluence and git repos for AI coding assistants.",
13
+ )
14
+
15
+
16
+ @app.command()
17
+ def init(name: str = typer.Argument(help="Project directory name")):
18
+ """Scaffold a new docforge project with config templates."""
19
+ target = Path(name)
20
+ if target.exists():
21
+ typer.echo(f"Error: directory '{name}' already exists.", err=True)
22
+ raise typer.Exit(1)
23
+
24
+ import importlib.resources as resources
25
+
26
+ templates_dir = resources.files("docforge") / "templates"
27
+ target.mkdir(parents=True)
28
+
29
+ for item in templates_dir.iterdir():
30
+ dest = target / item.name
31
+ if hasattr(item, "read_bytes"):
32
+ dest.write_bytes(item.read_bytes())
33
+ typer.echo(f" Created {dest}")
34
+
35
+ typer.echo(f"\nProject scaffolded in {target}/")
36
+ typer.echo("Next steps:")
37
+ typer.echo(f" cd {name}")
38
+ typer.echo(" # Edit docforge.yml with your Confluence URL")
39
+ typer.echo(" # Edit sources.yml with your page IDs")
40
+ typer.echo(" # Edit .env with your credentials")
41
+ typer.echo(" docker compose up -d db")
42
+ typer.echo(" docforge init-db")
43
+ typer.echo(" docforge ingest")
44
+ typer.echo(" docforge serve")
45
+
46
+
47
+ @app.command(name="init-db")
48
+ def init_db():
49
+ """Initialize the database schema."""
50
+ asyncio.run(_init_db())
51
+
52
+
53
+ @app.command()
54
+ def ingest(
55
+ purge_orphans: bool = typer.Option(
56
+ False,
57
+ "--purge-orphans",
58
+ help="Report DB sources absent from sources.yml. Dry-run; use --confirm to delete.",
59
+ ),
60
+ confirm: bool = typer.Option(
61
+ False,
62
+ "--confirm",
63
+ help="Required alongside --purge-orphans to actually delete orphans.",
64
+ ),
65
+ ):
66
+ """Crawl all sources, embed, and store in PostgreSQL."""
67
+ _setup_logging()
68
+ if confirm and not purge_orphans:
69
+ typer.echo("Error: --confirm only applies to --purge-orphans", err=True)
70
+ raise typer.Exit(1)
71
+ asyncio.run(_ingest(purge_orphans=purge_orphans, confirm=confirm))
72
+
73
+
74
+ @app.command()
75
+ def search(
76
+ query: str = typer.Argument(help="Search query"),
77
+ user_name: str = typer.Option(
78
+ None,
79
+ "--user",
80
+ help="Your name (required; falls back to default_user_name setting)",
81
+ ),
82
+ team_name: str = typer.Option(
83
+ None,
84
+ "--team",
85
+ help="Your team tag (required; falls back to default_team_name setting)",
86
+ ),
87
+ area_name: str = typer.Option(
88
+ None,
89
+ "--area",
90
+ help="Your area tag (optional; falls back to default_area_name setting)",
91
+ ),
92
+ limit: int = typer.Option(5, help="Max results"),
93
+ ):
94
+ """Search the documentation index."""
95
+ _setup_logging()
96
+ from docforge.config import Settings
97
+
98
+ settings = Settings()
99
+ resolved_user = user_name or settings.default_user_name
100
+ resolved_team = team_name or settings.default_team_name
101
+ resolved_area = area_name or (settings.default_area_name or None) or None
102
+
103
+ if not resolved_user:
104
+ typer.echo(
105
+ "Error: --user is required (or set default_user_name in docforge.yml).",
106
+ err=True,
107
+ )
108
+ raise typer.Exit(1)
109
+ if not resolved_team:
110
+ typer.echo(
111
+ "Error: --team is required (or set default_team_name in docforge.yml).",
112
+ err=True,
113
+ )
114
+ raise typer.Exit(1)
115
+
116
+ asyncio.run(_search(query, resolved_user, resolved_team, resolved_area, limit))
117
+
118
+
119
+ @app.command()
120
+ def serve(api: bool = typer.Option(False, help="Run FastAPI search API instead of MCP")):
121
+ """Run the MCP server (or FastAPI API with --api)."""
122
+ _setup_logging()
123
+ if api:
124
+ import uvicorn
125
+
126
+ from docforge.api import app as fastapi_app
127
+
128
+ uvicorn.run(fastapi_app, host="0.0.0.0", port=8000)
129
+ else:
130
+ from docforge.mcp_server import mcp
131
+
132
+ mcp.run()
133
+
134
+
135
+ @app.command()
136
+ def status():
137
+ """Show index statistics and health."""
138
+ asyncio.run(_status())
139
+
140
+
141
+ @app.command(name="lint-docs")
142
+ def lint_docs(
143
+ repo_path: Path = typer.Argument(..., help="Path to the repo root to lint"),
144
+ ) -> None:
145
+ """Lint a repo's README + CLAUDE.md + docs/ for banned-content rules."""
146
+ from docforge.lint import format_report, lint_repo
147
+
148
+ if not repo_path.is_dir():
149
+ typer.echo(f"Error: {repo_path} is not a directory", err=True)
150
+ raise typer.Exit(1)
151
+
152
+ report = lint_repo(repo_path)
153
+ typer.echo(format_report(report, repo_path))
154
+ if report.findings:
155
+ raise typer.Exit(1)
156
+
157
+
158
+ def _setup_logging():
159
+ logging.basicConfig(
160
+ level=logging.INFO,
161
+ format="%(asctime)s %(levelname)-8s %(name)s: %(message)s",
162
+ datefmt="%H:%M:%S",
163
+ )
164
+
165
+
166
+ async def _init_db():
167
+ from docforge.config import Settings
168
+ from docforge.db import init_db as do_init_db
169
+
170
+ settings = Settings()
171
+ typer.echo(f"Initializing database: {settings.database_url.split('@')[-1]}")
172
+ try:
173
+ await do_init_db(settings.database_url)
174
+ except OSError as e:
175
+ typer.echo(
176
+ f"Error: Cannot connect to database. Is PostgreSQL running?\n{e}",
177
+ err=True,
178
+ )
179
+ raise typer.Exit(1)
180
+ except Exception as e:
181
+ typer.echo(f"Error initializing database: {e}", err=True)
182
+ raise typer.Exit(1)
183
+ typer.echo("Database initialized successfully.")
184
+
185
+
186
+ async def _ingest(purge_orphans: bool = False, confirm: bool = False):
187
+ from docforge.config import Settings
188
+ from docforge.db import close_pool
189
+ from docforge.ingest import ingest_all
190
+
191
+ settings = Settings()
192
+ try:
193
+ await ingest_all(settings, purge_orphans=purge_orphans, confirm=confirm)
194
+ except OSError as e:
195
+ typer.echo(
196
+ f"Error: Cannot connect to database. Is PostgreSQL running?\n{e}",
197
+ err=True,
198
+ )
199
+ raise typer.Exit(1)
200
+ except RuntimeError as e:
201
+ typer.echo(f"Error: {e}", err=True)
202
+ raise typer.Exit(1)
203
+ except Exception as e:
204
+ typer.echo(f"Error during ingest: {e}", err=True)
205
+ raise typer.Exit(1)
206
+ finally:
207
+ await close_pool()
208
+
209
+
210
+ async def _search(query: str, user_name: str, team_name: str, area_name: str | None, limit: int):
211
+ import numpy as np
212
+
213
+ from docforge.config import Settings
214
+ from docforge.db import close_pool, get_pool
215
+ from docforge.processors.embedder import Embedder
216
+ from docforge.query_log import log_query
217
+
218
+ settings = Settings()
219
+ try:
220
+ embedder = Embedder(settings.embedding_model, hf_token=settings.hf_token.get_secret_value())
221
+ except RuntimeError as e:
222
+ typer.echo(f"Error: {e}", err=True)
223
+ raise typer.Exit(1)
224
+
225
+ query_vector = embedder.embed_query(query)
226
+ user_tags = [team_name] + ([area_name] if area_name else [])
227
+
228
+ try:
229
+ pool = await get_pool(settings.database_url)
230
+ async with pool.acquire() as conn:
231
+ rows = await conn.fetch(
232
+ """
233
+ SELECT c.text, c.section_title, s.title AS source_title,
234
+ s.tags AS source_tags,
235
+ 1 - (c.embedding <=> $1::vector) AS similarity,
236
+ (1 - (c.embedding <=> $1::vector)) *
237
+ (1
238
+ + $2::float * cardinality(
239
+ ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
240
+ )
241
+ + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
242
+ ) AS boosted_score
243
+ FROM chunks c JOIN sources s ON c.source_id = s.id
244
+ WHERE s.status = 'active'
245
+ ORDER BY boosted_score DESC LIMIT $5
246
+ """,
247
+ np.array(query_vector, dtype=np.float32),
248
+ settings.tag_match_weight,
249
+ user_tags,
250
+ settings.org_tag_weight,
251
+ limit,
252
+ )
253
+ await log_query(pool, user_name, team_name, area_name, query, len(rows))
254
+ except OSError as e:
255
+ typer.echo(
256
+ f"Error: Cannot connect to database. Is PostgreSQL running?\n{e}",
257
+ err=True,
258
+ )
259
+ raise typer.Exit(1)
260
+ finally:
261
+ await close_pool()
262
+
263
+ if not rows:
264
+ typer.echo("No results found.")
265
+ return
266
+
267
+ for i, row in enumerate(rows, 1):
268
+ sim = row["similarity"]
269
+ src = row["source_title"]
270
+ sec = row["section_title"] or ""
271
+ tags = list(row["source_tags"] or [])
272
+ typer.echo(f"\n--- Result {i} (relevance: {sim:.2f}) --- {src}")
273
+ if sec:
274
+ typer.echo(f"Section: {sec}")
275
+ if tags:
276
+ typer.echo(f"Tags: {', '.join(tags)}")
277
+ typer.echo(row["text"][:500])
278
+
279
+
280
+ async def _status():
281
+ from docforge.config import Settings
282
+ from docforge.db import close_pool, get_pool
283
+
284
+ settings = Settings()
285
+ try:
286
+ pool = await get_pool(settings.database_url)
287
+ async with pool.acquire() as conn:
288
+ sources = await conn.fetchval("SELECT count(*) FROM sources")
289
+ chunks = await conn.fetchval("SELECT count(*) FROM chunks")
290
+ typer.echo(f"Sources: {sources}")
291
+ typer.echo(f"Chunks: {chunks}")
292
+ typer.echo(f"DB: {settings.database_url.split('@')[-1]}")
293
+ except Exception as e:
294
+ typer.echo(f"Error connecting to database: {e}", err=True)
295
+ finally:
296
+ await close_pool()
docforge/config.py ADDED
@@ -0,0 +1,99 @@
1
+ """Settings loading — merges defaults, docforge.yml, .env, env vars, and kwargs.
2
+
3
+ Precedence: kwargs > yml > env > .env > defaults. yml values are passed to
4
+ pydantic-settings via `super().__init__(**merged)`, which treats them as
5
+ init-kwargs (highest priority after explicit kwargs).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import Literal
12
+
13
+ import yaml
14
+ from pydantic import BaseModel, SecretStr, model_validator
15
+ from pydantic_settings import BaseSettings, SettingsConfigDict
16
+
17
+
18
+ class AuthSettings(BaseModel):
19
+ mode: Literal["none", "entra"] = "none"
20
+ tenant_id: str = ""
21
+ audience: str = ""
22
+
23
+ @model_validator(mode="after")
24
+ def _validate_entra_fields(self):
25
+ if self.mode == "entra":
26
+ if not self.tenant_id:
27
+ raise ValueError(
28
+ "auth.mode=entra requires auth.tenant_id to be set "
29
+ "(via docforge.yml or AUTH__TENANT_ID env var)"
30
+ )
31
+ if not self.audience:
32
+ raise ValueError(
33
+ "auth.mode=entra requires auth.audience to be set "
34
+ "(via docforge.yml or AUTH__AUDIENCE env var)"
35
+ )
36
+ return self
37
+
38
+
39
+ class Settings(BaseSettings):
40
+ model_config = SettingsConfigDict(
41
+ env_file=".env",
42
+ env_file_encoding="utf-8",
43
+ env_nested_delimiter="__",
44
+ )
45
+
46
+ # Database
47
+ database_url: str = "postgresql://docforge:localdev@localhost:5432/docforge"
48
+
49
+ # Confluence
50
+ confluence_base_url: str = ""
51
+ confluence_email: str = ""
52
+ confluence_api_token: SecretStr = SecretStr("")
53
+
54
+ # HuggingFace token for model access
55
+ hf_token: SecretStr = SecretStr("")
56
+
57
+ # Embedding model
58
+ embedding_model: str = "google/embeddinggemma-300m"
59
+ embedding_dimensions: int = 768
60
+ chunk_max_tokens: int = 500
61
+
62
+ # Sources config
63
+ sources_file: str = "sources.yml"
64
+
65
+ # Ranking weights (see docforge.ranking.compute_boosted_score)
66
+ tag_match_weight: float = 0.1
67
+ org_tag_weight: float = 0.05
68
+
69
+ # Default identity (used as CLI flag defaults when set via env/yml)
70
+ default_user_name: str = ""
71
+ default_team_name: str = ""
72
+ default_area_name: str = ""
73
+
74
+ # Auth (opt-in Entra ID for /search + /sources)
75
+ auth: AuthSettings = AuthSettings()
76
+
77
+ # query_log retention — app-level cleanup loop deletes rows older than this
78
+ query_log_retention_days: int = 180
79
+
80
+ def __init__(self, **kwargs) -> None:
81
+ # Load from docforge.yml if present, then overlay with env vars
82
+ yml_path = Path("docforge.yml")
83
+ yml_values = {}
84
+ if yml_path.exists():
85
+ with open(yml_path) as f:
86
+ yml = yaml.safe_load(f) or {}
87
+ # Flatten nested embedding config
88
+ if "embedding" in yml:
89
+ emb = yml.pop("embedding")
90
+ if "model" in emb:
91
+ yml_values["embedding_model"] = emb["model"]
92
+ if "dimensions" in emb:
93
+ yml_values["embedding_dimensions"] = emb["dimensions"]
94
+ if "chunk_max_tokens" in emb:
95
+ yml_values["chunk_max_tokens"] = emb["chunk_max_tokens"]
96
+ yml_values.update(yml)
97
+ # YAML values are defaults; explicit kwargs and env vars override
98
+ merged = {**yml_values, **kwargs}
99
+ super().__init__(**merged)
@@ -0,0 +1 @@
1
+ """Source crawlers — Confluence REST API and local git repo file walkers."""