docforge-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docforge/__init__.py +0 -0
  2. docforge/__main__.py +5 -0
  3. docforge/api.py +266 -0
  4. docforge/cli.py +296 -0
  5. docforge/config.py +99 -0
  6. docforge/crawlers/__init__.py +1 -0
  7. docforge/crawlers/confluence.py +109 -0
  8. docforge/crawlers/git.py +79 -0
  9. docforge/db.py +57 -0
  10. docforge/ingest.py +401 -0
  11. docforge/lint.py +92 -0
  12. docforge/mcp_server.py +188 -0
  13. docforge/processors/__init__.py +1 -0
  14. docforge/processors/chunker.py +141 -0
  15. docforge/processors/embedder.py +78 -0
  16. docforge/processors/parser.py +143 -0
  17. docforge/query_log.py +45 -0
  18. docforge/ranking.py +20 -0
  19. docforge/scripts/__init__.py +1 -0
  20. docforge/scripts/eval_search.py +226 -0
  21. docforge/scripts/latency_report.py +142 -0
  22. docforge/sources.py +46 -0
  23. docforge/sql/migrations/001_add_source_identifier.sql +3 -0
  24. docforge/sql/migrations/002_add_status_index.sql +1 -0
  25. docforge/sql/migrations/003_add_source_tags.sql +4 -0
  26. docforge/sql/migrations/004_add_query_log.sql +11 -0
  27. docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
  28. docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
  29. docforge/sql/schema.sql +29 -0
  30. docforge/templates/docforge.yml +11 -0
  31. docforge/templates/docker-compose.yml +14 -0
  32. docforge/templates/mcp_client.py +83 -0
  33. docforge/templates/sources.yml +21 -0
  34. docforge_cli-0.2.0.dist-info/METADATA +178 -0
  35. docforge_cli-0.2.0.dist-info/RECORD +39 -0
  36. docforge_cli-0.2.0.dist-info/WHEEL +5 -0
  37. docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
  38. docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
  39. docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
docforge/ranking.py ADDED
@@ -0,0 +1,20 @@
1
+ """Ranking helpers — pure Python mirror of the boost formula in search SQL."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ def compute_boosted_score(
7
+ similarity: float,
8
+ source_tags: list[str],
9
+ user_tags: list[str],
10
+ tag_weight: float,
11
+ org_weight: float,
12
+ ) -> float:
13
+ """Apply tag-overlap + org-tag boost to a similarity score.
14
+
15
+ Formula mirrors the SQL used in mcp_server.py and api.py search queries.
16
+ Kept in a pure function so the ranking math is unit-testable without SQL.
17
+ """
18
+ overlap = len(set(source_tags) & set(user_tags))
19
+ has_org = "org" in source_tags
20
+ return similarity * (1 + tag_weight * overlap + org_weight * (1 if has_org else 0))
@@ -0,0 +1 @@
1
+ """Operator scripts for docforge (run via `python -m docforge.scripts.<name>`)."""
@@ -0,0 +1,226 @@
1
+ """Evaluate docforge retrieval quality against a ground-truth query set.
2
+
3
+ Usage:
4
+ python -m docforge.scripts.eval_search \\
5
+ --api-url https://<fqdn> \\
6
+ --ground-truth rag/eval/ground_truth.yml \\
7
+ --user tobias.ens --team ccl --area cloud \\
8
+ --k 5
9
+
10
+ Prints per-query detail + summary (recall@1, recall@k, MRR) to stdout. Exits 0
11
+ on successful run regardless of retrieval quality — this tool measures, it does
12
+ not gate.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import asyncio
19
+ import sys
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+
23
+ import httpx
24
+ import yaml
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class QueryResult:
29
+ query: str
30
+ expected_substring: str
31
+ returned_titles: list[str]
32
+ returned_scores: list[float]
33
+ match_rank: int | None # 1-based; None if not in top-k
34
+
35
+
36
+ def score_query(returned_titles: list[str], expected_substring: str) -> int | None:
37
+ """Return 1-based rank where expected_substring is contained in any title,
38
+ case-insensitively; or None if no match. Pure function."""
39
+ needle = expected_substring.lower()
40
+ for rank, title in enumerate(returned_titles, start=1):
41
+ if needle in title.lower():
42
+ return rank
43
+ return None
44
+
45
+
46
+ def summarize(results: list[QueryResult], k: int) -> dict[str, float | int]:
47
+ """Return {queries, recall@1, recall@k, mrr}. Pure function."""
48
+ total = len(results)
49
+ if total == 0:
50
+ return {"queries": 0, "recall@1": 0.0, f"recall@{k}": 0.0, "mrr": 0.0}
51
+ hits_at_1 = sum(1 for r in results if r.match_rank == 1)
52
+ hits_at_k = sum(1 for r in results if r.match_rank is not None and r.match_rank <= k)
53
+ mrr = sum(1.0 / r.match_rank for r in results if r.match_rank is not None) / total
54
+ return {
55
+ "queries": total,
56
+ "recall@1": hits_at_1 / total,
57
+ f"recall@{k}": hits_at_k / total,
58
+ "mrr": mrr,
59
+ }
60
+
61
+
62
+ async def run_queries(
63
+ api_url: str,
64
+ ground_truth: list[dict],
65
+ user_name: str,
66
+ team_name: str,
67
+ area_name: str | None,
68
+ k: int,
69
+ audience: str | None = None,
70
+ ) -> list[QueryResult]:
71
+ """POST each query to <api_url>/search via httpx; collect results. Sequential.
72
+
73
+ When audience is provided, attach an Entra Bearer token obtained via
74
+ DefaultAzureCredential. When not, send unauthenticated (auth.mode==none path)."""
75
+ results: list[QueryResult] = []
76
+ credential = None
77
+ if audience:
78
+ from azure.identity.aio import DefaultAzureCredential
79
+
80
+ credential = DefaultAzureCredential()
81
+ try:
82
+ async with httpx.AsyncClient(timeout=30.0) as client:
83
+ for entry in ground_truth:
84
+ q: str = entry["q"]
85
+ expected: str = entry["expected_title_contains"]
86
+ headers: dict[str, str] = {}
87
+ if credential is not None:
88
+ token = await credential.get_token(f"{audience}/.default")
89
+ headers["Authorization"] = f"Bearer {token.token}"
90
+ try:
91
+ resp = await client.post(
92
+ f"{api_url}/search",
93
+ headers=headers,
94
+ json={
95
+ "query": q,
96
+ "user_name": user_name,
97
+ "team_name": team_name,
98
+ "area_name": area_name,
99
+ "limit": k,
100
+ },
101
+ )
102
+ resp.raise_for_status()
103
+ payload = resp.json()
104
+ hits = payload.get("results", [])
105
+ except (httpx.HTTPError, ValueError) as e:
106
+ print(f" Query failed ({q!r}): {e}", file=sys.stderr)
107
+ hits = []
108
+ titles = [h.get("source_title", "") for h in hits]
109
+ scores = [float(h.get("similarity", 0.0)) for h in hits]
110
+ results.append(
111
+ QueryResult(
112
+ query=q,
113
+ expected_substring=expected,
114
+ returned_titles=titles,
115
+ returned_scores=scores,
116
+ match_rank=score_query(titles, expected),
117
+ )
118
+ )
119
+ finally:
120
+ if credential is not None:
121
+ await credential.close()
122
+ return results
123
+
124
+
125
+ def format_report(results: list[QueryResult], summary: dict[str, float | int], k: int) -> str:
126
+ """Per-query detail + summary. Human-readable stdout."""
127
+ lines: list[str] = []
128
+ for r in results:
129
+ lines.append(f"Query: {r.query!r}")
130
+ lines.append(f" Expected: contains {r.expected_substring!r}")
131
+ if r.returned_titles:
132
+ lines.append(f" Top {len(r.returned_titles)}:")
133
+ for i, (title, score) in enumerate(
134
+ zip(r.returned_titles, r.returned_scores, strict=False), start=1
135
+ ):
136
+ marker = " <-- MATCH" if r.match_rank == i else ""
137
+ lines.append(f" {i}. [{score:.2f}] {title}{marker}")
138
+ else:
139
+ lines.append(" Top: (no results)")
140
+ if r.match_rank is not None and r.match_rank <= k:
141
+ lines.append(f" recall@{k}: HIT rank: {r.match_rank}")
142
+ else:
143
+ lines.append(f" recall@{k}: MISS")
144
+ lines.append("")
145
+
146
+ lines.append("Summary:")
147
+ lines.append(f" queries: {summary['queries']}")
148
+ recall1 = summary["recall@1"]
149
+ recall_k = summary[f"recall@{k}"]
150
+ total = summary["queries"] or 1
151
+ r1_pct = f"{recall1 * 100:.0f}%"
152
+ rk_pct = f"{recall_k * 100:.0f}%"
153
+ lines.append(f" recall@1: {int(recall1 * total)}/{total} ({r1_pct})")
154
+ lines.append(f" recall@{k}: {int(recall_k * total)}/{total} ({rk_pct})")
155
+ lines.append(f" mean reciprocal rank: {summary['mrr']:.3f}")
156
+
157
+ misses = [r for r in results if r.match_rank is None or r.match_rank > k]
158
+ if misses:
159
+ lines.append("")
160
+ lines.append(f" Missed (no match in top {k}):")
161
+ for r in misses:
162
+ lines.append(f" - {r.query!r} (expected {r.expected_substring!r})")
163
+
164
+ return "\n".join(lines)
165
+
166
+
167
+ def _load_ground_truth(path: Path) -> list[dict]:
168
+ """Load, validate, and return the `queries` list from a ground-truth YAML file."""
169
+ if not path.is_file():
170
+ raise FileNotFoundError(f"Ground truth file not found: {path}")
171
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
172
+ queries = data.get("queries")
173
+ if not isinstance(queries, list) or not queries:
174
+ raise ValueError(f"{path}: missing or empty 'queries' list")
175
+ for i, q in enumerate(queries):
176
+ if "q" not in q or "expected_title_contains" not in q:
177
+ raise ValueError(f"{path}: entry {i} must have 'q' and 'expected_title_contains' keys")
178
+ return queries
179
+
180
+
181
+ def main() -> int:
182
+ parser = argparse.ArgumentParser(
183
+ description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
184
+ )
185
+ parser.add_argument(
186
+ "--api-url", required=True, help="Base URL of the search API (no trailing slash)"
187
+ )
188
+ parser.add_argument("--ground-truth", required=True, type=Path, help="Path to ground_truth.yml")
189
+ parser.add_argument("--user", required=True, help="Your identity — forwarded as user_name")
190
+ parser.add_argument("--team", required=True, help="Your team tag — forwarded as team_name")
191
+ parser.add_argument("--area", default=None, help="Optional area tag — forwarded as area_name")
192
+ parser.add_argument("--k", type=int, default=5, help="Top-k cutoff for recall@k")
193
+ parser.add_argument(
194
+ "--audience",
195
+ default=None,
196
+ help=(
197
+ "Entra API audience (e.g., api://<app-id>). When set, attaches a "
198
+ "Bearer token via DefaultAzureCredential. Omit for auth.mode=none."
199
+ ),
200
+ )
201
+ args = parser.parse_args()
202
+
203
+ try:
204
+ ground_truth = _load_ground_truth(args.ground_truth)
205
+ except (FileNotFoundError, ValueError) as e:
206
+ print(f"Error: {e}", file=sys.stderr)
207
+ return 1
208
+
209
+ results = asyncio.run(
210
+ run_queries(
211
+ api_url=args.api_url.rstrip("/"),
212
+ ground_truth=ground_truth,
213
+ user_name=args.user,
214
+ team_name=args.team,
215
+ area_name=args.area,
216
+ k=args.k,
217
+ audience=args.audience,
218
+ )
219
+ )
220
+ summary = summarize(results, args.k)
221
+ print(format_report(results, summary, args.k))
222
+ return 0
223
+
224
+
225
+ if __name__ == "__main__":
226
+ sys.exit(main())
@@ -0,0 +1,142 @@
1
+ """Compute P50 / P95 / P99 latency over recent query_log entries.
2
+
3
+ Usage:
4
+ python -m docforge.scripts.latency_report --since '7 days' [--database-url ...]
5
+
6
+ Reads DATABASE_URL from the environment (or --database-url flag) so it can
7
+ run against prod with the admin connection string from Key Vault.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import asyncio
14
+ import os
15
+ import re
16
+ import sys
17
+ from dataclasses import dataclass
18
+
19
+ import asyncpg
20
+
21
+ # Only these interval shapes are accepted. The value is embedded as a SQL
22
+ # literal (asyncpg can't bind str to $1::interval), so the regex is the
23
+ # injection safety boundary — keep it strict.
24
+ _SINCE_PATTERN = re.compile(
25
+ r"^\s*\d+\s+(seconds?|minutes?|hours?|days?|weeks?|months?)\s*$",
26
+ re.IGNORECASE,
27
+ )
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class LatencySummary:
32
+ n: int
33
+ p50_ms: float | None
34
+ p95_ms: float | None
35
+ p99_ms: float | None
36
+ earliest_request_ms_at: str | None # ISO timestamp of first post-C4.3 row
37
+
38
+
39
+ async def compute_summary(database_url: str, since: str) -> LatencySummary:
40
+ """Query query_log.request_ms within the given interval. Returns
41
+ percentiles + row count + the earliest-seen request_ms timestamp (the
42
+ effective C4.3 cutover date for this DB).
43
+
44
+ `since` must match _SINCE_PATTERN (N + unit). It is embedded as a SQL
45
+ literal because asyncpg rejects str for $1::interval; the regex
46
+ validation is the injection boundary."""
47
+ if not _SINCE_PATTERN.match(since):
48
+ raise ValueError(
49
+ f"Invalid --since {since!r}. Expected 'N unit' where unit is "
50
+ "seconds/minutes/hours/days/weeks/months (e.g. '7 days')."
51
+ )
52
+ conn = await asyncpg.connect(database_url)
53
+ try:
54
+ row = await conn.fetchrow(
55
+ f"""
56
+ SELECT
57
+ percentile_cont(0.50) WITHIN GROUP (ORDER BY request_ms) AS p50,
58
+ percentile_cont(0.95) WITHIN GROUP (ORDER BY request_ms) AS p95,
59
+ percentile_cont(0.99) WITHIN GROUP (ORDER BY request_ms) AS p99,
60
+ count(*) AS n
61
+ FROM query_log
62
+ WHERE request_ms IS NOT NULL
63
+ AND created_at > now() - interval '{since.strip()}'
64
+ """
65
+ )
66
+ earliest = await conn.fetchval(
67
+ "SELECT min(created_at) FROM query_log WHERE request_ms IS NOT NULL"
68
+ )
69
+ return LatencySummary(
70
+ n=int(row["n"]),
71
+ p50_ms=float(row["p50"]) if row["p50"] is not None else None,
72
+ p95_ms=float(row["p95"]) if row["p95"] is not None else None,
73
+ p99_ms=float(row["p99"]) if row["p99"] is not None else None,
74
+ earliest_request_ms_at=earliest.isoformat() if earliest is not None else None,
75
+ )
76
+ finally:
77
+ await conn.close()
78
+
79
+
80
+ def format_summary(summary: LatencySummary, since: str) -> str:
81
+ """Human-readable stdout report."""
82
+ lines = [
83
+ f"Window: last {since}",
84
+ f"Queries with timing: {summary.n}",
85
+ ]
86
+ if summary.n == 0:
87
+ lines.append(
88
+ "No rows with request_ms in the window — has the C4.3 migration been applied "
89
+ "and the /search handler redeployed?"
90
+ )
91
+ return "\n".join(lines)
92
+ lines.extend(
93
+ [
94
+ f"P50: {summary.p50_ms:.0f} ms",
95
+ f"P95: {summary.p95_ms:.0f} ms",
96
+ f"P99: {summary.p99_ms:.0f} ms",
97
+ ]
98
+ )
99
+ if summary.earliest_request_ms_at is not None:
100
+ lines.append(f"request_ms cutover at: {summary.earliest_request_ms_at}")
101
+ lines.append("")
102
+ lines.append("Note: the earliest ~1-2 rows after each revision deployment include")
103
+ lines.append("the 15-30 s embedding-model warm-up cost; this is kept in the data as")
104
+ lines.append("honest signal. P95 therefore reflects warm-up+steady-state.")
105
+ return "\n".join(lines)
106
+
107
+
108
+ def main() -> int:
109
+ parser = argparse.ArgumentParser(
110
+ description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
111
+ )
112
+ parser.add_argument(
113
+ "--since",
114
+ default="7 days",
115
+ help="Postgres interval string (e.g., '7 days', '24 hours'). Default: 7 days.",
116
+ )
117
+ parser.add_argument(
118
+ "--database-url",
119
+ default=None,
120
+ help="Postgres URL. Falls back to DATABASE_URL env var.",
121
+ )
122
+ args = parser.parse_args()
123
+
124
+ db_url = args.database_url or os.environ.get("DATABASE_URL")
125
+ if not db_url:
126
+ print("Error: DATABASE_URL not set (and --database-url not provided)", file=sys.stderr)
127
+ return 1
128
+
129
+ try:
130
+ summary = asyncio.run(compute_summary(db_url, args.since))
131
+ except ValueError as e:
132
+ print(f"Error: {e}", file=sys.stderr)
133
+ return 2
134
+ except (OSError, asyncpg.PostgresError) as e:
135
+ print(f"Error connecting to the database: {e}", file=sys.stderr)
136
+ return 1
137
+ print(format_summary(summary, args.since))
138
+ return 0
139
+
140
+
141
+ if __name__ == "__main__":
142
+ sys.exit(main())
docforge/sources.py ADDED
@@ -0,0 +1,46 @@
1
+ """Source configuration — pydantic models + YAML loader.
2
+
3
+ Each entry in `sources.yml` is a ConfluenceSourceConfig or a
4
+ GitRepoSourceConfig, discriminated by the `type` field.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Annotated, Literal
11
+
12
+ import yaml
13
+ from pydantic import BaseModel, Field
14
+
15
+
16
+ class ConfluenceSourceConfig(BaseModel):
17
+ type: Literal["confluence_page"]
18
+ page_id: str
19
+ space_key: str
20
+ title: str
21
+ tags: list[str] = []
22
+
23
+
24
+ class GitRepoSourceConfig(BaseModel):
25
+ type: Literal["git_repo"]
26
+ repo_path: str
27
+ include_patterns: list[str] = ["README.md", "CLAUDE.md", "docs/**/*.md"]
28
+ title: str
29
+ tags: list[str] = []
30
+
31
+
32
+ SourceConfig = Annotated[
33
+ ConfluenceSourceConfig | GitRepoSourceConfig,
34
+ Field(discriminator="type"),
35
+ ]
36
+
37
+
38
+ class SourcesFile(BaseModel):
39
+ sources: list[SourceConfig]
40
+
41
+
42
+ def load_sources(path: str | Path) -> list[SourceConfig]:
43
+ """Load source configurations from a YAML file."""
44
+ with open(path) as f:
45
+ data = yaml.safe_load(f)
46
+ return SourcesFile.model_validate(data).sources
@@ -0,0 +1,3 @@
1
+ ALTER TABLE sources ADD COLUMN IF NOT EXISTS source_identifier TEXT;
2
+ CREATE UNIQUE INDEX IF NOT EXISTS sources_source_identifier_unique
3
+ ON sources (source_identifier) WHERE source_identifier IS NOT NULL;
@@ -0,0 +1 @@
1
+ CREATE INDEX IF NOT EXISTS sources_status_idx ON sources (status);
@@ -0,0 +1,4 @@
1
+ ALTER TABLE sources
2
+ ADD COLUMN IF NOT EXISTS tags TEXT[] NOT NULL DEFAULT '{}';
3
+
4
+ CREATE INDEX IF NOT EXISTS sources_tags_idx ON sources USING gin (tags);
@@ -0,0 +1,11 @@
1
+ CREATE TABLE IF NOT EXISTS query_log (
2
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
3
+ user_name TEXT NOT NULL,
4
+ team_name TEXT NOT NULL,
5
+ area_name TEXT,
6
+ query TEXT NOT NULL,
7
+ result_count INT NOT NULL,
8
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now()
9
+ );
10
+
11
+ CREATE INDEX IF NOT EXISTS query_log_created_at_idx ON query_log (created_at);
@@ -0,0 +1,2 @@
1
+ ALTER TABLE query_log ADD COLUMN IF NOT EXISTS user_oid TEXT;
2
+ CREATE INDEX IF NOT EXISTS query_log_user_oid_idx ON query_log (user_oid);
@@ -0,0 +1 @@
1
+ ALTER TABLE query_log ADD COLUMN IF NOT EXISTS request_ms INT;
@@ -0,0 +1,29 @@
1
+ CREATE EXTENSION IF NOT EXISTS vector;
2
+
3
+ CREATE TABLE IF NOT EXISTS sources (
4
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
5
+ type TEXT NOT NULL,
6
+ url TEXT NOT NULL,
7
+ title TEXT NOT NULL,
8
+ confluence_page_id TEXT,
9
+ confluence_space_key TEXT,
10
+ source_identifier TEXT,
11
+ last_crawled_at TIMESTAMPTZ,
12
+ content_hash TEXT,
13
+ status TEXT DEFAULT 'pending',
14
+ created_at TIMESTAMPTZ DEFAULT now(),
15
+ CONSTRAINT sources_confluence_page_id_unique UNIQUE (confluence_page_id)
16
+ );
17
+
18
+ CREATE TABLE IF NOT EXISTS chunks (
19
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
20
+ source_id UUID NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
21
+ chunk_index INT NOT NULL,
22
+ text TEXT NOT NULL,
23
+ embedding vector(768),
24
+ section_title TEXT,
25
+ created_at TIMESTAMPTZ DEFAULT now()
26
+ );
27
+
28
+ CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks
29
+ USING hnsw (embedding vector_cosine_ops);
@@ -0,0 +1,11 @@
1
+ # docforge.yml — main configuration
2
+ # Secrets (API tokens, passwords) go in .env, not here.
3
+
4
+ database_url: postgresql://docforge:localdev@localhost:5432/docforge
5
+
6
+ embedding:
7
+ model: google/embeddinggemma-300m
8
+ dimensions: 768
9
+ chunk_max_tokens: 500
10
+
11
+ sources_file: sources.yml
@@ -0,0 +1,14 @@
1
+ services:
2
+ db:
3
+ image: pgvector/pgvector:pg16
4
+ environment:
5
+ POSTGRES_DB: docforge
6
+ POSTGRES_USER: docforge
7
+ POSTGRES_PASSWORD: localdev
8
+ ports:
9
+ - "5432:5432"
10
+ volumes:
11
+ - pgdata:/var/lib/postgresql/data
12
+
13
+ volumes:
14
+ pgdata:
@@ -0,0 +1,83 @@
1
+ """Lightweight MCP client for docforge.
2
+
3
+ Calls a hosted search API over HTTP. No local database or model needed.
4
+
5
+ Usage:
6
+ pip install httpx fastmcp
7
+ claude mcp add -s user docforge -- python mcp_client.py
8
+
9
+ Environment:
10
+ DOCFORGE_API_URL: Base URL of the search API
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+
17
+ import httpx
18
+ from fastmcp import FastMCP
19
+
20
+ API_URL = os.environ.get("DOCFORGE_API_URL", "http://localhost:8000")
21
+
22
+ mcp = FastMCP(
23
+ "docforge",
24
+ instructions=(
25
+ "Search across your team's indexed documentation including architecture, "
26
+ "coding guidelines, and cross-team interfaces. "
27
+ "Use the search_documentation tool when you need information about "
28
+ "other teams, shared practices, or organizational knowledge."
29
+ ),
30
+ )
31
+
32
+
33
+ @mcp.tool()
34
+ async def search_documentation(query: str, limit: int = 5) -> str:
35
+ """Search across indexed documentation from Confluence pages and git repos.
36
+
37
+ Args:
38
+ query: Natural language search query.
39
+ limit: Maximum number of results to return (default 5).
40
+ """
41
+ async with httpx.AsyncClient(timeout=30.0) as client:
42
+ resp = await client.post(
43
+ f"{API_URL}/search",
44
+ json={"query": query, "limit": limit},
45
+ )
46
+ resp.raise_for_status()
47
+ data = resp.json()
48
+
49
+ if not data["results"]:
50
+ return "No documentation found matching your query."
51
+
52
+ parts: list[str] = []
53
+ for i, result in enumerate(data["results"], 1):
54
+ header = f"**Result {i}** (relevance: {result['similarity']:.2f})"
55
+ header += f" -- {result['source_title']}"
56
+ if result.get("section_title"):
57
+ header += f" > {result['section_title']}"
58
+ header += f"\nSource: {result['source_url']}"
59
+ parts.append(f"{header}\n\n{result['text']}")
60
+
61
+ return "\n\n---\n\n".join(parts)
62
+
63
+
64
+ @mcp.tool()
65
+ async def list_sources() -> str:
66
+ """List all documentation sources currently indexed."""
67
+ async with httpx.AsyncClient(timeout=10.0) as client:
68
+ resp = await client.get(f"{API_URL}/sources")
69
+ resp.raise_for_status()
70
+ data = resp.json()
71
+
72
+ if not data["sources"]:
73
+ return "No sources indexed."
74
+
75
+ lines = [f"**{data['count']} indexed sources:**\n"]
76
+ for src in data["sources"]:
77
+ lines.append(f"- **{src['title']}** ({src['chunk_count']} chunks, {src['status']})")
78
+
79
+ return "\n".join(lines)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ mcp.run()
@@ -0,0 +1,21 @@
1
+ # sources.yml — documentation sources to index
2
+ # Uncomment and edit the examples below.
3
+
4
+ sources: []
5
+
6
+ # Confluence pages (need CONFLUENCE_* vars in .env):
7
+ # - type: confluence_page
8
+ # page_id: "12345"
9
+ # space_key: MYSPACE
10
+ # title: "My Team's Documentation"
11
+ #
12
+ # - type: confluence_page
13
+ # page_id: "67890"
14
+ # space_key: MYSPACE
15
+ # title: "Architecture Guidelines"
16
+
17
+ # Local git repos (no auth needed):
18
+ # - type: git_repo
19
+ # repo_path: "/path/to/my-repo"
20
+ # include_patterns: ["README.md", "CLAUDE.md", "docs/**/*.md"]
21
+ # title: "My Repo"