docforge-cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docforge/__init__.py +0 -0
- docforge/__main__.py +5 -0
- docforge/api.py +266 -0
- docforge/cli.py +296 -0
- docforge/config.py +99 -0
- docforge/crawlers/__init__.py +1 -0
- docforge/crawlers/confluence.py +109 -0
- docforge/crawlers/git.py +79 -0
- docforge/db.py +57 -0
- docforge/ingest.py +401 -0
- docforge/lint.py +92 -0
- docforge/mcp_server.py +188 -0
- docforge/processors/__init__.py +1 -0
- docforge/processors/chunker.py +141 -0
- docforge/processors/embedder.py +78 -0
- docforge/processors/parser.py +143 -0
- docforge/query_log.py +45 -0
- docforge/ranking.py +20 -0
- docforge/scripts/__init__.py +1 -0
- docforge/scripts/eval_search.py +226 -0
- docforge/scripts/latency_report.py +142 -0
- docforge/sources.py +46 -0
- docforge/sql/migrations/001_add_source_identifier.sql +3 -0
- docforge/sql/migrations/002_add_status_index.sql +1 -0
- docforge/sql/migrations/003_add_source_tags.sql +4 -0
- docforge/sql/migrations/004_add_query_log.sql +11 -0
- docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
- docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
- docforge/sql/schema.sql +29 -0
- docforge/templates/docforge.yml +11 -0
- docforge/templates/docker-compose.yml +14 -0
- docforge/templates/mcp_client.py +83 -0
- docforge/templates/sources.yml +21 -0
- docforge_cli-0.2.0.dist-info/METADATA +178 -0
- docforge_cli-0.2.0.dist-info/RECORD +39 -0
- docforge_cli-0.2.0.dist-info/WHEEL +5 -0
- docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
- docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
- docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
docforge/ranking.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Ranking helpers — pure Python mirror of the boost formula in search SQL."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_boosted_score(
|
|
7
|
+
similarity: float,
|
|
8
|
+
source_tags: list[str],
|
|
9
|
+
user_tags: list[str],
|
|
10
|
+
tag_weight: float,
|
|
11
|
+
org_weight: float,
|
|
12
|
+
) -> float:
|
|
13
|
+
"""Apply tag-overlap + org-tag boost to a similarity score.
|
|
14
|
+
|
|
15
|
+
Formula mirrors the SQL used in mcp_server.py and api.py search queries.
|
|
16
|
+
Kept in a pure function so the ranking math is unit-testable without SQL.
|
|
17
|
+
"""
|
|
18
|
+
overlap = len(set(source_tags) & set(user_tags))
|
|
19
|
+
has_org = "org" in source_tags
|
|
20
|
+
return similarity * (1 + tag_weight * overlap + org_weight * (1 if has_org else 0))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Operator scripts for docforge (run via `python -m docforge.scripts.<name>`)."""
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Evaluate docforge retrieval quality against a ground-truth query set.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python -m docforge.scripts.eval_search \\
|
|
5
|
+
--api-url https://<fqdn> \\
|
|
6
|
+
--ground-truth rag/eval/ground_truth.yml \\
|
|
7
|
+
--user tobias.ens --team ccl --area cloud \\
|
|
8
|
+
--k 5
|
|
9
|
+
|
|
10
|
+
Prints per-query detail + summary (recall@1, recall@k, MRR) to stdout. Exits 0
|
|
11
|
+
on successful run regardless of retrieval quality — this tool measures, it does
|
|
12
|
+
not gate.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import asyncio
|
|
19
|
+
import sys
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import httpx
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class QueryResult:
|
|
29
|
+
query: str
|
|
30
|
+
expected_substring: str
|
|
31
|
+
returned_titles: list[str]
|
|
32
|
+
returned_scores: list[float]
|
|
33
|
+
match_rank: int | None # 1-based; None if not in top-k
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def score_query(returned_titles: list[str], expected_substring: str) -> int | None:
|
|
37
|
+
"""Return 1-based rank where expected_substring is contained in any title,
|
|
38
|
+
case-insensitively; or None if no match. Pure function."""
|
|
39
|
+
needle = expected_substring.lower()
|
|
40
|
+
for rank, title in enumerate(returned_titles, start=1):
|
|
41
|
+
if needle in title.lower():
|
|
42
|
+
return rank
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def summarize(results: list[QueryResult], k: int) -> dict[str, float | int]:
|
|
47
|
+
"""Return {queries, recall@1, recall@k, mrr}. Pure function."""
|
|
48
|
+
total = len(results)
|
|
49
|
+
if total == 0:
|
|
50
|
+
return {"queries": 0, "recall@1": 0.0, f"recall@{k}": 0.0, "mrr": 0.0}
|
|
51
|
+
hits_at_1 = sum(1 for r in results if r.match_rank == 1)
|
|
52
|
+
hits_at_k = sum(1 for r in results if r.match_rank is not None and r.match_rank <= k)
|
|
53
|
+
mrr = sum(1.0 / r.match_rank for r in results if r.match_rank is not None) / total
|
|
54
|
+
return {
|
|
55
|
+
"queries": total,
|
|
56
|
+
"recall@1": hits_at_1 / total,
|
|
57
|
+
f"recall@{k}": hits_at_k / total,
|
|
58
|
+
"mrr": mrr,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def run_queries(
|
|
63
|
+
api_url: str,
|
|
64
|
+
ground_truth: list[dict],
|
|
65
|
+
user_name: str,
|
|
66
|
+
team_name: str,
|
|
67
|
+
area_name: str | None,
|
|
68
|
+
k: int,
|
|
69
|
+
audience: str | None = None,
|
|
70
|
+
) -> list[QueryResult]:
|
|
71
|
+
"""POST each query to <api_url>/search via httpx; collect results. Sequential.
|
|
72
|
+
|
|
73
|
+
When audience is provided, attach an Entra Bearer token obtained via
|
|
74
|
+
DefaultAzureCredential. When not, send unauthenticated (auth.mode==none path)."""
|
|
75
|
+
results: list[QueryResult] = []
|
|
76
|
+
credential = None
|
|
77
|
+
if audience:
|
|
78
|
+
from azure.identity.aio import DefaultAzureCredential
|
|
79
|
+
|
|
80
|
+
credential = DefaultAzureCredential()
|
|
81
|
+
try:
|
|
82
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
83
|
+
for entry in ground_truth:
|
|
84
|
+
q: str = entry["q"]
|
|
85
|
+
expected: str = entry["expected_title_contains"]
|
|
86
|
+
headers: dict[str, str] = {}
|
|
87
|
+
if credential is not None:
|
|
88
|
+
token = await credential.get_token(f"{audience}/.default")
|
|
89
|
+
headers["Authorization"] = f"Bearer {token.token}"
|
|
90
|
+
try:
|
|
91
|
+
resp = await client.post(
|
|
92
|
+
f"{api_url}/search",
|
|
93
|
+
headers=headers,
|
|
94
|
+
json={
|
|
95
|
+
"query": q,
|
|
96
|
+
"user_name": user_name,
|
|
97
|
+
"team_name": team_name,
|
|
98
|
+
"area_name": area_name,
|
|
99
|
+
"limit": k,
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
resp.raise_for_status()
|
|
103
|
+
payload = resp.json()
|
|
104
|
+
hits = payload.get("results", [])
|
|
105
|
+
except (httpx.HTTPError, ValueError) as e:
|
|
106
|
+
print(f" Query failed ({q!r}): {e}", file=sys.stderr)
|
|
107
|
+
hits = []
|
|
108
|
+
titles = [h.get("source_title", "") for h in hits]
|
|
109
|
+
scores = [float(h.get("similarity", 0.0)) for h in hits]
|
|
110
|
+
results.append(
|
|
111
|
+
QueryResult(
|
|
112
|
+
query=q,
|
|
113
|
+
expected_substring=expected,
|
|
114
|
+
returned_titles=titles,
|
|
115
|
+
returned_scores=scores,
|
|
116
|
+
match_rank=score_query(titles, expected),
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
finally:
|
|
120
|
+
if credential is not None:
|
|
121
|
+
await credential.close()
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def format_report(results: list[QueryResult], summary: dict[str, float | int], k: int) -> str:
|
|
126
|
+
"""Per-query detail + summary. Human-readable stdout."""
|
|
127
|
+
lines: list[str] = []
|
|
128
|
+
for r in results:
|
|
129
|
+
lines.append(f"Query: {r.query!r}")
|
|
130
|
+
lines.append(f" Expected: contains {r.expected_substring!r}")
|
|
131
|
+
if r.returned_titles:
|
|
132
|
+
lines.append(f" Top {len(r.returned_titles)}:")
|
|
133
|
+
for i, (title, score) in enumerate(
|
|
134
|
+
zip(r.returned_titles, r.returned_scores, strict=False), start=1
|
|
135
|
+
):
|
|
136
|
+
marker = " <-- MATCH" if r.match_rank == i else ""
|
|
137
|
+
lines.append(f" {i}. [{score:.2f}] {title}{marker}")
|
|
138
|
+
else:
|
|
139
|
+
lines.append(" Top: (no results)")
|
|
140
|
+
if r.match_rank is not None and r.match_rank <= k:
|
|
141
|
+
lines.append(f" recall@{k}: HIT rank: {r.match_rank}")
|
|
142
|
+
else:
|
|
143
|
+
lines.append(f" recall@{k}: MISS")
|
|
144
|
+
lines.append("")
|
|
145
|
+
|
|
146
|
+
lines.append("Summary:")
|
|
147
|
+
lines.append(f" queries: {summary['queries']}")
|
|
148
|
+
recall1 = summary["recall@1"]
|
|
149
|
+
recall_k = summary[f"recall@{k}"]
|
|
150
|
+
total = summary["queries"] or 1
|
|
151
|
+
r1_pct = f"{recall1 * 100:.0f}%"
|
|
152
|
+
rk_pct = f"{recall_k * 100:.0f}%"
|
|
153
|
+
lines.append(f" recall@1: {int(recall1 * total)}/{total} ({r1_pct})")
|
|
154
|
+
lines.append(f" recall@{k}: {int(recall_k * total)}/{total} ({rk_pct})")
|
|
155
|
+
lines.append(f" mean reciprocal rank: {summary['mrr']:.3f}")
|
|
156
|
+
|
|
157
|
+
misses = [r for r in results if r.match_rank is None or r.match_rank > k]
|
|
158
|
+
if misses:
|
|
159
|
+
lines.append("")
|
|
160
|
+
lines.append(f" Missed (no match in top {k}):")
|
|
161
|
+
for r in misses:
|
|
162
|
+
lines.append(f" - {r.query!r} (expected {r.expected_substring!r})")
|
|
163
|
+
|
|
164
|
+
return "\n".join(lines)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _load_ground_truth(path: Path) -> list[dict]:
|
|
168
|
+
"""Load, validate, and return the `queries` list from a ground-truth YAML file."""
|
|
169
|
+
if not path.is_file():
|
|
170
|
+
raise FileNotFoundError(f"Ground truth file not found: {path}")
|
|
171
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
172
|
+
queries = data.get("queries")
|
|
173
|
+
if not isinstance(queries, list) or not queries:
|
|
174
|
+
raise ValueError(f"{path}: missing or empty 'queries' list")
|
|
175
|
+
for i, q in enumerate(queries):
|
|
176
|
+
if "q" not in q or "expected_title_contains" not in q:
|
|
177
|
+
raise ValueError(f"{path}: entry {i} must have 'q' and 'expected_title_contains' keys")
|
|
178
|
+
return queries
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main() -> int:
|
|
182
|
+
parser = argparse.ArgumentParser(
|
|
183
|
+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
184
|
+
)
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--api-url", required=True, help="Base URL of the search API (no trailing slash)"
|
|
187
|
+
)
|
|
188
|
+
parser.add_argument("--ground-truth", required=True, type=Path, help="Path to ground_truth.yml")
|
|
189
|
+
parser.add_argument("--user", required=True, help="Your identity — forwarded as user_name")
|
|
190
|
+
parser.add_argument("--team", required=True, help="Your team tag — forwarded as team_name")
|
|
191
|
+
parser.add_argument("--area", default=None, help="Optional area tag — forwarded as area_name")
|
|
192
|
+
parser.add_argument("--k", type=int, default=5, help="Top-k cutoff for recall@k")
|
|
193
|
+
parser.add_argument(
|
|
194
|
+
"--audience",
|
|
195
|
+
default=None,
|
|
196
|
+
help=(
|
|
197
|
+
"Entra API audience (e.g., api://<app-id>). When set, attaches a "
|
|
198
|
+
"Bearer token via DefaultAzureCredential. Omit for auth.mode=none."
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
args = parser.parse_args()
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
ground_truth = _load_ground_truth(args.ground_truth)
|
|
205
|
+
except (FileNotFoundError, ValueError) as e:
|
|
206
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
207
|
+
return 1
|
|
208
|
+
|
|
209
|
+
results = asyncio.run(
|
|
210
|
+
run_queries(
|
|
211
|
+
api_url=args.api_url.rstrip("/"),
|
|
212
|
+
ground_truth=ground_truth,
|
|
213
|
+
user_name=args.user,
|
|
214
|
+
team_name=args.team,
|
|
215
|
+
area_name=args.area,
|
|
216
|
+
k=args.k,
|
|
217
|
+
audience=args.audience,
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
summary = summarize(results, args.k)
|
|
221
|
+
print(format_report(results, summary, args.k))
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
if __name__ == "__main__":
|
|
226
|
+
sys.exit(main())
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Compute P50 / P95 / P99 latency over recent query_log entries.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python -m docforge.scripts.latency_report --since '7 days' [--database-url ...]
|
|
5
|
+
|
|
6
|
+
Reads DATABASE_URL from the environment (or --database-url flag) so it can
|
|
7
|
+
run against prod with the admin connection string from Key Vault.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import asyncio
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
import asyncpg
|
|
20
|
+
|
|
21
|
+
# Only these interval shapes are accepted. The value is embedded as a SQL
|
|
22
|
+
# literal (asyncpg can't bind str to $1::interval), so the regex is the
|
|
23
|
+
# injection safety boundary — keep it strict.
|
|
24
|
+
_SINCE_PATTERN = re.compile(
|
|
25
|
+
r"^\s*\d+\s+(seconds?|minutes?|hours?|days?|weeks?|months?)\s*$",
|
|
26
|
+
re.IGNORECASE,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class LatencySummary:
|
|
32
|
+
n: int
|
|
33
|
+
p50_ms: float | None
|
|
34
|
+
p95_ms: float | None
|
|
35
|
+
p99_ms: float | None
|
|
36
|
+
earliest_request_ms_at: str | None # ISO timestamp of first post-C4.3 row
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def compute_summary(database_url: str, since: str) -> LatencySummary:
|
|
40
|
+
"""Query query_log.request_ms within the given interval. Returns
|
|
41
|
+
percentiles + row count + the earliest-seen request_ms timestamp (the
|
|
42
|
+
effective C4.3 cutover date for this DB).
|
|
43
|
+
|
|
44
|
+
`since` must match _SINCE_PATTERN (N + unit). It is embedded as a SQL
|
|
45
|
+
literal because asyncpg rejects str for $1::interval; the regex
|
|
46
|
+
validation is the injection boundary."""
|
|
47
|
+
if not _SINCE_PATTERN.match(since):
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Invalid --since {since!r}. Expected 'N unit' where unit is "
|
|
50
|
+
"seconds/minutes/hours/days/weeks/months (e.g. '7 days')."
|
|
51
|
+
)
|
|
52
|
+
conn = await asyncpg.connect(database_url)
|
|
53
|
+
try:
|
|
54
|
+
row = await conn.fetchrow(
|
|
55
|
+
f"""
|
|
56
|
+
SELECT
|
|
57
|
+
percentile_cont(0.50) WITHIN GROUP (ORDER BY request_ms) AS p50,
|
|
58
|
+
percentile_cont(0.95) WITHIN GROUP (ORDER BY request_ms) AS p95,
|
|
59
|
+
percentile_cont(0.99) WITHIN GROUP (ORDER BY request_ms) AS p99,
|
|
60
|
+
count(*) AS n
|
|
61
|
+
FROM query_log
|
|
62
|
+
WHERE request_ms IS NOT NULL
|
|
63
|
+
AND created_at > now() - interval '{since.strip()}'
|
|
64
|
+
"""
|
|
65
|
+
)
|
|
66
|
+
earliest = await conn.fetchval(
|
|
67
|
+
"SELECT min(created_at) FROM query_log WHERE request_ms IS NOT NULL"
|
|
68
|
+
)
|
|
69
|
+
return LatencySummary(
|
|
70
|
+
n=int(row["n"]),
|
|
71
|
+
p50_ms=float(row["p50"]) if row["p50"] is not None else None,
|
|
72
|
+
p95_ms=float(row["p95"]) if row["p95"] is not None else None,
|
|
73
|
+
p99_ms=float(row["p99"]) if row["p99"] is not None else None,
|
|
74
|
+
earliest_request_ms_at=earliest.isoformat() if earliest is not None else None,
|
|
75
|
+
)
|
|
76
|
+
finally:
|
|
77
|
+
await conn.close()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def format_summary(summary: LatencySummary, since: str) -> str:
|
|
81
|
+
"""Human-readable stdout report."""
|
|
82
|
+
lines = [
|
|
83
|
+
f"Window: last {since}",
|
|
84
|
+
f"Queries with timing: {summary.n}",
|
|
85
|
+
]
|
|
86
|
+
if summary.n == 0:
|
|
87
|
+
lines.append(
|
|
88
|
+
"No rows with request_ms in the window — has the C4.3 migration been applied "
|
|
89
|
+
"and the /search handler redeployed?"
|
|
90
|
+
)
|
|
91
|
+
return "\n".join(lines)
|
|
92
|
+
lines.extend(
|
|
93
|
+
[
|
|
94
|
+
f"P50: {summary.p50_ms:.0f} ms",
|
|
95
|
+
f"P95: {summary.p95_ms:.0f} ms",
|
|
96
|
+
f"P99: {summary.p99_ms:.0f} ms",
|
|
97
|
+
]
|
|
98
|
+
)
|
|
99
|
+
if summary.earliest_request_ms_at is not None:
|
|
100
|
+
lines.append(f"request_ms cutover at: {summary.earliest_request_ms_at}")
|
|
101
|
+
lines.append("")
|
|
102
|
+
lines.append("Note: the earliest ~1-2 rows after each revision deployment include")
|
|
103
|
+
lines.append("the 15-30 s embedding-model warm-up cost; this is kept in the data as")
|
|
104
|
+
lines.append("honest signal. P95 therefore reflects warm-up+steady-state.")
|
|
105
|
+
return "\n".join(lines)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def main() -> int:
|
|
109
|
+
parser = argparse.ArgumentParser(
|
|
110
|
+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
111
|
+
)
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--since",
|
|
114
|
+
default="7 days",
|
|
115
|
+
help="Postgres interval string (e.g., '7 days', '24 hours'). Default: 7 days.",
|
|
116
|
+
)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--database-url",
|
|
119
|
+
default=None,
|
|
120
|
+
help="Postgres URL. Falls back to DATABASE_URL env var.",
|
|
121
|
+
)
|
|
122
|
+
args = parser.parse_args()
|
|
123
|
+
|
|
124
|
+
db_url = args.database_url or os.environ.get("DATABASE_URL")
|
|
125
|
+
if not db_url:
|
|
126
|
+
print("Error: DATABASE_URL not set (and --database-url not provided)", file=sys.stderr)
|
|
127
|
+
return 1
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
summary = asyncio.run(compute_summary(db_url, args.since))
|
|
131
|
+
except ValueError as e:
|
|
132
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
133
|
+
return 2
|
|
134
|
+
except (OSError, asyncpg.PostgresError) as e:
|
|
135
|
+
print(f"Error connecting to the database: {e}", file=sys.stderr)
|
|
136
|
+
return 1
|
|
137
|
+
print(format_summary(summary, args.since))
|
|
138
|
+
return 0
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
sys.exit(main())
|
docforge/sources.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Source configuration — pydantic models + YAML loader.
|
|
2
|
+
|
|
3
|
+
Each entry in `sources.yml` is a ConfluenceSourceConfig or a
|
|
4
|
+
GitRepoSourceConfig, discriminated by the `type` field.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ConfluenceSourceConfig(BaseModel):
|
|
17
|
+
type: Literal["confluence_page"]
|
|
18
|
+
page_id: str
|
|
19
|
+
space_key: str
|
|
20
|
+
title: str
|
|
21
|
+
tags: list[str] = []
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GitRepoSourceConfig(BaseModel):
|
|
25
|
+
type: Literal["git_repo"]
|
|
26
|
+
repo_path: str
|
|
27
|
+
include_patterns: list[str] = ["README.md", "CLAUDE.md", "docs/**/*.md"]
|
|
28
|
+
title: str
|
|
29
|
+
tags: list[str] = []
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
SourceConfig = Annotated[
|
|
33
|
+
ConfluenceSourceConfig | GitRepoSourceConfig,
|
|
34
|
+
Field(discriminator="type"),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SourcesFile(BaseModel):
|
|
39
|
+
sources: list[SourceConfig]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_sources(path: str | Path) -> list[SourceConfig]:
|
|
43
|
+
"""Load source configurations from a YAML file."""
|
|
44
|
+
with open(path) as f:
|
|
45
|
+
data = yaml.safe_load(f)
|
|
46
|
+
return SourcesFile.model_validate(data).sources
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
CREATE INDEX IF NOT EXISTS sources_status_idx ON sources (status);
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
CREATE TABLE IF NOT EXISTS query_log (
|
|
2
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
3
|
+
user_name TEXT NOT NULL,
|
|
4
|
+
team_name TEXT NOT NULL,
|
|
5
|
+
area_name TEXT,
|
|
6
|
+
query TEXT NOT NULL,
|
|
7
|
+
result_count INT NOT NULL,
|
|
8
|
+
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
9
|
+
);
|
|
10
|
+
|
|
11
|
+
CREATE INDEX IF NOT EXISTS query_log_created_at_idx ON query_log (created_at);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE query_log ADD COLUMN IF NOT EXISTS request_ms INT;
|
docforge/sql/schema.sql
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
|
2
|
+
|
|
3
|
+
CREATE TABLE IF NOT EXISTS sources (
|
|
4
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
5
|
+
type TEXT NOT NULL,
|
|
6
|
+
url TEXT NOT NULL,
|
|
7
|
+
title TEXT NOT NULL,
|
|
8
|
+
confluence_page_id TEXT,
|
|
9
|
+
confluence_space_key TEXT,
|
|
10
|
+
source_identifier TEXT,
|
|
11
|
+
last_crawled_at TIMESTAMPTZ,
|
|
12
|
+
content_hash TEXT,
|
|
13
|
+
status TEXT DEFAULT 'pending',
|
|
14
|
+
created_at TIMESTAMPTZ DEFAULT now(),
|
|
15
|
+
CONSTRAINT sources_confluence_page_id_unique UNIQUE (confluence_page_id)
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
19
|
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
20
|
+
source_id UUID NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
21
|
+
chunk_index INT NOT NULL,
|
|
22
|
+
text TEXT NOT NULL,
|
|
23
|
+
embedding vector(768),
|
|
24
|
+
section_title TEXT,
|
|
25
|
+
created_at TIMESTAMPTZ DEFAULT now()
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks
|
|
29
|
+
USING hnsw (embedding vector_cosine_ops);
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# docforge.yml — main configuration
|
|
2
|
+
# Secrets (API tokens, passwords) go in .env, not here.
|
|
3
|
+
|
|
4
|
+
database_url: postgresql://docforge:localdev@localhost:5432/docforge
|
|
5
|
+
|
|
6
|
+
embedding:
|
|
7
|
+
model: google/embeddinggemma-300m
|
|
8
|
+
dimensions: 768
|
|
9
|
+
chunk_max_tokens: 500
|
|
10
|
+
|
|
11
|
+
sources_file: sources.yml
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Lightweight MCP client for docforge.
|
|
2
|
+
|
|
3
|
+
Calls a hosted search API over HTTP. No local database or model needed.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
pip install httpx fastmcp
|
|
7
|
+
claude mcp add -s user docforge -- python mcp_client.py
|
|
8
|
+
|
|
9
|
+
Environment:
|
|
10
|
+
DOCFORGE_API_URL: Base URL of the search API
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
from fastmcp import FastMCP
|
|
19
|
+
|
|
20
|
+
API_URL = os.environ.get("DOCFORGE_API_URL", "http://localhost:8000")
|
|
21
|
+
|
|
22
|
+
mcp = FastMCP(
|
|
23
|
+
"docforge",
|
|
24
|
+
instructions=(
|
|
25
|
+
"Search across your team's indexed documentation including architecture, "
|
|
26
|
+
"coding guidelines, and cross-team interfaces. "
|
|
27
|
+
"Use the search_documentation tool when you need information about "
|
|
28
|
+
"other teams, shared practices, or organizational knowledge."
|
|
29
|
+
),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@mcp.tool()
|
|
34
|
+
async def search_documentation(query: str, limit: int = 5) -> str:
|
|
35
|
+
"""Search across indexed documentation from Confluence pages and git repos.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
query: Natural language search query.
|
|
39
|
+
limit: Maximum number of results to return (default 5).
|
|
40
|
+
"""
|
|
41
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
42
|
+
resp = await client.post(
|
|
43
|
+
f"{API_URL}/search",
|
|
44
|
+
json={"query": query, "limit": limit},
|
|
45
|
+
)
|
|
46
|
+
resp.raise_for_status()
|
|
47
|
+
data = resp.json()
|
|
48
|
+
|
|
49
|
+
if not data["results"]:
|
|
50
|
+
return "No documentation found matching your query."
|
|
51
|
+
|
|
52
|
+
parts: list[str] = []
|
|
53
|
+
for i, result in enumerate(data["results"], 1):
|
|
54
|
+
header = f"**Result {i}** (relevance: {result['similarity']:.2f})"
|
|
55
|
+
header += f" -- {result['source_title']}"
|
|
56
|
+
if result.get("section_title"):
|
|
57
|
+
header += f" > {result['section_title']}"
|
|
58
|
+
header += f"\nSource: {result['source_url']}"
|
|
59
|
+
parts.append(f"{header}\n\n{result['text']}")
|
|
60
|
+
|
|
61
|
+
return "\n\n---\n\n".join(parts)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@mcp.tool()
|
|
65
|
+
async def list_sources() -> str:
|
|
66
|
+
"""List all documentation sources currently indexed."""
|
|
67
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
68
|
+
resp = await client.get(f"{API_URL}/sources")
|
|
69
|
+
resp.raise_for_status()
|
|
70
|
+
data = resp.json()
|
|
71
|
+
|
|
72
|
+
if not data["sources"]:
|
|
73
|
+
return "No sources indexed."
|
|
74
|
+
|
|
75
|
+
lines = [f"**{data['count']} indexed sources:**\n"]
|
|
76
|
+
for src in data["sources"]:
|
|
77
|
+
lines.append(f"- **{src['title']}** ({src['chunk_count']} chunks, {src['status']})")
|
|
78
|
+
|
|
79
|
+
return "\n".join(lines)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
mcp.run()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# sources.yml — documentation sources to index
|
|
2
|
+
# Uncomment and edit the examples below.
|
|
3
|
+
|
|
4
|
+
sources: []
|
|
5
|
+
|
|
6
|
+
# Confluence pages (need CONFLUENCE_* vars in .env):
|
|
7
|
+
# - type: confluence_page
|
|
8
|
+
# page_id: "12345"
|
|
9
|
+
# space_key: MYSPACE
|
|
10
|
+
# title: "My Team's Documentation"
|
|
11
|
+
#
|
|
12
|
+
# - type: confluence_page
|
|
13
|
+
# page_id: "67890"
|
|
14
|
+
# space_key: MYSPACE
|
|
15
|
+
# title: "Architecture Guidelines"
|
|
16
|
+
|
|
17
|
+
# Local git repos (no auth needed):
|
|
18
|
+
# - type: git_repo
|
|
19
|
+
# repo_path: "/path/to/my-repo"
|
|
20
|
+
# include_patterns: ["README.md", "CLAUDE.md", "docs/**/*.md"]
|
|
21
|
+
# title: "My Repo"
|