docforge-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. docforge/__init__.py +0 -0
  2. docforge/__main__.py +5 -0
  3. docforge/api.py +266 -0
  4. docforge/cli.py +296 -0
  5. docforge/config.py +99 -0
  6. docforge/crawlers/__init__.py +1 -0
  7. docforge/crawlers/confluence.py +109 -0
  8. docforge/crawlers/git.py +79 -0
  9. docforge/db.py +57 -0
  10. docforge/ingest.py +401 -0
  11. docforge/lint.py +92 -0
  12. docforge/mcp_server.py +188 -0
  13. docforge/processors/__init__.py +1 -0
  14. docforge/processors/chunker.py +141 -0
  15. docforge/processors/embedder.py +78 -0
  16. docforge/processors/parser.py +143 -0
  17. docforge/query_log.py +45 -0
  18. docforge/ranking.py +20 -0
  19. docforge/scripts/__init__.py +1 -0
  20. docforge/scripts/eval_search.py +226 -0
  21. docforge/scripts/latency_report.py +142 -0
  22. docforge/sources.py +46 -0
  23. docforge/sql/migrations/001_add_source_identifier.sql +3 -0
  24. docforge/sql/migrations/002_add_status_index.sql +1 -0
  25. docforge/sql/migrations/003_add_source_tags.sql +4 -0
  26. docforge/sql/migrations/004_add_query_log.sql +11 -0
  27. docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
  28. docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
  29. docforge/sql/schema.sql +29 -0
  30. docforge/templates/docforge.yml +11 -0
  31. docforge/templates/docker-compose.yml +14 -0
  32. docforge/templates/mcp_client.py +83 -0
  33. docforge/templates/sources.yml +21 -0
  34. docforge_cli-0.2.0.dist-info/METADATA +178 -0
  35. docforge_cli-0.2.0.dist-info/RECORD +39 -0
  36. docforge_cli-0.2.0.dist-info/WHEEL +5 -0
  37. docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
  38. docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
  39. docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
docforge/lint.py ADDED
@@ -0,0 +1,92 @@
1
+ """Lint a repo's README + CLAUDE.md + docs/ for Spec B banned-content patterns.
2
+
3
+ Pure logic — see `docforge.cli.lint_docs` for the user entry point. Banned-content
4
+ only in v1; structural/required-topics linting is deferred (see Spec C2 follow-ups).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+
13
+ from docforge.crawlers.git import crawl_repo
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class LintFinding:
18
+ file: str
19
+ line: int
20
+ rule: str
21
+ message: str
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class LintReport:
26
+ scanned: list[str] = field(default_factory=list)
27
+ findings: list[LintFinding] = field(default_factory=list)
28
+
29
+
30
+ BANNED_RULES: list[tuple[str, str, str]] = [
31
+ (
32
+ "todo-placeholder",
33
+ r"TODO.*(Explain|Contribute)",
34
+ "Placeholder TODO — delete and write real content",
35
+ ),
36
+ (
37
+ "readme-inspiration-link",
38
+ r"create-a-readme",
39
+ "Microsoft README inspirational link — delete",
40
+ ),
41
+ (
42
+ "readme-boilerplate",
43
+ r"(github\.com/aspnet/Home|Microsoft/vscode|Microsoft/ChakraCore)",
44
+ "Azure DevOps default boilerplate — delete whole block",
45
+ ),
46
+ (
47
+ "lastpass-reference",
48
+ r"LastPass",
49
+ "Credential-source reference — move to Teams channel, not indexed docs",
50
+ ),
51
+ ]
52
+
53
+ _COMPILED_BANNED_RULES = [(name, re.compile(pat), msg) for name, pat, msg in BANNED_RULES]
54
+
55
+
56
+ def lint_repo(repo_root: Path) -> LintReport:
57
+ """Walk the repo's doc surface and scan for banned-content patterns. Read-only."""
58
+ files = crawl_repo(str(repo_root))
59
+ scanned = [f.file_path.replace("\\", "/") for f in files]
60
+ findings: list[LintFinding] = []
61
+ for f in files:
62
+ rel = f.file_path.replace("\\", "/")
63
+ for lineno, line in enumerate(f.content.splitlines(), start=1):
64
+ for rule_name, pattern, message in _COMPILED_BANNED_RULES:
65
+ if pattern.search(line):
66
+ findings.append(LintFinding(rel, lineno, rule_name, message))
67
+ return LintReport(scanned=scanned, findings=findings)
68
+
69
+
70
+ def format_report(report: LintReport, repo_root: Path) -> str:
71
+ """Human-readable stdout, grouped by file with summary line."""
72
+ lines: list[str] = []
73
+ header = "FAIL" if report.findings else "PASS"
74
+ lines.append(f"{repo_root} — {header}")
75
+ lines.append(f" {len(report.scanned)} files scanned")
76
+ if not report.findings:
77
+ lines.append(" No banned content")
78
+ return "\n".join(lines)
79
+
80
+ lines.append("")
81
+ lines.append(" Banned content:")
82
+ by_file: dict[str, list[LintFinding]] = {}
83
+ for f in report.findings:
84
+ by_file.setdefault(f.file, []).append(f)
85
+ for file_rel, file_findings in by_file.items():
86
+ for f in file_findings:
87
+ location = f"{file_rel}:{f.line}"
88
+ lines.append(f" FAIL {location:<30} {f.rule:<22} {f.message}")
89
+
90
+ lines.append("")
91
+ lines.append(f" Summary: {len(report.findings)} banned-content hits")
92
+ return "\n".join(lines)
docforge/mcp_server.py ADDED
@@ -0,0 +1,188 @@
1
+ """MCP server exposing documentation search to AI coding assistants.
2
+
3
+ Run with: python -m docforge.mcp_server
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+
10
+ import numpy as np
11
+ from fastmcp import FastMCP
12
+
13
+ from docforge.config import Settings
14
+ from docforge.db import get_pool
15
+ from docforge.processors.embedder import Embedder
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ mcp = FastMCP(
20
+ "knowledge-hub",
21
+ instructions=(
22
+ "Search across your team's indexed documentation including team responsibilities, "
23
+ "coding guidelines, architecture standards, and cross-team interfaces. "
24
+ "Use the search_documentation tool when you need information about other teams, "
25
+ "shared coding practices, or organizational knowledge."
26
+ ),
27
+ )
28
+
29
+ # Initialized lazily on first search
30
+ _embedder: Embedder | None = None
31
+ _settings: Settings | None = None
32
+
33
+
34
+ def _get_settings() -> Settings:
35
+ global _settings
36
+ if _settings is None:
37
+ _settings = Settings()
38
+ return _settings
39
+
40
+
41
+ def _get_embedder() -> Embedder:
42
+ global _embedder
43
+ if _embedder is None:
44
+ settings = _get_settings()
45
+ logger.info("Loading embedding model (this may take a few seconds)...")
46
+ _embedder = Embedder(
47
+ settings.embedding_model, hf_token=settings.hf_token.get_secret_value()
48
+ )
49
+ return _embedder
50
+
51
+
52
+ @mcp.tool()
53
+ async def search_documentation(
54
+ query: str,
55
+ user_name: str,
56
+ team_name: str,
57
+ area_name: str | None = None,
58
+ limit: int = 5,
59
+ ) -> str:
60
+ """Search across indexed documentation from Confluence pages and git repos.
61
+
62
+ Returns relevant documentation chunks with source attribution. Use this to find
63
+ information about team ownership, coding guidelines, architecture decisions,
64
+ and cross-team interfaces.
65
+
66
+ Args:
67
+ query: Natural language search query.
68
+ user_name: Your name (e.g., "tobias.ens"). Used for usage telemetry.
69
+ team_name: Your team tag (e.g., "ccl"). Boosts team-tagged docs.
70
+ area_name: Your area tag (e.g., "cloud"). Optional; boosts area-tagged docs.
71
+ limit: Maximum number of results to return (default 5).
72
+ """
73
+ settings = _get_settings()
74
+ embedder = _get_embedder()
75
+
76
+ query_vector = embedder.embed_query(query)
77
+ user_tags = [team_name] + ([area_name] if area_name else [])
78
+
79
+ pool = await get_pool(settings.database_url)
80
+ async with pool.acquire() as conn:
81
+ rows = await conn.fetch(
82
+ """
83
+ SELECT
84
+ c.text,
85
+ c.section_title,
86
+ s.title AS source_title,
87
+ s.url AS source_url,
88
+ s.tags AS source_tags,
89
+ 1 - (c.embedding <=> $1::vector) AS similarity,
90
+ (1 - (c.embedding <=> $1::vector)) *
91
+ (1
92
+ + $2::float * cardinality(
93
+ ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
94
+ )
95
+ + $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
96
+ ) AS boosted_score
97
+ FROM chunks c
98
+ JOIN sources s ON c.source_id = s.id
99
+ WHERE s.status = 'active'
100
+ ORDER BY boosted_score DESC
101
+ LIMIT $5
102
+ """,
103
+ np.array(query_vector, dtype=np.float32),
104
+ settings.tag_match_weight,
105
+ user_tags,
106
+ settings.org_tag_weight,
107
+ limit,
108
+ )
109
+
110
+ from docforge.query_log import log_query
111
+
112
+ await log_query(pool, user_name, team_name, area_name, query, len(rows))
113
+
114
+ if not rows:
115
+ return (
116
+ "No documentation found matching your query. "
117
+ "The index may be empty -- run `python -m docforge ingest` to populate it."
118
+ )
119
+
120
+ parts: list[str] = []
121
+ for i, row in enumerate(rows, 1):
122
+ similarity = row["similarity"]
123
+ source = row["source_title"]
124
+ url = row["source_url"]
125
+ section = row["section_title"]
126
+ text = row["text"]
127
+ tags = list(row["source_tags"] or [])
128
+
129
+ header = f"**Result {i}** (relevance: {similarity:.2f}) — {source}"
130
+ if section:
131
+ header += f" > {section}"
132
+ header += f"\nSource: {url}"
133
+ if tags:
134
+ header += f"\nTags: {', '.join(tags)}"
135
+
136
+ parts.append(f"{header}\n\n{text}")
137
+
138
+ return "\n\n---\n\n".join(parts)
139
+
140
+
141
+ @mcp.tool()
142
+ async def list_sources() -> str:
143
+ """List all documentation sources currently indexed in the knowledge hub.
144
+
145
+ Returns the title, URL, status, and last crawl time for each source.
146
+ Use this to see what documentation is available for searching.
147
+ """
148
+ settings = _get_settings()
149
+ pool = await get_pool(settings.database_url)
150
+
151
+ async with pool.acquire() as conn:
152
+ rows = await conn.fetch(
153
+ """
154
+ SELECT title, url, status, last_crawled_at,
155
+ (SELECT count(*) FROM chunks WHERE source_id = s.id) AS chunk_count
156
+ FROM sources s
157
+ ORDER BY title
158
+ """
159
+ )
160
+
161
+ if not rows:
162
+ return "No sources indexed yet. Run `python -m docforge ingest` to populate."
163
+
164
+ lines: list[str] = []
165
+ for row in rows:
166
+ last = row["last_crawled_at"]
167
+ crawled = last.strftime("%Y-%m-%d %H:%M") if last else "never"
168
+ lines.append(
169
+ f"- **{row['title']}** ({row['chunk_count']} chunks, {row['status']})\n"
170
+ f" Last crawled: {crawled}\n"
171
+ f" {row['url']}"
172
+ )
173
+
174
+ return f"**{len(rows)} indexed sources:**\n\n" + "\n\n".join(lines)
175
+
176
+
177
+ def main() -> None:
178
+ """Configure logging and start the FastMCP server on stdio transport."""
179
+ logging.basicConfig(
180
+ level=logging.INFO,
181
+ format="%(asctime)s %(levelname)-8s %(name)s: %(message)s",
182
+ datefmt="%H:%M:%S",
183
+ )
184
+ mcp.run()
185
+
186
+
187
+ if __name__ == "__main__":
188
+ main()
@@ -0,0 +1 @@
1
+ """Text processors — HTML parser, token-aware chunker, embedder."""
@@ -0,0 +1,141 @@
1
+ """Token-aware chunker — splits sections into chunks under a token limit."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from docforge.processors.parser import Section
8
+
9
+
10
+ @dataclass
11
+ class Chunk:
12
+ text: str
13
+ section_title: str
14
+ chunk_index: int
15
+
16
+
17
+ def chunk_sections(
18
+ sections: list[Section],
19
+ max_tokens: int = 500,
20
+ tokenizer_fn: callable | None = None,
21
+ ) -> list[Chunk]:
22
+ """Split sections into chunks of roughly max_tokens size.
23
+
24
+ Splits on section boundaries first, then on paragraph boundaries
25
+ if a section exceeds max_tokens.
26
+
27
+ Args:
28
+ sections: Parsed sections from the document.
29
+ max_tokens: Maximum tokens per chunk.
30
+ tokenizer_fn: Function that counts tokens in a string.
31
+ If None, uses a simple word-count approximation
32
+ (1 token ~ 0.75 words).
33
+ """
34
+ if tokenizer_fn is None:
35
+ tokenizer_fn = _approximate_token_count
36
+
37
+ chunks: list[Chunk] = []
38
+ chunk_index = 0
39
+
40
+ for section in sections:
41
+ text = section.text.strip()
42
+ if not text:
43
+ continue
44
+
45
+ # Add section title as context prefix
46
+ if section.title:
47
+ text = f"{section.title}\n\n{text}"
48
+
49
+ if tokenizer_fn(text) <= max_tokens:
50
+ chunks.append(Chunk(text=text, section_title=section.title, chunk_index=chunk_index))
51
+ chunk_index += 1
52
+ else:
53
+ # Split on paragraph boundaries
54
+ sub_chunks = _split_by_paragraphs(text, max_tokens, tokenizer_fn)
55
+ for sub_text in sub_chunks:
56
+ chunks.append(
57
+ Chunk(text=sub_text, section_title=section.title, chunk_index=chunk_index)
58
+ )
59
+ chunk_index += 1
60
+
61
+ return chunks
62
+
63
+
64
+ def _split_by_paragraphs(text: str, max_tokens: int, tokenizer_fn: callable) -> list[str]:
65
+ """Split text into chunks by paragraph boundaries."""
66
+ paragraphs = text.split("\n")
67
+ result: list[str] = []
68
+ current_parts: list[str] = []
69
+ current_tokens = 0
70
+
71
+ for paragraph in paragraphs:
72
+ paragraph = paragraph.strip()
73
+ if not paragraph:
74
+ continue
75
+
76
+ para_tokens = tokenizer_fn(paragraph)
77
+
78
+ if para_tokens > max_tokens:
79
+ # Flush current buffer
80
+ if current_parts:
81
+ result.append("\n".join(current_parts))
82
+ current_parts = []
83
+ current_tokens = 0
84
+
85
+ # Split long paragraph by sentences
86
+ for sentence_chunk in _split_long_text(paragraph, max_tokens, tokenizer_fn):
87
+ result.append(sentence_chunk)
88
+ continue
89
+
90
+ if current_tokens + para_tokens > max_tokens and current_parts:
91
+ result.append("\n".join(current_parts))
92
+ current_parts = []
93
+ current_tokens = 0
94
+
95
+ current_parts.append(paragraph)
96
+ current_tokens += para_tokens
97
+
98
+ if current_parts:
99
+ result.append("\n".join(current_parts))
100
+
101
+ return result
102
+
103
+
104
+ def _split_long_text(text: str, max_tokens: int, tokenizer_fn: callable) -> list[str]:
105
+ """Split text that exceeds max_tokens by sentence boundaries, falling back to words."""
106
+ # Try splitting by sentences (period followed by space)
107
+ sentences = text.replace(". ", ".\n").split("\n")
108
+
109
+ result: list[str] = []
110
+ current_parts: list[str] = []
111
+ current_tokens = 0
112
+
113
+ for sentence in sentences:
114
+ sentence = sentence.strip()
115
+ if not sentence:
116
+ continue
117
+
118
+ sent_tokens = tokenizer_fn(sentence)
119
+
120
+ if current_tokens + sent_tokens > max_tokens and current_parts:
121
+ result.append(" ".join(current_parts))
122
+ current_parts = []
123
+ current_tokens = 0
124
+
125
+ current_parts.append(sentence)
126
+ current_tokens += sent_tokens
127
+
128
+ if current_parts:
129
+ result.append(" ".join(current_parts))
130
+
131
+ return result
132
+
133
+
134
+ def _approximate_token_count(text: str) -> int:
135
+ """Approximate token count using word count.
136
+
137
+ Roughly 1 token ~ 0.75 words for English text.
138
+ This is used as fallback when no model tokenizer is available.
139
+ """
140
+ words = len(text.split())
141
+ return int(words / 0.75)
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from typing import Callable
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class Embedder:
11
+ """Generates text embeddings using a sentence-transformers model.
12
+
13
+ Loads the model once at initialization and reuses it for all calls.
14
+ Default model is EmbeddingGemma-300M (768 dimensions).
15
+ Falls back to all-MiniLM-L6-v2 (384 dimensions) if the primary model fails to load.
16
+ """
17
+
18
+ def __init__(self, model_name: str = "google/embeddinggemma-300m", hf_token: str = "") -> None:
19
+ from sentence_transformers import SentenceTransformer
20
+
21
+ # Use provided token, fall back to environment variable
22
+ if not hf_token:
23
+ hf_token = os.environ.get("HF_TOKEN", "")
24
+
25
+ try:
26
+ logger.info("Loading embedding model: %s", model_name)
27
+ self._model = SentenceTransformer(model_name, token=hf_token or None)
28
+ self.model_name = model_name
29
+ self.dimensions = self._model.get_embedding_dimension()
30
+ logger.info("Model loaded: %s (%d dimensions)", self.model_name, self.dimensions)
31
+ except Exception:
32
+ fallback = "sentence-transformers/all-MiniLM-L6-v2"
33
+ logger.warning(
34
+ "Failed to load %s, falling back to %s",
35
+ model_name,
36
+ fallback,
37
+ exc_info=True,
38
+ )
39
+ try:
40
+ self._model = SentenceTransformer(fallback)
41
+ self.model_name = fallback
42
+ self.dimensions = self._model.get_embedding_dimension()
43
+ logger.info(
44
+ "Fallback model loaded: %s (%d dimensions)",
45
+ self.model_name,
46
+ self.dimensions,
47
+ )
48
+ except Exception:
49
+ logger.error("Failed to load fallback model %s", fallback, exc_info=True)
50
+ raise RuntimeError(
51
+ f"No embedding model available. "
52
+ f"Primary ({model_name}) and fallback ({fallback}) both failed."
53
+ )
54
+
55
+ def embed(self, texts: list[str]) -> list[list[float]]:
56
+ """Generate embeddings for a list of texts.
57
+
58
+ Returns a list of float vectors, one per input text.
59
+ """
60
+ if not texts:
61
+ return []
62
+
63
+ embeddings = self._model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
64
+ return embeddings.tolist()
65
+
66
+ def embed_query(self, query: str) -> list[float]:
67
+ """Generate embedding for a single search query."""
68
+ result = self.embed([query])
69
+ return result[0]
70
+
71
+ def get_tokenizer_fn(self) -> Callable[[str], int]:
72
+ """Return a token-counting function using this model's tokenizer."""
73
+ tokenizer = self._model.tokenizer
74
+
75
+ def count_tokens(text: str) -> int:
76
+ return len(tokenizer.encode(text, add_special_tokens=False))
77
+
78
+ return count_tokens
@@ -0,0 +1,143 @@
1
+ """Confluence storage-format HTML parser — yields Section objects."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+
10
+ @dataclass
11
+ class Section:
12
+ title: str
13
+ text: str
14
+ level: int = 0
15
+
16
+
17
+ def parse_confluence_html(html: str) -> list[Section]:
18
+ """Parse Confluence storage-format HTML into a list of text sections.
19
+
20
+ Handles Confluence-specific elements:
21
+ - Headings become section boundaries
22
+ - Tables are converted to readable text
23
+ - Custom macros (smartlinks, status, emoji) are handled
24
+ - Empty sections are dropped
25
+ """
26
+ soup = BeautifulSoup(html, "html.parser")
27
+
28
+ _clean_confluence_macros(soup)
29
+
30
+ sections: list[Section] = []
31
+ current_title = ""
32
+ current_level = 0
33
+ current_parts: list[str] = []
34
+
35
+ for element in soup.children:
36
+ if not isinstance(element, Tag):
37
+ text = element.get_text(strip=True)
38
+ if text:
39
+ current_parts.append(text)
40
+ continue
41
+
42
+ if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
43
+ # Flush previous section
44
+ if current_parts:
45
+ combined = "\n".join(current_parts).strip()
46
+ if combined:
47
+ sections.append(
48
+ Section(title=current_title, text=combined, level=current_level)
49
+ )
50
+ current_parts = []
51
+
52
+ current_title = element.get_text(strip=True)
53
+ current_level = int(element.name[1])
54
+
55
+ elif element.name == "table":
56
+ current_parts.append(_table_to_text(element))
57
+
58
+ else:
59
+ text = element.get_text(separator=" ", strip=True)
60
+ if text:
61
+ current_parts.append(text)
62
+
63
+ # Flush last section
64
+ if current_parts:
65
+ combined = "\n".join(current_parts).strip()
66
+ if combined:
67
+ sections.append(Section(title=current_title, text=combined, level=current_level))
68
+
69
+ return sections
70
+
71
+
72
+ def _clean_confluence_macros(soup: BeautifulSoup) -> None:
73
+ """Process Confluence custom elements in-place."""
74
+ for custom in soup.find_all("custom"):
75
+ data_type = custom.get("data-type", "")
76
+
77
+ if data_type == "smartlink":
78
+ # Replace smart links with their URL
79
+ href = custom.get_text(strip=True)
80
+ if href.startswith("http"):
81
+ custom.replace_with(href)
82
+ else:
83
+ custom.replace_with(custom.get_text(strip=True))
84
+
85
+ elif data_type == "emoji":
86
+ # Strip emojis
87
+ custom.decompose()
88
+
89
+ elif data_type == "status":
90
+ # Convert status badges to text
91
+ status_text = custom.get_text(strip=True)
92
+ custom.replace_with(f"[{status_text}]")
93
+
94
+ else:
95
+ # Unknown custom element — keep as text
96
+ custom.replace_with(custom.get_text(strip=True))
97
+
98
+ # Also handle ac:structured-macro, ac:rich-text-body etc. (Confluence Server format)
99
+ for macro in soup.find_all("ac:structured-macro"):
100
+ body = macro.find("ac:rich-text-body")
101
+ if body:
102
+ macro.replace_with(body.get_text(separator=" ", strip=True))
103
+ else:
104
+ macro.decompose()
105
+
106
+
107
+ def _table_to_text(table: Tag) -> str:
108
+ """Convert an HTML table to readable plain text.
109
+
110
+ For tables with headers, produces "header: value" pairs per row.
111
+ For tables without headers, produces pipe-separated rows.
112
+ """
113
+ rows = table.find_all("tr")
114
+ if not rows:
115
+ return ""
116
+
117
+ # Extract headers from first row
118
+ headers: list[str] = []
119
+ first_row = rows[0]
120
+ header_cells = first_row.find_all(["th"])
121
+ if header_cells:
122
+ headers = [cell.get_text(separator=" ", strip=True) for cell in header_cells]
123
+ data_rows = rows[1:]
124
+ else:
125
+ data_rows = rows
126
+
127
+ lines: list[str] = []
128
+
129
+ for row in data_rows:
130
+ cells = row.find_all(["td", "th"])
131
+ values = [cell.get_text(separator=" ", strip=True) for cell in cells]
132
+
133
+ if headers and len(values) == len(headers):
134
+ # Format as "header: value" pairs, skip empty values
135
+ pairs = [f"{h}: {v}" for h, v in zip(headers, values) if v]
136
+ if pairs:
137
+ lines.append(" | ".join(pairs))
138
+ elif values:
139
+ non_empty = [v for v in values if v]
140
+ if non_empty:
141
+ lines.append(" | ".join(non_empty))
142
+
143
+ return "\n".join(lines)
docforge/query_log.py ADDED
@@ -0,0 +1,45 @@
1
+ """Async helper for inserting rows into query_log.
2
+
3
+ Failures are logged and swallowed — query logging must never break a search.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+
10
+ import asyncpg
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ async def log_query(
16
+ pool: asyncpg.Pool,
17
+ user_name: str,
18
+ team_name: str,
19
+ area_name: str | None,
20
+ query: str,
21
+ result_count: int,
22
+ user_oid: str | None = None,
23
+ request_ms: int | None = None,
24
+ ) -> None:
25
+ """Record a search request. user_oid is the Entra object ID (post-auth)
26
+ or None (pre-auth rows). request_ms is the handler's wall-clock time in
27
+ milliseconds (post-C4.3) or None (pre-C4.3 rows). Never raises."""
28
+ try:
29
+ async with pool.acquire() as conn:
30
+ await conn.execute(
31
+ """
32
+ INSERT INTO query_log
33
+ (user_name, team_name, area_name, query, result_count, user_oid, request_ms)
34
+ VALUES ($1, $2, $3, $4, $5, $6, $7)
35
+ """,
36
+ user_name,
37
+ team_name,
38
+ area_name,
39
+ query,
40
+ result_count,
41
+ user_oid,
42
+ request_ms,
43
+ )
44
+ except Exception as e:
45
+ logger.warning("query_log insert failed: %s", e)