docforge-cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docforge/__init__.py +0 -0
- docforge/__main__.py +5 -0
- docforge/api.py +266 -0
- docforge/cli.py +296 -0
- docforge/config.py +99 -0
- docforge/crawlers/__init__.py +1 -0
- docforge/crawlers/confluence.py +109 -0
- docforge/crawlers/git.py +79 -0
- docforge/db.py +57 -0
- docforge/ingest.py +401 -0
- docforge/lint.py +92 -0
- docforge/mcp_server.py +188 -0
- docforge/processors/__init__.py +1 -0
- docforge/processors/chunker.py +141 -0
- docforge/processors/embedder.py +78 -0
- docforge/processors/parser.py +143 -0
- docforge/query_log.py +45 -0
- docforge/ranking.py +20 -0
- docforge/scripts/__init__.py +1 -0
- docforge/scripts/eval_search.py +226 -0
- docforge/scripts/latency_report.py +142 -0
- docforge/sources.py +46 -0
- docforge/sql/migrations/001_add_source_identifier.sql +3 -0
- docforge/sql/migrations/002_add_status_index.sql +1 -0
- docforge/sql/migrations/003_add_source_tags.sql +4 -0
- docforge/sql/migrations/004_add_query_log.sql +11 -0
- docforge/sql/migrations/005_add_query_log_user_oid.sql +2 -0
- docforge/sql/migrations/006_add_query_log_request_ms.sql +1 -0
- docforge/sql/schema.sql +29 -0
- docforge/templates/docforge.yml +11 -0
- docforge/templates/docker-compose.yml +14 -0
- docforge/templates/mcp_client.py +83 -0
- docforge/templates/sources.yml +21 -0
- docforge_cli-0.2.0.dist-info/METADATA +178 -0
- docforge_cli-0.2.0.dist-info/RECORD +39 -0
- docforge_cli-0.2.0.dist-info/WHEEL +5 -0
- docforge_cli-0.2.0.dist-info/entry_points.txt +2 -0
- docforge_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
- docforge_cli-0.2.0.dist-info/top_level.txt +1 -0
docforge/lint.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Lint a repo's README + CLAUDE.md + docs/ for Spec B banned-content patterns.
|
|
2
|
+
|
|
3
|
+
Pure logic — see `docforge.cli.lint_docs` for the user entry point. Banned-content
|
|
4
|
+
only in v1; structural/required-topics linting is deferred (see Spec C2 follow-ups).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from docforge.crawlers.git import crawl_repo
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class LintFinding:
|
|
18
|
+
file: str
|
|
19
|
+
line: int
|
|
20
|
+
rule: str
|
|
21
|
+
message: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class LintReport:
|
|
26
|
+
scanned: list[str] = field(default_factory=list)
|
|
27
|
+
findings: list[LintFinding] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
BANNED_RULES: list[tuple[str, str, str]] = [
|
|
31
|
+
(
|
|
32
|
+
"todo-placeholder",
|
|
33
|
+
r"TODO.*(Explain|Contribute)",
|
|
34
|
+
"Placeholder TODO — delete and write real content",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
"readme-inspiration-link",
|
|
38
|
+
r"create-a-readme",
|
|
39
|
+
"Microsoft README inspirational link — delete",
|
|
40
|
+
),
|
|
41
|
+
(
|
|
42
|
+
"readme-boilerplate",
|
|
43
|
+
r"(github\.com/aspnet/Home|Microsoft/vscode|Microsoft/ChakraCore)",
|
|
44
|
+
"Azure DevOps default boilerplate — delete whole block",
|
|
45
|
+
),
|
|
46
|
+
(
|
|
47
|
+
"lastpass-reference",
|
|
48
|
+
r"LastPass",
|
|
49
|
+
"Credential-source reference — move to Teams channel, not indexed docs",
|
|
50
|
+
),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
_COMPILED_BANNED_RULES = [(name, re.compile(pat), msg) for name, pat, msg in BANNED_RULES]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def lint_repo(repo_root: Path) -> LintReport:
|
|
57
|
+
"""Walk the repo's doc surface and scan for banned-content patterns. Read-only."""
|
|
58
|
+
files = crawl_repo(str(repo_root))
|
|
59
|
+
scanned = [f.file_path.replace("\\", "/") for f in files]
|
|
60
|
+
findings: list[LintFinding] = []
|
|
61
|
+
for f in files:
|
|
62
|
+
rel = f.file_path.replace("\\", "/")
|
|
63
|
+
for lineno, line in enumerate(f.content.splitlines(), start=1):
|
|
64
|
+
for rule_name, pattern, message in _COMPILED_BANNED_RULES:
|
|
65
|
+
if pattern.search(line):
|
|
66
|
+
findings.append(LintFinding(rel, lineno, rule_name, message))
|
|
67
|
+
return LintReport(scanned=scanned, findings=findings)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def format_report(report: LintReport, repo_root: Path) -> str:
|
|
71
|
+
"""Human-readable stdout, grouped by file with summary line."""
|
|
72
|
+
lines: list[str] = []
|
|
73
|
+
header = "FAIL" if report.findings else "PASS"
|
|
74
|
+
lines.append(f"{repo_root} — {header}")
|
|
75
|
+
lines.append(f" {len(report.scanned)} files scanned")
|
|
76
|
+
if not report.findings:
|
|
77
|
+
lines.append(" No banned content")
|
|
78
|
+
return "\n".join(lines)
|
|
79
|
+
|
|
80
|
+
lines.append("")
|
|
81
|
+
lines.append(" Banned content:")
|
|
82
|
+
by_file: dict[str, list[LintFinding]] = {}
|
|
83
|
+
for f in report.findings:
|
|
84
|
+
by_file.setdefault(f.file, []).append(f)
|
|
85
|
+
for file_rel, file_findings in by_file.items():
|
|
86
|
+
for f in file_findings:
|
|
87
|
+
location = f"{file_rel}:{f.line}"
|
|
88
|
+
lines.append(f" FAIL {location:<30} {f.rule:<22} {f.message}")
|
|
89
|
+
|
|
90
|
+
lines.append("")
|
|
91
|
+
lines.append(f" Summary: {len(report.findings)} banned-content hits")
|
|
92
|
+
return "\n".join(lines)
|
docforge/mcp_server.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""MCP server exposing documentation search to AI coding assistants.
|
|
2
|
+
|
|
3
|
+
Run with: python -m docforge.mcp_server
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from fastmcp import FastMCP
|
|
12
|
+
|
|
13
|
+
from docforge.config import Settings
|
|
14
|
+
from docforge.db import get_pool
|
|
15
|
+
from docforge.processors.embedder import Embedder
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
mcp = FastMCP(
|
|
20
|
+
"knowledge-hub",
|
|
21
|
+
instructions=(
|
|
22
|
+
"Search across your team's indexed documentation including team responsibilities, "
|
|
23
|
+
"coding guidelines, architecture standards, and cross-team interfaces. "
|
|
24
|
+
"Use the search_documentation tool when you need information about other teams, "
|
|
25
|
+
"shared coding practices, or organizational knowledge."
|
|
26
|
+
),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Initialized lazily on first search
|
|
30
|
+
_embedder: Embedder | None = None
|
|
31
|
+
_settings: Settings | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_settings() -> Settings:
|
|
35
|
+
global _settings
|
|
36
|
+
if _settings is None:
|
|
37
|
+
_settings = Settings()
|
|
38
|
+
return _settings
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_embedder() -> Embedder:
|
|
42
|
+
global _embedder
|
|
43
|
+
if _embedder is None:
|
|
44
|
+
settings = _get_settings()
|
|
45
|
+
logger.info("Loading embedding model (this may take a few seconds)...")
|
|
46
|
+
_embedder = Embedder(
|
|
47
|
+
settings.embedding_model, hf_token=settings.hf_token.get_secret_value()
|
|
48
|
+
)
|
|
49
|
+
return _embedder
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@mcp.tool()
|
|
53
|
+
async def search_documentation(
|
|
54
|
+
query: str,
|
|
55
|
+
user_name: str,
|
|
56
|
+
team_name: str,
|
|
57
|
+
area_name: str | None = None,
|
|
58
|
+
limit: int = 5,
|
|
59
|
+
) -> str:
|
|
60
|
+
"""Search across indexed documentation from Confluence pages and git repos.
|
|
61
|
+
|
|
62
|
+
Returns relevant documentation chunks with source attribution. Use this to find
|
|
63
|
+
information about team ownership, coding guidelines, architecture decisions,
|
|
64
|
+
and cross-team interfaces.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
query: Natural language search query.
|
|
68
|
+
user_name: Your name (e.g., "tobias.ens"). Used for usage telemetry.
|
|
69
|
+
team_name: Your team tag (e.g., "ccl"). Boosts team-tagged docs.
|
|
70
|
+
area_name: Your area tag (e.g., "cloud"). Optional; boosts area-tagged docs.
|
|
71
|
+
limit: Maximum number of results to return (default 5).
|
|
72
|
+
"""
|
|
73
|
+
settings = _get_settings()
|
|
74
|
+
embedder = _get_embedder()
|
|
75
|
+
|
|
76
|
+
query_vector = embedder.embed_query(query)
|
|
77
|
+
user_tags = [team_name] + ([area_name] if area_name else [])
|
|
78
|
+
|
|
79
|
+
pool = await get_pool(settings.database_url)
|
|
80
|
+
async with pool.acquire() as conn:
|
|
81
|
+
rows = await conn.fetch(
|
|
82
|
+
"""
|
|
83
|
+
SELECT
|
|
84
|
+
c.text,
|
|
85
|
+
c.section_title,
|
|
86
|
+
s.title AS source_title,
|
|
87
|
+
s.url AS source_url,
|
|
88
|
+
s.tags AS source_tags,
|
|
89
|
+
1 - (c.embedding <=> $1::vector) AS similarity,
|
|
90
|
+
(1 - (c.embedding <=> $1::vector)) *
|
|
91
|
+
(1
|
|
92
|
+
+ $2::float * cardinality(
|
|
93
|
+
ARRAY(SELECT unnest(s.tags) INTERSECT SELECT unnest($3::text[]))
|
|
94
|
+
)
|
|
95
|
+
+ $4::float * (CASE WHEN 'org' = ANY(s.tags) THEN 1 ELSE 0 END)
|
|
96
|
+
) AS boosted_score
|
|
97
|
+
FROM chunks c
|
|
98
|
+
JOIN sources s ON c.source_id = s.id
|
|
99
|
+
WHERE s.status = 'active'
|
|
100
|
+
ORDER BY boosted_score DESC
|
|
101
|
+
LIMIT $5
|
|
102
|
+
""",
|
|
103
|
+
np.array(query_vector, dtype=np.float32),
|
|
104
|
+
settings.tag_match_weight,
|
|
105
|
+
user_tags,
|
|
106
|
+
settings.org_tag_weight,
|
|
107
|
+
limit,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
from docforge.query_log import log_query
|
|
111
|
+
|
|
112
|
+
await log_query(pool, user_name, team_name, area_name, query, len(rows))
|
|
113
|
+
|
|
114
|
+
if not rows:
|
|
115
|
+
return (
|
|
116
|
+
"No documentation found matching your query. "
|
|
117
|
+
"The index may be empty -- run `python -m docforge ingest` to populate it."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
parts: list[str] = []
|
|
121
|
+
for i, row in enumerate(rows, 1):
|
|
122
|
+
similarity = row["similarity"]
|
|
123
|
+
source = row["source_title"]
|
|
124
|
+
url = row["source_url"]
|
|
125
|
+
section = row["section_title"]
|
|
126
|
+
text = row["text"]
|
|
127
|
+
tags = list(row["source_tags"] or [])
|
|
128
|
+
|
|
129
|
+
header = f"**Result {i}** (relevance: {similarity:.2f}) — {source}"
|
|
130
|
+
if section:
|
|
131
|
+
header += f" > {section}"
|
|
132
|
+
header += f"\nSource: {url}"
|
|
133
|
+
if tags:
|
|
134
|
+
header += f"\nTags: {', '.join(tags)}"
|
|
135
|
+
|
|
136
|
+
parts.append(f"{header}\n\n{text}")
|
|
137
|
+
|
|
138
|
+
return "\n\n---\n\n".join(parts)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@mcp.tool()
|
|
142
|
+
async def list_sources() -> str:
|
|
143
|
+
"""List all documentation sources currently indexed in the knowledge hub.
|
|
144
|
+
|
|
145
|
+
Returns the title, URL, status, and last crawl time for each source.
|
|
146
|
+
Use this to see what documentation is available for searching.
|
|
147
|
+
"""
|
|
148
|
+
settings = _get_settings()
|
|
149
|
+
pool = await get_pool(settings.database_url)
|
|
150
|
+
|
|
151
|
+
async with pool.acquire() as conn:
|
|
152
|
+
rows = await conn.fetch(
|
|
153
|
+
"""
|
|
154
|
+
SELECT title, url, status, last_crawled_at,
|
|
155
|
+
(SELECT count(*) FROM chunks WHERE source_id = s.id) AS chunk_count
|
|
156
|
+
FROM sources s
|
|
157
|
+
ORDER BY title
|
|
158
|
+
"""
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if not rows:
|
|
162
|
+
return "No sources indexed yet. Run `python -m docforge ingest` to populate."
|
|
163
|
+
|
|
164
|
+
lines: list[str] = []
|
|
165
|
+
for row in rows:
|
|
166
|
+
last = row["last_crawled_at"]
|
|
167
|
+
crawled = last.strftime("%Y-%m-%d %H:%M") if last else "never"
|
|
168
|
+
lines.append(
|
|
169
|
+
f"- **{row['title']}** ({row['chunk_count']} chunks, {row['status']})\n"
|
|
170
|
+
f" Last crawled: {crawled}\n"
|
|
171
|
+
f" {row['url']}"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return f"**{len(rows)} indexed sources:**\n\n" + "\n\n".join(lines)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def main() -> None:
|
|
178
|
+
"""Configure logging and start the FastMCP server on stdio transport."""
|
|
179
|
+
logging.basicConfig(
|
|
180
|
+
level=logging.INFO,
|
|
181
|
+
format="%(asctime)s %(levelname)-8s %(name)s: %(message)s",
|
|
182
|
+
datefmt="%H:%M:%S",
|
|
183
|
+
)
|
|
184
|
+
mcp.run()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__":
|
|
188
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Text processors — HTML parser, token-aware chunker, embedder."""
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Token-aware chunker — splits sections into chunks under a token limit."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from docforge.processors.parser import Section
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Chunk:
|
|
12
|
+
text: str
|
|
13
|
+
section_title: str
|
|
14
|
+
chunk_index: int
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def chunk_sections(
|
|
18
|
+
sections: list[Section],
|
|
19
|
+
max_tokens: int = 500,
|
|
20
|
+
tokenizer_fn: callable | None = None,
|
|
21
|
+
) -> list[Chunk]:
|
|
22
|
+
"""Split sections into chunks of roughly max_tokens size.
|
|
23
|
+
|
|
24
|
+
Splits on section boundaries first, then on paragraph boundaries
|
|
25
|
+
if a section exceeds max_tokens.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
sections: Parsed sections from the document.
|
|
29
|
+
max_tokens: Maximum tokens per chunk.
|
|
30
|
+
tokenizer_fn: Function that counts tokens in a string.
|
|
31
|
+
If None, uses a simple word-count approximation
|
|
32
|
+
(1 token ~ 0.75 words).
|
|
33
|
+
"""
|
|
34
|
+
if tokenizer_fn is None:
|
|
35
|
+
tokenizer_fn = _approximate_token_count
|
|
36
|
+
|
|
37
|
+
chunks: list[Chunk] = []
|
|
38
|
+
chunk_index = 0
|
|
39
|
+
|
|
40
|
+
for section in sections:
|
|
41
|
+
text = section.text.strip()
|
|
42
|
+
if not text:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
# Add section title as context prefix
|
|
46
|
+
if section.title:
|
|
47
|
+
text = f"{section.title}\n\n{text}"
|
|
48
|
+
|
|
49
|
+
if tokenizer_fn(text) <= max_tokens:
|
|
50
|
+
chunks.append(Chunk(text=text, section_title=section.title, chunk_index=chunk_index))
|
|
51
|
+
chunk_index += 1
|
|
52
|
+
else:
|
|
53
|
+
# Split on paragraph boundaries
|
|
54
|
+
sub_chunks = _split_by_paragraphs(text, max_tokens, tokenizer_fn)
|
|
55
|
+
for sub_text in sub_chunks:
|
|
56
|
+
chunks.append(
|
|
57
|
+
Chunk(text=sub_text, section_title=section.title, chunk_index=chunk_index)
|
|
58
|
+
)
|
|
59
|
+
chunk_index += 1
|
|
60
|
+
|
|
61
|
+
return chunks
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _split_by_paragraphs(text: str, max_tokens: int, tokenizer_fn: callable) -> list[str]:
|
|
65
|
+
"""Split text into chunks by paragraph boundaries."""
|
|
66
|
+
paragraphs = text.split("\n")
|
|
67
|
+
result: list[str] = []
|
|
68
|
+
current_parts: list[str] = []
|
|
69
|
+
current_tokens = 0
|
|
70
|
+
|
|
71
|
+
for paragraph in paragraphs:
|
|
72
|
+
paragraph = paragraph.strip()
|
|
73
|
+
if not paragraph:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
para_tokens = tokenizer_fn(paragraph)
|
|
77
|
+
|
|
78
|
+
if para_tokens > max_tokens:
|
|
79
|
+
# Flush current buffer
|
|
80
|
+
if current_parts:
|
|
81
|
+
result.append("\n".join(current_parts))
|
|
82
|
+
current_parts = []
|
|
83
|
+
current_tokens = 0
|
|
84
|
+
|
|
85
|
+
# Split long paragraph by sentences
|
|
86
|
+
for sentence_chunk in _split_long_text(paragraph, max_tokens, tokenizer_fn):
|
|
87
|
+
result.append(sentence_chunk)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
if current_tokens + para_tokens > max_tokens and current_parts:
|
|
91
|
+
result.append("\n".join(current_parts))
|
|
92
|
+
current_parts = []
|
|
93
|
+
current_tokens = 0
|
|
94
|
+
|
|
95
|
+
current_parts.append(paragraph)
|
|
96
|
+
current_tokens += para_tokens
|
|
97
|
+
|
|
98
|
+
if current_parts:
|
|
99
|
+
result.append("\n".join(current_parts))
|
|
100
|
+
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _split_long_text(text: str, max_tokens: int, tokenizer_fn: callable) -> list[str]:
|
|
105
|
+
"""Split text that exceeds max_tokens by sentence boundaries, falling back to words."""
|
|
106
|
+
# Try splitting by sentences (period followed by space)
|
|
107
|
+
sentences = text.replace(". ", ".\n").split("\n")
|
|
108
|
+
|
|
109
|
+
result: list[str] = []
|
|
110
|
+
current_parts: list[str] = []
|
|
111
|
+
current_tokens = 0
|
|
112
|
+
|
|
113
|
+
for sentence in sentences:
|
|
114
|
+
sentence = sentence.strip()
|
|
115
|
+
if not sentence:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
sent_tokens = tokenizer_fn(sentence)
|
|
119
|
+
|
|
120
|
+
if current_tokens + sent_tokens > max_tokens and current_parts:
|
|
121
|
+
result.append(" ".join(current_parts))
|
|
122
|
+
current_parts = []
|
|
123
|
+
current_tokens = 0
|
|
124
|
+
|
|
125
|
+
current_parts.append(sentence)
|
|
126
|
+
current_tokens += sent_tokens
|
|
127
|
+
|
|
128
|
+
if current_parts:
|
|
129
|
+
result.append(" ".join(current_parts))
|
|
130
|
+
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _approximate_token_count(text: str) -> int:
|
|
135
|
+
"""Approximate token count using word count.
|
|
136
|
+
|
|
137
|
+
Roughly 1 token ~ 0.75 words for English text.
|
|
138
|
+
This is used as fallback when no model tokenizer is available.
|
|
139
|
+
"""
|
|
140
|
+
words = len(text.split())
|
|
141
|
+
return int(words / 0.75)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Embedder:
|
|
11
|
+
"""Generates text embeddings using a sentence-transformers model.
|
|
12
|
+
|
|
13
|
+
Loads the model once at initialization and reuses it for all calls.
|
|
14
|
+
Default model is EmbeddingGemma-300M (768 dimensions).
|
|
15
|
+
Falls back to all-MiniLM-L6-v2 (384 dimensions) if the primary model fails to load.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, model_name: str = "google/embeddinggemma-300m", hf_token: str = "") -> None:
|
|
19
|
+
from sentence_transformers import SentenceTransformer
|
|
20
|
+
|
|
21
|
+
# Use provided token, fall back to environment variable
|
|
22
|
+
if not hf_token:
|
|
23
|
+
hf_token = os.environ.get("HF_TOKEN", "")
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
logger.info("Loading embedding model: %s", model_name)
|
|
27
|
+
self._model = SentenceTransformer(model_name, token=hf_token or None)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
self.dimensions = self._model.get_embedding_dimension()
|
|
30
|
+
logger.info("Model loaded: %s (%d dimensions)", self.model_name, self.dimensions)
|
|
31
|
+
except Exception:
|
|
32
|
+
fallback = "sentence-transformers/all-MiniLM-L6-v2"
|
|
33
|
+
logger.warning(
|
|
34
|
+
"Failed to load %s, falling back to %s",
|
|
35
|
+
model_name,
|
|
36
|
+
fallback,
|
|
37
|
+
exc_info=True,
|
|
38
|
+
)
|
|
39
|
+
try:
|
|
40
|
+
self._model = SentenceTransformer(fallback)
|
|
41
|
+
self.model_name = fallback
|
|
42
|
+
self.dimensions = self._model.get_embedding_dimension()
|
|
43
|
+
logger.info(
|
|
44
|
+
"Fallback model loaded: %s (%d dimensions)",
|
|
45
|
+
self.model_name,
|
|
46
|
+
self.dimensions,
|
|
47
|
+
)
|
|
48
|
+
except Exception:
|
|
49
|
+
logger.error("Failed to load fallback model %s", fallback, exc_info=True)
|
|
50
|
+
raise RuntimeError(
|
|
51
|
+
f"No embedding model available. "
|
|
52
|
+
f"Primary ({model_name}) and fallback ({fallback}) both failed."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
56
|
+
"""Generate embeddings for a list of texts.
|
|
57
|
+
|
|
58
|
+
Returns a list of float vectors, one per input text.
|
|
59
|
+
"""
|
|
60
|
+
if not texts:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
embeddings = self._model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
|
64
|
+
return embeddings.tolist()
|
|
65
|
+
|
|
66
|
+
def embed_query(self, query: str) -> list[float]:
|
|
67
|
+
"""Generate embedding for a single search query."""
|
|
68
|
+
result = self.embed([query])
|
|
69
|
+
return result[0]
|
|
70
|
+
|
|
71
|
+
def get_tokenizer_fn(self) -> Callable[[str], int]:
|
|
72
|
+
"""Return a token-counting function using this model's tokenizer."""
|
|
73
|
+
tokenizer = self._model.tokenizer
|
|
74
|
+
|
|
75
|
+
def count_tokens(text: str) -> int:
|
|
76
|
+
return len(tokenizer.encode(text, add_special_tokens=False))
|
|
77
|
+
|
|
78
|
+
return count_tokens
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Confluence storage-format HTML parser — yields Section objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Section:
|
|
12
|
+
title: str
|
|
13
|
+
text: str
|
|
14
|
+
level: int = 0
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_confluence_html(html: str) -> list[Section]:
|
|
18
|
+
"""Parse Confluence storage-format HTML into a list of text sections.
|
|
19
|
+
|
|
20
|
+
Handles Confluence-specific elements:
|
|
21
|
+
- Headings become section boundaries
|
|
22
|
+
- Tables are converted to readable text
|
|
23
|
+
- Custom macros (smartlinks, status, emoji) are handled
|
|
24
|
+
- Empty sections are dropped
|
|
25
|
+
"""
|
|
26
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
27
|
+
|
|
28
|
+
_clean_confluence_macros(soup)
|
|
29
|
+
|
|
30
|
+
sections: list[Section] = []
|
|
31
|
+
current_title = ""
|
|
32
|
+
current_level = 0
|
|
33
|
+
current_parts: list[str] = []
|
|
34
|
+
|
|
35
|
+
for element in soup.children:
|
|
36
|
+
if not isinstance(element, Tag):
|
|
37
|
+
text = element.get_text(strip=True)
|
|
38
|
+
if text:
|
|
39
|
+
current_parts.append(text)
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
43
|
+
# Flush previous section
|
|
44
|
+
if current_parts:
|
|
45
|
+
combined = "\n".join(current_parts).strip()
|
|
46
|
+
if combined:
|
|
47
|
+
sections.append(
|
|
48
|
+
Section(title=current_title, text=combined, level=current_level)
|
|
49
|
+
)
|
|
50
|
+
current_parts = []
|
|
51
|
+
|
|
52
|
+
current_title = element.get_text(strip=True)
|
|
53
|
+
current_level = int(element.name[1])
|
|
54
|
+
|
|
55
|
+
elif element.name == "table":
|
|
56
|
+
current_parts.append(_table_to_text(element))
|
|
57
|
+
|
|
58
|
+
else:
|
|
59
|
+
text = element.get_text(separator=" ", strip=True)
|
|
60
|
+
if text:
|
|
61
|
+
current_parts.append(text)
|
|
62
|
+
|
|
63
|
+
# Flush last section
|
|
64
|
+
if current_parts:
|
|
65
|
+
combined = "\n".join(current_parts).strip()
|
|
66
|
+
if combined:
|
|
67
|
+
sections.append(Section(title=current_title, text=combined, level=current_level))
|
|
68
|
+
|
|
69
|
+
return sections
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _clean_confluence_macros(soup: BeautifulSoup) -> None:
|
|
73
|
+
"""Process Confluence custom elements in-place."""
|
|
74
|
+
for custom in soup.find_all("custom"):
|
|
75
|
+
data_type = custom.get("data-type", "")
|
|
76
|
+
|
|
77
|
+
if data_type == "smartlink":
|
|
78
|
+
# Replace smart links with their URL
|
|
79
|
+
href = custom.get_text(strip=True)
|
|
80
|
+
if href.startswith("http"):
|
|
81
|
+
custom.replace_with(href)
|
|
82
|
+
else:
|
|
83
|
+
custom.replace_with(custom.get_text(strip=True))
|
|
84
|
+
|
|
85
|
+
elif data_type == "emoji":
|
|
86
|
+
# Strip emojis
|
|
87
|
+
custom.decompose()
|
|
88
|
+
|
|
89
|
+
elif data_type == "status":
|
|
90
|
+
# Convert status badges to text
|
|
91
|
+
status_text = custom.get_text(strip=True)
|
|
92
|
+
custom.replace_with(f"[{status_text}]")
|
|
93
|
+
|
|
94
|
+
else:
|
|
95
|
+
# Unknown custom element — keep as text
|
|
96
|
+
custom.replace_with(custom.get_text(strip=True))
|
|
97
|
+
|
|
98
|
+
# Also handle ac:structured-macro, ac:rich-text-body etc. (Confluence Server format)
|
|
99
|
+
for macro in soup.find_all("ac:structured-macro"):
|
|
100
|
+
body = macro.find("ac:rich-text-body")
|
|
101
|
+
if body:
|
|
102
|
+
macro.replace_with(body.get_text(separator=" ", strip=True))
|
|
103
|
+
else:
|
|
104
|
+
macro.decompose()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _table_to_text(table: Tag) -> str:
|
|
108
|
+
"""Convert an HTML table to readable plain text.
|
|
109
|
+
|
|
110
|
+
For tables with headers, produces "header: value" pairs per row.
|
|
111
|
+
For tables without headers, produces pipe-separated rows.
|
|
112
|
+
"""
|
|
113
|
+
rows = table.find_all("tr")
|
|
114
|
+
if not rows:
|
|
115
|
+
return ""
|
|
116
|
+
|
|
117
|
+
# Extract headers from first row
|
|
118
|
+
headers: list[str] = []
|
|
119
|
+
first_row = rows[0]
|
|
120
|
+
header_cells = first_row.find_all(["th"])
|
|
121
|
+
if header_cells:
|
|
122
|
+
headers = [cell.get_text(separator=" ", strip=True) for cell in header_cells]
|
|
123
|
+
data_rows = rows[1:]
|
|
124
|
+
else:
|
|
125
|
+
data_rows = rows
|
|
126
|
+
|
|
127
|
+
lines: list[str] = []
|
|
128
|
+
|
|
129
|
+
for row in data_rows:
|
|
130
|
+
cells = row.find_all(["td", "th"])
|
|
131
|
+
values = [cell.get_text(separator=" ", strip=True) for cell in cells]
|
|
132
|
+
|
|
133
|
+
if headers and len(values) == len(headers):
|
|
134
|
+
# Format as "header: value" pairs, skip empty values
|
|
135
|
+
pairs = [f"{h}: {v}" for h, v in zip(headers, values) if v]
|
|
136
|
+
if pairs:
|
|
137
|
+
lines.append(" | ".join(pairs))
|
|
138
|
+
elif values:
|
|
139
|
+
non_empty = [v for v in values if v]
|
|
140
|
+
if non_empty:
|
|
141
|
+
lines.append(" | ".join(non_empty))
|
|
142
|
+
|
|
143
|
+
return "\n".join(lines)
|
docforge/query_log.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Async helper for inserting rows into query_log.
|
|
2
|
+
|
|
3
|
+
Failures are logged and swallowed — query logging must never break a search.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
import asyncpg
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def log_query(
|
|
16
|
+
pool: asyncpg.Pool,
|
|
17
|
+
user_name: str,
|
|
18
|
+
team_name: str,
|
|
19
|
+
area_name: str | None,
|
|
20
|
+
query: str,
|
|
21
|
+
result_count: int,
|
|
22
|
+
user_oid: str | None = None,
|
|
23
|
+
request_ms: int | None = None,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""Record a search request. user_oid is the Entra object ID (post-auth)
|
|
26
|
+
or None (pre-auth rows). request_ms is the handler's wall-clock time in
|
|
27
|
+
milliseconds (post-C4.3) or None (pre-C4.3 rows). Never raises."""
|
|
28
|
+
try:
|
|
29
|
+
async with pool.acquire() as conn:
|
|
30
|
+
await conn.execute(
|
|
31
|
+
"""
|
|
32
|
+
INSERT INTO query_log
|
|
33
|
+
(user_name, team_name, area_name, query, result_count, user_oid, request_ms)
|
|
34
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
35
|
+
""",
|
|
36
|
+
user_name,
|
|
37
|
+
team_name,
|
|
38
|
+
area_name,
|
|
39
|
+
query,
|
|
40
|
+
result_count,
|
|
41
|
+
user_oid,
|
|
42
|
+
request_ms,
|
|
43
|
+
)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.warning("query_log insert failed: %s", e)
|