iil-docs-agent 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs_agent/__init__.py +7 -0
- docs_agent/analyzer/__init__.py +1 -0
- docs_agent/analyzer/ast_scanner.py +231 -0
- docs_agent/analyzer/diataxis_classifier.py +206 -0
- docs_agent/analyzer/llm_classifier.py +149 -0
- docs_agent/cli.py +454 -0
- docs_agent/generator/__init__.py +1 -0
- docs_agent/generator/code_inserter.py +230 -0
- docs_agent/generator/docstring_gen.py +216 -0
- docs_agent/llm_client.py +181 -0
- docs_agent/models.py +99 -0
- docs_agent/prompts.py +77 -0
- iil_docs_agent-0.2.0.dist-info/METADATA +214 -0
- iil_docs_agent-0.2.0.dist-info/RECORD +16 -0
- iil_docs_agent-0.2.0.dist-info/WHEEL +4 -0
- iil_docs_agent-0.2.0.dist-info/entry_points.txt +2 -0
docs_agent/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Analyzer components — AST scanner and DIATAXIS classifier."""
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""AST-based docstring coverage scanner.
|
|
2
|
+
|
|
3
|
+
Scans Python modules via the `ast` module and reports undocumented items:
|
|
4
|
+
classes, functions, methods. No LLM required — pure static analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import ast
|
|
10
|
+
import logging
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from docs_agent.models import CodeItem, ItemKind, ModuleCoverage, RepoCoverage
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
SKIP_DIRS: set[str] = {
|
|
18
|
+
"__pycache__",
|
|
19
|
+
".git",
|
|
20
|
+
".venv",
|
|
21
|
+
"venv",
|
|
22
|
+
"node_modules",
|
|
23
|
+
"migrations",
|
|
24
|
+
"_build",
|
|
25
|
+
"_archive",
|
|
26
|
+
"static",
|
|
27
|
+
"staticfiles",
|
|
28
|
+
".tox",
|
|
29
|
+
".mypy_cache",
|
|
30
|
+
".pytest_cache",
|
|
31
|
+
"dist",
|
|
32
|
+
"build",
|
|
33
|
+
"eggs",
|
|
34
|
+
"*.egg-info",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
SKIP_FILES: set[str] = {
|
|
38
|
+
"__init__.py",
|
|
39
|
+
"conftest.py",
|
|
40
|
+
"manage.py",
|
|
41
|
+
"wsgi.py",
|
|
42
|
+
"asgi.py",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _should_skip_dir(dirname: str) -> bool:
|
|
47
|
+
"""Check if directory should be skipped."""
|
|
48
|
+
return dirname in SKIP_DIRS or dirname.startswith(".")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _should_skip_file(filename: str) -> bool:
|
|
52
|
+
"""Check if file should be skipped."""
|
|
53
|
+
return filename in SKIP_FILES or filename.startswith("test_")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _has_docstring(node: ast.AST) -> bool:
|
|
57
|
+
"""Check if an AST node has a docstring."""
|
|
58
|
+
if not hasattr(node, "body") or not node.body:
|
|
59
|
+
return False
|
|
60
|
+
first = node.body[0]
|
|
61
|
+
if isinstance(first, ast.Expr) and isinstance(first.value, ast.Constant):
|
|
62
|
+
return isinstance(first.value.value, str)
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _is_private(name: str) -> bool:
|
|
67
|
+
"""Check if name is private (single underscore prefix, not dunder)."""
|
|
68
|
+
return name.startswith("_") and not name.startswith("__")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _is_dunder(name: str) -> bool:
|
|
72
|
+
"""Check if name is a dunder method."""
|
|
73
|
+
return name.startswith("__") and name.endswith("__")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
SKIP_DUNDERS: set[str] = {
|
|
77
|
+
"__init__",
|
|
78
|
+
"__str__",
|
|
79
|
+
"__repr__",
|
|
80
|
+
"__eq__",
|
|
81
|
+
"__hash__",
|
|
82
|
+
"__lt__",
|
|
83
|
+
"__le__",
|
|
84
|
+
"__gt__",
|
|
85
|
+
"__ge__",
|
|
86
|
+
"__len__",
|
|
87
|
+
"__bool__",
|
|
88
|
+
"__contains__",
|
|
89
|
+
"__iter__",
|
|
90
|
+
"__next__",
|
|
91
|
+
"__enter__",
|
|
92
|
+
"__exit__",
|
|
93
|
+
"__call__",
|
|
94
|
+
"__getattr__",
|
|
95
|
+
"__setattr__",
|
|
96
|
+
"__delattr__",
|
|
97
|
+
"__getitem__",
|
|
98
|
+
"__setitem__",
|
|
99
|
+
"__delitem__",
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def scan_module(file_path: Path) -> ModuleCoverage:
|
|
104
|
+
"""Scan a single Python module for docstring coverage.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
file_path: Path to the Python file.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
ModuleCoverage with all documentable items and their status.
|
|
111
|
+
"""
|
|
112
|
+
coverage = ModuleCoverage(file_path=file_path)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
source = file_path.read_text(encoding="utf-8")
|
|
116
|
+
except (OSError, UnicodeDecodeError) as exc:
|
|
117
|
+
logger.warning("Cannot read %s: %s", file_path, exc)
|
|
118
|
+
return coverage
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
tree = ast.parse(source, filename=str(file_path))
|
|
122
|
+
except SyntaxError as exc:
|
|
123
|
+
logger.warning("Syntax error in %s: %s", file_path, exc)
|
|
124
|
+
return coverage
|
|
125
|
+
|
|
126
|
+
# Module-level docstring
|
|
127
|
+
has_module_doc = _has_docstring(tree)
|
|
128
|
+
coverage.items.append(
|
|
129
|
+
CodeItem(
|
|
130
|
+
name=file_path.stem,
|
|
131
|
+
kind=ItemKind.MODULE,
|
|
132
|
+
line=1,
|
|
133
|
+
has_docstring=has_module_doc,
|
|
134
|
+
file_path=file_path,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
coverage.total_items += 1
|
|
138
|
+
coverage.documented_items += int(has_module_doc)
|
|
139
|
+
|
|
140
|
+
for node in ast.walk(tree):
|
|
141
|
+
if isinstance(node, ast.ClassDef):
|
|
142
|
+
has_doc = _has_docstring(node)
|
|
143
|
+
coverage.items.append(
|
|
144
|
+
CodeItem(
|
|
145
|
+
name=node.name,
|
|
146
|
+
kind=ItemKind.CLASS,
|
|
147
|
+
line=node.lineno,
|
|
148
|
+
has_docstring=has_doc,
|
|
149
|
+
file_path=file_path,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
coverage.total_items += 1
|
|
153
|
+
coverage.documented_items += int(has_doc)
|
|
154
|
+
|
|
155
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
156
|
+
name = node.name
|
|
157
|
+
|
|
158
|
+
if _is_dunder(name) and name in SKIP_DUNDERS:
|
|
159
|
+
continue
|
|
160
|
+
if _is_private(name):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Determine if method or function
|
|
164
|
+
is_method = any(
|
|
165
|
+
isinstance(parent, ast.ClassDef)
|
|
166
|
+
for parent in ast.walk(tree)
|
|
167
|
+
if isinstance(getattr(parent, "body", None), list)
|
|
168
|
+
and node in parent.body
|
|
169
|
+
)
|
|
170
|
+
kind = ItemKind.METHOD if is_method else ItemKind.FUNCTION
|
|
171
|
+
|
|
172
|
+
has_doc = _has_docstring(node)
|
|
173
|
+
coverage.items.append(
|
|
174
|
+
CodeItem(
|
|
175
|
+
name=name,
|
|
176
|
+
kind=kind,
|
|
177
|
+
line=node.lineno,
|
|
178
|
+
has_docstring=has_doc,
|
|
179
|
+
file_path=file_path,
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
coverage.total_items += 1
|
|
183
|
+
coverage.documented_items += int(has_doc)
|
|
184
|
+
|
|
185
|
+
return coverage
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def scan_repo(
|
|
189
|
+
repo_path: Path,
|
|
190
|
+
*,
|
|
191
|
+
apps_only: bool = False,
|
|
192
|
+
) -> RepoCoverage:
|
|
193
|
+
"""Scan an entire repository for docstring coverage.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
repo_path: Root path of the repository.
|
|
197
|
+
apps_only: If True, only scan `apps/` subdirectory.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
RepoCoverage with per-module and aggregate statistics.
|
|
201
|
+
"""
|
|
202
|
+
repo_path = repo_path.resolve()
|
|
203
|
+
result = RepoCoverage(repo_path=repo_path)
|
|
204
|
+
|
|
205
|
+
search_root = repo_path
|
|
206
|
+
if apps_only:
|
|
207
|
+
apps_dir = repo_path / "apps"
|
|
208
|
+
if apps_dir.is_dir():
|
|
209
|
+
search_root = apps_dir
|
|
210
|
+
else:
|
|
211
|
+
src_apps = repo_path / "src" / "apps"
|
|
212
|
+
if src_apps.is_dir():
|
|
213
|
+
search_root = src_apps
|
|
214
|
+
|
|
215
|
+
for py_file in sorted(search_root.rglob("*.py")):
|
|
216
|
+
# Skip excluded directories
|
|
217
|
+
if any(
|
|
218
|
+
_should_skip_dir(part)
|
|
219
|
+
for part in py_file.relative_to(repo_path).parts
|
|
220
|
+
):
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Skip excluded files
|
|
224
|
+
if _should_skip_file(py_file.name):
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
module_cov = scan_module(py_file)
|
|
228
|
+
if module_cov.total_items > 0:
|
|
229
|
+
result.modules.append(module_cov)
|
|
230
|
+
|
|
231
|
+
return result
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""DIATAXIS heuristic classifier for documentation files.
|
|
2
|
+
|
|
3
|
+
Classifies Markdown/RST files into DIATAXIS quadrants using
|
|
4
|
+
trigger-word pattern matching. No LLM required for the heuristic pass.
|
|
5
|
+
|
|
6
|
+
LLM refinement (confidence < 0.7) is deferred to Phase 4.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from docs_agent.models import DiaxisClassification, DiaxisQuadrant
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
TRIGGER_PATTERNS: dict[DiaxisQuadrant, list[str]] = {
|
|
20
|
+
DiaxisQuadrant.TUTORIAL: [
|
|
21
|
+
r"step\s+\d",
|
|
22
|
+
r"getting\s+started",
|
|
23
|
+
r"\blearn\b",
|
|
24
|
+
r"we\s+will",
|
|
25
|
+
r"in\s+this\s+tutorial",
|
|
26
|
+
r"let[\u2018\u2019']s\s+",
|
|
27
|
+
r"follow\s+along",
|
|
28
|
+
r"by\s+the\s+end\b",
|
|
29
|
+
r"prerequisites",
|
|
30
|
+
],
|
|
31
|
+
DiaxisQuadrant.GUIDE: [
|
|
32
|
+
r"how\s+to\b",
|
|
33
|
+
r"\bconfigure\b",
|
|
34
|
+
r"\bdeploy\b",
|
|
35
|
+
r"\bfix\b",
|
|
36
|
+
r"troubleshoot",
|
|
37
|
+
r"set\s+up\b",
|
|
38
|
+
r"install\b",
|
|
39
|
+
r"upgrade\b",
|
|
40
|
+
r"migrat",
|
|
41
|
+
r"recipe",
|
|
42
|
+
],
|
|
43
|
+
DiaxisQuadrant.REFERENCE: [
|
|
44
|
+
r"\bAPI\b",
|
|
45
|
+
r"\bparameter[s]?\b",
|
|
46
|
+
r"\breturn[s]?\b",
|
|
47
|
+
r"type\s*:",
|
|
48
|
+
r"automodule",
|
|
49
|
+
r"endpoint[s]?\b",
|
|
50
|
+
r"\bschema\b",
|
|
51
|
+
r"class\s+\w+",
|
|
52
|
+
r"field[s]?\b",
|
|
53
|
+
r"\bargs\b",
|
|
54
|
+
r"\bkwargs\b",
|
|
55
|
+
],
|
|
56
|
+
DiaxisQuadrant.EXPLANATION: [
|
|
57
|
+
r"\bwhy\b",
|
|
58
|
+
r"architecture",
|
|
59
|
+
r"\bdesign\b",
|
|
60
|
+
r"rationale",
|
|
61
|
+
r"background",
|
|
62
|
+
r"concept[s]?\b",
|
|
63
|
+
r"overview",
|
|
64
|
+
r"philosophy",
|
|
65
|
+
r"trade-?off",
|
|
66
|
+
r"decision",
|
|
67
|
+
],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
DOC_EXTENSIONS: set[str] = {".md", ".rst", ".txt"}
|
|
71
|
+
|
|
72
|
+
SKIP_DIRS: set[str] = {
|
|
73
|
+
"_archive",
|
|
74
|
+
"_build",
|
|
75
|
+
"source",
|
|
76
|
+
"node_modules",
|
|
77
|
+
".git",
|
|
78
|
+
".venv",
|
|
79
|
+
"venv",
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _count_triggers(
|
|
84
|
+
text: str,
|
|
85
|
+
patterns: list[str],
|
|
86
|
+
) -> tuple[int, list[str]]:
|
|
87
|
+
"""Count trigger pattern matches in text.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
text: Document text to scan.
|
|
91
|
+
patterns: Regex patterns to match.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Tuple of (match_count, list_of_matched_patterns).
|
|
95
|
+
"""
|
|
96
|
+
count = 0
|
|
97
|
+
matched: list[str] = []
|
|
98
|
+
for pattern in patterns:
|
|
99
|
+
hits = len(re.findall(pattern, text, re.IGNORECASE))
|
|
100
|
+
if hits > 0:
|
|
101
|
+
count += hits
|
|
102
|
+
matched.append(pattern)
|
|
103
|
+
return count, matched
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def classify_file(file_path: Path) -> DiaxisClassification:
|
|
107
|
+
"""Classify a single document file into a DIATAXIS quadrant.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
file_path: Path to the document file.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
DiaxisClassification with quadrant, confidence, and triggers.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
text = file_path.read_text(encoding="utf-8", errors="replace")
|
|
117
|
+
except OSError as exc:
|
|
118
|
+
logger.warning("Cannot read %s: %s", file_path, exc)
|
|
119
|
+
return DiaxisClassification(
|
|
120
|
+
file_path=file_path,
|
|
121
|
+
quadrant=DiaxisQuadrant.UNKNOWN,
|
|
122
|
+
confidence=0.0,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
scores: dict[DiaxisQuadrant, int] = {}
|
|
126
|
+
all_triggers: dict[DiaxisQuadrant, list[str]] = {}
|
|
127
|
+
|
|
128
|
+
for quadrant, patterns in TRIGGER_PATTERNS.items():
|
|
129
|
+
count, matched = _count_triggers(text, patterns)
|
|
130
|
+
scores[quadrant] = count
|
|
131
|
+
all_triggers[quadrant] = matched
|
|
132
|
+
|
|
133
|
+
total = sum(scores.values())
|
|
134
|
+
if total == 0:
|
|
135
|
+
return DiaxisClassification(
|
|
136
|
+
file_path=file_path,
|
|
137
|
+
quadrant=DiaxisQuadrant.UNKNOWN,
|
|
138
|
+
confidence=0.0,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
best_quadrant = max(scores, key=lambda q: scores[q])
|
|
142
|
+
best_score = scores[best_quadrant]
|
|
143
|
+
confidence = best_score / total
|
|
144
|
+
|
|
145
|
+
# Boost confidence if file path hints at quadrant
|
|
146
|
+
path_lower = str(file_path).lower()
|
|
147
|
+
path_hints: dict[str, DiaxisQuadrant] = {
|
|
148
|
+
"tutorial": DiaxisQuadrant.TUTORIAL,
|
|
149
|
+
"getting-started": DiaxisQuadrant.TUTORIAL,
|
|
150
|
+
"guide": DiaxisQuadrant.GUIDE,
|
|
151
|
+
"howto": DiaxisQuadrant.GUIDE,
|
|
152
|
+
"how-to": DiaxisQuadrant.GUIDE,
|
|
153
|
+
"reference": DiaxisQuadrant.REFERENCE,
|
|
154
|
+
"api": DiaxisQuadrant.REFERENCE,
|
|
155
|
+
"explanation": DiaxisQuadrant.EXPLANATION,
|
|
156
|
+
"adr": DiaxisQuadrant.EXPLANATION,
|
|
157
|
+
"architecture": DiaxisQuadrant.EXPLANATION,
|
|
158
|
+
}
|
|
159
|
+
for hint, quadrant in path_hints.items():
|
|
160
|
+
if hint in path_lower:
|
|
161
|
+
if quadrant == best_quadrant:
|
|
162
|
+
confidence = min(confidence + 0.15, 1.0)
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
return DiaxisClassification(
|
|
166
|
+
file_path=file_path,
|
|
167
|
+
quadrant=best_quadrant,
|
|
168
|
+
confidence=round(confidence, 3),
|
|
169
|
+
triggers=all_triggers[best_quadrant],
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def classify_repo(
|
|
174
|
+
repo_path: Path,
|
|
175
|
+
) -> list[DiaxisClassification]:
|
|
176
|
+
"""Classify all documentation files in a repository.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
repo_path: Root path of the repository.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
List of DiaxisClassification results.
|
|
183
|
+
"""
|
|
184
|
+
repo_path = repo_path.resolve()
|
|
185
|
+
results: list[DiaxisClassification] = []
|
|
186
|
+
|
|
187
|
+
docs_dir = repo_path / "docs"
|
|
188
|
+
if not docs_dir.is_dir():
|
|
189
|
+
logger.info("No docs/ directory in %s", repo_path)
|
|
190
|
+
return results
|
|
191
|
+
|
|
192
|
+
for doc_file in sorted(docs_dir.rglob("*")):
|
|
193
|
+
if not doc_file.is_file():
|
|
194
|
+
continue
|
|
195
|
+
if doc_file.suffix not in DOC_EXTENSIONS:
|
|
196
|
+
continue
|
|
197
|
+
if any(
|
|
198
|
+
part in SKIP_DIRS
|
|
199
|
+
for part in doc_file.relative_to(repo_path).parts
|
|
200
|
+
):
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
classification = classify_file(doc_file)
|
|
204
|
+
results.append(classification)
|
|
205
|
+
|
|
206
|
+
return results
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""LLM-based DIATAXIS classifier for low-confidence documents.
|
|
2
|
+
|
|
3
|
+
Re-classifies documents where the heuristic classifier has
|
|
4
|
+
confidence < threshold, using the llm_mcp HTTP gateway or
|
|
5
|
+
direct OpenAI API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from docs_agent.llm_client import LLMConfig, generate
|
|
16
|
+
from docs_agent.models import DiaxisClassification, DiaxisQuadrant
|
|
17
|
+
from docs_agent.prompts import PROMPT_DIATAXIS_CLASSIFY, SYSTEM_DIATAXIS
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
DEFAULT_THRESHOLD = 0.7
|
|
22
|
+
PREVIEW_CHARS = 500
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def reclassify_low_confidence(
|
|
26
|
+
classifications: list[DiaxisClassification],
|
|
27
|
+
*,
|
|
28
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
29
|
+
config: Optional[LLMConfig] = None,
|
|
30
|
+
) -> list[DiaxisClassification]:
|
|
31
|
+
"""Re-classify documents with confidence below threshold via LLM.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
classifications: Heuristic classification results.
|
|
35
|
+
threshold: Confidence threshold for re-classification.
|
|
36
|
+
config: LLM configuration.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Updated list with LLM-refined classifications replacing
|
|
40
|
+
low-confidence entries.
|
|
41
|
+
"""
|
|
42
|
+
results: list[DiaxisClassification] = []
|
|
43
|
+
|
|
44
|
+
for classification in classifications:
|
|
45
|
+
if classification.confidence >= threshold:
|
|
46
|
+
results.append(classification)
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
refined = await _classify_with_llm(
|
|
50
|
+
classification.file_path, config=config
|
|
51
|
+
)
|
|
52
|
+
if refined is not None:
|
|
53
|
+
results.append(refined)
|
|
54
|
+
else:
|
|
55
|
+
results.append(classification)
|
|
56
|
+
|
|
57
|
+
return results
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def _classify_with_llm(
|
|
61
|
+
file_path: Path,
|
|
62
|
+
*,
|
|
63
|
+
config: Optional[LLMConfig] = None,
|
|
64
|
+
) -> Optional[DiaxisClassification]:
|
|
65
|
+
"""Classify a single document via LLM.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
file_path: Path to the document.
|
|
69
|
+
config: LLM configuration.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
DiaxisClassification or None if LLM call fails.
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
text = file_path.read_text(encoding="utf-8", errors="replace")
|
|
76
|
+
except OSError as exc:
|
|
77
|
+
logger.warning("Cannot read %s: %s", file_path, exc)
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
title = file_path.stem.replace("-", " ").replace("_", " ")
|
|
81
|
+
preview = text[:PREVIEW_CHARS]
|
|
82
|
+
|
|
83
|
+
prompt = PROMPT_DIATAXIS_CLASSIFY.format(
|
|
84
|
+
title=title,
|
|
85
|
+
preview=preview,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
response = await generate(
|
|
89
|
+
prompt,
|
|
90
|
+
system_prompt=SYSTEM_DIATAXIS,
|
|
91
|
+
config=config,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
if not response.success:
|
|
95
|
+
logger.warning(
|
|
96
|
+
"LLM classification failed for %s: %s",
|
|
97
|
+
file_path, response.error,
|
|
98
|
+
)
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
return _parse_classification_response(response.content, file_path)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _parse_classification_response(
|
|
105
|
+
content: str | dict | list | None,
|
|
106
|
+
file_path: Path,
|
|
107
|
+
) -> Optional[DiaxisClassification]:
|
|
108
|
+
"""Parse LLM classification response.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
content: LLM response content.
|
|
112
|
+
file_path: Path to the classified document.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
DiaxisClassification or None on parse failure.
|
|
116
|
+
"""
|
|
117
|
+
if isinstance(content, str):
|
|
118
|
+
try:
|
|
119
|
+
content = json.loads(content)
|
|
120
|
+
except json.JSONDecodeError:
|
|
121
|
+
logger.warning("Failed to parse JSON from LLM response")
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
if not isinstance(content, dict):
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
quadrant_str = content.get("quadrant", "").lower().strip()
|
|
128
|
+
confidence = float(content.get("confidence", 0.5))
|
|
129
|
+
|
|
130
|
+
quadrant_map = {
|
|
131
|
+
"tutorial": DiaxisQuadrant.TUTORIAL,
|
|
132
|
+
"guide": DiaxisQuadrant.GUIDE,
|
|
133
|
+
"reference": DiaxisQuadrant.REFERENCE,
|
|
134
|
+
"explanation": DiaxisQuadrant.EXPLANATION,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
quadrant = quadrant_map.get(quadrant_str)
|
|
138
|
+
if quadrant is None:
|
|
139
|
+
logger.warning(
|
|
140
|
+
"Unknown quadrant from LLM: %s", quadrant_str
|
|
141
|
+
)
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
return DiaxisClassification(
|
|
145
|
+
file_path=file_path,
|
|
146
|
+
quadrant=quadrant,
|
|
147
|
+
confidence=round(confidence, 3),
|
|
148
|
+
triggers=[f"llm: {content.get('reasoning', '')}"],
|
|
149
|
+
)
|