code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""
|
|
2
|
+
API Surface Extraction skill.
|
|
3
|
+
|
|
4
|
+
Extracts the public API surface from source code files or directories
|
|
5
|
+
using AST parsing. Deterministic — no LLM, no indexing, no embeddings.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python -m claude_context.skills.api_surface --target src/auth/
|
|
9
|
+
python -m claude_context.skills.api_surface --target src/auth/handler.py --include-private
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import logging
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from ..enhanced_ast_chunker import EnhancedASTChunker, EnhancedCodeChunk
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Types that represent API surface entities
|
|
22
|
+
API_ENTITY_TYPES = {"function", "class", "method", "interface", "type"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_api_surface(
|
|
26
|
+
target_path: str,
|
|
27
|
+
languages: Optional[List[str]] = None,
|
|
28
|
+
include_private: bool = False,
|
|
29
|
+
include_docstrings: bool = True,
|
|
30
|
+
max_chunk_size: int = 1500,
|
|
31
|
+
) -> Dict[str, Any]:
|
|
32
|
+
"""Extract the public API surface from source files.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
target_path: Path to a file or directory
|
|
36
|
+
languages: Filter by language (e.g., ["python", "typescript"])
|
|
37
|
+
include_private: Include _prefixed names
|
|
38
|
+
include_docstrings: Include docstrings in output
|
|
39
|
+
max_chunk_size: Max chunk size for AST chunker
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dict with structured API surface grouped by file
|
|
43
|
+
"""
|
|
44
|
+
target = Path(target_path).resolve()
|
|
45
|
+
if not target.exists():
|
|
46
|
+
raise FileNotFoundError(f"Target path does not exist: {target}")
|
|
47
|
+
|
|
48
|
+
chunker = EnhancedASTChunker(max_chunk_size=max_chunk_size, context_mode="full")
|
|
49
|
+
|
|
50
|
+
# Collect files to process
|
|
51
|
+
if target.is_file():
|
|
52
|
+
files = [target]
|
|
53
|
+
else:
|
|
54
|
+
files = sorted(_discover_source_files(target))
|
|
55
|
+
|
|
56
|
+
api_surface: Dict[str, Any] = {}
|
|
57
|
+
total_entities = 0
|
|
58
|
+
|
|
59
|
+
for file_path in files:
|
|
60
|
+
chunks = chunker.chunk_file(file_path)
|
|
61
|
+
if not chunks:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# Filter to API entities
|
|
65
|
+
entities = [c for c in chunks if c.chunk_type in API_ENTITY_TYPES]
|
|
66
|
+
|
|
67
|
+
# Filter by language
|
|
68
|
+
if languages:
|
|
69
|
+
entities = [e for e in entities if e.language.lower() in [l.lower() for l in languages]]
|
|
70
|
+
|
|
71
|
+
# Filter private
|
|
72
|
+
if not include_private:
|
|
73
|
+
entities = [e for e in entities if not (e.name and e.name.startswith("_"))]
|
|
74
|
+
|
|
75
|
+
if not entities:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
file_key = str(file_path)
|
|
79
|
+
file_language = entities[0].language if entities else ""
|
|
80
|
+
structured = _structure_entities(entities, include_docstrings)
|
|
81
|
+
|
|
82
|
+
api_surface[file_key] = {
|
|
83
|
+
"language": file_language,
|
|
84
|
+
"entities": structured,
|
|
85
|
+
}
|
|
86
|
+
total_entities += len(entities)
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"target_path": str(target),
|
|
90
|
+
"files_processed": len(files),
|
|
91
|
+
"files_with_api": len(api_surface),
|
|
92
|
+
"total_entities": total_entities,
|
|
93
|
+
"api_surface": api_surface,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _discover_source_files(directory: Path) -> List[Path]:
|
|
98
|
+
"""Discover source files in a directory, skipping common non-source dirs."""
|
|
99
|
+
skip_dirs = {".git", "__pycache__", "node_modules", ".venv", "venv", ".vibe2doc"}
|
|
100
|
+
extensions = {
|
|
101
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs",
|
|
102
|
+
".java", ".kt", ".cpp", ".c", ".h", ".hpp",
|
|
103
|
+
}
|
|
104
|
+
files = []
|
|
105
|
+
for path in directory.rglob("*"):
|
|
106
|
+
if any(part in skip_dirs for part in path.parts):
|
|
107
|
+
continue
|
|
108
|
+
if path.is_file() and path.suffix in extensions:
|
|
109
|
+
files.append(path)
|
|
110
|
+
return files
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _structure_entities(
|
|
114
|
+
entities: List[EnhancedCodeChunk],
|
|
115
|
+
include_docstrings: bool,
|
|
116
|
+
) -> List[Dict[str, Any]]:
|
|
117
|
+
"""Structure entities, nesting methods under their parent classes."""
|
|
118
|
+
# Separate classes and top-level items
|
|
119
|
+
classes: Dict[str, Dict[str, Any]] = {}
|
|
120
|
+
top_level: List[Dict[str, Any]] = []
|
|
121
|
+
|
|
122
|
+
for entity in entities:
|
|
123
|
+
entry = _entity_to_dict(entity, include_docstrings)
|
|
124
|
+
|
|
125
|
+
if entity.chunk_type == "class":
|
|
126
|
+
entry["methods"] = []
|
|
127
|
+
classes[entity.name] = entry
|
|
128
|
+
elif entity.scope:
|
|
129
|
+
# Has a parent scope — nest under parent class if we have it.
|
|
130
|
+
# The chunker may report chunk_type="function" for methods,
|
|
131
|
+
# so we use scope presence as the signal, not chunk_type.
|
|
132
|
+
parent_name = entity.scope[0]
|
|
133
|
+
if parent_name in classes:
|
|
134
|
+
classes[parent_name]["methods"].append(entry)
|
|
135
|
+
else:
|
|
136
|
+
top_level.append(entry)
|
|
137
|
+
else:
|
|
138
|
+
top_level.append(entry)
|
|
139
|
+
|
|
140
|
+
# Classes first, then top-level functions
|
|
141
|
+
result = list(classes.values()) + top_level
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _entity_to_dict(entity: EnhancedCodeChunk, include_docstrings: bool) -> Dict[str, Any]:
|
|
146
|
+
"""Convert a single entity to output dict."""
|
|
147
|
+
d: Dict[str, Any] = {
|
|
148
|
+
"type": entity.chunk_type,
|
|
149
|
+
"name": entity.name,
|
|
150
|
+
"signature": entity.signature,
|
|
151
|
+
"line_range": list(entity.line_range),
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if entity.parameters:
|
|
155
|
+
d["parameters"] = [
|
|
156
|
+
{
|
|
157
|
+
"name": p.name,
|
|
158
|
+
"type": p.type_annotation,
|
|
159
|
+
"default": p.default_value,
|
|
160
|
+
}
|
|
161
|
+
for p in entity.parameters
|
|
162
|
+
if p.name != "self"
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
if entity.return_type:
|
|
166
|
+
d["return_type"] = entity.return_type
|
|
167
|
+
|
|
168
|
+
if include_docstrings and entity.docstring:
|
|
169
|
+
d["docstring"] = entity.docstring
|
|
170
|
+
|
|
171
|
+
if entity.scope:
|
|
172
|
+
d["scope"] = entity.scope
|
|
173
|
+
|
|
174
|
+
return d
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def main():
|
|
178
|
+
"""CLI entry point."""
|
|
179
|
+
from . _cli_common import setup_logging, add_common_args, output_result, error_result
|
|
180
|
+
setup_logging()
|
|
181
|
+
|
|
182
|
+
parser = argparse.ArgumentParser(
|
|
183
|
+
description="Extract API surface from source code using AST parsing"
|
|
184
|
+
)
|
|
185
|
+
parser.add_argument(
|
|
186
|
+
"--target", required=True,
|
|
187
|
+
help="File or directory to analyze"
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--languages", type=str, default=None,
|
|
191
|
+
help="Comma-separated language filter (e.g., python,typescript)"
|
|
192
|
+
)
|
|
193
|
+
parser.add_argument(
|
|
194
|
+
"--include-private", action="store_true",
|
|
195
|
+
help="Include _prefixed private names"
|
|
196
|
+
)
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--no-docstrings", action="store_true",
|
|
199
|
+
help="Exclude docstrings from output"
|
|
200
|
+
)
|
|
201
|
+
add_common_args(parser)
|
|
202
|
+
|
|
203
|
+
args = parser.parse_args()
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
langs = args.languages.split(",") if args.languages else None
|
|
207
|
+
result = extract_api_surface(
|
|
208
|
+
target_path=args.target,
|
|
209
|
+
languages=langs,
|
|
210
|
+
include_private=args.include_private,
|
|
211
|
+
include_docstrings=not args.no_docstrings,
|
|
212
|
+
)
|
|
213
|
+
output_result(result, args, filename="api_surface.json")
|
|
214
|
+
except Exception as e:
|
|
215
|
+
error_result(str(e))
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
if __name__ == "__main__":
|
|
219
|
+
main()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Evidence Retrieval skill.
|
|
3
|
+
|
|
4
|
+
Given a natural language query and a repo path, returns ranked code
|
|
5
|
+
snippets with file paths, signatures, and context scores.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python -m claude_context.skills.evidence_retrieval \
|
|
9
|
+
--repo /path/to/repo --query "how does auth work?" --limit 10
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from ._index_manager import ensure_index
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def retrieve_evidence(
|
|
22
|
+
repo_path: str,
|
|
23
|
+
query: str,
|
|
24
|
+
limit: int = 10,
|
|
25
|
+
filter_types: Optional[List[str]] = None,
|
|
26
|
+
filter_languages: Optional[List[str]] = None,
|
|
27
|
+
filter_paths: Optional[List[str]] = None,
|
|
28
|
+
db_path: Optional[str] = None,
|
|
29
|
+
reindex: bool = False,
|
|
30
|
+
) -> Dict[str, Any]:
|
|
31
|
+
"""Search a codebase for evidence matching a natural language query.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
repo_path: Path to the repository
|
|
35
|
+
query: Natural language search query
|
|
36
|
+
limit: Maximum number of results
|
|
37
|
+
filter_types: Filter by chunk type (function, class, method, etc.)
|
|
38
|
+
filter_languages: Filter by language (python, typescript, etc.)
|
|
39
|
+
filter_paths: Filter results to files under these directory prefixes
|
|
40
|
+
db_path: Override Milvus DB path
|
|
41
|
+
reindex: Force re-indexing
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dict with ranked search results and metadata
|
|
45
|
+
"""
|
|
46
|
+
searcher, index_info = ensure_index(repo_path, db_path=db_path, reindex=reindex)
|
|
47
|
+
|
|
48
|
+
# Resolve filter paths relative to repo_path (not cwd) so they match
|
|
49
|
+
# the absolute file_path values stored in the index
|
|
50
|
+
resolved_paths = None
|
|
51
|
+
if filter_paths:
|
|
52
|
+
from pathlib import Path
|
|
53
|
+
repo_root = Path(repo_path).resolve()
|
|
54
|
+
resolved_paths = [str((repo_root / p).resolve()) for p in filter_paths]
|
|
55
|
+
|
|
56
|
+
results = searcher.search(
|
|
57
|
+
query=query,
|
|
58
|
+
limit=limit,
|
|
59
|
+
filter_chunk_types=filter_types,
|
|
60
|
+
filter_languages=filter_languages,
|
|
61
|
+
filter_paths=resolved_paths,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"query": query,
|
|
66
|
+
"repo_path": repo_path,
|
|
67
|
+
"result_count": len(results),
|
|
68
|
+
"index_info": index_info,
|
|
69
|
+
"results": [
|
|
70
|
+
{
|
|
71
|
+
"rank": i + 1,
|
|
72
|
+
"file_path": r.file_path,
|
|
73
|
+
"file_name": r.file_name,
|
|
74
|
+
"start_line": r.start_line,
|
|
75
|
+
"end_line": r.end_line,
|
|
76
|
+
"language": r.language,
|
|
77
|
+
"chunk_type": r.chunk_type,
|
|
78
|
+
"chunk_name": r.chunk_name,
|
|
79
|
+
"parent_context": r.parent_context,
|
|
80
|
+
"signature": r.signature,
|
|
81
|
+
"docstring": r.docstring,
|
|
82
|
+
"return_type": r.return_type,
|
|
83
|
+
"content": r.content,
|
|
84
|
+
"scores": {
|
|
85
|
+
"vector": round(r.vector_score, 4),
|
|
86
|
+
"bm25": round(r.bm25_score, 4),
|
|
87
|
+
"combined": round(r.combined_score, 4),
|
|
88
|
+
},
|
|
89
|
+
}
|
|
90
|
+
for i, r in enumerate(results)
|
|
91
|
+
],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def main():
|
|
96
|
+
"""CLI entry point."""
|
|
97
|
+
from ._cli_common import setup_logging, add_common_args, output_result, error_result
|
|
98
|
+
setup_logging()
|
|
99
|
+
|
|
100
|
+
parser = argparse.ArgumentParser(
|
|
101
|
+
description="Search codebase for code evidence matching a query"
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"--repo", required=True,
|
|
105
|
+
help="Path to the repository to search"
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--query", required=True,
|
|
109
|
+
help="Natural language search query"
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--limit", type=int, default=10,
|
|
113
|
+
help="Maximum number of results (default: 10)"
|
|
114
|
+
)
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
"--filter-types", type=str, default=None,
|
|
117
|
+
help="Comma-separated chunk type filter (e.g., function,method)"
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--filter-languages", type=str, default=None,
|
|
121
|
+
help="Comma-separated language filter (e.g., python,typescript)"
|
|
122
|
+
)
|
|
123
|
+
parser.add_argument(
|
|
124
|
+
"--filter-paths", type=str, default=None,
|
|
125
|
+
help="Comma-separated directory prefixes to scope results (e.g., src/auth,src/config)"
|
|
126
|
+
)
|
|
127
|
+
add_common_args(parser)
|
|
128
|
+
|
|
129
|
+
args = parser.parse_args()
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
types = args.filter_types.split(",") if args.filter_types else None
|
|
133
|
+
langs = args.filter_languages.split(",") if args.filter_languages else None
|
|
134
|
+
paths = args.filter_paths.split(",") if args.filter_paths else None
|
|
135
|
+
result = retrieve_evidence(
|
|
136
|
+
repo_path=args.repo,
|
|
137
|
+
query=args.query,
|
|
138
|
+
limit=args.limit,
|
|
139
|
+
filter_types=types,
|
|
140
|
+
filter_languages=langs,
|
|
141
|
+
filter_paths=paths,
|
|
142
|
+
db_path=args.db_path,
|
|
143
|
+
reindex=args.reindex,
|
|
144
|
+
)
|
|
145
|
+
output_result(result, args, filename="evidence.json")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
error_result(str(e))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
main()
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code-Grounded Review skill.
|
|
3
|
+
|
|
4
|
+
Given a draft document and a repo path, extracts factual claims from
|
|
5
|
+
the document and verifies each against the source code using hybrid
|
|
6
|
+
search. Returns per-claim verdicts with supporting evidence.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python -m claude_context.skills.grounded_review \
|
|
10
|
+
--repo /path/to/repo --draft docs/getting-started.md
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
from ..search import SearchResult
|
|
20
|
+
from ._index_manager import ensure_index
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def grounded_review(
|
|
26
|
+
repo_path: str,
|
|
27
|
+
draft_path: str,
|
|
28
|
+
max_evidence_per_claim: int = 5,
|
|
29
|
+
db_path: Optional[str] = None,
|
|
30
|
+
reindex: bool = False,
|
|
31
|
+
) -> Dict[str, Any]:
|
|
32
|
+
"""Review a draft document by checking claims against source code.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
repo_path: Path to the repository
|
|
36
|
+
draft_path: Path to the markdown draft document
|
|
37
|
+
max_evidence_per_claim: Max evidence snippets per claim
|
|
38
|
+
db_path: Override Milvus DB path
|
|
39
|
+
reindex: Force re-indexing
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dict with per-claim verdicts and evidence
|
|
43
|
+
"""
|
|
44
|
+
draft = Path(draft_path)
|
|
45
|
+
if not draft.exists():
|
|
46
|
+
raise FileNotFoundError(f"Draft not found: {draft_path}")
|
|
47
|
+
|
|
48
|
+
draft_text = draft.read_text(encoding="utf-8", errors="ignore")
|
|
49
|
+
claims = _extract_claims(draft_text)
|
|
50
|
+
|
|
51
|
+
if not claims:
|
|
52
|
+
return {
|
|
53
|
+
"draft_path": str(draft_path),
|
|
54
|
+
"repo_path": repo_path,
|
|
55
|
+
"total_claims": 0,
|
|
56
|
+
"summary": {"supported": 0, "partially_supported": 0, "unsupported": 0, "no_evidence_found": 0},
|
|
57
|
+
"claims": [],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
searcher, index_info = ensure_index(repo_path, db_path=db_path, reindex=reindex)
|
|
61
|
+
|
|
62
|
+
claim_results = []
|
|
63
|
+
summary = {"supported": 0, "partially_supported": 0, "unsupported": 0, "no_evidence_found": 0}
|
|
64
|
+
|
|
65
|
+
for i, claim_text in enumerate(claims):
|
|
66
|
+
results = searcher.search(query=claim_text, limit=max_evidence_per_claim)
|
|
67
|
+
verdict, confidence = _judge_claim(claim_text, results)
|
|
68
|
+
summary[verdict] += 1
|
|
69
|
+
|
|
70
|
+
claim_results.append({
|
|
71
|
+
"claim_id": i + 1,
|
|
72
|
+
"text": claim_text,
|
|
73
|
+
"verdict": verdict,
|
|
74
|
+
"confidence": round(confidence, 4),
|
|
75
|
+
"evidence": [
|
|
76
|
+
{
|
|
77
|
+
"file_path": r.file_path,
|
|
78
|
+
"start_line": r.start_line,
|
|
79
|
+
"end_line": r.end_line,
|
|
80
|
+
"chunk_type": r.chunk_type,
|
|
81
|
+
"chunk_name": r.chunk_name,
|
|
82
|
+
"signature": r.signature,
|
|
83
|
+
"relevance_score": round(r.combined_score, 4),
|
|
84
|
+
"content_snippet": r.content[:300],
|
|
85
|
+
}
|
|
86
|
+
for r in results[:3] # Top 3 evidence per claim
|
|
87
|
+
],
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"draft_path": str(draft_path),
|
|
92
|
+
"repo_path": repo_path,
|
|
93
|
+
"total_claims": len(claims),
|
|
94
|
+
"summary": summary,
|
|
95
|
+
"claims": claim_results,
|
|
96
|
+
"index_info": index_info,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _extract_claims(draft_text: str) -> List[str]:
|
|
101
|
+
"""Extract verifiable factual claims from a markdown draft.
|
|
102
|
+
|
|
103
|
+
Looks for sentences that reference code identifiers: backtick-quoted
|
|
104
|
+
names, file paths, function-like names (snake_case, camelCase), or
|
|
105
|
+
import statements.
|
|
106
|
+
"""
|
|
107
|
+
# Remove code blocks (they're examples, not claims)
|
|
108
|
+
text = re.sub(r"```[\s\S]*?```", "", draft_text)
|
|
109
|
+
|
|
110
|
+
# Remove headings
|
|
111
|
+
text = re.sub(r"^#+\s+.*$", "", text, flags=re.MULTILINE)
|
|
112
|
+
|
|
113
|
+
# Split into sentences
|
|
114
|
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
115
|
+
|
|
116
|
+
claims = []
|
|
117
|
+
# Patterns that indicate a factual claim about code
|
|
118
|
+
code_patterns = [
|
|
119
|
+
r"`[^`]+`", # backtick-quoted identifiers
|
|
120
|
+
r"\b\w+\.\w+\b", # dotted names (module.func)
|
|
121
|
+
r"\b[a-z]+_[a-z_]+\b", # snake_case identifiers
|
|
122
|
+
r"\b[a-z]+[A-Z][a-zA-Z]+\b", # camelCase identifiers
|
|
123
|
+
r"[/\\]\w+\.\w+", # file paths
|
|
124
|
+
r"\bimport\b", # import references
|
|
125
|
+
r"\bclass\b", # class references
|
|
126
|
+
r"\bfunction\b|\bmethod\b|\bdef\b", # function references
|
|
127
|
+
]
|
|
128
|
+
combined_pattern = "|".join(code_patterns)
|
|
129
|
+
|
|
130
|
+
for sentence in sentences:
|
|
131
|
+
sentence = sentence.strip()
|
|
132
|
+
if len(sentence) < 20:
|
|
133
|
+
continue
|
|
134
|
+
if re.search(combined_pattern, sentence):
|
|
135
|
+
# Clean up whitespace
|
|
136
|
+
claim = " ".join(sentence.split())
|
|
137
|
+
if len(claim) <= 500: # Skip overly long passages
|
|
138
|
+
claims.append(claim)
|
|
139
|
+
|
|
140
|
+
return claims
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _judge_claim(claim: str, evidence: List[SearchResult]) -> Tuple[str, float]:
|
|
144
|
+
"""Produce a verdict for a claim based on search evidence.
|
|
145
|
+
|
|
146
|
+
Uses score thresholds and keyword overlap. Deterministic — no LLM.
|
|
147
|
+
Phase 2 can add LLM-based judgment using the evidence as context.
|
|
148
|
+
"""
|
|
149
|
+
if not evidence:
|
|
150
|
+
return "no_evidence_found", 0.0
|
|
151
|
+
|
|
152
|
+
top_score = evidence[0].combined_score
|
|
153
|
+
|
|
154
|
+
# Calculate keyword overlap between claim and top evidence
|
|
155
|
+
claim_tokens = set(re.findall(r"\b\w+\b", claim.lower()))
|
|
156
|
+
evidence_tokens = set(re.findall(r"\b\w+\b", evidence[0].content.lower()))
|
|
157
|
+
if claim_tokens:
|
|
158
|
+
overlap = len(claim_tokens & evidence_tokens) / len(claim_tokens)
|
|
159
|
+
else:
|
|
160
|
+
overlap = 0.0
|
|
161
|
+
|
|
162
|
+
if top_score < 0.3 and overlap < 0.2:
|
|
163
|
+
return "no_evidence_found", top_score
|
|
164
|
+
|
|
165
|
+
if top_score > 0.7 and overlap > 0.4:
|
|
166
|
+
return "supported", top_score
|
|
167
|
+
|
|
168
|
+
if top_score > 0.5 or overlap > 0.3:
|
|
169
|
+
return "partially_supported", top_score
|
|
170
|
+
|
|
171
|
+
return "unsupported", top_score
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def main():
|
|
175
|
+
"""CLI entry point."""
|
|
176
|
+
from ._cli_common import setup_logging, add_common_args, output_result, error_result
|
|
177
|
+
setup_logging()
|
|
178
|
+
|
|
179
|
+
parser = argparse.ArgumentParser(
|
|
180
|
+
description="Review a draft document by checking claims against source code"
|
|
181
|
+
)
|
|
182
|
+
parser.add_argument(
|
|
183
|
+
"--repo", required=True,
|
|
184
|
+
help="Path to the repository"
|
|
185
|
+
)
|
|
186
|
+
parser.add_argument(
|
|
187
|
+
"--draft", required=True,
|
|
188
|
+
help="Path to the markdown draft document"
|
|
189
|
+
)
|
|
190
|
+
parser.add_argument(
|
|
191
|
+
"--max-evidence", type=int, default=5,
|
|
192
|
+
help="Max evidence snippets per claim (default: 5)"
|
|
193
|
+
)
|
|
194
|
+
add_common_args(parser)
|
|
195
|
+
|
|
196
|
+
args = parser.parse_args()
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
result = grounded_review(
|
|
200
|
+
repo_path=args.repo,
|
|
201
|
+
draft_path=args.draft,
|
|
202
|
+
max_evidence_per_claim=args.max_evidence,
|
|
203
|
+
db_path=args.db_path,
|
|
204
|
+
reindex=args.reindex,
|
|
205
|
+
)
|
|
206
|
+
output_result(result, args, filename="review.json")
|
|
207
|
+
except Exception as e:
|
|
208
|
+
error_result(str(e))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
main()
|