code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,219 @@
1
+ """
2
+ API Surface Extraction skill.
3
+
4
+ Extracts the public API surface from source code files or directories
5
+ using AST parsing. Deterministic — no LLM, no indexing, no embeddings.
6
+
7
+ Usage:
8
+ python -m claude_context.skills.api_surface --target src/auth/
9
+ python -m claude_context.skills.api_surface --target src/auth/handler.py --include-private
10
+ """
11
+
12
+ import argparse
13
+ import logging
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from ..enhanced_ast_chunker import EnhancedASTChunker, EnhancedCodeChunk
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Types that represent API surface entities
22
+ API_ENTITY_TYPES = {"function", "class", "method", "interface", "type"}
23
+
24
+
25
+ def extract_api_surface(
26
+ target_path: str,
27
+ languages: Optional[List[str]] = None,
28
+ include_private: bool = False,
29
+ include_docstrings: bool = True,
30
+ max_chunk_size: int = 1500,
31
+ ) -> Dict[str, Any]:
32
+ """Extract the public API surface from source files.
33
+
34
+ Args:
35
+ target_path: Path to a file or directory
36
+ languages: Filter by language (e.g., ["python", "typescript"])
37
+ include_private: Include _prefixed names
38
+ include_docstrings: Include docstrings in output
39
+ max_chunk_size: Max chunk size for AST chunker
40
+
41
+ Returns:
42
+ Dict with structured API surface grouped by file
43
+ """
44
+ target = Path(target_path).resolve()
45
+ if not target.exists():
46
+ raise FileNotFoundError(f"Target path does not exist: {target}")
47
+
48
+ chunker = EnhancedASTChunker(max_chunk_size=max_chunk_size, context_mode="full")
49
+
50
+ # Collect files to process
51
+ if target.is_file():
52
+ files = [target]
53
+ else:
54
+ files = sorted(_discover_source_files(target))
55
+
56
+ api_surface: Dict[str, Any] = {}
57
+ total_entities = 0
58
+
59
+ for file_path in files:
60
+ chunks = chunker.chunk_file(file_path)
61
+ if not chunks:
62
+ continue
63
+
64
+ # Filter to API entities
65
+ entities = [c for c in chunks if c.chunk_type in API_ENTITY_TYPES]
66
+
67
+ # Filter by language
68
+ if languages:
69
+ entities = [e for e in entities if e.language.lower() in [l.lower() for l in languages]]
70
+
71
+ # Filter private
72
+ if not include_private:
73
+ entities = [e for e in entities if not (e.name and e.name.startswith("_"))]
74
+
75
+ if not entities:
76
+ continue
77
+
78
+ file_key = str(file_path)
79
+ file_language = entities[0].language if entities else ""
80
+ structured = _structure_entities(entities, include_docstrings)
81
+
82
+ api_surface[file_key] = {
83
+ "language": file_language,
84
+ "entities": structured,
85
+ }
86
+ total_entities += len(entities)
87
+
88
+ return {
89
+ "target_path": str(target),
90
+ "files_processed": len(files),
91
+ "files_with_api": len(api_surface),
92
+ "total_entities": total_entities,
93
+ "api_surface": api_surface,
94
+ }
95
+
96
+
97
+ def _discover_source_files(directory: Path) -> List[Path]:
98
+ """Discover source files in a directory, skipping common non-source dirs."""
99
+ skip_dirs = {".git", "__pycache__", "node_modules", ".venv", "venv", ".vibe2doc"}
100
+ extensions = {
101
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs",
102
+ ".java", ".kt", ".cpp", ".c", ".h", ".hpp",
103
+ }
104
+ files = []
105
+ for path in directory.rglob("*"):
106
+ if any(part in skip_dirs for part in path.parts):
107
+ continue
108
+ if path.is_file() and path.suffix in extensions:
109
+ files.append(path)
110
+ return files
111
+
112
+
113
+ def _structure_entities(
114
+ entities: List[EnhancedCodeChunk],
115
+ include_docstrings: bool,
116
+ ) -> List[Dict[str, Any]]:
117
+ """Structure entities, nesting methods under their parent classes."""
118
+ # Separate classes and top-level items
119
+ classes: Dict[str, Dict[str, Any]] = {}
120
+ top_level: List[Dict[str, Any]] = []
121
+
122
+ for entity in entities:
123
+ entry = _entity_to_dict(entity, include_docstrings)
124
+
125
+ if entity.chunk_type == "class":
126
+ entry["methods"] = []
127
+ classes[entity.name] = entry
128
+ elif entity.scope:
129
+ # Has a parent scope — nest under parent class if we have it.
130
+ # The chunker may report chunk_type="function" for methods,
131
+ # so we use scope presence as the signal, not chunk_type.
132
+ parent_name = entity.scope[0]
133
+ if parent_name in classes:
134
+ classes[parent_name]["methods"].append(entry)
135
+ else:
136
+ top_level.append(entry)
137
+ else:
138
+ top_level.append(entry)
139
+
140
+ # Classes first, then top-level functions
141
+ result = list(classes.values()) + top_level
142
+ return result
143
+
144
+
145
+ def _entity_to_dict(entity: EnhancedCodeChunk, include_docstrings: bool) -> Dict[str, Any]:
146
+ """Convert a single entity to output dict."""
147
+ d: Dict[str, Any] = {
148
+ "type": entity.chunk_type,
149
+ "name": entity.name,
150
+ "signature": entity.signature,
151
+ "line_range": list(entity.line_range),
152
+ }
153
+
154
+ if entity.parameters:
155
+ d["parameters"] = [
156
+ {
157
+ "name": p.name,
158
+ "type": p.type_annotation,
159
+ "default": p.default_value,
160
+ }
161
+ for p in entity.parameters
162
+ if p.name != "self"
163
+ ]
164
+
165
+ if entity.return_type:
166
+ d["return_type"] = entity.return_type
167
+
168
+ if include_docstrings and entity.docstring:
169
+ d["docstring"] = entity.docstring
170
+
171
+ if entity.scope:
172
+ d["scope"] = entity.scope
173
+
174
+ return d
175
+
176
+
177
+ def main():
178
+ """CLI entry point."""
179
+ from . _cli_common import setup_logging, add_common_args, output_result, error_result
180
+ setup_logging()
181
+
182
+ parser = argparse.ArgumentParser(
183
+ description="Extract API surface from source code using AST parsing"
184
+ )
185
+ parser.add_argument(
186
+ "--target", required=True,
187
+ help="File or directory to analyze"
188
+ )
189
+ parser.add_argument(
190
+ "--languages", type=str, default=None,
191
+ help="Comma-separated language filter (e.g., python,typescript)"
192
+ )
193
+ parser.add_argument(
194
+ "--include-private", action="store_true",
195
+ help="Include _prefixed private names"
196
+ )
197
+ parser.add_argument(
198
+ "--no-docstrings", action="store_true",
199
+ help="Exclude docstrings from output"
200
+ )
201
+ add_common_args(parser)
202
+
203
+ args = parser.parse_args()
204
+
205
+ try:
206
+ langs = args.languages.split(",") if args.languages else None
207
+ result = extract_api_surface(
208
+ target_path=args.target,
209
+ languages=langs,
210
+ include_private=args.include_private,
211
+ include_docstrings=not args.no_docstrings,
212
+ )
213
+ output_result(result, args, filename="api_surface.json")
214
+ except Exception as e:
215
+ error_result(str(e))
216
+
217
+
218
+ if __name__ == "__main__":
219
+ main()
@@ -0,0 +1,151 @@
1
+ """
2
+ Code Evidence Retrieval skill.
3
+
4
+ Given a natural language query and a repo path, returns ranked code
5
+ snippets with file paths, signatures, and context scores.
6
+
7
+ Usage:
8
+ python -m claude_context.skills.evidence_retrieval \
9
+ --repo /path/to/repo --query "how does auth work?" --limit 10
10
+ """
11
+
12
+ import argparse
13
+ import logging
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from ._index_manager import ensure_index
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def retrieve_evidence(
22
+ repo_path: str,
23
+ query: str,
24
+ limit: int = 10,
25
+ filter_types: Optional[List[str]] = None,
26
+ filter_languages: Optional[List[str]] = None,
27
+ filter_paths: Optional[List[str]] = None,
28
+ db_path: Optional[str] = None,
29
+ reindex: bool = False,
30
+ ) -> Dict[str, Any]:
31
+ """Search a codebase for evidence matching a natural language query.
32
+
33
+ Args:
34
+ repo_path: Path to the repository
35
+ query: Natural language search query
36
+ limit: Maximum number of results
37
+ filter_types: Filter by chunk type (function, class, method, etc.)
38
+ filter_languages: Filter by language (python, typescript, etc.)
39
+ filter_paths: Filter results to files under these directory prefixes
40
+ db_path: Override Milvus DB path
41
+ reindex: Force re-indexing
42
+
43
+ Returns:
44
+ Dict with ranked search results and metadata
45
+ """
46
+ searcher, index_info = ensure_index(repo_path, db_path=db_path, reindex=reindex)
47
+
48
+ # Resolve filter paths relative to repo_path (not cwd) so they match
49
+ # the absolute file_path values stored in the index
50
+ resolved_paths = None
51
+ if filter_paths:
52
+ from pathlib import Path
53
+ repo_root = Path(repo_path).resolve()
54
+ resolved_paths = [str((repo_root / p).resolve()) for p in filter_paths]
55
+
56
+ results = searcher.search(
57
+ query=query,
58
+ limit=limit,
59
+ filter_chunk_types=filter_types,
60
+ filter_languages=filter_languages,
61
+ filter_paths=resolved_paths,
62
+ )
63
+
64
+ return {
65
+ "query": query,
66
+ "repo_path": repo_path,
67
+ "result_count": len(results),
68
+ "index_info": index_info,
69
+ "results": [
70
+ {
71
+ "rank": i + 1,
72
+ "file_path": r.file_path,
73
+ "file_name": r.file_name,
74
+ "start_line": r.start_line,
75
+ "end_line": r.end_line,
76
+ "language": r.language,
77
+ "chunk_type": r.chunk_type,
78
+ "chunk_name": r.chunk_name,
79
+ "parent_context": r.parent_context,
80
+ "signature": r.signature,
81
+ "docstring": r.docstring,
82
+ "return_type": r.return_type,
83
+ "content": r.content,
84
+ "scores": {
85
+ "vector": round(r.vector_score, 4),
86
+ "bm25": round(r.bm25_score, 4),
87
+ "combined": round(r.combined_score, 4),
88
+ },
89
+ }
90
+ for i, r in enumerate(results)
91
+ ],
92
+ }
93
+
94
+
95
+ def main():
96
+ """CLI entry point."""
97
+ from ._cli_common import setup_logging, add_common_args, output_result, error_result
98
+ setup_logging()
99
+
100
+ parser = argparse.ArgumentParser(
101
+ description="Search codebase for code evidence matching a query"
102
+ )
103
+ parser.add_argument(
104
+ "--repo", required=True,
105
+ help="Path to the repository to search"
106
+ )
107
+ parser.add_argument(
108
+ "--query", required=True,
109
+ help="Natural language search query"
110
+ )
111
+ parser.add_argument(
112
+ "--limit", type=int, default=10,
113
+ help="Maximum number of results (default: 10)"
114
+ )
115
+ parser.add_argument(
116
+ "--filter-types", type=str, default=None,
117
+ help="Comma-separated chunk type filter (e.g., function,method)"
118
+ )
119
+ parser.add_argument(
120
+ "--filter-languages", type=str, default=None,
121
+ help="Comma-separated language filter (e.g., python,typescript)"
122
+ )
123
+ parser.add_argument(
124
+ "--filter-paths", type=str, default=None,
125
+ help="Comma-separated directory prefixes to scope results (e.g., src/auth,src/config)"
126
+ )
127
+ add_common_args(parser)
128
+
129
+ args = parser.parse_args()
130
+
131
+ try:
132
+ types = args.filter_types.split(",") if args.filter_types else None
133
+ langs = args.filter_languages.split(",") if args.filter_languages else None
134
+ paths = args.filter_paths.split(",") if args.filter_paths else None
135
+ result = retrieve_evidence(
136
+ repo_path=args.repo,
137
+ query=args.query,
138
+ limit=args.limit,
139
+ filter_types=types,
140
+ filter_languages=langs,
141
+ filter_paths=paths,
142
+ db_path=args.db_path,
143
+ reindex=args.reindex,
144
+ )
145
+ output_result(result, args, filename="evidence.json")
146
+ except Exception as e:
147
+ error_result(str(e))
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()
@@ -0,0 +1,212 @@
1
+ """
2
+ Code-Grounded Review skill.
3
+
4
+ Given a draft document and a repo path, extracts factual claims from
5
+ the document and verifies each against the source code using hybrid
6
+ search. Returns per-claim verdicts with supporting evidence.
7
+
8
+ Usage:
9
+ python -m claude_context.skills.grounded_review \
10
+ --repo /path/to/repo --draft docs/getting-started.md
11
+ """
12
+
13
+ import argparse
14
+ import logging
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ from ..search import SearchResult
20
+ from ._index_manager import ensure_index
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def grounded_review(
26
+ repo_path: str,
27
+ draft_path: str,
28
+ max_evidence_per_claim: int = 5,
29
+ db_path: Optional[str] = None,
30
+ reindex: bool = False,
31
+ ) -> Dict[str, Any]:
32
+ """Review a draft document by checking claims against source code.
33
+
34
+ Args:
35
+ repo_path: Path to the repository
36
+ draft_path: Path to the markdown draft document
37
+ max_evidence_per_claim: Max evidence snippets per claim
38
+ db_path: Override Milvus DB path
39
+ reindex: Force re-indexing
40
+
41
+ Returns:
42
+ Dict with per-claim verdicts and evidence
43
+ """
44
+ draft = Path(draft_path)
45
+ if not draft.exists():
46
+ raise FileNotFoundError(f"Draft not found: {draft_path}")
47
+
48
+ draft_text = draft.read_text(encoding="utf-8", errors="ignore")
49
+ claims = _extract_claims(draft_text)
50
+
51
+ if not claims:
52
+ return {
53
+ "draft_path": str(draft_path),
54
+ "repo_path": repo_path,
55
+ "total_claims": 0,
56
+ "summary": {"supported": 0, "partially_supported": 0, "unsupported": 0, "no_evidence_found": 0},
57
+ "claims": [],
58
+ }
59
+
60
+ searcher, index_info = ensure_index(repo_path, db_path=db_path, reindex=reindex)
61
+
62
+ claim_results = []
63
+ summary = {"supported": 0, "partially_supported": 0, "unsupported": 0, "no_evidence_found": 0}
64
+
65
+ for i, claim_text in enumerate(claims):
66
+ results = searcher.search(query=claim_text, limit=max_evidence_per_claim)
67
+ verdict, confidence = _judge_claim(claim_text, results)
68
+ summary[verdict] += 1
69
+
70
+ claim_results.append({
71
+ "claim_id": i + 1,
72
+ "text": claim_text,
73
+ "verdict": verdict,
74
+ "confidence": round(confidence, 4),
75
+ "evidence": [
76
+ {
77
+ "file_path": r.file_path,
78
+ "start_line": r.start_line,
79
+ "end_line": r.end_line,
80
+ "chunk_type": r.chunk_type,
81
+ "chunk_name": r.chunk_name,
82
+ "signature": r.signature,
83
+ "relevance_score": round(r.combined_score, 4),
84
+ "content_snippet": r.content[:300],
85
+ }
86
+ for r in results[:3] # Top 3 evidence per claim
87
+ ],
88
+ })
89
+
90
+ return {
91
+ "draft_path": str(draft_path),
92
+ "repo_path": repo_path,
93
+ "total_claims": len(claims),
94
+ "summary": summary,
95
+ "claims": claim_results,
96
+ "index_info": index_info,
97
+ }
98
+
99
+
100
+ def _extract_claims(draft_text: str) -> List[str]:
101
+ """Extract verifiable factual claims from a markdown draft.
102
+
103
+ Looks for sentences that reference code identifiers: backtick-quoted
104
+ names, file paths, function-like names (snake_case, camelCase), or
105
+ import statements.
106
+ """
107
+ # Remove code blocks (they're examples, not claims)
108
+ text = re.sub(r"```[\s\S]*?```", "", draft_text)
109
+
110
+ # Remove headings
111
+ text = re.sub(r"^#+\s+.*$", "", text, flags=re.MULTILINE)
112
+
113
+ # Split into sentences
114
+ sentences = re.split(r"(?<=[.!?])\s+", text)
115
+
116
+ claims = []
117
+ # Patterns that indicate a factual claim about code
118
+ code_patterns = [
119
+ r"`[^`]+`", # backtick-quoted identifiers
120
+ r"\b\w+\.\w+\b", # dotted names (module.func)
121
+ r"\b[a-z]+_[a-z_]+\b", # snake_case identifiers
122
+ r"\b[a-z]+[A-Z][a-zA-Z]+\b", # camelCase identifiers
123
+ r"[/\\]\w+\.\w+", # file paths
124
+ r"\bimport\b", # import references
125
+ r"\bclass\b", # class references
126
+ r"\bfunction\b|\bmethod\b|\bdef\b", # function references
127
+ ]
128
+ combined_pattern = "|".join(code_patterns)
129
+
130
+ for sentence in sentences:
131
+ sentence = sentence.strip()
132
+ if len(sentence) < 20:
133
+ continue
134
+ if re.search(combined_pattern, sentence):
135
+ # Clean up whitespace
136
+ claim = " ".join(sentence.split())
137
+ if len(claim) <= 500: # Skip overly long passages
138
+ claims.append(claim)
139
+
140
+ return claims
141
+
142
+
143
+ def _judge_claim(claim: str, evidence: List[SearchResult]) -> Tuple[str, float]:
144
+ """Produce a verdict for a claim based on search evidence.
145
+
146
+ Uses score thresholds and keyword overlap. Deterministic — no LLM.
147
+ Phase 2 can add LLM-based judgment using the evidence as context.
148
+ """
149
+ if not evidence:
150
+ return "no_evidence_found", 0.0
151
+
152
+ top_score = evidence[0].combined_score
153
+
154
+ # Calculate keyword overlap between claim and top evidence
155
+ claim_tokens = set(re.findall(r"\b\w+\b", claim.lower()))
156
+ evidence_tokens = set(re.findall(r"\b\w+\b", evidence[0].content.lower()))
157
+ if claim_tokens:
158
+ overlap = len(claim_tokens & evidence_tokens) / len(claim_tokens)
159
+ else:
160
+ overlap = 0.0
161
+
162
+ if top_score < 0.3 and overlap < 0.2:
163
+ return "no_evidence_found", top_score
164
+
165
+ if top_score > 0.7 and overlap > 0.4:
166
+ return "supported", top_score
167
+
168
+ if top_score > 0.5 or overlap > 0.3:
169
+ return "partially_supported", top_score
170
+
171
+ return "unsupported", top_score
172
+
173
+
174
+ def main():
175
+ """CLI entry point."""
176
+ from ._cli_common import setup_logging, add_common_args, output_result, error_result
177
+ setup_logging()
178
+
179
+ parser = argparse.ArgumentParser(
180
+ description="Review a draft document by checking claims against source code"
181
+ )
182
+ parser.add_argument(
183
+ "--repo", required=True,
184
+ help="Path to the repository"
185
+ )
186
+ parser.add_argument(
187
+ "--draft", required=True,
188
+ help="Path to the markdown draft document"
189
+ )
190
+ parser.add_argument(
191
+ "--max-evidence", type=int, default=5,
192
+ help="Max evidence snippets per claim (default: 5)"
193
+ )
194
+ add_common_args(parser)
195
+
196
+ args = parser.parse_args()
197
+
198
+ try:
199
+ result = grounded_review(
200
+ repo_path=args.repo,
201
+ draft_path=args.draft,
202
+ max_evidence_per_claim=args.max_evidence,
203
+ db_path=args.db_path,
204
+ reindex=args.reindex,
205
+ )
206
+ output_result(result, args, filename="review.json")
207
+ except Exception as e:
208
+ error_result(str(e))
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()
@@ -0,0 +1,8 @@
1
+ from .llm_synthesizer import LLMSynthesizer
2
+
3
+ __all__ = [
4
+ "LLMSynthesizer",
5
+ ]
6
+
7
+
8
+