code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,458 @@
1
+ import re
2
+ import logging
3
+ from typing import List, Dict, Tuple, Any, Optional, Set
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def check_essentials_present(text: str, essentials: Dict[str, Any]) -> Tuple[bool, str]:
9
+ """
10
+ Validate that essential content (installation, quickstart) appears in output.
11
+
12
+ This is CRITICAL - fail if essentials in evidence but missing from output.
13
+ Prevents LLM from writing about obscure functions while skipping install commands.
14
+
15
+ MANDATORY EVIDENCE-FIRST APPROACH:
16
+ - Installation commands from README MUST appear in output
17
+ - Quickstart code examples from README MUST appear in output
18
+ - No paraphrasing - show EXACT commands/code
19
+ """
20
+ if not essentials:
21
+ return True, ""
22
+
23
+ violations = []
24
+
25
+ # CRITICAL: Check installation command (EXACT match required)
26
+ if essentials.get("installation"):
27
+ cmd = essentials["installation"].get("command", "")
28
+ if cmd:
29
+ # Must contain the EXACT command
30
+ if cmd not in text:
31
+ violations.append(f"CRITICAL: Missing installation command from README: {cmd}")
32
+ # Detect paraphrasing anti-pattern
33
+ if "can be installed" in text.lower() and cmd not in text:
34
+ violations.append(f"CRITICAL: Installation paraphrased instead of showing exact command: {cmd}")
35
+
36
+ # CRITICAL: Check quickstart code block presence
37
+ if essentials.get("quickstart"):
38
+ qs_code = essentials["quickstart"].get("code", "")
39
+ # Should have at least one code block
40
+ if "```" not in text:
41
+ violations.append("CRITICAL: Missing quickstart code block (found in README but not in output)")
42
+ # Prefer checking if substantial portion of quickstart is present
43
+ elif qs_code:
44
+ # Check if at least 50% of quickstart code is present
45
+ code_lines = [line.strip() for line in qs_code.split('\n') if line.strip() and not line.strip().startswith('#')]
46
+ if code_lines:
47
+ present_count = sum(1 for line in code_lines if line in text)
48
+ if present_count < len(code_lines) * 0.5:
49
+ violations.append(f"CRITICAL: Quickstart example incomplete (only {present_count}/{len(code_lines)} lines present)")
50
+
51
+ # Check authentication if present (WARNING only - not blocking)
52
+ warnings = []
53
+ if essentials.get("authentication"):
54
+ auth = essentials["authentication"]
55
+ # At minimum, should mention auth if README does
56
+ if "auth" not in text.lower() and "authentication" not in text.lower():
57
+ warnings.append("WARNING: Authentication mentioned in README but not in output")
58
+
59
+ # Only CRITICAL violations block generation
60
+ if violations:
61
+ # Include warnings for context, but still fail on critical
62
+ all_issues = violations + warnings
63
+ return False, "; ".join(all_issues)
64
+
65
+ # If only warnings, still pass but return warnings for logging
66
+ if warnings:
67
+ return True, "; ".join(warnings)
68
+
69
+ return True, ""
70
+
71
+
72
+ def check_max_words(text: str, max_words: int) -> Tuple[bool, str]:
73
+ words = len((text or "").strip().split())
74
+ if words <= max_words:
75
+ return True, ""
76
+ return False, f"Too long: {words} words > {max_words}"
77
+
78
+
79
+ def check_citations(text: str, min_citations: int, citation_style: str) -> Tuple[bool, str]:
80
+ """
81
+ Check for citations in various formats.
82
+
83
+ Supports:
84
+ - [CITE:source]
85
+ - [SOURCE:name]
86
+ - [API:endpoint]
87
+ - [file.py:10-20]
88
+ - And any [WORD:something] pattern
89
+ """
90
+ if not text:
91
+ return False, f"Missing citations: found 0, require >= {min_citations}"
92
+
93
+ # Try to extract the prefix from citation_style (e.g., "CITE" from "[CITE:source]")
94
+ style_prefix = None
95
+ if citation_style and ":" in citation_style:
96
+ # Extract word between [ and :
97
+ match = re.search(r'\[([A-Z]+):', citation_style)
98
+ if match:
99
+ style_prefix = match.group(1)
100
+
101
+ # Build pattern based on style
102
+ if style_prefix:
103
+ # Look for specific style like [SOURCE:...], [CITE:...], [API:...]
104
+ pattern = rf"\[{style_prefix}:[^\]]+\]"
105
+ else:
106
+ # Generic: look for any [WORD:...] pattern
107
+ pattern = r"\[([A-Z]+|[a-z_]+\.[a-z]+):[^\]]+\]"
108
+
109
+ found = re.findall(pattern, text, re.IGNORECASE)
110
+
111
+ if len(found) >= int(min_citations):
112
+ return True, ""
113
+ return False, f"Missing citations: found {len(found)}, require >= {min_citations}"
114
+
115
+
116
+ def check_required_elements(text: str, required_elements: List[str]) -> Tuple[bool, str]:
117
+ missing = []
118
+ for req in required_elements or []:
119
+ if re.search(re.escape(req), text or "", re.IGNORECASE) is None:
120
+ missing.append(req)
121
+ if not missing:
122
+ return True, ""
123
+ return False, f"Missing required elements: {', '.join(missing)}"
124
+
125
+
126
+ def validate_section_output(
127
+ text: str,
128
+ rules: Dict,
129
+ max_words: int,
130
+ essentials: Optional[Dict[str, Any]] = None,
131
+ section_name: Optional[str] = None,
132
+ structured_evidence: Optional[Dict[str, Any]] = None
133
+ ) -> List[str]:
134
+ """
135
+ Validate section output against rules.
136
+
137
+ EVIDENCE-FIRST VALIDATION (MANDATORY):
138
+ - CRITICAL violations (missing essentials) FAIL immediately
139
+ - Other violations (citations, word count) are warnings but still fail
140
+ - NEW: Anti-hallucination validation for API docs (WARNING level)
141
+
142
+ Priority:
143
+ 1. Essential content (install, quickstart) - CRITICAL
144
+ 2. Citations and required elements - HIGH
145
+ 3. API reference validation (anti-hallucination) - MEDIUM
146
+ 4. Word count - MEDIUM (can be slightly exceeded for completeness)
147
+ """
148
+ violations: List[str] = []
149
+ critical_violations: List[str] = []
150
+
151
+ # PRIORITY 1: Check essentials FIRST (CRITICAL - must pass)
152
+ # Only check essentials on sections where they belong (getting started, installation, quickstart)
153
+ # Skip for reference sections like "Sources and References", "API Reference", "Troubleshooting"
154
+ essentials_sections = {
155
+ 'getting started', 'getting_started', 'installation', 'quickstart',
156
+ 'quick start', 'quick_start', 'overview', 'introduction'
157
+ }
158
+ section_name_lower = (section_name or "").lower().replace("-", " ").replace("_", " ")
159
+
160
+ should_check_essentials = essentials and any(
161
+ es in section_name_lower for es in essentials_sections
162
+ )
163
+
164
+ if should_check_essentials:
165
+ ok, msg = check_essentials_present(text, essentials)
166
+ if not ok:
167
+ # Split into critical vs warning
168
+ parts = msg.split("; ")
169
+ for part in parts:
170
+ if "CRITICAL:" in part:
171
+ critical_violations.append(part)
172
+ else:
173
+ violations.append(part)
174
+
175
+ # PRIORITY 2: Citations and required elements (HIGH - should pass)
176
+ if rules.get("require_citations", True):
177
+ ok, msg = check_citations(text, rules.get("min_citations", 1), rules.get("citation_style", "[CITE:source]"))
178
+ if not ok:
179
+ violations.append(msg)
180
+
181
+ if rules.get("required_elements"):
182
+ ok, msg = check_required_elements(text, rules["required_elements"])
183
+ if not ok:
184
+ violations.append(msg)
185
+
186
+ # PRIORITY 2.5: Anti-hallucination validation for API docs (NEW!)
187
+ if section_name and "api" in section_name.lower() and structured_evidence:
188
+ # Extract entities from structured evidence
189
+ extracted_entities = {}
190
+
191
+ # Check implementation tier for entities
192
+ impl = structured_evidence.get("implementation", {})
193
+ if impl:
194
+ # Entities might be in code_patterns or directly in implementation
195
+ code_patterns = impl.get("code_patterns", [])
196
+
197
+ # Also check if entities are stored elsewhere in evidence
198
+ usage = structured_evidence.get("usage", {})
199
+ if usage and "entities" in usage:
200
+ extracted_entities = usage.get("entities", {})
201
+
202
+ # Only validate if we have entities to check against
203
+ if extracted_entities:
204
+ passed, confidence, warnings = validate_api_references(
205
+ text, extracted_entities, section_name
206
+ )
207
+
208
+ if warnings:
209
+ # Add as warnings, not critical violations
210
+ for warning in warnings:
211
+ violations.append(f"API-VALIDATION: {warning}")
212
+
213
+ # Log confidence for monitoring
214
+ if confidence < 1.0:
215
+ logger.info(f"{section_name} API validation: {confidence:.0%} confidence, {len(warnings)} warnings")
216
+
217
+ rationale_data = structured_evidence.get("rationale") if structured_evidence else None
218
+ if rationale_data and any(rationale_data.get(key) for key in ("logic", "decisions", "qa")):
219
+ if "[CITE:rationale" not in (text or ""):
220
+ violations.append("Missing rationale citation: include at least one [CITE:rationale] entry")
221
+
222
+ # PRIORITY 3: Word count (MEDIUM - can be flexible)
223
+ ok, msg = check_max_words(text, max_words)
224
+ if not ok:
225
+ # Allow 10% overage if content is good
226
+ words = len((text or "").strip().split())
227
+ if words <= max_words * 1.1:
228
+ violations.append(f"Note: {words} words (slightly over {max_words} limit, acceptable)")
229
+ else:
230
+ violations.append(msg)
231
+
232
+ # Return critical violations first (fail-fast on these)
233
+ return critical_violations + violations
234
+
235
+
236
+ def feedback_instructions(violations: List[str]) -> str:
237
+ if not violations:
238
+ return ""
239
+ bullet = "\n".join(f"- {v}" for v in violations)
240
+ return (
241
+ "Fix the following issues without adding new claims:\n"
242
+ f"{bullet}\n"
243
+ "Only make minimal edits to satisfy the rules."
244
+ )
245
+
246
+
247
+ # Anti-hallucination validation - prevents docs from referencing non-existent APIs
248
+
249
+ # Whitelist of safe terms that shouldn't be validated as entities
250
+ SAFE_API_TERMS = {
251
+ # Generic types
252
+ 'string', 'str', 'int', 'integer', 'float', 'double', 'bool', 'boolean',
253
+ 'list', 'dict', 'dictionary', 'tuple', 'set', 'array', 'vector',
254
+ 'object', 'function', 'class', 'method', 'property', 'attribute',
255
+ 'type', 'none', 'null', 'undefined', 'any', 'optional',
256
+
257
+ # Common patterns
258
+ 'api', 'endpoint', 'route', 'handler', 'service', 'client', 'server',
259
+ 'request', 'response', 'error', 'exception', 'warning',
260
+ 'config', 'configuration', 'settings', 'options', 'parameters', 'params',
261
+ 'args', 'kwargs', 'context', 'state', 'data', 'result', 'output',
262
+
263
+ # External frameworks - PyTorch
264
+ 'tensor', 'model', 'module', 'optimizer', 'scheduler', 'loss',
265
+ 'dataset', 'dataloader', 'batch', 'epoch', 'checkpoint',
266
+ 'cuda', 'device', 'gpu', 'cpu', 'dtype', 'shape',
267
+
268
+ # External frameworks - NumPy/Pandas
269
+ 'numpy', 'ndarray', 'pandas', 'dataframe', 'series', 'index',
270
+ 'sklearn', 'scipy', 'matplotlib', 'seaborn',
271
+
272
+ # External frameworks - Web
273
+ 'flask', 'django', 'fastapi', 'requests', 'http', 'https',
274
+ 'json', 'xml', 'yaml', 'csv', 'html', 'url', 'uri',
275
+
276
+ # External frameworks - TensorFlow
277
+ 'tensorflow', 'keras', 'layer', 'sequential', 'functional',
278
+
279
+ # Documentation/general terms
280
+ 'example', 'usage', 'overview', 'architecture', 'installation',
281
+ 'quickstart', 'tutorial', 'guide', 'reference', 'documentation',
282
+ 'readme', 'license', 'changelog', 'version', 'release',
283
+
284
+ # Common verbs/actions (not entities)
285
+ 'create', 'read', 'update', 'delete', 'get', 'set', 'add', 'remove',
286
+ 'initialize', 'configure', 'start', 'stop', 'run', 'execute',
287
+ 'load', 'save', 'export', 'import', 'parse', 'format', 'validate'
288
+ }
289
+
290
+
291
+ def extract_api_references(content: str) -> Set[str]:
292
+ """
293
+ Extract potential API references from documentation content.
294
+
295
+ Looks for:
296
+ - Function calls: word()
297
+ - Class references: CapitalizedWord
298
+ - Code references: `backtick_word`
299
+ - Method calls: object.method
300
+
301
+ Returns:
302
+ Set of potential entity names
303
+ """
304
+ potential_refs = set()
305
+
306
+ # Pattern 1: Function calls (word followed by parentheses)
307
+ function_pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\('
308
+ potential_refs.update(re.findall(function_pattern, content))
309
+
310
+ # Pattern 2: Class references (capitalized words, likely classes)
311
+ # Only in code contexts (between backticks or in code blocks)
312
+ code_contexts = re.findall(r'`([^`]+)`', content)
313
+ code_blocks = re.findall(r'```[a-z]*\n(.*?)```', content, re.DOTALL)
314
+
315
+ for code_text in code_contexts + code_blocks:
316
+ class_pattern = r'\b([A-Z][a-zA-Z0-9_]*)\b'
317
+ potential_refs.update(re.findall(class_pattern, code_text))
318
+
319
+ # Pattern 3: Method references (word.method)
320
+ method_pattern = r'\.([a-zA-Z_][a-zA-Z0-9_]*)\b'
321
+ potential_refs.update(re.findall(method_pattern, content))
322
+
323
+ return potential_refs
324
+
325
+
326
+ def matches_entity(ref: str, extracted_entities: Dict, fuzzy: bool = True) -> bool:
327
+ """
328
+ Check if a reference matches a real extracted entity.
329
+
330
+ Args:
331
+ ref: The reference to check (e.g., "SmoothQuant")
332
+ extracted_entities: Extracted entities from code analysis
333
+ fuzzy: Enable fuzzy matching for partial matches
334
+
335
+ Returns:
336
+ True if the reference matches a real entity
337
+ """
338
+ if not extracted_entities:
339
+ return False
340
+
341
+ # Exact match
342
+ for file_data in extracted_entities.values():
343
+ entities = file_data.get('entities', [])
344
+ for entity in entities:
345
+ entity_name = entity.get('name', '')
346
+
347
+ # Exact match
348
+ if ref == entity_name:
349
+ return True
350
+
351
+ # Fuzzy match (if enabled)
352
+ if fuzzy:
353
+ ref_lower = ref.lower()
354
+
355
+ for file_data in extracted_entities.values():
356
+ entities = file_data.get('entities', [])
357
+ for entity in entities:
358
+ entity_name = entity.get('name', '')
359
+ entity_lower = entity_name.lower()
360
+
361
+ # Partial match: "SmoothQuant" matches "SmoothQuantModifier"
362
+ if ref_lower in entity_lower or entity_lower in ref_lower:
363
+ return True
364
+
365
+ # Case-insensitive exact match
366
+ if ref_lower == entity_lower:
367
+ return True
368
+
369
+ return False
370
+
371
+
372
+ def validate_api_references(
373
+ content: str,
374
+ extracted_entities: Dict[str, Any],
375
+ doc_type: str
376
+ ) -> Tuple[bool, float, List[str]]:
377
+ """
378
+ Validate API references in documentation to prevent hallucinations.
379
+
380
+ This is a CONSERVATIVE check with:
381
+ - Whitelisted safe terms (generic types, external libraries)
382
+ - Fuzzy matching for partial names
383
+ - Confidence scoring instead of binary pass/fail
384
+ - Warnings instead of hard rejections
385
+
386
+ Args:
387
+ content: Documentation content to validate
388
+ extracted_entities: Dict of extracted code entities
389
+ doc_type: Type of documentation (e.g., "API", "Overview")
390
+
391
+ Returns:
392
+ (passed: bool, confidence: float, warnings: List[str])
393
+ """
394
+ if not content or not extracted_entities:
395
+ # No entities extracted = can't validate, but that's ok
396
+ return True, 1.0, []
397
+
398
+ # Extract potential API references
399
+ potential_refs = extract_api_references(content)
400
+
401
+ if not potential_refs:
402
+ # No references found = no risk of hallucination
403
+ return True, 1.0, []
404
+
405
+ # Classify references
406
+ verified_refs = []
407
+ unverified_refs = []
408
+ warnings = []
409
+
410
+ for ref in potential_refs:
411
+ ref_lower = ref.lower()
412
+
413
+ # Skip whitelisted safe terms
414
+ if ref_lower in SAFE_API_TERMS:
415
+ continue
416
+
417
+ # Skip very short refs (likely not entities)
418
+ if len(ref) <= 2:
419
+ continue
420
+
421
+ # Skip pure numbers
422
+ if ref.isdigit():
423
+ continue
424
+
425
+ # Check if it matches real entities (with fuzzy matching)
426
+ if matches_entity(ref, extracted_entities, fuzzy=True):
427
+ verified_refs.append(ref)
428
+ else:
429
+ unverified_refs.append(ref)
430
+ warnings.append(f"Unverified API reference: '{ref}'")
431
+
432
+ # Calculate confidence score
433
+ total_refs = len(verified_refs) + len(unverified_refs)
434
+
435
+ if total_refs == 0:
436
+ # All refs were whitelisted = perfect
437
+ confidence = 1.0
438
+ else:
439
+ confidence = len(verified_refs) / total_refs if total_refs > 0 else 1.0
440
+
441
+ # Graduated response based on confidence
442
+ if confidence >= 0.8:
443
+ # High confidence - pass with no warnings
444
+ return True, confidence, []
445
+
446
+ elif confidence >= 0.6:
447
+ # Medium confidence - pass with warnings
448
+ logger.warning(f"{doc_type} validation: {confidence:.0%} confidence ({len(unverified_refs)} unverified refs)")
449
+ return True, confidence, warnings[:5] # Limit warnings
450
+
451
+ else:
452
+ # Low confidence - pass with strong warnings (but don't block)
453
+ # Too many false positives to make this blocking
454
+ logger.warning(f"{doc_type} validation: LOW confidence {confidence:.0%} ({len(unverified_refs)}/{total_refs} unverified)")
455
+ return True, confidence, warnings[:5] # Still pass, but log warnings
456
+
457
+
458
+