code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,391 @@
1
+ """Editor agent that critiques synthesized documentation using repository evidence.
2
+
3
+ The editor runs after the first-pass synthesis. It reuses the same hybrid search
4
+ infrastructure as the extraction pipeline so that every critique can reference
5
+ real code, README content, or rationale records before suggesting revisions.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import re
13
+ from dataclasses import dataclass
14
+ from typing import Any, Dict, Iterable, List, Optional, Sequence
15
+
16
+ from .providers import TextGenerator, create_generator
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ CITATION_PATTERN = re.compile(r"\[CITE:([^\]]+)\]")
21
+
22
+
23
+ @dataclass
24
+ class RetrievedContext:
25
+ """Snippet pulled from the repository for editor review."""
26
+
27
+ file_path: str
28
+ start_line: int
29
+ end_line: int
30
+ snippet: str
31
+
32
+
33
+ @dataclass
34
+ class EditorReview:
35
+ """Structured output from the editor pass."""
36
+
37
+ section: str
38
+ issues: List[str]
39
+ revised_text: str
40
+ citations: List[str]
41
+ queries: List[str]
42
+ retrieved_context: List[RetrievedContext]
43
+ raw_response: str
44
+ evidence_summary: str
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return {
48
+ "section": self.section,
49
+ "issues": self.issues,
50
+ "citations": self.citations,
51
+ "queries": self.queries,
52
+ "retrieved_context": [
53
+ {
54
+ "file_path": ctx.file_path,
55
+ "start_line": ctx.start_line,
56
+ "end_line": ctx.end_line,
57
+ "snippet": ctx.snippet,
58
+ }
59
+ for ctx in self.retrieved_context
60
+ ],
61
+ "raw_response": self.raw_response,
62
+ "evidence_summary": self.evidence_summary,
63
+ }
64
+
65
+
66
+ class EditorAgent:
67
+ """Critiques synthesized documentation with grounded repository evidence."""
68
+
69
+ def __init__(
70
+ self,
71
+ searcher: Any,
72
+ generator: Optional[TextGenerator] = None,
73
+ *,
74
+ max_queries: int = 6,
75
+ search_limit: int = 3,
76
+ temperature: float = 0.15,
77
+ max_tokens: int = 900,
78
+ ) -> None:
79
+ self.searcher = searcher
80
+ self.generator = generator or create_generator()
81
+ self.max_queries = max_queries
82
+ self.search_limit = search_limit
83
+ self.temperature = temperature
84
+ self.max_tokens = max_tokens
85
+
86
+ def review_sections(
87
+ self,
88
+ sections: Dict[str, str],
89
+ *,
90
+ structured_evidence: Dict[str, Any],
91
+ repository_path: str,
92
+ ) -> List[EditorReview]:
93
+ reviews: List[EditorReview] = []
94
+ for section_name, draft_text in sections.items():
95
+ review = self._review_single_section(
96
+ section_name=section_name,
97
+ draft_text=draft_text,
98
+ repository_path=repository_path,
99
+ structured_evidence=structured_evidence,
100
+ )
101
+ reviews.append(review)
102
+ return reviews
103
+
104
+ # ------------------------------------------------------------------
105
+ # Prompt helpers
106
+ # ------------------------------------------------------------------
107
+ def _system_prompt(self) -> str:
108
+ return (
109
+ "You are a technical editor. Review documentation against the provided "
110
+ "evidence and source snippets. Identify factual errors, missing rationale, "
111
+ "and opportunities to clarify the WHY behind decisions. Respond in JSON."
112
+ )
113
+
114
+ def _build_user_prompt(
115
+ self,
116
+ *,
117
+ section_name: str,
118
+ draft_text: str,
119
+ repository_path: str,
120
+ rationale: Dict[str, Any],
121
+ retrieved: List[RetrievedContext],
122
+ evidence_summary: str,
123
+ ) -> str:
124
+ rationale_block = json.dumps(self._condense_rationale(rationale), indent=2)
125
+ context_lines = []
126
+ for ctx in retrieved:
127
+ snippet = ctx.snippet.strip()
128
+ if len(snippet) > 600:
129
+ snippet = snippet[:600] + "\n..."
130
+ context_lines.append(
131
+ f"File: {ctx.file_path}:{ctx.start_line}-{ctx.end_line}\n{snippet}"
132
+ )
133
+ context_block = "\n\n".join(context_lines) or "(no extra context retrieved)"
134
+
135
+ instructions = (
136
+ "Review the draft against the evidence. Every retrieved snippet must be considered. "
137
+ "If any snippet disagrees with the draft (dataset name, parameter, behavior, rationale), "
138
+ "rewrite the affected text to match the snippet and cite it. If everything matches, "
139
+ "explicitly state that no change was required. Return JSON with keys: issues (list), "
140
+ "revised_text (string), citations (list of cite markers)."
141
+ )
142
+
143
+ return "\n\n".join(
144
+ [
145
+ f"Repository: {repository_path}",
146
+ f"Section under review: {section_name}",
147
+ instructions,
148
+ "\nDraft section:\n" + draft_text.strip(),
149
+ "\nRationale evidence:\n" + rationale_block,
150
+ "\nRetrieved source context:\n" + context_block,
151
+ "\nEvidence summary:\n" + evidence_summary,
152
+ ]
153
+ )
154
+
155
+ def _parse_response(self, response: str, draft_text: str) -> Dict[str, Any]:
156
+ try:
157
+ data = json.loads(response)
158
+ except json.JSONDecodeError:
159
+ logger.debug("Editor response not JSON; returning draft unchanged")
160
+ return {
161
+ "issues": [line.strip() for line in response.splitlines() if line.strip()],
162
+ "revised_text": draft_text,
163
+ "citations": self._extract_citations(draft_text),
164
+ }
165
+
166
+ issues = data.get("issues", [])
167
+ if isinstance(issues, str):
168
+ issues = [issues]
169
+ elif not isinstance(issues, Iterable):
170
+ issues = []
171
+
172
+ revised = data.get("revised_text") or draft_text
173
+ citations = data.get("citations")
174
+ if isinstance(citations, str):
175
+ citations = [citations]
176
+ elif not isinstance(citations, Iterable):
177
+ citations = []
178
+ citations = [str(c).strip() for c in citations if str(c).strip()]
179
+ if not citations:
180
+ citations = self._extract_citations(revised) or self._extract_citations(draft_text)
181
+
182
+ return {
183
+ "issues": [str(issue).strip() for issue in issues if str(issue).strip()],
184
+ "revised_text": revised,
185
+ "citations": citations,
186
+ }
187
+
188
+ # ------------------------------------------------------------------
189
+ # Evidence helpers
190
+ # ------------------------------------------------------------------
191
+ def _derive_queries(
192
+ self,
193
+ section_name: str,
194
+ draft_text: str,
195
+ structured_evidence: Dict[str, Any],
196
+ ) -> List[str]:
197
+ queries: List[str] = []
198
+
199
+ for match in CITATION_PATTERN.findall(draft_text):
200
+ for fragment in match.split(","):
201
+ token = fragment.strip()
202
+ if not token:
203
+ continue
204
+ token = token.split(":")[0]
205
+ queries.append(token)
206
+
207
+ rationale = structured_evidence.get("rationale", {})
208
+ for logic in rationale.get("logic", [])[:3]:
209
+ file_path = logic.get("file_path")
210
+ if file_path:
211
+ queries.append(str(file_path))
212
+
213
+ essentials = structured_evidence.get("essentials", {})
214
+ readme_source = essentials.get("installation", {}).get("source") or essentials.get("quickstart", {}).get("source")
215
+ if readme_source:
216
+ queries.append(str(readme_source))
217
+
218
+ if not queries:
219
+ queries.append(section_name.replace(".md", ""))
220
+
221
+ seen = set()
222
+ ordered: List[str] = []
223
+ for query in queries:
224
+ if query not in seen:
225
+ seen.add(query)
226
+ ordered.append(query)
227
+ return ordered[: self.max_queries]
228
+
229
+ def _retrieve_context(self, queries: Sequence[str]) -> List[RetrievedContext]:
230
+ contexts: List[RetrievedContext] = []
231
+ for term in queries:
232
+ try:
233
+ results = self.searcher.search(term, limit=self.search_limit)
234
+ except Exception as exc: # pragma: no cover - defensive
235
+ logger.debug("Search failed for %s: %s", term, exc)
236
+ continue
237
+
238
+ for result in results:
239
+ snippet = (getattr(result, "content", None) or "").strip()
240
+ contexts.append(
241
+ RetrievedContext(
242
+ file_path=getattr(result, "file_path", term),
243
+ start_line=int(getattr(result, "start_line", 0) or 0),
244
+ end_line=int(getattr(result, "end_line", 0) or 0),
245
+ snippet=snippet,
246
+ )
247
+ )
248
+ return contexts
249
+
250
+ def _summarize_context(self, contexts: List[RetrievedContext]) -> str:
251
+ if not contexts:
252
+ return "(no context retrieved)"
253
+ lines: List[str] = []
254
+ for ctx in contexts[:10]:
255
+ snippet = ctx.snippet.strip().replace("\n", " ")
256
+ if len(snippet) > 120:
257
+ snippet = snippet[:120] + "…"
258
+ lines.append(f"- {ctx.file_path}:{ctx.start_line}-{ctx.end_line} → {snippet}")
259
+ return "\n".join(lines)
260
+
261
+ def _requires_revision(
262
+ self, draft_text: str, revised_text: str, contexts: List[RetrievedContext]
263
+ ) -> bool:
264
+ if draft_text.strip() == revised_text.strip():
265
+ return False
266
+
267
+ draft_lower = draft_text.lower()
268
+ revised_lower = revised_text.lower()
269
+ for ctx in contexts:
270
+ snippet = ctx.snippet.lower().strip()
271
+ if snippet and snippet in draft_lower and snippet not in revised_lower:
272
+ return False
273
+ return True
274
+
275
+ def _extract_citations(self, text: str) -> List[str]:
276
+ citations: List[str] = []
277
+ for match in CITATION_PATTERN.findall(text):
278
+ for fragment in match.split(","):
279
+ entry = fragment.strip()
280
+ if entry:
281
+ citations.append(entry)
282
+ return citations
283
+
284
+ def _condense_rationale(self, rationale: Dict[str, Any]) -> Dict[str, Any]:
285
+ summary: Dict[str, Any] = {}
286
+
287
+ logic_items: List[Dict[str, Any]] = []
288
+ for item in rationale.get("logic", [])[:5]:
289
+ logic_items.append(
290
+ {
291
+ "file": item.get("file_path"),
292
+ "summary": item.get("summary"),
293
+ "why": item.get("rationale"),
294
+ "trade_offs": item.get("trade_offs"),
295
+ "side_effects": item.get("side_effects"),
296
+ }
297
+ )
298
+ if logic_items:
299
+ summary["logic"] = logic_items
300
+
301
+ decisions: List[Dict[str, Any]] = []
302
+ for item in rationale.get("decisions", [])[:3]:
303
+ decisions.append(
304
+ {
305
+ "source": item.get("source"),
306
+ "summary": item.get("summary"),
307
+ "type": item.get("type"),
308
+ }
309
+ )
310
+ if decisions:
311
+ summary["decisions"] = decisions
312
+
313
+ qa_items: List[Dict[str, Any]] = []
314
+ for qa in rationale.get("qa", [])[:3]:
315
+ qa_items.append(
316
+ {
317
+ "question": qa.get("question"),
318
+ "confidence": qa.get("confidence"),
319
+ "notes": qa.get("rationale_points"),
320
+ }
321
+ )
322
+ if qa_items:
323
+ summary["qa"] = qa_items
324
+
325
+ errors = rationale.get("errors")
326
+ if errors:
327
+ summary["errors"] = errors
328
+
329
+ return summary
330
+
331
+ def _review_single_section(
332
+ self,
333
+ *,
334
+ section_name: str,
335
+ draft_text: str,
336
+ repository_path: str,
337
+ structured_evidence: Dict[str, Any],
338
+ ) -> EditorReview:
339
+ queries = self._derive_queries(section_name, draft_text, structured_evidence)
340
+ retrieved = self._retrieve_context(queries)
341
+ evidence_summary = self._summarize_context(retrieved)
342
+ logger.debug("Editor evidence for %s:\n%s", section_name, evidence_summary or "(none)")
343
+
344
+ user_prompt = self._build_user_prompt(
345
+ section_name=section_name,
346
+ draft_text=draft_text,
347
+ repository_path=repository_path,
348
+ rationale=structured_evidence.get("rationale", {}),
349
+ retrieved=retrieved,
350
+ evidence_summary=evidence_summary,
351
+ )
352
+
353
+ try:
354
+ response = self.generator.generate(
355
+ system_prompt=self._system_prompt(),
356
+ user_prompt=user_prompt,
357
+ temperature=self.temperature,
358
+ max_tokens=self.max_tokens,
359
+ )
360
+ except Exception as exc: # pragma: no cover - defensive
361
+ logger.warning("Editor agent failed for %s: %s", section_name, exc)
362
+ return EditorReview(
363
+ section=section_name,
364
+ issues=[f"Editor agent error: {exc}"],
365
+ revised_text=draft_text,
366
+ citations=self._extract_citations(draft_text),
367
+ queries=queries,
368
+ retrieved_context=retrieved,
369
+ raw_response=str(exc),
370
+ evidence_summary=evidence_summary,
371
+ )
372
+
373
+ parsed = self._parse_response(response, draft_text)
374
+ revised_text = parsed["revised_text"]
375
+ if not self._requires_revision(draft_text, revised_text, retrieved):
376
+ issues = parsed["issues"] or []
377
+ issues.append("No substantive changes detected; draft retained")
378
+ parsed["issues"] = issues
379
+ revised_text = draft_text
380
+
381
+ return EditorReview(
382
+ section=section_name,
383
+ issues=parsed["issues"],
384
+ revised_text=revised_text,
385
+ citations=parsed["citations"],
386
+ queries=queries,
387
+ retrieved_context=retrieved,
388
+ raw_response=response,
389
+ evidence_summary=evidence_summary,
390
+ )
391
+
@@ -0,0 +1,153 @@
1
+ import logging
2
+ from typing import List, Dict, Optional
3
+
4
+ from .providers import create_generator, TextGenerator
5
+ from .prompt_builder import build_section_prompt
6
+ from .validators import validate_section_output, feedback_instructions
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class LLMSynthesizer:
12
+ """
13
+ Minimal synthesis orchestrator:
14
+ - Build per-section prompts with evidence and rules
15
+ - Generate with a provider (fail-fast if not configured)
16
+ - Validate and retry once with targeted feedback
17
+
18
+ Token/Word Relationship:
19
+ - Templates specify max_words (e.g., 2000 words for Configuration section)
20
+ - Average: 1 word ≈ 1.3 tokens
21
+ - max_tokens calculated dynamically: max_words * 1.3 * 1.2 (20% buffer)
22
+ - Default max_tokens=3000 used as minimum fallback
23
+ """
24
+
25
+ def __init__(self,
26
+ provider: Optional[str] = None,
27
+ model: Optional[str] = None,
28
+ temperature: float = 0.2,
29
+ max_tokens: int = 3000, # Covers current max_words limits (up to 2000 words)
30
+ user_focused: bool = False):
31
+ self.generator: TextGenerator = create_generator(provider=provider, model=model)
32
+ self.temperature = float(temperature)
33
+ self.max_tokens = int(max_tokens)
34
+ self.user_focused = user_focused
35
+
36
+ def synthesize(self,
37
+ template_spec: Dict,
38
+ code_evidence: List[Dict],
39
+ context_evidence: List[Dict],
40
+ system_prompt: Optional[str] = None,
41
+ structured_evidence: Optional[Dict] = None) -> Dict[str, str]:
42
+ if not template_spec or "sections" not in template_spec or not template_spec["sections"]:
43
+ raise ValueError("template_spec.sections is required and must be non-empty")
44
+
45
+ # Use user-focused system prompt if flag is set
46
+ if self.user_focused:
47
+ sys_prompt = system_prompt or (
48
+ "You are writing user-focused documentation following README best practices. "
49
+ "Your goal: Help first-time users succeed in 10 minutes. "
50
+ "CRITICAL RULES: "
51
+ "1. When evidence includes code blocks (```), copy them EXACTLY - do not paraphrase or modify. "
52
+ "2. When evidence shows installation commands, preserve them verbatim. "
53
+ "3. Prioritize practical information (install, run, configure) over theory. "
54
+ "4. Use imperative language: 'Install X' not 'X can be installed'. "
55
+ "5. Keep paragraphs to 2-3 sentences maximum. "
56
+ "6. Cite evidence using the requested citation style. "
57
+ "7. Only use [INFERENCE] when truly speculating - if README states a fact, cite it."
58
+ )
59
+ else:
60
+ sys_prompt = system_prompt or (
61
+ "You are a precise technical writer and developer with lots of practical experience. Generate grounded documentation. "
62
+ "Cite evidence for all factual claims using the requested citation style and mark any speculation with [INFERENCE]. "
63
+ "Do not invent file names or APIs that are not in the evidence."
64
+ )
65
+
66
+ outputs: Dict[str, str] = {}
67
+
68
+ for sec in template_spec["sections"]:
69
+ name = sec.get("name")
70
+ if not name:
71
+ raise ValueError("Each section requires a 'name'")
72
+ instr = sec.get("instructions", f"Write the {name} section.")
73
+ max_words = int(sec.get("max_words", 400))
74
+
75
+ # Calculate max_tokens dynamically based on max_words
76
+ # Formula: max_words * 1.3 tokens/word * 1.2 (20% buffer for formatting/citations)
77
+ calculated_max_tokens = int(max_words * 1.3 * 1.2)
78
+ # Use the larger of calculated or default to ensure we don't truncate
79
+ section_max_tokens = max(calculated_max_tokens, self.max_tokens)
80
+ logger.info(f"Section '{name}': max_words={max_words} → max_tokens={section_max_tokens} (calculated: {calculated_max_tokens})")
81
+
82
+ # Get rules from section, with smart defaults
83
+ rules = sec.get("rules", {})
84
+
85
+ # Set defaults only if not specified
86
+ if "require_citations" not in rules:
87
+ rules["require_citations"] = True
88
+ if "min_citations" not in rules:
89
+ rules["min_citations"] = 1 # Reduced from 2 to be more flexible
90
+ if "citation_style" not in rules:
91
+ rules["citation_style"] = "[CITE:source]"
92
+ if "mark_inference" not in rules:
93
+ rules["mark_inference"] = True
94
+ if "required_elements" not in rules:
95
+ rules["required_elements"] = []
96
+
97
+ user_prompt = build_section_prompt(
98
+ section_name=name,
99
+ instructions=instr,
100
+ code_evidence=code_evidence,
101
+ context_evidence=context_evidence,
102
+ rules=rules,
103
+ max_words=max_words,
104
+ structured_evidence=structured_evidence # NEW: Pass structured evidence
105
+ )
106
+
107
+ draft = self.generator.generate(
108
+ system_prompt=sys_prompt,
109
+ user_prompt=user_prompt,
110
+ temperature=self.temperature,
111
+ max_tokens=section_max_tokens # Dynamically calculated per section
112
+ )
113
+
114
+ # Get essentials for validation (Evidence-First approach)
115
+ essentials = structured_evidence.get("essentials", {}) if structured_evidence else None
116
+
117
+ # NEW: Pass section_name and structured_evidence for API validation
118
+ violations = validate_section_output(
119
+ draft,
120
+ rules,
121
+ max_words,
122
+ essentials=essentials,
123
+ section_name=name,
124
+ structured_evidence=structured_evidence
125
+ )
126
+
127
+ if violations:
128
+ fb = feedback_instructions(violations)
129
+ improved_prompt = user_prompt + "\n\n" + fb
130
+ draft2 = self.generator.generate(
131
+ system_prompt=sys_prompt,
132
+ user_prompt=improved_prompt,
133
+ temperature=self.temperature,
134
+ max_tokens=section_max_tokens # Use same calculated value for retry
135
+ )
136
+ violations2 = validate_section_output(
137
+ draft2,
138
+ rules,
139
+ max_words,
140
+ essentials=essentials,
141
+ section_name=name,
142
+ structured_evidence=structured_evidence
143
+ )
144
+ if violations2:
145
+ logger.error(f"Synthesis failed for section '{name}': {violations2}")
146
+ raise RuntimeError(f"Synthesis failed for section '{name}': {violations2}")
147
+ outputs[name] = draft2
148
+ else:
149
+ outputs[name] = draft
150
+
151
+ return outputs
152
+
153
+