PyPI - code-finder - Versions diffs - 0.1.0__py3-none-any.whl - Mend

code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

claude_context/__init__.py +33 -0
claude_context/agentic_integration.py +309 -0
claude_context/ast_chunker.py +646 -0
claude_context/config.py +239 -0
claude_context/context_manager.py +627 -0
claude_context/embeddings.py +307 -0
claude_context/embeddings_interface.py +226 -0
claude_context/enhanced_ast_chunker.py +1129 -0
claude_context/explorer.py +951 -0
claude_context/explorer_with_context.py +1008 -0
claude_context/indexer.py +893 -0
claude_context/markdown_chunker.py +421 -0
claude_context/mode_handler.py +1774 -0
claude_context/query_metrics.py +164 -0
claude_context/question_generator.py +800 -0
claude_context/readme_extractor.py +485 -0
claude_context/repository_adapter.py +399 -0
claude_context/search.py +493 -0
claude_context/skills/__init__.py +11 -0
claude_context/skills/_cli_common.py +74 -0
claude_context/skills/_index_manager.py +98 -0
claude_context/skills/api_surface.py +219 -0
claude_context/skills/evidence_retrieval.py +151 -0
claude_context/skills/grounded_review.py +212 -0
claude_context/synthesis/__init__.py +8 -0
claude_context/synthesis/editor_agent.py +391 -0
claude_context/synthesis/llm_synthesizer.py +153 -0
claude_context/synthesis/logic_explainer.py +235 -0
claude_context/synthesis/multi_review_pipeline.py +717 -0
claude_context/synthesis/prompt_builder.py +439 -0
claude_context/synthesis/providers.py +115 -0
claude_context/synthesis/validators.py +458 -0
code_finder-0.1.0.dist-info/METADATA +823 -0
code_finder-0.1.0.dist-info/RECORD +37 -0
code_finder-0.1.0.dist-info/WHEEL +5 -0
code_finder-0.1.0.dist-info/entry_points.txt +4 -0
code_finder-0.1.0.dist-info/top_level.txt +1 -0

claude_context/synthesis/editor_agent.py ADDED Viewed

@@ -0,0 +1,391 @@
+"""Editor agent that critiques synthesized documentation using repository evidence.
+The editor runs after the first-pass synthesis. It reuses the same hybrid search
+infrastructure as the extraction pipeline so that every critique can reference
+real code, README content, or rationale records before suggesting revisions.
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence
+from .providers import TextGenerator, create_generator
+logger = logging.getLogger(__name__)
+CITATION_PATTERN = re.compile(r"\[CITE:([^\]]+)\]")
+@dataclass
+class RetrievedContext:
+    """Snippet pulled from the repository for editor review."""
+    file_path: str
+    start_line: int
+    end_line: int
+    snippet: str
+@dataclass
+class EditorReview:
+    """Structured output from the editor pass."""
+    section: str
+    issues: List[str]
+    revised_text: str
+    citations: List[str]
+    queries: List[str]
+    retrieved_context: List[RetrievedContext]
+    raw_response: str
+    evidence_summary: str
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "section": self.section,
+            "issues": self.issues,
+            "citations": self.citations,
+            "queries": self.queries,
+            "retrieved_context": [
+                {
+                    "file_path": ctx.file_path,
+                    "start_line": ctx.start_line,
+                    "end_line": ctx.end_line,
+                    "snippet": ctx.snippet,
+                }
+                for ctx in self.retrieved_context
+            ],
+            "raw_response": self.raw_response,
+            "evidence_summary": self.evidence_summary,
+        }
+class EditorAgent:
+    """Critiques synthesized documentation with grounded repository evidence."""
+    def __init__(
+        self,
+        searcher: Any,
+        generator: Optional[TextGenerator] = None,
+        *,
+        max_queries: int = 6,
+        search_limit: int = 3,
+        temperature: float = 0.15,
+        max_tokens: int = 900,
+    ) -> None:
+        self.searcher = searcher
+        self.generator = generator or create_generator()
+        self.max_queries = max_queries
+        self.search_limit = search_limit
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+    def review_sections(
+        self,
+        sections: Dict[str, str],
+        *,
+        structured_evidence: Dict[str, Any],
+        repository_path: str,
+    ) -> List[EditorReview]:
+        reviews: List[EditorReview] = []
+        for section_name, draft_text in sections.items():
+            review = self._review_single_section(
+                section_name=section_name,
+                draft_text=draft_text,
+                repository_path=repository_path,
+                structured_evidence=structured_evidence,
+            )
+            reviews.append(review)
+        return reviews
+    # ------------------------------------------------------------------
+    # Prompt helpers
+    # ------------------------------------------------------------------
+    def _system_prompt(self) -> str:
+        return (
+            "You are a technical editor. Review documentation against the provided "
+            "evidence and source snippets. Identify factual errors, missing rationale, "
+            "and opportunities to clarify the WHY behind decisions. Respond in JSON."
+        )
+    def _build_user_prompt(
+        self,
+        *,
+        section_name: str,
+        draft_text: str,
+        repository_path: str,
+        rationale: Dict[str, Any],
+        retrieved: List[RetrievedContext],
+        evidence_summary: str,
+    ) -> str:
+        rationale_block = json.dumps(self._condense_rationale(rationale), indent=2)
+        context_lines = []
+        for ctx in retrieved:
+            snippet = ctx.snippet.strip()
+            if len(snippet) > 600:
+                snippet = snippet[:600] + "\n..."
+            context_lines.append(
+                f"File: {ctx.file_path}:{ctx.start_line}-{ctx.end_line}\n{snippet}"
+            )
+        context_block = "\n\n".join(context_lines) or "(no extra context retrieved)"
+        instructions = (
+            "Review the draft against the evidence. Every retrieved snippet must be considered. "
+            "If any snippet disagrees with the draft (dataset name, parameter, behavior, rationale), "
+            "rewrite the affected text to match the snippet and cite it. If everything matches, "
+            "explicitly state that no change was required. Return JSON with keys: issues (list), "
+            "revised_text (string), citations (list of cite markers)."
+        )
+        return "\n\n".join(
+            [
+                f"Repository: {repository_path}",
+                f"Section under review: {section_name}",
+                instructions,
+                "\nDraft section:\n" + draft_text.strip(),
+                "\nRationale evidence:\n" + rationale_block,
+                "\nRetrieved source context:\n" + context_block,
+                "\nEvidence summary:\n" + evidence_summary,
+            ]
+        )
+    def _parse_response(self, response: str, draft_text: str) -> Dict[str, Any]:
+        try:
+            data = json.loads(response)
+        except json.JSONDecodeError:
+            logger.debug("Editor response not JSON; returning draft unchanged")
+            return {
+                "issues": [line.strip() for line in response.splitlines() if line.strip()],
+                "revised_text": draft_text,
+                "citations": self._extract_citations(draft_text),
+            }
+        issues = data.get("issues", [])
+        if isinstance(issues, str):
+            issues = [issues]
+        elif not isinstance(issues, Iterable):
+            issues = []
+        revised = data.get("revised_text") or draft_text
+        citations = data.get("citations")
+        if isinstance(citations, str):
+            citations = [citations]
+        elif not isinstance(citations, Iterable):
+            citations = []
+        citations = [str(c).strip() for c in citations if str(c).strip()]
+        if not citations:
+            citations = self._extract_citations(revised) or self._extract_citations(draft_text)
+        return {
+            "issues": [str(issue).strip() for issue in issues if str(issue).strip()],
+            "revised_text": revised,
+            "citations": citations,
+        }
+    # ------------------------------------------------------------------
+    # Evidence helpers
+    # ------------------------------------------------------------------
+    def _derive_queries(
+        self,
+        section_name: str,
+        draft_text: str,
+        structured_evidence: Dict[str, Any],
+    ) -> List[str]:
+        queries: List[str] = []
+        for match in CITATION_PATTERN.findall(draft_text):
+            for fragment in match.split(","):
+                token = fragment.strip()
+                if not token:
+                    continue
+                token = token.split(":")[0]
+                queries.append(token)
+        rationale = structured_evidence.get("rationale", {})
+        for logic in rationale.get("logic", [])[:3]:
+            file_path = logic.get("file_path")
+            if file_path:
+                queries.append(str(file_path))
+        essentials = structured_evidence.get("essentials", {})
+        readme_source = essentials.get("installation", {}).get("source") or essentials.get("quickstart", {}).get("source")
+        if readme_source:
+            queries.append(str(readme_source))
+        if not queries:
+            queries.append(section_name.replace(".md", ""))
+        seen = set()
+        ordered: List[str] = []
+        for query in queries:
+            if query not in seen:
+                seen.add(query)
+                ordered.append(query)
+        return ordered[: self.max_queries]
+    def _retrieve_context(self, queries: Sequence[str]) -> List[RetrievedContext]:
+        contexts: List[RetrievedContext] = []
+        for term in queries:
+            try:
+                results = self.searcher.search(term, limit=self.search_limit)
+            except Exception as exc:  # pragma: no cover - defensive
+                logger.debug("Search failed for %s: %s", term, exc)
+                continue
+            for result in results:
+                snippet = (getattr(result, "content", None) or "").strip()
+                contexts.append(
+                    RetrievedContext(
+                        file_path=getattr(result, "file_path", term),
+                        start_line=int(getattr(result, "start_line", 0) or 0),
+                        end_line=int(getattr(result, "end_line", 0) or 0),
+                        snippet=snippet,
+                    )
+                )
+        return contexts
+    def _summarize_context(self, contexts: List[RetrievedContext]) -> str:
+        if not contexts:
+            return "(no context retrieved)"
+        lines: List[str] = []
+        for ctx in contexts[:10]:
+            snippet = ctx.snippet.strip().replace("\n", " ")
+            if len(snippet) > 120:
+                snippet = snippet[:120] + "…"
+            lines.append(f"- {ctx.file_path}:{ctx.start_line}-{ctx.end_line} → {snippet}")
+        return "\n".join(lines)
+    def _requires_revision(
+        self, draft_text: str, revised_text: str, contexts: List[RetrievedContext]
+    ) -> bool:
+        if draft_text.strip() == revised_text.strip():
+            return False
+        draft_lower = draft_text.lower()
+        revised_lower = revised_text.lower()
+        for ctx in contexts:
+            snippet = ctx.snippet.lower().strip()
+            if snippet and snippet in draft_lower and snippet not in revised_lower:
+                return False
+        return True
+    def _extract_citations(self, text: str) -> List[str]:
+        citations: List[str] = []
+        for match in CITATION_PATTERN.findall(text):
+            for fragment in match.split(","):
+                entry = fragment.strip()
+                if entry:
+                    citations.append(entry)
+        return citations
+    def _condense_rationale(self, rationale: Dict[str, Any]) -> Dict[str, Any]:
+        summary: Dict[str, Any] = {}
+        logic_items: List[Dict[str, Any]] = []
+        for item in rationale.get("logic", [])[:5]:
+            logic_items.append(
+                {
+                    "file": item.get("file_path"),
+                    "summary": item.get("summary"),
+                    "why": item.get("rationale"),
+                    "trade_offs": item.get("trade_offs"),
+                    "side_effects": item.get("side_effects"),
+                }
+            )
+        if logic_items:
+            summary["logic"] = logic_items
+        decisions: List[Dict[str, Any]] = []
+        for item in rationale.get("decisions", [])[:3]:
+            decisions.append(
+                {
+                    "source": item.get("source"),
+                    "summary": item.get("summary"),
+                    "type": item.get("type"),
+                }
+            )
+        if decisions:
+            summary["decisions"] = decisions
+        qa_items: List[Dict[str, Any]] = []
+        for qa in rationale.get("qa", [])[:3]:
+            qa_items.append(
+                {
+                    "question": qa.get("question"),
+                    "confidence": qa.get("confidence"),
+                    "notes": qa.get("rationale_points"),
+                }
+            )
+        if qa_items:
+            summary["qa"] = qa_items
+        errors = rationale.get("errors")
+        if errors:
+            summary["errors"] = errors
+        return summary
+    def _review_single_section(
+        self,
+        *,
+        section_name: str,
+        draft_text: str,
+        repository_path: str,
+        structured_evidence: Dict[str, Any],
+    ) -> EditorReview:
+        queries = self._derive_queries(section_name, draft_text, structured_evidence)
+        retrieved = self._retrieve_context(queries)
+        evidence_summary = self._summarize_context(retrieved)
+        logger.debug("Editor evidence for %s:\n%s", section_name, evidence_summary or "(none)")
+        user_prompt = self._build_user_prompt(
+            section_name=section_name,
+            draft_text=draft_text,
+            repository_path=repository_path,
+            rationale=structured_evidence.get("rationale", {}),
+            retrieved=retrieved,
+            evidence_summary=evidence_summary,
+        )
+        try:
+            response = self.generator.generate(
+                system_prompt=self._system_prompt(),
+                user_prompt=user_prompt,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+            )
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.warning("Editor agent failed for %s: %s", section_name, exc)
+            return EditorReview(
+                section=section_name,
+                issues=[f"Editor agent error: {exc}"],
+                revised_text=draft_text,
+                citations=self._extract_citations(draft_text),
+                queries=queries,
+                retrieved_context=retrieved,
+                raw_response=str(exc),
+                evidence_summary=evidence_summary,
+            )
+        parsed = self._parse_response(response, draft_text)
+        revised_text = parsed["revised_text"]
+        if not self._requires_revision(draft_text, revised_text, retrieved):
+            issues = parsed["issues"] or []
+            issues.append("No substantive changes detected; draft retained")
+            parsed["issues"] = issues
+            revised_text = draft_text
+        return EditorReview(
+            section=section_name,
+            issues=parsed["issues"],
+            revised_text=revised_text,
+            citations=parsed["citations"],
+            queries=queries,
+            retrieved_context=retrieved,
+            raw_response=response,
+            evidence_summary=evidence_summary,
+        )

claude_context/synthesis/llm_synthesizer.py ADDED Viewed

@@ -0,0 +1,153 @@
+import logging
+from typing import List, Dict, Optional
+from .providers import create_generator, TextGenerator
+from .prompt_builder import build_section_prompt
+from .validators import validate_section_output, feedback_instructions
+logger = logging.getLogger(__name__)
+class LLMSynthesizer:
+    """
+    Minimal synthesis orchestrator:
+    - Build per-section prompts with evidence and rules
+    - Generate with a provider (fail-fast if not configured)
+    - Validate and retry once with targeted feedback
+    Token/Word Relationship:
+    - Templates specify max_words (e.g., 2000 words for Configuration section)
+    - Average: 1 word ≈ 1.3 tokens
+    - max_tokens calculated dynamically: max_words * 1.3 * 1.2 (20% buffer)
+    - Default max_tokens=3000 used as minimum fallback
+    """
+    def __init__(self,
+                 provider: Optional[str] = None,
+                 model: Optional[str] = None,
+                 temperature: float = 0.2,
+                 max_tokens: int = 3000,  # Covers current max_words limits (up to 2000 words)
+                 user_focused: bool = False):
+        self.generator: TextGenerator = create_generator(provider=provider, model=model)
+        self.temperature = float(temperature)
+        self.max_tokens = int(max_tokens)
+        self.user_focused = user_focused
+    def synthesize(self,
+                   template_spec: Dict,
+                   code_evidence: List[Dict],
+                   context_evidence: List[Dict],
+                   system_prompt: Optional[str] = None,
+                   structured_evidence: Optional[Dict] = None) -> Dict[str, str]:
+        if not template_spec or "sections" not in template_spec or not template_spec["sections"]:
+            raise ValueError("template_spec.sections is required and must be non-empty")
+        # Use user-focused system prompt if flag is set
+        if self.user_focused:
+            sys_prompt = system_prompt or (
+                "You are writing user-focused documentation following README best practices. "
+                "Your goal: Help first-time users succeed in 10 minutes. "
+                "CRITICAL RULES: "
+                "1. When evidence includes code blocks (```), copy them EXACTLY - do not paraphrase or modify. "
+                "2. When evidence shows installation commands, preserve them verbatim. "
+                "3. Prioritize practical information (install, run, configure) over theory. "
+                "4. Use imperative language: 'Install X' not 'X can be installed'. "
+                "5. Keep paragraphs to 2-3 sentences maximum. "
+                "6. Cite evidence using the requested citation style. "
+                "7. Only use [INFERENCE] when truly speculating - if README states a fact, cite it."
+            )
+        else:
+            sys_prompt = system_prompt or (
+                "You are a precise technical writer and developer with lots of practical experience. Generate grounded documentation. "
+                "Cite evidence for all factual claims using the requested citation style and mark any speculation with [INFERENCE]. "
+                "Do not invent file names or APIs that are not in the evidence."
+            )
+        outputs: Dict[str, str] = {}
+        for sec in template_spec["sections"]:
+            name = sec.get("name")
+            if not name:
+                raise ValueError("Each section requires a 'name'")
+            instr = sec.get("instructions", f"Write the {name} section.")
+            max_words = int(sec.get("max_words", 400))
+            # Calculate max_tokens dynamically based on max_words
+            # Formula: max_words * 1.3 tokens/word * 1.2 (20% buffer for formatting/citations)
+            calculated_max_tokens = int(max_words * 1.3 * 1.2)
+            # Use the larger of calculated or default to ensure we don't truncate
+            section_max_tokens = max(calculated_max_tokens, self.max_tokens)
+            logger.info(f"Section '{name}': max_words={max_words} → max_tokens={section_max_tokens} (calculated: {calculated_max_tokens})")
+            # Get rules from section, with smart defaults
+            rules = sec.get("rules", {})
+            # Set defaults only if not specified
+            if "require_citations" not in rules:
+                rules["require_citations"] = True
+            if "min_citations" not in rules:
+                rules["min_citations"] = 1  # Reduced from 2 to be more flexible
+            if "citation_style" not in rules:
+                rules["citation_style"] = "[CITE:source]"
+            if "mark_inference" not in rules:
+                rules["mark_inference"] = True
+            if "required_elements" not in rules:
+                rules["required_elements"] = []
+            user_prompt = build_section_prompt(
+                section_name=name,
+                instructions=instr,
+                code_evidence=code_evidence,
+                context_evidence=context_evidence,
+                rules=rules,
+                max_words=max_words,
+                structured_evidence=structured_evidence  # NEW: Pass structured evidence
+            )
+            draft = self.generator.generate(
+                system_prompt=sys_prompt,
+                user_prompt=user_prompt,
+                temperature=self.temperature,
+                max_tokens=section_max_tokens  # Dynamically calculated per section
+            )
+            # Get essentials for validation (Evidence-First approach)
+            essentials = structured_evidence.get("essentials", {}) if structured_evidence else None
+            # NEW: Pass section_name and structured_evidence for API validation
+            violations = validate_section_output(
+                draft,
+                rules,
+                max_words,
+                essentials=essentials,
+                section_name=name,
+                structured_evidence=structured_evidence
+            )
+            if violations:
+                fb = feedback_instructions(violations)
+                improved_prompt = user_prompt + "\n\n" + fb
+                draft2 = self.generator.generate(
+                    system_prompt=sys_prompt,
+                    user_prompt=improved_prompt,
+                    temperature=self.temperature,
+                    max_tokens=section_max_tokens  # Use same calculated value for retry
+                )
+                violations2 = validate_section_output(
+                    draft2,
+                    rules,
+                    max_words,
+                    essentials=essentials,
+                    section_name=name,
+                    structured_evidence=structured_evidence
+                )
+                if violations2:
+                    logger.error(f"Synthesis failed for section '{name}': {violations2}")
+                    raise RuntimeError(f"Synthesis failed for section '{name}': {violations2}")
+                outputs[name] = draft2
+            else:
+                outputs[name] = draft
+        return outputs