PyPI - deepdoc - Versions diffs - 2.2.0__tar.gz → 2.2.1__tar.gz - Mend

deepdoc 2.2.0tar.gz → 2.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

{deepdoc-2.2.0 → deepdoc-2.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoc
-Version: 2.2.0
+Version: 2.2.1
 Summary: Auto-generate beautiful docs from any codebase
 Author: Pranav Kumar
 License: MIT
@@ -43,6 +43,18 @@ Dynamic: license-file
 [![Python versions](https://img.shields.io/pypi/pyversions/deepdoc)](https://pypi.org/project/deepdoc/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](./LICENSE)
+## Repository Layout
+| Directory | What it is | Where to start |
+|---|---|---|
+| [`deepdoc/`](./deepdoc/) | The Python package — CLI, pipeline, planner, generator, chatbot, and site builder. This is the core product. | [`deepdoc/README.md`](./deepdoc/README.md) |
+| [`web/`](./web/) | Marketing and changelog site built with Astro 5 + Tailwind. Deployed to the public DeepDoc website. | [`web/README.md`](./web/README.md) |
+| [`vscode-extension/`](./vscode-extension/) | VS Code extension — explains selected code snippets in Fast or Deep mode and inserts AI-generated comments inline. | [`vscode-extension/README.md`](./vscode-extension/README.md) |
+| [`tests/`](./tests/) | pytest test suite for the Python package. | Run `python3 -m pytest -q` from repo root. |
+| [`scripts/`](./scripts/) | One-off release and maintenance scripts. | — |
+---
 Auto-generate deep engineering documentation from real codebases using AI.
 DeepDoc scans your repo, builds a bucket-based documentation plan, generates rich MDX pages with Mermaid diagrams, and builds a local-first Fumadocs site with Orama search.

{deepdoc-2.2.0 → deepdoc-2.2.1}/README.md RENAMED Viewed

@@ -4,6 +4,18 @@
 [![Python versions](https://img.shields.io/pypi/pyversions/deepdoc)](https://pypi.org/project/deepdoc/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](./LICENSE)
+## Repository Layout
+| Directory | What it is | Where to start |
+|---|---|---|
+| [`deepdoc/`](./deepdoc/) | The Python package — CLI, pipeline, planner, generator, chatbot, and site builder. This is the core product. | [`deepdoc/README.md`](./deepdoc/README.md) |
+| [`web/`](./web/) | Marketing and changelog site built with Astro 5 + Tailwind. Deployed to the public DeepDoc website. | [`web/README.md`](./web/README.md) |
+| [`vscode-extension/`](./vscode-extension/) | VS Code extension — explains selected code snippets in Fast or Deep mode and inserts AI-generated comments inline. | [`vscode-extension/README.md`](./vscode-extension/README.md) |
+| [`tests/`](./tests/) | pytest test suite for the Python package. | Run `python3 -m pytest -q` from repo root. |
+| [`scripts/`](./scripts/) | One-off release and maintenance scripts. | — |
+---
 Auto-generate deep engineering documentation from real codebases using AI.
 DeepDoc scans your repo, builds a bucket-based documentation plan, generates rich MDX pages with Mermaid diagrams, and builds a local-first Fumadocs site with Orama search.

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """DeepDoc — Auto-generate beautiful docs from any codebase."""
-__version__ = "2.1.0"
+__version__ = "2.2.1"

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/cli.py RENAMED Viewed

@@ -200,8 +200,10 @@ def init(name, description, provider, model, output_dir, with_chatbot):
         "openai": ("gpt-4o", "OPENAI_API_KEY"),
         "ollama": ("ollama/llama3.2", None),
         "azure": ("azure/gpt-4o", "AZURE_API_KEY"),
+        "google": ("gemini/gemini-1.5-pro", "GEMINI_API_KEY"),
+        "gemini": ("gemini/gemini-1.5-pro", "GEMINI_API_KEY"),
     }
-    default_model, default_key_env = provider_defaults.get(provider, ("", ""))
+    default_model, default_key_env = provider_defaults.get(provider, ("", "DEEPDOC_LLM_API_KEY"))
     resolved_model = model or default_model
     cfg = dict(DEFAULT_CONFIG)
@@ -261,14 +263,15 @@ def init(name, description, provider, model, output_dir, with_chatbot):
         )
         next_steps.append("  4. Generate docs:     [bold]deepdoc generate[/bold]")
         next_steps.append("  5. Preview locally:   [bold]deepdoc serve[/bold]")
-    elif cfg["llm"]["api_key_env"]:
-        next_steps.append(
-            f"  2. Set your API key:  [bold]export {cfg['llm']['api_key_env']}=...[/bold]"
-        )
+    elif provider == "ollama":
+        next_steps.append("  2. Make sure Ollama is running locally")
         next_steps.append("  3. Generate docs:     [bold]deepdoc generate[/bold]")
         next_steps.append("  4. Preview locally:   [bold]deepdoc serve[/bold]")
     else:
-        next_steps.append("  2. Make sure Ollama is running locally")
+        key_env = cfg["llm"].get("api_key_env") or "DEEPDOC_LLM_API_KEY"
+        next_steps.append(
+            f"  2. Set your API key:  [bold]export {key_env}=...[/bold]"
+        )
         next_steps.append("  3. Generate docs:     [bold]deepdoc generate[/bold]")
         next_steps.append("  4. Preview locally:   [bold]deepdoc serve[/bold]")
     if with_chatbot:

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/generator/evidence.py RENAMED Viewed

@@ -40,6 +40,11 @@ from ..openapi import parse_openapi_spec, spec_to_context_string
 console = Console()
+_EP_TITLE_RE = re.compile(
+    r"^(GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS|CONNECT|TRACE)\s+(/\S*)",
+    re.IGNORECASE,
+)
 # ═════════════════════════════════════════════════════════════════════════════
 # 3.1  Evidence Assembly
 # ═════════════════════════════════════════════════════════════════════════════
@@ -67,6 +72,7 @@ class AssembledEvidence:
     compressed_cards_context: str = ""
     files_included_raw: int = 0
     files_compressed: int = 0
+    compressed_file_paths: set[str] = field(default_factory=set)
     coverage_files_total: int = 0
     helper_context: str = ""  # resolved helper/utility function bodies
     flow_context: str = ""  # call graph + flow evidence (entrypoints, chains, side effects)
@@ -141,7 +147,7 @@ class EvidenceAssembler:
             source_ctx,
             compressed_cards_ctx,
             files_included_raw,
-            files_compressed,
+            compressed_file_paths,
             coverage_total,
         ) = self._build_source_context(bucket)
         endpoints_detail = self._build_endpoints_detail(bucket)
@@ -204,7 +210,8 @@ class EvidenceAssembler:
             flow_context=flow_ctx,
             total_evidence_chars=total,
             files_included_raw=files_included_raw,
-            files_compressed=files_compressed,
+            files_compressed=len(compressed_file_paths),
+            compressed_file_paths=compressed_file_paths,
             coverage_files_total=coverage_total,
             evidence_file_paths=evidence_files,
             config_env_context=config_env_ctx,
@@ -316,11 +323,12 @@ class EvidenceAssembler:
             included += 1
         cards_context = self._format_compressed_cards(compressed_cards)
+        compressed_paths = {card.file_path for card in compressed_cards}
         return (
             "\n".join(parts),
             cards_context,
             included,
-            len(compressed_cards),
+            compressed_paths,
             len(ranked_files),
         )
@@ -780,7 +788,7 @@ class EvidenceAssembler:
         """Extract actual env var names from source files for grounded config docs."""
         env_vars: dict[str, list[str]] = {}  # var_name -> [file_paths]
-        for src_file in bucket.owned_files:
+        for src_file in list(bucket.owned_files) + list(bucket.artifact_refs or []):
             src_path = self.repo_root / src_file
             if not src_path.exists():
                 continue
@@ -1052,10 +1060,11 @@ class EvidenceAssembler:
         # ── endpoint_ref: match specific endpoint, pull deep evidence ─────
         if hints.get("is_endpoint_ref"):
-            # The title is e.g. "GET /api/v1/orders" — extract method+path
-            title_parts = bucket.title.split(" ", 1)
-            ref_method = title_parts[0].upper() if len(title_parts) >= 1 else ""
-            ref_path = title_parts[1] if len(title_parts) >= 2 else ""
+            # Extract METHOD /path from title via regex — more robust than split(" ", 1)
+            # since titles may not always follow the "GET /path" convention.
+            _m = _EP_TITLE_RE.match(bucket.title)
+            ref_method = _m.group(1).upper() if _m else ""
+            ref_path = _m.group(2) if _m else ""
             # Find matching bundle via handler symbol or method+path
             matched_bundle = None

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/generator/generation.py RENAMED Viewed

@@ -40,7 +40,7 @@ from ..parser import parse_file, supported_extensions
 from ..parser.base import ParsedFile, Symbol
 from ..planner import DocBucket, DocPlan, RepoScan, tracked_bucket_files
 from ..prompts_v2 import SYSTEM_V2, get_prompt_for_bucket
-from ..scanner import _classify_file_role
+from ..scanner import _build_import_lookup, _classify_file_role, _normalize_import
 from ..openapi import parse_openapi_spec, spec_to_context_string
 console = Console()
@@ -354,6 +354,7 @@ class BucketGenerationEngine:
         self._repo_file_paths = set(self.scan.file_summaries.keys())
         self.coverage_report: dict[str, Any] = {}
         self.local_dev_warnings: list[str] = []
+        self._import_lookup = _build_import_lookup(set(self.scan.file_summaries.keys()))
         self._openapi_context = self._precompute_openapi_context()
         self._doc_pages = self._planned_doc_pages()
         (
@@ -363,17 +364,25 @@ class BucketGenerationEngine:
         ) = build_internal_doc_link_maps(self._doc_pages)
     def _precompute_openapi_context(self) -> str:
-        """Parse the first available OpenAPI spec once per run."""
+        """Parse all available OpenAPI specs, accumulating up to 6 000 chars."""
         if not self.scan.has_openapi:
             return ""
+        spec_count = len(self.scan.openapi_paths)
+        per_spec_limit = max(2000, min(4000, 6000 // max(1, spec_count)))
+        parts: list[str] = []
+        total = 0
         for spec_path in self.scan.openapi_paths:
             spec = parse_openapi_spec(self.repo_root / spec_path)
             if spec:
-                return (
+                chunk = (
                     f"\n## OpenAPI Spec ({spec_path}):\n"
-                    f"{spec_to_context_string(spec)[:4000]}"
+                    f"{spec_to_context_string(spec)[:per_spec_limit]}"
                 )
-        return ""
+                parts.append(chunk)
+                total += len(chunk)
+                if total >= 6000:
+                    break
+        return "\n".join(parts)
     def generate_all(self, force: bool = False) -> list[GenerationResult]:
         """Generate all pages. Returns results for each bucket.
@@ -469,6 +478,9 @@ class BucketGenerationEngine:
                                     f"~{word_count} words{diagrams} · "
                                     f"{result.elapsed_seconds:.1f}s)[/dim]{warnings}"
                                 )
+                                # Save manifest incrementally so a cancelled run
+                                # can resume from completed pages on next generate.
+                                self._checkpoint_manifest(_manifest, result)
                         except Exception as e:
                             failed_count += 1
                             results.append(
@@ -1364,24 +1376,52 @@ Re-run `deepdoc generate` to retry.
             pages.append((bucket.title, url))
         return pages
-    def _build_sitemap_for(self, current_slug: str) -> str:
-        """Build formatted sitemap excluding current page."""
-        by_section: dict[str, list[DocBucket]] = defaultdict(list)
-        for b in self.plan.buckets:
-            if b.slug != current_slug:
-                by_section[b.section or "Other"].append(b)
+    def _bucket_url(self, b: DocBucket) -> str:
+        """Return the site URL for a bucket, respecting endpoint_ref /api/* routing."""
+        hints = b.generation_hints or {}
+        if hints.get("is_introduction_page"):
+            return "/"
+        if self.scan.has_openapi and (
+            hints.get("is_endpoint_ref")
+            or hints.get("prompt_style") == "endpoint_ref"
+            or b.bucket_type == "endpoint_ref"
+        ):
+            return f"/api/{b.slug}"
+        return f"/{b.slug}"
+    def _build_sitemap_for(self, current_slug: str) -> str:
+        """Build formatted sitemap ordered by nav_structure, excluding current page."""
+        slug_to_bucket = {b.slug: b for b in self.plan.buckets if b.slug != current_slug}
         lines: list[str] = []
-        for section, buckets in by_section.items():
-            lines.append(f"**{section}**")
-            for b in buckets:
-                page_path = f"/{b.slug}"
+        seen: set[str] = set()
+        for section, slugs in self.plan.nav_structure.items():
+            section_lines: list[str] = []
+            for slug in slugs:
+                b = slug_to_bucket.get(slug)
+                if not b:
+                    continue
+                seen.add(slug)
+                page_path = self._bucket_url(b)
                 key_files = ", ".join(f"`{f}`" for f in b.owned_files[:4])
                 if len(b.owned_files) > 4:
                     key_files += f" +{len(b.owned_files) - 4} more"
-                lines.append(f"- [{b.title}]({page_path}) — {b.description}")
+                section_lines.append(f"- [{b.title}]({page_path}) — {b.description}")
                 if key_files:
-                    lines.append(f"  *Covers: {key_files}*")
+                    section_lines.append(f"  *Covers: {key_files}*")
+            if section_lines:
+                lines.append(f"**{section}**")
+                lines.extend(section_lines)
+        # Buckets not referenced by nav_structure — group by section
+        orphans_by_section: dict[str, list] = defaultdict(list)
+        for slug, b in slug_to_bucket.items():
+            if slug not in seen:
+                orphans_by_section[b.section or "Other"].append(b)
+        for section, orphan_buckets in orphans_by_section.items():
+            lines.append(f"**{section}**")
+            for b in orphan_buckets:
+                lines.append(f"- [{b.title}]({self._bucket_url(b)}) — {b.description}")
         return "\n".join(lines) if lines else "(no other pages)"
@@ -1396,7 +1436,9 @@ Re-run `deepdoc generate` to retry.
             if dep_slug in slug_to_bucket and dep_slug != bucket.slug:
                 related[dep_slug] = slug_to_bucket[dep_slug]
-        # Import-based: find buckets whose files are imported by this bucket's files
+        # Import-based: find buckets whose files are imported by this bucket's files.
+        # Uses the pre-built import lookup (O(imports) per file) instead of scanning
+        # all repo files for each import string.
         file_to_buckets: dict[str, list[DocBucket]] = defaultdict(list)
         for b in self.plan.buckets:
             for f in b.owned_files:
@@ -1407,18 +1449,17 @@ Re-run `deepdoc generate` to retry.
             if not parsed or not parsed.imports:
                 continue
             for imp in parsed.imports:
-                # Simple suffix match against known files
-                for known_file in self.scan.file_summaries:
-                    stem = (
-                        known_file.rsplit(".", 1)[0]
-                        .replace("/", ".")
-                        .replace("\\", ".")
-                    )
-                    if stem and stem in imp.replace("/", "."):
-                        for linked_bucket in file_to_buckets.get(known_file, []):
+                for hint in _normalize_import(imp):
+                    key = hint.replace(".", "/").lower().strip("/")
+                    if not key:
+                        continue
+                    matched_files = self._import_lookup.get(key, set())
+                    if len(matched_files) > 5:
+                        continue  # ambiguous — too many matches to be useful
+                    for matched_file in matched_files:
+                        for linked_bucket in file_to_buckets.get(matched_file, []):
                             if linked_bucket.slug != bucket.slug:
                                 related[linked_bucket.slug] = linked_bucket
-                        break
         # Strong overlap-based links for database/runtime/interface pages
         for candidate in self.plan.buckets:
@@ -1443,7 +1484,7 @@ Re-run `deepdoc generate` to retry.
             "**Dependency Links** (pages this module imports from — MUST link to these):"
         ]
         for b in related.values():
-            lines.append(f"- [{b.title}](/{b.slug}) — {b.description}")
+            lines.append(f"- [{b.title}]({self._bucket_url(b)}) — {b.description}")
         return "\n".join(lines)
@@ -1466,6 +1507,19 @@ Re-run `deepdoc generate` to retry.
                 return True
         return False
+    def _checkpoint_manifest(self, manifest: Any, result: "GenerationResult") -> None:
+        """Write the manifest for one completed page so a cancelled run can resume."""
+        from ..manifest import file_hash as compute_hash
+        try:
+            for src_file in tracked_bucket_files(result.bucket):
+                src_path = self.repo_root / src_file
+                if src_path.exists():
+                    content = src_path.read_text(encoding="utf-8", errors="replace")
+                    manifest.update(src_file, compute_hash(content), result.bucket.slug)
+            manifest.save()
+        except Exception:
+            pass
     def update_manifest(self, results: list[GenerationResult]):
         """Update the manifest with new file hashes for all successfully generated pages."""
         from ..manifest import Manifest, file_hash as compute_hash

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/generator/mdx_compile_gate.py RENAMED Viewed

@@ -24,6 +24,7 @@ from typing import Callable
 from ..llm import LLMClient
 from ..planner import DocBucket
 from ..prompts_v2 import SYSTEM_V2
+from .post_processors import escape_mdx_route_params, escape_mdx_text_hazards
 from .mdx_validator import (
     MdxCompileError,
     ValidationOutcome,
@@ -115,13 +116,19 @@ def apply_mdx_compile_gate(
             )
             continue
+        # Re-run hazard escaping so LLM fix attempts cannot reintroduce bare
+        # {expr} or route params that weren't present before the fix call.
+        fixed = escape_mdx_text_hazards(fixed)
+        fixed = escape_mdx_route_params(fixed)
         current = fixed
         next_outcome = validate(current)
         if next_outcome.ok:
             return GateOutcome(content=current, retries=retries)
         last_error = next_outcome.error
-    fallback = _strip_jsx_to_markdown(current)
+    # Escape hazards one more time before JSX stripping — the retry loop may
+    # have left bare {expr} in content that the strip pass won't handle.
+    fallback = _strip_jsx_to_markdown(escape_mdx_text_hazards(escape_mdx_route_params(current)))
     fallback_outcome = validate(fallback)
     return GateOutcome(
         content=fallback,

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/generator/validation.py RENAMED Viewed

@@ -102,7 +102,7 @@ class PageValidator:
         self._check_sections(content, bucket, result)
         # 2. Check that owned files are referenced
-        self._check_file_refs(content, bucket, result)
+        self._check_file_refs(content, bucket, result, evidence)
         # 3. Check for hallucinated file paths
         self._check_hallucinated_paths(content, bucket, result)
@@ -114,7 +114,7 @@ class PageValidator:
         self._check_hallucinated_symbols(content, bucket, evidence, result)
         # 6. Check route/path claims for API and operations-heavy pages
-        self._check_route_claims(content, bucket, result)
+        self._check_route_claims(content, bucket, result, evidence)
         # 7. Count mermaid diagrams
         result.mermaid_block_count = len(re.findall(r"```mermaid", content))
@@ -259,21 +259,41 @@ class PageValidator:
         ]
     def _check_file_refs(
-        self, content: str, bucket: DocBucket, result: ValidationResult
+        self,
+        content: str,
+        bucket: DocBucket,
+        result: ValidationResult,
+        evidence: AssembledEvidence | None = None,
     ):
-        """Check that at least some of the bucket's owned files are referenced."""
+        """Check that at least some of the bucket's owned files are referenced.
+        Only files the LLM actually received full source for are checked — files
+        that were compressed to evidence cards are excluded from the coverage
+        threshold because the LLM cannot be expected to cite paths it never saw
+        in full.
+        """
         if not bucket.owned_files:
             return
+        # Scope the check to files the LLM actually received full source for.
+        compressed_paths: set[str] = (
+            evidence.compressed_file_paths
+            if evidence is not None and evidence.compressed_file_paths
+            else set()
+        )
+        checkable_files = [f for f in bucket.owned_files if f not in compressed_paths]
+        if not checkable_files:
+            # All files were compressed — the LLM had no full source to cite from.
+            return
         content_lower = content.lower()
         referenced = 0
-        for f in bucket.owned_files:
-            # Check if file path appears in the content (case-insensitive)
+        for f in checkable_files:
             if f.lower() in content_lower:
                 referenced += 1
-        coverage = referenced / len(bucket.owned_files) if bucket.owned_files else 1.0
-        unreferenced = [f for f in bucket.owned_files if f.lower() not in content_lower]
+        coverage = referenced / len(checkable_files)
+        unreferenced = [f for f in checkable_files if f.lower() not in content_lower]
         hints = bucket.generation_hints or {}
         is_intro = hints.get("is_introduction_page") or bucket.section == "Start Here"
@@ -289,12 +309,17 @@ class PageValidator:
         if coverage < threshold and len(unreferenced) > 2:
             result.missing_file_refs = unreferenced[:5]
+            compressed_note = (
+                f"; {len(compressed_paths)} compressed files excluded"
+                if compressed_paths
+                else ""
+            )
             result.warnings.append(
-                f"Low file coverage: {referenced}/{len(bucket.owned_files)} files referenced "
-                f"({coverage:.0%}; expected at least {threshold:.0%})"
+                f"Low file coverage: {referenced}/{len(checkable_files)} full-source files referenced "
+                f"({coverage:.0%}; expected at least {threshold:.0%}{compressed_note})"
             )
             if is_intro:
-                if coverage < 0.15 and len(bucket.owned_files) >= 10:
+                if coverage < 0.15 and len(checkable_files) >= 10:
                     result.is_valid = False
             else:
                 result.is_valid = False
@@ -438,6 +463,14 @@ class PageValidator:
             key = impact.get("key", "") if isinstance(impact, dict) else getattr(impact, "key", "")
             if key:
                 symbols.add(str(key))
+        # For integration pages, also treat any symbol-like token that appears in
+        # the integration context evidence as known-good. These are external SDK
+        # symbols (e.g. S3Client, GupshupMessage) that are real but not in the
+        # repo's parsed files — they should not be flagged as hallucinations.
+        if evidence is not None and evidence.integration_context:
+            for token in re.findall(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b", evidence.integration_context):
+                if self._looks_like_symbol_reference(token):
+                    symbols.add(token)
         return symbols
     @staticmethod
@@ -494,7 +527,11 @@ class PageValidator:
         return lower in bucket_text
     def _check_route_claims(
-        self, content: str, bucket: DocBucket, result: ValidationResult
+        self,
+        content: str,
+        bucket: DocBucket,
+        result: ValidationResult,
+        evidence: "AssembledEvidence | None" = None,
     ) -> None:
         if not self.known_route_paths:
             return
@@ -512,6 +549,19 @@ class PageValidator:
         ):
             return
+        # Build the set of valid routes for this page: internal routes + any routes
+        # that appear verbatim in the integration context evidence. The second set
+        # covers external service API paths (e.g. WhatsApp /messages, AWS /putObject)
+        # that the LLM received in evidence and is correct to reference.
+        valid_routes = set(self.known_route_paths)
+        if evidence is not None and evidence.integration_context:
+            for token in re.findall(
+                r"(\/[A-Za-z0-9{}_<>\-.:/~]+)", evidence.integration_context
+            ):
+                normalized = self._normalize_route_path(token)
+                if normalized and not self._is_markup_path_noise(normalized):
+                    valid_routes.add(normalized)
         candidate_tokens: list[str] = []
         for inline in re.findall(r"`([^`]+)`", content):
             candidate_tokens.extend(re.findall(r"(\/[A-Za-z0-9{}_<>\-.:/~]+)", inline))
@@ -534,7 +584,7 @@ class PageValidator:
             candidates.add(route)
         unmatched = sorted(
-            route for route in candidates if route not in self.known_route_paths
+            route for route in candidates if route not in valid_routes
         )
         if unmatched:
             result.unmatched_routes = unmatched[:10]
@@ -557,9 +607,17 @@ class PageValidator:
             return
         content_lower = content.lower()
-        if "call flow" not in content_lower:
+        _flow_terms = (
+            "call flow", "execution flow", "request flow",
+            "flow diagram", "sequence diagram",
+        )
+        _effect_terms = (
+            "side effect", "downstream effect",
+            "triggers", "emits", "dispatches", "publishes",
+        )
+        if not any(t in content_lower for t in _flow_terms):
             result.missing_flow_edges.append("call_flow")
-        if "side effects" not in content_lower:
+        if not any(t in content_lower for t in _effect_terms):
             result.missing_flow_entrypoints.append("side_effects")
         if result.missing_flow_edges or result.missing_flow_entrypoints:
@@ -903,7 +961,16 @@ class PageValidator:
             return
         content_lower = content.lower()
-        missing = [name for name in expected if name.lower() not in content_lower]
+        # Use token-based partial matching: an integration name is "covered" if any
+        # of its meaningful tokens appear in the content. This handles paraphrasing
+        # like "Amazon Web Services" vs "AWS" or "Gupshup WhatsApp" vs "WhatsApp".
+        def _is_covered(name: str) -> bool:
+            if name.lower() in content_lower:
+                return True
+            tokens = self._integration_name_tokens(name)
+            return bool(tokens) and any(token in content_lower for token in tokens)
+        missing = [name for name in expected if not _is_covered(name)]
         if not missing:
             return
@@ -911,6 +978,8 @@ class PageValidator:
         result.warnings.append(
             f"Integration context missing named references: {', '.join(result.missing_integrations[:4])}"
         )
+        # Only mark invalid when ALL expected integrations are fully absent and we
+        # have concrete evidence the LLM had them in context.
         if bucket.bucket_type == "integration" and len(missing) == len(expected):
             result.is_valid = False
         elif (

{deepdoc-2.2.0 → deepdoc-2.2.1}/deepdoc/pipeline_v2.py RENAMED Viewed

@@ -283,7 +283,7 @@ class PipelineV2:
             )
         )
         phase_start = time.perf_counter()
-        plan = bucket_plan_docs(scan, self.cfg, self.llm)
+        plan = bucket_plan_docs(scan, self.cfg, self.llm, repo_root=self.repo_root)
         phase_timings["plan"] = time.perf_counter() - phase_start
         stats["pages_planned"] = len(plan.pages)
@@ -347,6 +347,34 @@ class PipelineV2:
             stats["playground"] = 0
             phase_timings["openapi"] = 0.0
+        # ── Persist state ──────────────────────────────────────────────
+        phase_start = time.perf_counter()
+        save_all(plan, scan, gen_results, self.repo_root, self.output_dir)
+        stats["llm_usage"] = dict(getattr(self.llm, "usage", {}) or {})
+        self._save_quality_report(stats)
+        phase_timings["persist"] = time.perf_counter() - phase_start
+        # ── Record changelog after save_all so entries reference persisted pages ──
+        try:
+            import git as _git
+            _repo_cl = _git.Repo(self.repo_root)
+            _head_cl = _repo_cl.head.commit
+            _changelog_exists = bool(load_changelog(self.repo_root))
+            _record_changelog(
+                self.repo_root,
+                self.output_dir,
+                commit=_head_cl.hexsha,
+                commit_message=_head_cl.message.strip().splitlines()[0],
+                commit_date=_head_cl.committed_datetime.strftime("%Y-%m-%d"),
+                strategy="full_generate",
+                pages_updated=[b.slug for b in plan.buckets],
+                files_changed=[],
+                is_initial=not _changelog_exists,
+            )
+        except Exception:
+            pass  # Not a git repo or detached HEAD — skip silently
         # ── Phase 5: Build site ────────────────────────────────────────
         console.print(
             Panel("[bold]Phase 5/5: Building site[/bold]", border_style="blue")
@@ -361,13 +389,6 @@ class PipelineV2:
         phase_timings["build_site"] = time.perf_counter() - phase_start
         stats["site"] = 1
-        # ── Persist state ──────────────────────────────────────────────
-        phase_start = time.perf_counter()
-        save_all(plan, scan, gen_results, self.repo_root, self.output_dir)
-        stats["llm_usage"] = dict(getattr(self.llm, "usage", {}) or {})
-        self._save_quality_report(stats)
-        phase_timings["persist"] = time.perf_counter() - phase_start
         if chatbot_enabled(self.cfg):
             try:
                 from .chatbot.indexer import ChatbotIndexer
@@ -463,19 +484,6 @@ class PipelineV2:
                     "replanned": True,
                 },
             )
-            changelog_exists = bool(load_changelog(self.repo_root))
-            _commit_obj = _repo.head.commit
-            _record_changelog(
-                self.repo_root,
-                self.output_dir,
-                commit=head_sha,
-                commit_message=_commit_obj.message.strip().splitlines()[0],
-                commit_date=_commit_obj.committed_datetime.strftime("%Y-%m-%d"),
-                strategy="full_generate",
-                pages_updated=[b.slug for b in plan.buckets],
-                files_changed=[],
-                is_initial=not changelog_exists,
-            )
         except Exception:
             pass  # Not a git repo or detached HEAD — skip silently

deepdoc 2.2.0__tar.gz → 2.2.1__tar.gz

deepdoc 2.2.0tar.gz → 2.2.1tar.gz