PyPI - docpull - Versions diffs - 2.3.0__tar.gz → 2.4.0__tar.gz - Mend

docpull 2.3.0tar.gz → 2.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docpull-2.3.0/src/docpull.egg-info → docpull-2.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 2.3.0
+Version: 2.4.0
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -278,11 +278,16 @@ NDJSON (one record per page or chunk):
 ## Security
 - HTTPS-only, mandatory robots.txt compliance
-- SSRF protection: blocks private/internal network IPs, DNS rebinding
+- SSRF protection: blocks private/internal network IPs, DNS rebinding via
+  connect-time address pinning
 - XXE protection via `defusedxml` on sitemaps
 - Path traversal and CRLF header injection guards
 - Auth headers stripped on cross-origin redirects
+When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
+`--require-pinned-dns` to refuse this configuration and keep the connector-
+level SSRF guarantees in effect.
 ## Options
 Run `docpull --help` for the full list. Highlights:
@@ -310,6 +315,26 @@ Cache:
   --cache-ttl DAYS
 ```
+## Performance
+End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
+synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
+HTTP keep-alive, 5% injected duplicate content):
+| Metric | Value |
+|---|---|
+| Total wall time | ~27 s |
+| Discovery (sitemap parse) | ~80 ms |
+| Fetch + convert + save | ~27 s |
+| Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
+| Peak RSS delta from baseline | ~28 MB |
+| Cache manifest size on disk | ~3.4 MB |
+| Duplicates detected (5% injected) | 499 / 500 |
+Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
+benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
+into trend tooling).
 ## Troubleshooting
 ```bash

{docpull-2.3.0 → docpull-2.4.0}/README.md RENAMED Viewed

@@ -196,11 +196,16 @@ NDJSON (one record per page or chunk):
 ## Security
 - HTTPS-only, mandatory robots.txt compliance
-- SSRF protection: blocks private/internal network IPs, DNS rebinding
+- SSRF protection: blocks private/internal network IPs, DNS rebinding via
+  connect-time address pinning
 - XXE protection via `defusedxml` on sitemaps
 - Path traversal and CRLF header injection guards
 - Auth headers stripped on cross-origin redirects
+When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
+`--require-pinned-dns` to refuse this configuration and keep the connector-
+level SSRF guarantees in effect.
 ## Options
 Run `docpull --help` for the full list. Highlights:
@@ -228,6 +233,26 @@ Cache:
   --cache-ttl DAYS
 ```
+## Performance
+End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
+synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
+HTTP keep-alive, 5% injected duplicate content):
+| Metric | Value |
+|---|---|
+| Total wall time | ~27 s |
+| Discovery (sitemap parse) | ~80 ms |
+| Fetch + convert + save | ~27 s |
+| Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
+| Peak RSS delta from baseline | ~28 MB |
+| Cache manifest size on disk | ~3.4 MB |
+| Duplicates detected (5% injected) | 499 / 500 |
+Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
+benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
+into trend tooling).
 ## Troubleshooting
 ```bash

{docpull-2.3.0 → docpull-2.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "2.3.0"
+version = "2.4.0"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}

{docpull-2.3.0 → docpull-2.4.0}/src/docpull/__init__.py RENAMED Viewed

@@ -14,7 +14,7 @@ Usage:
             print(event)
 """
-__version__ = "2.3.0"
+__version__ = "2.4.0"
 from .cache import CacheManager, StreamingDeduplicator
 from .conversion.chunking import Chunk, TokenCounter, chunk_markdown

{docpull-2.3.0 → docpull-2.4.0}/src/docpull/cli.py RENAMED Viewed

@@ -102,6 +102,23 @@ Examples:
         help="Fetch the given URL only (no discovery/crawl). Fast path for agents.",
     )
+    parser.add_argument(
+        "--skill",
+        type=str,
+        metavar="NAME",
+        help=(
+            "Generate a Claude Code skill directory. Output goes to "
+            "<output-dir>/<NAME>/ with hierarchical naming and a "
+            "SKILL.md manifest derived from the first page's metadata."
+        ),
+    )
+    parser.add_argument(
+        "--skill-description",
+        type=str,
+        metavar="TEXT",
+        help="Override the auto-derived `description` in SKILL.md.",
+    )
     # Output
     parser.add_argument(
         "--output-dir",
@@ -117,6 +134,16 @@ Examples:
         default=None,
         help="Output format (default: markdown; 'ndjson' streams one record per line)",
     )
+    parser.add_argument(
+        "--naming-strategy",
+        choices=["full", "hierarchical", "flat", "short"],
+        default=None,
+        help=(
+            "URL-to-filename strategy. 'full' flattens with underscores; "
+            "'hierarchical' preserves the URL path as nested directories. "
+            "Mirror profile defaults to hierarchical."
+        ),
+    )
     parser.add_argument(
         "--stream",
         action="store_true",
@@ -167,6 +194,15 @@ Examples:
         action="store_true",
         help="Automatically adjust rate limits based on server responses",
     )
+    crawl_group.add_argument(
+        "--no-streaming-discovery",
+        action="store_true",
+        help=(
+            "Fall back to discover-all-then-fetch instead of piping URLs "
+            "through a worker pool as discovery yields them. Backstop for "
+            "queue-backpressure regressions."
+        ),
+    )
     # Content filtering
     filter_group = parser.add_argument_group("content filtering")
@@ -175,12 +211,6 @@ Examples:
         action="store_true",
         help="Enable real-time deduplication",
     )
-    filter_group.add_argument(
-        "--language",
-        type=str,
-        metavar="CODE",
-        help="Include only pages in this language",
-    )
     filter_group.add_argument(
         "--extractor",
         choices=["default", "trafilatura"],
@@ -244,6 +274,15 @@ Examples:
         default=None,
         help="Maximum retry attempts",
     )
+    network_group.add_argument(
+        "--require-pinned-dns",
+        action="store_true",
+        help=(
+            "Refuse configurations that delegate DNS to a proxy. With this "
+            "flag, --proxy is rejected so the SSRF posture cannot silently "
+            "weaken in agent-driven crawls."
+        ),
+    )
     # Authentication settings
     auth_group = parser.add_argument_group("authentication")
@@ -358,8 +397,20 @@ def run_fetcher(args: argparse.Namespace) -> int:
     # Output settings
     output_kwargs: dict = {}
-    if args.output_dir:
+    if args.skill:
+        # Skill mode: nest under <output-dir>/<skill>/, force hierarchical
+        # naming, and stamp the manifest fields. Default --output-dir to
+        # `.claude/skills` for the common drop-in use case.
+        base = args.output_dir or Path(".claude/skills")
+        output_kwargs["directory"] = base / args.skill
+        output_kwargs["naming_strategy"] = "hierarchical"
+        output_kwargs["skill_name"] = args.skill
+        if args.skill_description:
+            output_kwargs["skill_description"] = args.skill_description
+    elif args.output_dir:
         output_kwargs["directory"] = args.output_dir
+    if args.naming_strategy and "naming_strategy" not in output_kwargs:
+        output_kwargs["naming_strategy"] = args.naming_strategy
     if args.stream:
         output_kwargs["format"] = "ndjson"
         output_kwargs["ndjson_filename"] = "-"
@@ -386,6 +437,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
         crawl_kwargs["rate_limit"] = args.rate_limit
     if args.adaptive_rate_limit:
         crawl_kwargs["adaptive_rate_limit"] = True
+    if args.no_streaming_discovery:
+        crawl_kwargs["streaming_discovery"] = False
     if args.include_paths:
         crawl_kwargs["include_paths"] = args.include_paths
     if args.exclude_paths:
@@ -397,8 +450,6 @@ def run_fetcher(args: argparse.Namespace) -> int:
     filter_kwargs: dict = {}
     if args.streaming_dedup:
         filter_kwargs["streaming_dedup"] = True
-    if args.language:
-        filter_kwargs["language"] = args.language
     if args.extractor:
         filter_kwargs["extractor"] = args.extractor
     if args.no_special_cases:
@@ -422,6 +473,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
         return 1
     if args.max_retries is not None:
         network_kwargs["max_retries"] = args.max_retries
+    if args.require_pinned_dns:
+        network_kwargs["require_pinned_dns"] = True
     if network_kwargs:
         config_kwargs["network"] = network_kwargs

{docpull-2.3.0 → docpull-2.4.0}/src/docpull/concurrency/manager.py RENAMED Viewed

@@ -1,8 +1,10 @@
 """Thread pool manager for CPU-bound operations."""
 import asyncio
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Callable, Optional, TypeVar
+from types import TracebackType
+from typing import Any, TypeVar
 T = TypeVar("T")
@@ -32,7 +34,7 @@ class ConcurrencyManager:
                         Consider CPU core count for optimal value.
         """
         self.max_workers = max_workers
-        self._executor: Optional[ThreadPoolExecutor] = None
+        self._executor: ThreadPoolExecutor | None = None
     @property
     def executor(self) -> ThreadPoolExecutor:
@@ -98,7 +100,12 @@ class ConcurrencyManager:
         """Enter async context."""
         return self
-    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
         """Exit async context and shutdown executor."""
         self.shutdown(wait=True)
@@ -106,6 +113,11 @@ class ConcurrencyManager:
         """Enter sync context."""
         return self
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
         """Exit sync context and shutdown executor."""
         self.shutdown(wait=True)

{docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/extractor.py RENAMED Viewed

@@ -26,7 +26,7 @@ CONTENT_SELECTORS = [
     "#documentation",
 ]
-# Elements to remove (navigation, ads, etc.)
+# Elements to remove (navigation, ads, cookie banners, etc.)
 REMOVE_SELECTORS = [
     "nav",
     "header",
@@ -54,6 +54,33 @@ REMOVE_SELECTORS = [
     "noscript",
     "iframe",
     "svg",
+    # Cookie / consent / GDPR walls. Most are structural — class names
+    # come from a small set of vendor SDKs (OneTrust, Osano, CookieConsent,
+    # CookieLaw, Cookiebot, Iubenda) plus generic `.cookie-*` / `.gdpr-*`
+    # patterns. The aria-label fallbacks catch dialogs whose className
+    # doesn't match the conventions but whose accessibility label does.
+    ".cookie-banner",
+    ".cookie-consent",
+    ".cookie-notice",
+    ".cookielaw-banner",
+    ".cookiebot",
+    ".gdpr",
+    ".gdpr-banner",
+    ".consent-banner",
+    ".consent-popup",
+    ".cc-window",
+    ".cc-banner",
+    ".osano-cm-window",
+    ".osano-cm-dialog",
+    "#onetrust-banner-sdk",
+    "#onetrust-consent-sdk",
+    "#onetrust-pc-sdk",
+    ".ot-sdk-container",
+    ".iubenda-cs-container",
+    ".termly-styl-banner",
+    '[aria-label*="cookie" i]',
+    '[aria-label*="consent" i]',
+    '[aria-label*="gdpr" i]',
 ]
 # Elements to preserve but simplify
@@ -236,8 +263,94 @@ class MainContentExtractor:
         # Clean up
         self._remove_unwanted(content)
+        # Normalize fence languages BEFORE we strip attributes — many
+        # syntax-highlight conventions encode language in `class` (Prism:
+        # `language-python`, highlight.js: `lang-py`, Shiki: `language-bash`).
+        # html2text's `mark_code` won't pick these up by default, so we lift
+        # the language onto an html2text-friendly `class="lang-X"` form on
+        # both the <pre> and inner <code>.
+        _normalize_code_fence_language(content)
         self._clean_attributes(content)
         self._resolve_links(content, url)
         result = str(content)
         return self._clean_whitespace(result)
+# Map syntax-highlight library conventions to a canonical short language tag.
+# Order matters: longest/most-specific prefix first so `highlight-source-rust`
+# resolves to `rust`, not `source-rust`. We deliberately skip `none`, `text`,
+# and `plaintext` — they represent "no language."
+_LANG_CLASS_PATTERNS: list[re.Pattern[str]] = [
+    re.compile(r"(?:^|\s)highlight-source-([\w+#-]+)", re.IGNORECASE),
+    re.compile(r"(?:^|\s)hljs-language-([\w+#-]+)", re.IGNORECASE),
+    re.compile(r"(?:^|\s)(?:language|lang|highlight)-([\w+#-]+)", re.IGNORECASE),
+]
+# Sentinel injected as the first text node inside a <code> tag. html2text
+# preserves the body of <pre><code> verbatim (it just indents by 4 spaces
+# and wraps in [code]/[/code]), so this sentinel survives through to the
+# Markdown stage where HtmlToMarkdown._clean_output recovers the language
+# and rewrites the block as a fenced GFM code block.
+DOCPULL_FENCE_SENTINEL_PREFIX = "__DOCPULL_FENCE_LANG_"
+DOCPULL_FENCE_SENTINEL_SUFFIX = "__"
+def _classes_of(tag: Tag) -> list[str]:
+    """Return a tag's CSS classes as a flat list of strings.
+    BeautifulSoup hands back ``str``, ``AttributeValueList``, or ``None``
+    depending on parser version. Normalize to ``list[str]`` for the rest
+    of the language-detection code.
+    """
+    raw = tag.get("class")
+    if raw is None:
+        return []
+    if isinstance(raw, str):
+        return [raw]
+    return [str(c) for c in raw]
+def _detect_lang(class_string: str) -> str | None:
+    """Return the canonical language tag for a code block, or None."""
+    for pattern in _LANG_CLASS_PATTERNS:
+        match = pattern.search(class_string)
+        if not match:
+            continue
+        lang = match.group(1).lower()
+        if lang in {"none", "plaintext", "text"}:
+            return None
+        return lang
+    return None
+def _normalize_code_fence_language(content: BeautifulSoup) -> None:
+    """Inject a sentinel that lets the Markdown stage emit fenced blocks.
+    Modern syntax-highlight libraries encode the language as a CSS class
+    (Prism: ``language-python``; highlight.js: ``lang-py`` /
+    ``hljs-language-bash``; GitHub: ``highlight-source-rust``). html2text
+    cannot read these and emits a generic ``[code]...[/code]`` block.
+    We walk every ``<pre>`` and prepend a sentinel ``__DOCPULL_FENCE_LANG_X__``
+    as a NavigableString to the inner ``<code>`` (or to the ``<pre>`` itself
+    if no inner ``<code>`` exists). Post-conversion, the Markdown layer
+    pulls that sentinel back out of the rendered text and rewrites the
+    block as a GFM fenced code block with the language tag.
+    """
+    for pre in content.find_all("pre"):
+        if not isinstance(pre, Tag):
+            continue
+        pre_classes = _classes_of(pre)
+        code = pre.find("code") if pre else None
+        code_classes: list[str] = []
+        if isinstance(code, Tag):
+            code_classes = _classes_of(code)
+        lang = _detect_lang(" ".join(pre_classes + code_classes))
+        if lang is None:
+            continue
+        sentinel = f"{DOCPULL_FENCE_SENTINEL_PREFIX}{lang}{DOCPULL_FENCE_SENTINEL_SUFFIX}\n"
+        target = code if isinstance(code, Tag) else pre
+        target.insert(0, sentinel)

{docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/markdown.py RENAMED Viewed

@@ -4,11 +4,17 @@ from __future__ import annotations
 import logging
 import re
+import textwrap
 from typing import Any
 from urllib.parse import urljoin
 import html2text
+from .extractor import (
+    DOCPULL_FENCE_SENTINEL_PREFIX,
+    DOCPULL_FENCE_SENTINEL_SUFFIX,
+)
 logger = logging.getLogger(__name__)
@@ -17,6 +23,43 @@ def _normalize_scheme(url: str) -> str:
     return re.sub(r"^(https?:)/(?!/)", r"\1//", url)
+# html2text wraps <pre><code> in [code]/[/code] markers and indents the body
+# by 4 spaces. The opening marker may carry trailing whitespace
+# (`[code] \n`); tolerate it so we don't miss real code blocks.
+_HTML2TEXT_CODE_BLOCK_RE = re.compile(
+    r"\[code\][ \t]*\n(.*?)\n[ \t]*\[/code\]",
+    re.DOTALL,
+)
+_FENCE_SENTINEL_RE = re.compile(
+    rf"^[ \t]*{re.escape(DOCPULL_FENCE_SENTINEL_PREFIX)}"
+    rf"([\w+#-]+){re.escape(DOCPULL_FENCE_SENTINEL_SUFFIX)}[ \t]*\n",
+    re.MULTILINE,
+)
+def _rewrite_html2text_code_blocks(markdown: str) -> str:
+    """Replace ``[code]...[/code]`` markers with GFM fenced blocks.
+    html2text indents the body of a ``[code]`` block by 4 spaces; we dedent
+    that consistently. If the body's first line is a docpull language
+    sentinel (injected by the extractor), the fence is opened with that
+    language; otherwise the fence is bare.
+    """
+    def replace(match: re.Match[str]) -> str:
+        body = match.group(1)
+        body = textwrap.dedent(body)
+        lang = ""
+        sentinel_match = _FENCE_SENTINEL_RE.match(body)
+        if sentinel_match:
+            lang = sentinel_match.group(1)
+            body = body[sentinel_match.end() :]
+        body = body.rstrip("\n")
+        return f"```{lang}\n{body}\n```"
+    return _HTML2TEXT_CODE_BLOCK_RE.sub(replace, markdown)
 class HtmlToMarkdown:
     """
     Converts HTML content to clean Markdown.
@@ -77,13 +120,16 @@ class HtmlToMarkdown:
     def _clean_output(self, markdown: str) -> str:
         """Clean up the converted Markdown."""
+        # Convert html2text's [code]/[/code] markers into GFM fences,
+        # recovering the language tag from the docpull sentinel injected
+        # by MainContentExtractor when the source HTML carried a Prism /
+        # highlight.js / Shiki language class. Must run BEFORE blank-line
+        # collapsing so the rewritten fences sit on their own lines.
+        markdown = _rewrite_html2text_code_blocks(markdown)
         # Remove excessive blank lines
         markdown = re.sub(r"\n{3,}", "\n\n", markdown)
-        # Fix code block formatting
-        # Ensure code blocks have language hint
-        markdown = re.sub(r"```\n", "```\n", markdown)
         # Unmangle html2text's protect_links output:
         #   [text](prefix/<https:/real.url>)  ->  [text](https://real.url)
         # The angle-bracketed inner URL is the true absolute URL (the prefix is

docpull 2.3.0__tar.gz → 2.4.0__tar.gz

docpull 2.3.0tar.gz → 2.4.0tar.gz