PyPI - docpull - Versions diffs - 3.0.0__tar.gz → 3.0.1__tar.gz - Mend

docpull 3.0.0tar.gz → 3.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docpull-3.0.0/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 3.0.0
+Version: 3.0.1
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
 Requires-Dist: defusedxml>=0.7.1
 Requires-Dist: extruct>=0.15.0
 Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: idna>=3.15
+Requires-Dist: regex>=2024.11.6
 Requires-Dist: rich>=13.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: pydantic>=2.0
+Requires-Dist: urllib3>=2.7.0
 Provides-Extra: proxy
 Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
 Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
 Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
 Provides-Extra: mcp
 Requires-Dist: mcp>=1.0.0; extra == "mcp"
+Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
+Requires-Dist: starlette>=1.0.1; extra == "mcp"
 Provides-Extra: llm
 Requires-Dist: tiktoken>=0.7.0; extra == "llm"
 Provides-Extra: all
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
 Requires-Dist: trafilatura>=1.12.0; extra == "all"
 Requires-Dist: tiktoken>=0.7.0; extra == "all"
 Requires-Dist: mcp>=1.0.0; extra == "all"
+Requires-Dist: python-multipart>=0.0.27; extra == "all"
+Requires-Dist: starlette>=1.0.1; extra == "all"
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

{docpull-3.0.0 → docpull-3.0.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "3.0.0"
+version = "3.0.1"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -67,9 +67,12 @@ dependencies = [
     "defusedxml>=0.7.1",
     "extruct>=0.15.0",
     "aiohttp>=3.9.0",
+    "idna>=3.15",
+    "regex>=2024.11.6",
     "rich>=13.0.0",
     "pyyaml>=6.0",
     "pydantic>=2.0",
+    "urllib3>=2.7.0",
 ]
 [project.optional-dependencies]
@@ -87,6 +90,8 @@ tokens = [
 ]
 mcp = [
     "mcp>=1.0.0",
+    "python-multipart>=0.0.27",
+    "starlette>=1.0.1",
 ]
 llm = [
     "tiktoken>=0.7.0",
@@ -97,6 +102,8 @@ all = [
     "trafilatura>=1.12.0",
     "tiktoken>=0.7.0",
     "mcp>=1.0.0",
+    "python-multipart>=0.0.27",
+    "starlette>=1.0.1",
 ]
 dev = [
     "pytest>=7.0.0",
@@ -170,10 +177,22 @@ module = "docpull.models.*"
 disallow_any_unimported = false
 warn_return_any = false
-[[tool.mypy.overrides]]
-module = "tests.*"
-disallow_untyped_defs = false
-disallow_any_unimported = false
+[tool.bandit]
+# Policy: every entry in `skips` MUST have a one-line justification
+# above it explaining what bandit found, why it's a false positive
+# *for this codebase*, and (if narrow) why a `# nosec BXXX  # reason`
+# annotation at the call site would have been worse. Bandit skips
+# silence findings repo-wide, so the bar to add one is higher than
+# silencing a single line. If a new skip is unavoidable, add it here
+# in PR review, not as a drive-by.
+#
+# B101 (assert_used) — flags every `assert x is not None` we use for
+# type narrowing. Bandit's concern is that assertions vanish under
+# `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
+# narrowing asserts are not load-bearing safety checks. Skipping the
+# rule globally keeps the existing idiom without 8+ inline `# nosec`
+# annotations in fetcher.py / pipeline/steps/convert.py.
+skips = ["B101"]
 [tool.pytest.ini_options]
 minversion = "7.0"

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/manager.py RENAMED Viewed

@@ -5,10 +5,12 @@ from __future__ import annotations
 import hashlib
 import json
 import logging
-from datetime import datetime, timedelta
+from datetime import timedelta
 from pathlib import Path
 from typing import TypedDict
+from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
 logger = logging.getLogger(__name__)
 # Default TTL for cache entries (30 days)
@@ -257,7 +259,7 @@ class CacheManager:
         self.manifest[url] = {
             "checksum": self.compute_checksum(content),
             "file_path": str(file_path),
-            "fetched_at": datetime.now().isoformat(),
+            "fetched_at": utc_now_iso(),
             "size": len(content),
         }
@@ -314,7 +316,7 @@ class CacheManager:
         Note:
             Changes are batched. Call flush() to persist to disk.
         """
-        self._state.last_run = datetime.now().isoformat()
+        self._state.last_run = utc_now_iso()
         self._state_dirty = True
     def clear_state(self) -> None:
@@ -354,18 +356,18 @@ class CacheManager:
         if ttl is None:
             return 0
-        cutoff = datetime.now() - timedelta(days=ttl)
+        cutoff = utc_now() - timedelta(days=ttl)
         to_remove = []
         for url, entry in self.manifest.items():
             fetched_at = entry.get("fetched_at")
             if fetched_at:
                 try:
-                    entry_time = datetime.fromisoformat(fetched_at)
+                    entry_time = parse_persisted_datetime(fetched_at)
                     if entry_time < cutoff:
                         to_remove.append(url)
-                except ValueError:
-                    pass  # Invalid date format, skip
+                except ValueError as err:
+                    logger.warning("Invalid cache timestamp for %s: %s", url, err)
         for url in to_remove:
             del self.manifest[url]
@@ -413,7 +415,7 @@ class CacheManager:
         """
         data: DiscoveredUrlsState = {
             "start_url": start_url,
-            "discovered_at": datetime.now().isoformat(),
+            "discovered_at": utc_now_iso(),
             "urls": urls,
         }
         try:

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/cli.py RENAMED Viewed

@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
     output_dir = None
     if "--output-dir" in sys.argv or "-o" in sys.argv:
-        try:
-            flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
-            if flag_idx + 1 < len(sys.argv):
-                output_dir = Path(sys.argv[flag_idx + 1])
-        except (ValueError, IndexError):
-            pass
+        flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
+        flag_idx = sys.argv.index(flag)
+        if flag_idx + 1 < len(sys.argv):
+            output_dir = Path(sys.argv[flag_idx + 1])
     sys.exit(run_doctor(output_dir=output_dir))
 # Verify core dependencies

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/special_cases.py RENAMED Viewed

@@ -320,7 +320,8 @@ class OpenApiExtractor:
             return None
         try:
             data = json.loads(text)
-        except json.JSONDecodeError:
+        except json.JSONDecodeError as err:
+            logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
             return None
         if not isinstance(data, dict):
             return None
@@ -569,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
     """Heuristic: does this HTML appear to be a JS-only SPA?
     True when the non-script body text is very small relative to the overall
-    page size and the page contains script tags. Not perfect, but good enough
-    to warn an agent before it consumes empty Markdown.
+    page size and the page contains script tags. This is a conservative signal
+    for warning an agent before it consumes empty Markdown.
     """
     if len(html) < 500:
         return False
@@ -578,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
         return False
     try:
         soup = _soup(html)
-    except Exception:  # noqa: BLE001
+    except Exception as err:  # noqa: BLE001
+        logger.debug("SPA heuristic skipped malformed HTML: %s", err)
         return False
     # Remove scripts/styles before measuring.
     for tag in soup(["script", "style", "noscript"]):

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py RENAMED Viewed

@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
         try:
             absolute_url = urljoin(base_url, href)
-        except Exception:
+        except Exception as err:
+            logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
             return None
         # Validate it's a proper URL

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py RENAMED Viewed

@@ -148,7 +148,8 @@ class StaticLinkExtractor:
         """
         try:
             absolute_url = urljoin(base_url, href)
-        except Exception:
+        except Exception as err:
+            logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
             return None
         # Remove fragment

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/sources.py RENAMED Viewed

@@ -7,9 +7,12 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import Path
+from urllib.parse import urlparse
 import yaml
+from ..security.url_validator import UrlValidator
 logger = logging.getLogger(__name__)
@@ -54,6 +57,9 @@ BUILTIN_SOURCES: dict[str, SourceConfig] = {
 _URL_SCHEME_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE)
 _LIBRARY_NAME_RE = re.compile(r"^[a-zA-Z0-9_.-]+$")
+MAX_LIBRARY_NAME_LENGTH = 128
+MAX_USER_SOURCE_PAGES = 100_000
+_USER_SOURCE_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
 def is_safe_library_name(name: str) -> bool:
@@ -61,11 +67,44 @@ def is_safe_library_name(name: str) -> bool:
     Allows alnum + ``_ . -``; rejects separators, ``..``, leading dot.
     """
-    if not name or name.startswith(".") or name == ".." or len(name) > 128:
+    if not name or name.startswith(".") or name == ".." or len(name) > MAX_LIBRARY_NAME_LENGTH:
         return False
     return bool(_LIBRARY_NAME_RE.fullmatch(name))
+def _is_https_url(url: str) -> bool:
+    parsed = urlparse(url)
+    return parsed.scheme.lower() == "https" and parsed.hostname is not None
+def _is_allowed_source_url(url: str) -> tuple[bool, str | None]:
+    if not _is_https_url(url):
+        return (False, "url must be an HTTPS URL")
+    validation = _USER_SOURCE_URL_VALIDATOR.validate(url)
+    if not validation.is_valid:
+        return (False, validation.rejection_reason or "url rejected by validator")
+    return (True, None)
+def _coerce_max_pages(value: object, source_name: str) -> int | None:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        raise ValueError(f"source '{source_name}' max_pages must be an integer")
+    if isinstance(value, int):
+        parsed = value
+    elif isinstance(value, str):
+        try:
+            parsed = int(value)
+        except ValueError as err:
+            raise ValueError(f"source '{source_name}' max_pages must be an integer") from err
+    else:
+        raise ValueError(f"source '{source_name}' max_pages must be an integer")
+    if parsed < 1 or parsed > MAX_USER_SOURCE_PAGES:
+        raise ValueError(f"source '{source_name}' max_pages must be between 1 and {MAX_USER_SOURCE_PAGES}")
+    return parsed
 def default_config_dir() -> Path:
     env = os.environ.get("XDG_CONFIG_HOME")
     base = Path(env) if env else Path.home() / ".config"
@@ -98,13 +137,31 @@ def load_user_sources(path: Path | None = None) -> dict[str, SourceConfig]:
     entries = raw.get("sources") or {}
     result: dict[str, SourceConfig] = {}
     for name, cfg in entries.items():
-        if not isinstance(cfg, dict) or not isinstance(cfg.get("url"), str):
+        source_name = str(name)
+        if not is_safe_library_name(source_name):
+            logger.warning("Ignoring unsafe source name in %s: %r", path, source_name)
+            continue
+        if not isinstance(cfg, dict):
+            logger.warning("Ignoring source %s in %s: entry must be a mapping", source_name, path)
+            continue
+        url = cfg.get("url")
+        if not isinstance(url, str):
+            logger.warning("Ignoring source %s in %s: url must be an HTTPS URL", source_name, path)
+            continue
+        url_allowed, url_reason = _is_allowed_source_url(url)
+        if not url_allowed:
+            logger.warning("Ignoring source %s in %s: %s", source_name, path, url_reason)
+            continue
+        try:
+            max_pages = _coerce_max_pages(cfg.get("maxPages") or cfg.get("max_pages"), source_name)
+        except ValueError as err:
+            logger.warning("Ignoring source %s in %s: %s", source_name, path, err)
             continue
-        result[str(name)] = SourceConfig(
-            url=cfg["url"],
+        result[source_name] = SourceConfig(
+            url=url,
             description=str(cfg.get("description", "")),
             category=str(cfg.get("category", "user")),
-            max_pages=cfg.get("maxPages") or cfg.get("max_pages"),
+            max_pages=max_pages,
         )
     return result
@@ -122,7 +179,7 @@ def resolve_source(name: str) -> SourceConfig | None:
     be routed through configured aliases so that policy (max_pages, category)
     lives in one place.
     """
-    if _URL_SCHEME_RE.match(name):
+    if _URL_SCHEME_RE.match(name) or not is_safe_library_name(name):
         return None
     return all_sources().get(name)

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/tools.py RENAMED Viewed

@@ -14,20 +14,20 @@ from __future__ import annotations
 import json
 import logging
 import os
-import re
 import shutil
 import time
 from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
-from datetime import datetime
 from pathlib import Path
 from typing import Any
+import regex
 import yaml
 from ..core.fetcher import Fetcher
 from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
 from ..security.url_validator import UrlValidator
+from ..time_utils import utc_now_iso
 from .sources import (
     _URL_SCHEME_RE,
     BUILTIN_SOURCES,
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
 CACHE_TTL_SECONDS = 7 * 24 * 60 * 60  # 7 days
 MAX_GREP_PATTERN_LEN = 1000
 GREP_TIMEOUT_SECONDS = 10.0
+GREP_LINE_TIMEOUT_SECONDS = 0.05
 MAX_READ_DOC_BYTES = 1_000_000
 _FETCH_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
@@ -99,7 +100,7 @@ def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None:
                 "source": source,
                 "url": url,
                 "fetched_at_epoch": time.time(),
-                "fetched_at": datetime.now().isoformat(),
+                "fetched_at": utc_now_iso(),
                 "page_count": pages,
             },
             indent=2,
@@ -118,7 +119,7 @@ def _write_partial_meta(meta_path: Path, source: str, url: str, pages: int) -> N
                 "source": source,
                 "url": url,
                 "fetched_at_epoch": time.time(),
-                "fetched_at": datetime.now().isoformat(),
+                "fetched_at": utc_now_iso(),
                 "page_count": pages,
                 "partial": True,
             },
@@ -182,6 +183,11 @@ async def ensure_docs(
                 "and call ensure_docs with that name.",
                 is_error=True,
             )
+        if not is_safe_library_name(source):
+            return ToolResult(
+                f"Invalid source name '{source}'. Use names from list_sources.",
+                is_error=True,
+            )
         available = ", ".join(sorted(all_sources().keys()))
         return ToolResult(
             f"Unknown source '{source}'. Available: {available}",
@@ -409,9 +415,7 @@ def grep_docs(
     Hardened against (a) path traversal via ``library`` (rejected by
     ``is_safe_library_name``) and (b) catastrophic regex via a pattern
-    length cap and a wall-clock budget. Python's ``re`` has no built-in
-    timeout, so the budget is checked between files; a single pathological
-    pattern+line combination can still wedge for one file's worth of work.
+    length cap, a total wall-clock budget, and a per-line regex timeout.
     """
     docs_dir = docs_dir or default_docs_dir()
     if not docs_dir.exists():
@@ -430,9 +434,9 @@ def grep_docs(
     context = max(0, min(context, 3))
     try:
-        flags = 0 if case_sensitive else re.IGNORECASE
-        regex = re.compile(pattern, flags)
-    except re.error as err:
+        flags = 0 if case_sensitive else regex.IGNORECASE
+        compiled = regex.compile(pattern, flags)
+    except regex.error as err:
         return ToolResult(f"Invalid pattern: {err}", is_error=True)
     roots = (
@@ -459,7 +463,12 @@ def grep_docs(
                 continue
             matches: list[tuple[int, list[str], str, list[str]]] = []
             for idx, line in enumerate(lines):
-                if regex.search(line):
+                try:
+                    matched = compiled.search(line, timeout=GREP_LINE_TIMEOUT_SECONDS) is not None
+                except TimeoutError:
+                    timed_out = True
+                    break
+                if matched:
                     before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
                     after = (
                         [lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
@@ -476,6 +485,8 @@ def grep_docs(
                     )
                 )
                 total += len(matches)
+            if timed_out:
+                break
         if timed_out:
             break

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/config.py RENAMED Viewed

@@ -71,8 +71,10 @@ class ByteSize(int):
             # Try parsing as plain number
             try:
                 return int(v)
-            except ValueError:
-                pass
+            except ValueError as err:
+                raise ValueError(
+                    f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes."
+                ) from err
         raise ValueError(f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes.")

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_json.py RENAMED Viewed

@@ -7,11 +7,11 @@ import json
 import logging
 import os
 import tempfile
-from datetime import datetime
 from pathlib import Path
 from typing import TextIO
 from ...models.events import EventType, FetchEvent
+from ...time_utils import utc_now_iso
 from ..base import EventEmitter, PageContext
 logger = logging.getLogger(__name__)
@@ -102,7 +102,7 @@ class JsonSaveStep:
             "title": ctx.title,
             "content": ctx.markdown,
             "metadata": ctx.metadata,
-            "fetched_at": datetime.now().isoformat(),
+            "fetched_at": utc_now_iso(),
         }
         f = self._ensure_temp_file()
@@ -142,7 +142,7 @@ class JsonSaveStep:
             # No documents written - create empty structure
             self._base_dir.mkdir(parents=True, exist_ok=True)
             output = {
-                "generated_at": datetime.now().isoformat(),
+                "generated_at": utc_now_iso(),
                 "document_count": 0,
                 "documents": [],
             }
@@ -154,7 +154,7 @@ class JsonSaveStep:
         try:
             # Close the documents array and add metadata
             self._temp_file.write("\n  ],\n")
-            self._temp_file.write(f'  "generated_at": "{datetime.now().isoformat()}",\n')
+            self._temp_file.write(f'  "generated_at": "{utc_now_iso()}",\n')
             self._temp_file.write(f'  "document_count": {self._document_count}\n')
             self._temp_file.write("}\n")
             self._temp_file.close()

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_ndjson.py RENAMED Viewed

@@ -12,11 +12,11 @@ import hashlib
 import json
 import logging
 import sys
-from datetime import datetime
 from pathlib import Path
 from typing import IO
 from ...models.events import EventType, FetchEvent
+from ...time_utils import utc_now_iso
 from ..base import EventEmitter, PageContext
 logger = logging.getLogger(__name__)
@@ -77,7 +77,7 @@ class NdjsonSaveStep:
             "title": ctx.title,
             "source_type": ctx.source_type,
             "metadata": ctx.metadata,
-            "fetched_at": datetime.now().isoformat(),
+            "fetched_at": utc_now_iso(),
         }
         async with self._lock:

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_sqlite.py RENAMED Viewed

@@ -5,10 +5,10 @@ from __future__ import annotations
 import json
 import logging
 import sqlite3
-from datetime import datetime
 from pathlib import Path
 from ...models.events import EventType, FetchEvent
+from ...time_utils import utc_now_iso
 from ..base import EventEmitter, PageContext
 logger = logging.getLogger(__name__)
@@ -109,7 +109,7 @@ class SqliteSaveStep:
                     ctx.title,
                     ctx.markdown,
                     json.dumps(ctx.metadata, ensure_ascii=False),
-                    datetime.now().isoformat(),
+                    utc_now_iso(),
                 ),
             )
             # Only count if a row was actually inserted (not ignored)

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/robots.py RENAMED Viewed

@@ -348,7 +348,8 @@ class RobotsChecker:
             delay = parser.crawl_delay(self.user_agent)
             if delay is not None:
                 return float(delay)
-        except (TypeError, ValueError):
+        except (TypeError, ValueError) as err:
+            self.logger.debug("Ignoring invalid Crawl-delay for %s: %s", url, err)
             return None
         return None
@@ -372,7 +373,8 @@ class RobotsChecker:
         try:
             sitemaps = parser.site_maps()
             return list(sitemaps) if sitemaps else []
-        except Exception:
+        except Exception as err:
+            self.logger.debug("Could not read Sitemap entries for %s: %s", url, err)
             return []
     def clear_cache(self) -> None:

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/url_validator.py RENAMED Viewed

@@ -150,9 +150,7 @@ class UrlValidator:
             ipaddress.ip_address(normalized)
             return [normalized]
         except ValueError:
-            pass
-        return self._resolver(normalized)
+            return self._resolver(normalized)
     def _resolve_hostname(self, hostname: str) -> list[str]:
         """Resolve hostname to a deduplicated list of IP addresses."""
@@ -212,11 +210,13 @@ class UrlValidator:
         This closes the gap where attacker-controlled DNS maps a public-looking
         hostname to a private or loopback address.
         """
+        is_hostname_ip = True
         try:
             ipaddress.ip_address(hostname)
-            return None
         except ValueError:
-            pass
+            is_hostname_ip = False
+        if is_hostname_ip:
+            return None
         try:
             addresses = self._resolver(hostname)

docpull-3.0.1/src/docpull/time_utils.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""UTC time helpers for persisted docpull data."""
+from __future__ import annotations
+from datetime import datetime, timezone
+def utc_now() -> datetime:
+    """Return the current instant as a timezone-aware UTC datetime."""
+    return datetime.now(timezone.utc)
+def utc_now_iso() -> str:
+    """Return the current instant as an ISO-8601 UTC timestamp."""
+    return utc_now().isoformat()
+def parse_persisted_datetime(value: str) -> datetime:
+    """Parse a stored timestamp and normalize it to timezone-aware UTC.
+    Older cache files used naive local timestamps. Treat those legacy values
+    as UTC so comparisons stay deterministic after newer writes include an
+    explicit ``+00:00`` offset.
+    """
+    normalized = value[:-1] + "+00:00" if value.endswith("Z") else value
+    parsed = datetime.fromisoformat(normalized)
+    if parsed.tzinfo is None:
+        return parsed.replace(tzinfo=timezone.utc)
+    return parsed.astimezone(timezone.utc)

{docpull-3.0.0 → docpull-3.0.1/src/docpull.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 3.0.0
+Version: 3.0.1
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
 Requires-Dist: defusedxml>=0.7.1
 Requires-Dist: extruct>=0.15.0
 Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: idna>=3.15
+Requires-Dist: regex>=2024.11.6
 Requires-Dist: rich>=13.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: pydantic>=2.0
+Requires-Dist: urllib3>=2.7.0
 Provides-Extra: proxy
 Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
 Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
 Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
 Provides-Extra: mcp
 Requires-Dist: mcp>=1.0.0; extra == "mcp"
+Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
+Requires-Dist: starlette>=1.0.1; extra == "mcp"
 Provides-Extra: llm
 Requires-Dist: tiktoken>=0.7.0; extra == "llm"
 Provides-Extra: all
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
 Requires-Dist: trafilatura>=1.12.0; extra == "all"
 Requires-Dist: tiktoken>=0.7.0; extra == "all"
 Requires-Dist: mcp>=1.0.0; extra == "all"
+Requires-Dist: python-multipart>=0.0.27; extra == "all"
+Requires-Dist: starlette>=1.0.1; extra == "all"
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

{docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ src/docpull/doctor.py
 src/docpull/logging_config.py
 src/docpull/metadata_extractor.py
 src/docpull/py.typed
+src/docpull/time_utils.py
 src/docpull.egg-info/PKG-INFO
 src/docpull.egg-info/SOURCES.txt
 src/docpull.egg-info/dependency_links.txt
@@ -80,4 +81,5 @@ tests/test_pipeline.py
 tests/test_real_site_regressions.py
 tests/test_save_ndjson.py
 tests/test_security_hardening.py
-tests/test_special_cases.py
+tests/test_special_cases.py
+tests/test_time_utils.py

{docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/requires.txt RENAMED Viewed

@@ -3,9 +3,12 @@ html2text>=2020.1.16
 defusedxml>=0.7.1
 extruct>=0.15.0
 aiohttp>=3.9.0
+idna>=3.15
+regex>=2024.11.6
 rich>=13.0.0
 pyyaml>=6.0
 pydantic>=2.0
+urllib3>=2.7.0
 [all]
 aiohttp-socks>=0.8.0
@@ -13,6 +16,8 @@ url-normalize>=1.4.0
 trafilatura>=1.12.0
 tiktoken>=0.7.0
 mcp>=1.0.0
+python-multipart>=0.0.27
+starlette>=1.0.1
 [dev]
 pytest>=7.0.0
@@ -33,6 +38,8 @@ tiktoken>=0.7.0
 [mcp]
 mcp>=1.0.0
+python-multipart>=0.0.27
+starlette>=1.0.1
 [normalize]
 url-normalize>=1.4.0

{docpull-3.0.0 → docpull-3.0.1}/tests/test_mcp_tools.py RENAMED Viewed

@@ -22,6 +22,7 @@ from docpull.mcp.tools import (
     read_doc,
     remove_source,
 )
+from docpull.security.url_validator import UrlValidationResult
 def test_builtin_sources_include_common_libraries():
@@ -329,6 +330,18 @@ def test_grep_docs_rejects_invalid_regex(tmp_path):
     assert "Invalid pattern" in result.text
+def test_grep_docs_times_out_pathological_regex(tmp_path, monkeypatch):
+    lib = tmp_path / "lib"
+    lib.mkdir()
+    (lib / "a.md").write_text("a" * 20_000 + "!")
+    monkeypatch.setattr("docpull.mcp.tools.GREP_LINE_TIMEOUT_SECONDS", 0.001)
+    result = grep_docs(r"(a+)+$", docs_dir=tmp_path)
+    assert result.is_error is False
+    assert result.data["timed_out"] is True
 # --- Robustness -------------------------------------------------------
@@ -341,6 +354,44 @@ def test_load_user_sources_logs_yaml_error(tmp_path, caplog):
     assert any("Failed to parse" in rec.message for rec in caplog.records)
+def test_load_user_sources_rejects_unsafe_manual_entries(tmp_path, caplog, monkeypatch):
+    class FakeValidator:
+        def validate(self, url: str) -> UrlValidationResult:
+            if "blocked.example" in url:
+                return UrlValidationResult.invalid("blocked test host")
+            return UrlValidationResult.valid()
+    monkeypatch.setattr("docpull.mcp.sources._USER_SOURCE_URL_VALIDATOR", FakeValidator())
+    path = tmp_path / "sources.yaml"
+    path.write_text(
+        """
+sources:
+  good:
+    url: https://example.com/docs
+    max_pages: "5"
+  ../bad:
+    url: https://example.com/docs
+  plain_http:
+    url: http://example.com/docs
+  blocked:
+    url: https://blocked.example/docs
+  too_many:
+    url: https://example.com/docs
+    max_pages: 100001
+"""
+    )
+    with caplog.at_level(logging.WARNING, logger="docpull.mcp.sources"):
+        sources = load_user_sources(path=path)
+    assert list(sources) == ["good"]
+    assert sources["good"].max_pages == 5
+    assert any("unsafe source name" in rec.message for rec in caplog.records)
+    assert any("url must be an HTTPS URL" in rec.message for rec in caplog.records)
+    assert any("blocked test host" in rec.message for rec in caplog.records)
+    assert any("max_pages must be between" in rec.message for rec in caplog.records)
 def test_partial_meta_treats_cache_as_stale(tmp_path):
     """A meta file marked partial=true should not be considered fresh."""
     import json

docpull-3.0.1/tests/test_time_utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+from datetime import timezone
+from docpull.time_utils import parse_persisted_datetime, utc_now_iso
+def test_utc_now_iso_is_timezone_explicit() -> None:
+    assert utc_now_iso().endswith("+00:00")
+def test_parse_persisted_datetime_normalizes_legacy_naive_values() -> None:
+    parsed = parse_persisted_datetime("2026-04-26T00:00:00")
+    assert parsed.tzinfo == timezone.utc
+    assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"
+def test_parse_persisted_datetime_accepts_z_suffix() -> None:
+    parsed = parse_persisted_datetime("2026-04-26T00:00:00Z")
+    assert parsed.tzinfo == timezone.utc
+    assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"

{docpull-3.0.0 → docpull-3.0.1}/LICENSE RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/README.md RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/setup.cfg RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/__main__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/streaming_dedup.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/manager.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/chunking.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/extractor.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/markdown.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/protocols.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/trafilatura_extractor.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/fetcher.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/composite.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/crawler.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/filters.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/protocols.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/protocols.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/sitemap.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/doctor.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/client.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/protocols.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/rate_limiter.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/logging_config.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/server.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/metadata_extractor.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/events.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/profiles.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/base.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/chunk.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/convert.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/dedup.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/fetch.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/metadata.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/validate.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/py.typed RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/__init__.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/entry_points.txt RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/top_level.txt RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_cache_conditional_get.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_chunking.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_cli.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_conversion.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_convert_step_new.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_discovery.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_integration.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_link_extractors.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_naming.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_pipeline.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_real_site_regressions.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_save_ndjson.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_security_hardening.py RENAMED Viewed

File without changes

{docpull-3.0.0 → docpull-3.0.1}/tests/test_special_cases.py RENAMED Viewed

File without changes

docpull 3.0.0__tar.gz → 3.0.1__tar.gz

docpull 3.0.0tar.gz → 3.0.1tar.gz