docpull 3.0.0__tar.gz → 3.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docpull-3.0.0/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO +8 -1
  2. {docpull-3.0.0 → docpull-3.0.1}/pyproject.toml +24 -5
  3. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/manager.py +10 -8
  4. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cli.py +4 -6
  5. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/special_cases.py +6 -4
  6. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -1
  7. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py +2 -1
  8. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/sources.py +63 -6
  9. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/tools.py +22 -11
  10. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/config.py +4 -2
  11. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_json.py +4 -4
  12. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
  13. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_sqlite.py +2 -2
  14. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/robots.py +4 -2
  15. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/url_validator.py +5 -5
  16. docpull-3.0.1/src/docpull/time_utils.py +29 -0
  17. {docpull-3.0.0 → docpull-3.0.1/src/docpull.egg-info}/PKG-INFO +8 -1
  18. {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/SOURCES.txt +3 -1
  19. {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/requires.txt +7 -0
  20. {docpull-3.0.0 → docpull-3.0.1}/tests/test_mcp_tools.py +51 -0
  21. docpull-3.0.1/tests/test_time_utils.py +23 -0
  22. {docpull-3.0.0 → docpull-3.0.1}/LICENSE +0 -0
  23. {docpull-3.0.0 → docpull-3.0.1}/README.md +0 -0
  24. {docpull-3.0.0 → docpull-3.0.1}/setup.cfg +0 -0
  25. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/__init__.py +0 -0
  26. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/__main__.py +0 -0
  27. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/__init__.py +0 -0
  28. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
  29. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/__init__.py +0 -0
  30. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/manager.py +0 -0
  31. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/__init__.py +0 -0
  32. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/chunking.py +0 -0
  33. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/extractor.py +0 -0
  34. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/markdown.py +0 -0
  35. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/protocols.py +0 -0
  36. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  37. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/__init__.py +0 -0
  38. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/fetcher.py +0 -0
  39. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/__init__.py +0 -0
  40. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/composite.py +0 -0
  41. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/crawler.py +0 -0
  42. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/filters.py +0 -0
  43. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  44. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  45. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/protocols.py +0 -0
  46. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/sitemap.py +0 -0
  47. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/doctor.py +0 -0
  48. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/__init__.py +0 -0
  49. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/client.py +0 -0
  50. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/protocols.py +0 -0
  51. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/rate_limiter.py +0 -0
  52. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/logging_config.py +0 -0
  53. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/__init__.py +0 -0
  54. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/server.py +0 -0
  55. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/metadata_extractor.py +0 -0
  56. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/__init__.py +0 -0
  57. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/events.py +0 -0
  58. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/profiles.py +0 -0
  59. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/__init__.py +0 -0
  60. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/base.py +0 -0
  61. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
  62. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
  63. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/convert.py +0 -0
  64. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/dedup.py +0 -0
  65. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/fetch.py +0 -0
  66. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
  67. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save.py +0 -0
  68. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/validate.py +0 -0
  69. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/py.typed +0 -0
  70. {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/__init__.py +0 -0
  71. {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
  72. {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
  73. {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/top_level.txt +0 -0
  74. {docpull-3.0.0 → docpull-3.0.1}/tests/test_cache_conditional_get.py +0 -0
  75. {docpull-3.0.0 → docpull-3.0.1}/tests/test_chunking.py +0 -0
  76. {docpull-3.0.0 → docpull-3.0.1}/tests/test_cli.py +0 -0
  77. {docpull-3.0.0 → docpull-3.0.1}/tests/test_conversion.py +0 -0
  78. {docpull-3.0.0 → docpull-3.0.1}/tests/test_convert_step_new.py +0 -0
  79. {docpull-3.0.0 → docpull-3.0.1}/tests/test_discovery.py +0 -0
  80. {docpull-3.0.0 → docpull-3.0.1}/tests/test_integration.py +0 -0
  81. {docpull-3.0.0 → docpull-3.0.1}/tests/test_link_extractors.py +0 -0
  82. {docpull-3.0.0 → docpull-3.0.1}/tests/test_naming.py +0 -0
  83. {docpull-3.0.0 → docpull-3.0.1}/tests/test_pipeline.py +0 -0
  84. {docpull-3.0.0 → docpull-3.0.1}/tests/test_real_site_regressions.py +0 -0
  85. {docpull-3.0.0 → docpull-3.0.1}/tests/test_save_ndjson.py +0 -0
  86. {docpull-3.0.0 → docpull-3.0.1}/tests/test_security_hardening.py +0 -0
  87. {docpull-3.0.0 → docpull-3.0.1}/tests/test_special_cases.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 3.0.0
3
+ Version: 3.0.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
45
  Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: idna>=3.15
47
+ Requires-Dist: regex>=2024.11.6
46
48
  Requires-Dist: rich>=13.0.0
47
49
  Requires-Dist: pyyaml>=6.0
48
50
  Requires-Dist: pydantic>=2.0
51
+ Requires-Dist: urllib3>=2.7.0
49
52
  Provides-Extra: proxy
50
53
  Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
51
54
  Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
56
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
57
60
  Provides-Extra: mcp
58
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
+ Requires-Dist: starlette>=1.0.1; extra == "mcp"
59
64
  Provides-Extra: llm
60
65
  Requires-Dist: tiktoken>=0.7.0; extra == "llm"
61
66
  Provides-Extra: all
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
64
69
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
65
70
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
66
71
  Requires-Dist: mcp>=1.0.0; extra == "all"
72
+ Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
+ Requires-Dist: starlette>=1.0.1; extra == "all"
67
74
  Provides-Extra: dev
68
75
  Requires-Dist: pytest>=7.0.0; extra == "dev"
69
76
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "3.0.0"
7
+ version = "3.0.1"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -67,9 +67,12 @@ dependencies = [
67
67
  "defusedxml>=0.7.1",
68
68
  "extruct>=0.15.0",
69
69
  "aiohttp>=3.9.0",
70
+ "idna>=3.15",
71
+ "regex>=2024.11.6",
70
72
  "rich>=13.0.0",
71
73
  "pyyaml>=6.0",
72
74
  "pydantic>=2.0",
75
+ "urllib3>=2.7.0",
73
76
  ]
74
77
 
75
78
  [project.optional-dependencies]
@@ -87,6 +90,8 @@ tokens = [
87
90
  ]
88
91
  mcp = [
89
92
  "mcp>=1.0.0",
93
+ "python-multipart>=0.0.27",
94
+ "starlette>=1.0.1",
90
95
  ]
91
96
  llm = [
92
97
  "tiktoken>=0.7.0",
@@ -97,6 +102,8 @@ all = [
97
102
  "trafilatura>=1.12.0",
98
103
  "tiktoken>=0.7.0",
99
104
  "mcp>=1.0.0",
105
+ "python-multipart>=0.0.27",
106
+ "starlette>=1.0.1",
100
107
  ]
101
108
  dev = [
102
109
  "pytest>=7.0.0",
@@ -170,10 +177,22 @@ module = "docpull.models.*"
170
177
  disallow_any_unimported = false
171
178
  warn_return_any = false
172
179
 
173
- [[tool.mypy.overrides]]
174
- module = "tests.*"
175
- disallow_untyped_defs = false
176
- disallow_any_unimported = false
180
+ [tool.bandit]
181
+ # Policy: every entry in `skips` MUST have a one-line justification
182
+ # above it explaining what bandit found, why it's a false positive
183
+ # *for this codebase*, and (if narrow) why a `# nosec BXXX # reason`
184
+ # annotation at the call site would have been worse. Bandit skips
185
+ # silence findings repo-wide, so the bar to add one is higher than
186
+ # silencing a single line. If a new skip is unavoidable, add it here
187
+ # in PR review, not as a drive-by.
188
+ #
189
+ # B101 (assert_used) — flags every `assert x is not None` we use for
190
+ # type narrowing. Bandit's concern is that assertions vanish under
191
+ # `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
192
+ # narrowing asserts are not load-bearing safety checks. Skipping the
193
+ # rule globally keeps the existing idiom without 8+ inline `# nosec`
194
+ # annotations in fetcher.py / pipeline/steps/convert.py.
195
+ skips = ["B101"]
177
196
 
178
197
  [tool.pytest.ini_options]
179
198
  minversion = "7.0"
@@ -5,10 +5,12 @@ from __future__ import annotations
5
5
  import hashlib
6
6
  import json
7
7
  import logging
8
- from datetime import datetime, timedelta
8
+ from datetime import timedelta
9
9
  from pathlib import Path
10
10
  from typing import TypedDict
11
11
 
12
+ from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  # Default TTL for cache entries (30 days)
@@ -257,7 +259,7 @@ class CacheManager:
257
259
  self.manifest[url] = {
258
260
  "checksum": self.compute_checksum(content),
259
261
  "file_path": str(file_path),
260
- "fetched_at": datetime.now().isoformat(),
262
+ "fetched_at": utc_now_iso(),
261
263
  "size": len(content),
262
264
  }
263
265
 
@@ -314,7 +316,7 @@ class CacheManager:
314
316
  Note:
315
317
  Changes are batched. Call flush() to persist to disk.
316
318
  """
317
- self._state.last_run = datetime.now().isoformat()
319
+ self._state.last_run = utc_now_iso()
318
320
  self._state_dirty = True
319
321
 
320
322
  def clear_state(self) -> None:
@@ -354,18 +356,18 @@ class CacheManager:
354
356
  if ttl is None:
355
357
  return 0
356
358
 
357
- cutoff = datetime.now() - timedelta(days=ttl)
359
+ cutoff = utc_now() - timedelta(days=ttl)
358
360
  to_remove = []
359
361
 
360
362
  for url, entry in self.manifest.items():
361
363
  fetched_at = entry.get("fetched_at")
362
364
  if fetched_at:
363
365
  try:
364
- entry_time = datetime.fromisoformat(fetched_at)
366
+ entry_time = parse_persisted_datetime(fetched_at)
365
367
  if entry_time < cutoff:
366
368
  to_remove.append(url)
367
- except ValueError:
368
- pass # Invalid date format, skip
369
+ except ValueError as err:
370
+ logger.warning("Invalid cache timestamp for %s: %s", url, err)
369
371
 
370
372
  for url in to_remove:
371
373
  del self.manifest[url]
@@ -413,7 +415,7 @@ class CacheManager:
413
415
  """
414
416
  data: DiscoveredUrlsState = {
415
417
  "start_url": start_url,
416
- "discovered_at": datetime.now().isoformat(),
418
+ "discovered_at": utc_now_iso(),
417
419
  "urls": urls,
418
420
  }
419
421
  try:
@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
13
13
 
14
14
  output_dir = None
15
15
  if "--output-dir" in sys.argv or "-o" in sys.argv:
16
- try:
17
- flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
18
- if flag_idx + 1 < len(sys.argv):
19
- output_dir = Path(sys.argv[flag_idx + 1])
20
- except (ValueError, IndexError):
21
- pass
16
+ flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
17
+ flag_idx = sys.argv.index(flag)
18
+ if flag_idx + 1 < len(sys.argv):
19
+ output_dir = Path(sys.argv[flag_idx + 1])
22
20
  sys.exit(run_doctor(output_dir=output_dir))
23
21
 
24
22
  # Verify core dependencies
@@ -320,7 +320,8 @@ class OpenApiExtractor:
320
320
  return None
321
321
  try:
322
322
  data = json.loads(text)
323
- except json.JSONDecodeError:
323
+ except json.JSONDecodeError as err:
324
+ logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
324
325
  return None
325
326
  if not isinstance(data, dict):
326
327
  return None
@@ -569,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
569
570
  """Heuristic: does this HTML appear to be a JS-only SPA?
570
571
 
571
572
  True when the non-script body text is very small relative to the overall
572
- page size and the page contains script tags. Not perfect, but good enough
573
- to warn an agent before it consumes empty Markdown.
573
+ page size and the page contains script tags. This is a conservative signal
574
+ for warning an agent before it consumes empty Markdown.
574
575
  """
575
576
  if len(html) < 500:
576
577
  return False
@@ -578,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
578
579
  return False
579
580
  try:
580
581
  soup = _soup(html)
581
- except Exception: # noqa: BLE001
582
+ except Exception as err: # noqa: BLE001
583
+ logger.debug("SPA heuristic skipped malformed HTML: %s", err)
582
584
  return False
583
585
  # Remove scripts/styles before measuring.
584
586
  for tag in soup(["script", "style", "noscript"]):
@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
295
295
 
296
296
  try:
297
297
  absolute_url = urljoin(base_url, href)
298
- except Exception:
298
+ except Exception as err:
299
+ logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
299
300
  return None
300
301
 
301
302
  # Validate it's a proper URL
@@ -148,7 +148,8 @@ class StaticLinkExtractor:
148
148
  """
149
149
  try:
150
150
  absolute_url = urljoin(base_url, href)
151
- except Exception:
151
+ except Exception as err:
152
+ logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
152
153
  return None
153
154
 
154
155
  # Remove fragment
@@ -7,9 +7,12 @@ import os
7
7
  import re
8
8
  from dataclasses import dataclass
9
9
  from pathlib import Path
10
+ from urllib.parse import urlparse
10
11
 
11
12
  import yaml
12
13
 
14
+ from ..security.url_validator import UrlValidator
15
+
13
16
  logger = logging.getLogger(__name__)
14
17
 
15
18
 
@@ -54,6 +57,9 @@ BUILTIN_SOURCES: dict[str, SourceConfig] = {
54
57
 
55
58
  _URL_SCHEME_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE)
56
59
  _LIBRARY_NAME_RE = re.compile(r"^[a-zA-Z0-9_.-]+$")
60
+ MAX_LIBRARY_NAME_LENGTH = 128
61
+ MAX_USER_SOURCE_PAGES = 100_000
62
+ _USER_SOURCE_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
57
63
 
58
64
 
59
65
  def is_safe_library_name(name: str) -> bool:
@@ -61,11 +67,44 @@ def is_safe_library_name(name: str) -> bool:
61
67
 
62
68
  Allows alnum + ``_ . -``; rejects separators, ``..``, leading dot.
63
69
  """
64
- if not name or name.startswith(".") or name == ".." or len(name) > 128:
70
+ if not name or name.startswith(".") or name == ".." or len(name) > MAX_LIBRARY_NAME_LENGTH:
65
71
  return False
66
72
  return bool(_LIBRARY_NAME_RE.fullmatch(name))
67
73
 
68
74
 
75
+ def _is_https_url(url: str) -> bool:
76
+ parsed = urlparse(url)
77
+ return parsed.scheme.lower() == "https" and parsed.hostname is not None
78
+
79
+
80
+ def _is_allowed_source_url(url: str) -> tuple[bool, str | None]:
81
+ if not _is_https_url(url):
82
+ return (False, "url must be an HTTPS URL")
83
+ validation = _USER_SOURCE_URL_VALIDATOR.validate(url)
84
+ if not validation.is_valid:
85
+ return (False, validation.rejection_reason or "url rejected by validator")
86
+ return (True, None)
87
+
88
+
89
+ def _coerce_max_pages(value: object, source_name: str) -> int | None:
90
+ if value is None:
91
+ return None
92
+ if isinstance(value, bool):
93
+ raise ValueError(f"source '{source_name}' max_pages must be an integer")
94
+ if isinstance(value, int):
95
+ parsed = value
96
+ elif isinstance(value, str):
97
+ try:
98
+ parsed = int(value)
99
+ except ValueError as err:
100
+ raise ValueError(f"source '{source_name}' max_pages must be an integer") from err
101
+ else:
102
+ raise ValueError(f"source '{source_name}' max_pages must be an integer")
103
+ if parsed < 1 or parsed > MAX_USER_SOURCE_PAGES:
104
+ raise ValueError(f"source '{source_name}' max_pages must be between 1 and {MAX_USER_SOURCE_PAGES}")
105
+ return parsed
106
+
107
+
69
108
  def default_config_dir() -> Path:
70
109
  env = os.environ.get("XDG_CONFIG_HOME")
71
110
  base = Path(env) if env else Path.home() / ".config"
@@ -98,13 +137,31 @@ def load_user_sources(path: Path | None = None) -> dict[str, SourceConfig]:
98
137
  entries = raw.get("sources") or {}
99
138
  result: dict[str, SourceConfig] = {}
100
139
  for name, cfg in entries.items():
101
- if not isinstance(cfg, dict) or not isinstance(cfg.get("url"), str):
140
+ source_name = str(name)
141
+ if not is_safe_library_name(source_name):
142
+ logger.warning("Ignoring unsafe source name in %s: %r", path, source_name)
143
+ continue
144
+ if not isinstance(cfg, dict):
145
+ logger.warning("Ignoring source %s in %s: entry must be a mapping", source_name, path)
146
+ continue
147
+ url = cfg.get("url")
148
+ if not isinstance(url, str):
149
+ logger.warning("Ignoring source %s in %s: url must be an HTTPS URL", source_name, path)
150
+ continue
151
+ url_allowed, url_reason = _is_allowed_source_url(url)
152
+ if not url_allowed:
153
+ logger.warning("Ignoring source %s in %s: %s", source_name, path, url_reason)
154
+ continue
155
+ try:
156
+ max_pages = _coerce_max_pages(cfg.get("maxPages") or cfg.get("max_pages"), source_name)
157
+ except ValueError as err:
158
+ logger.warning("Ignoring source %s in %s: %s", source_name, path, err)
102
159
  continue
103
- result[str(name)] = SourceConfig(
104
- url=cfg["url"],
160
+ result[source_name] = SourceConfig(
161
+ url=url,
105
162
  description=str(cfg.get("description", "")),
106
163
  category=str(cfg.get("category", "user")),
107
- max_pages=cfg.get("maxPages") or cfg.get("max_pages"),
164
+ max_pages=max_pages,
108
165
  )
109
166
  return result
110
167
 
@@ -122,7 +179,7 @@ def resolve_source(name: str) -> SourceConfig | None:
122
179
  be routed through configured aliases so that policy (max_pages, category)
123
180
  lives in one place.
124
181
  """
125
- if _URL_SCHEME_RE.match(name):
182
+ if _URL_SCHEME_RE.match(name) or not is_safe_library_name(name):
126
183
  return None
127
184
  return all_sources().get(name)
128
185
 
@@ -14,20 +14,20 @@ from __future__ import annotations
14
14
  import json
15
15
  import logging
16
16
  import os
17
- import re
18
17
  import shutil
19
18
  import time
20
19
  from collections.abc import Awaitable, Callable
21
20
  from dataclasses import dataclass
22
- from datetime import datetime
23
21
  from pathlib import Path
24
22
  from typing import Any
25
23
 
24
+ import regex
26
25
  import yaml
27
26
 
28
27
  from ..core.fetcher import Fetcher
29
28
  from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
30
29
  from ..security.url_validator import UrlValidator
30
+ from ..time_utils import utc_now_iso
31
31
  from .sources import (
32
32
  _URL_SCHEME_RE,
33
33
  BUILTIN_SOURCES,
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
44
44
  CACHE_TTL_SECONDS = 7 * 24 * 60 * 60 # 7 days
45
45
  MAX_GREP_PATTERN_LEN = 1000
46
46
  GREP_TIMEOUT_SECONDS = 10.0
47
+ GREP_LINE_TIMEOUT_SECONDS = 0.05
47
48
  MAX_READ_DOC_BYTES = 1_000_000
48
49
 
49
50
  _FETCH_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
@@ -99,7 +100,7 @@ def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None:
99
100
  "source": source,
100
101
  "url": url,
101
102
  "fetched_at_epoch": time.time(),
102
- "fetched_at": datetime.now().isoformat(),
103
+ "fetched_at": utc_now_iso(),
103
104
  "page_count": pages,
104
105
  },
105
106
  indent=2,
@@ -118,7 +119,7 @@ def _write_partial_meta(meta_path: Path, source: str, url: str, pages: int) -> N
118
119
  "source": source,
119
120
  "url": url,
120
121
  "fetched_at_epoch": time.time(),
121
- "fetched_at": datetime.now().isoformat(),
122
+ "fetched_at": utc_now_iso(),
122
123
  "page_count": pages,
123
124
  "partial": True,
124
125
  },
@@ -182,6 +183,11 @@ async def ensure_docs(
182
183
  "and call ensure_docs with that name.",
183
184
  is_error=True,
184
185
  )
186
+ if not is_safe_library_name(source):
187
+ return ToolResult(
188
+ f"Invalid source name '{source}'. Use names from list_sources.",
189
+ is_error=True,
190
+ )
185
191
  available = ", ".join(sorted(all_sources().keys()))
186
192
  return ToolResult(
187
193
  f"Unknown source '{source}'. Available: {available}",
@@ -409,9 +415,7 @@ def grep_docs(
409
415
 
410
416
  Hardened against (a) path traversal via ``library`` (rejected by
411
417
  ``is_safe_library_name``) and (b) catastrophic regex via a pattern
412
- length cap and a wall-clock budget. Python's ``re`` has no built-in
413
- timeout, so the budget is checked between files; a single pathological
414
- pattern+line combination can still wedge for one file's worth of work.
418
+ length cap, a total wall-clock budget, and a per-line regex timeout.
415
419
  """
416
420
  docs_dir = docs_dir or default_docs_dir()
417
421
  if not docs_dir.exists():
@@ -430,9 +434,9 @@ def grep_docs(
430
434
  context = max(0, min(context, 3))
431
435
 
432
436
  try:
433
- flags = 0 if case_sensitive else re.IGNORECASE
434
- regex = re.compile(pattern, flags)
435
- except re.error as err:
437
+ flags = 0 if case_sensitive else regex.IGNORECASE
438
+ compiled = regex.compile(pattern, flags)
439
+ except regex.error as err:
436
440
  return ToolResult(f"Invalid pattern: {err}", is_error=True)
437
441
 
438
442
  roots = (
@@ -459,7 +463,12 @@ def grep_docs(
459
463
  continue
460
464
  matches: list[tuple[int, list[str], str, list[str]]] = []
461
465
  for idx, line in enumerate(lines):
462
- if regex.search(line):
466
+ try:
467
+ matched = compiled.search(line, timeout=GREP_LINE_TIMEOUT_SECONDS) is not None
468
+ except TimeoutError:
469
+ timed_out = True
470
+ break
471
+ if matched:
463
472
  before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
464
473
  after = (
465
474
  [lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
@@ -476,6 +485,8 @@ def grep_docs(
476
485
  )
477
486
  )
478
487
  total += len(matches)
488
+ if timed_out:
489
+ break
479
490
  if timed_out:
480
491
  break
481
492
 
@@ -71,8 +71,10 @@ class ByteSize(int):
71
71
  # Try parsing as plain number
72
72
  try:
73
73
  return int(v)
74
- except ValueError:
75
- pass
74
+ except ValueError as err:
75
+ raise ValueError(
76
+ f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes."
77
+ ) from err
76
78
  raise ValueError(f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes.")
77
79
 
78
80
 
@@ -7,11 +7,11 @@ import json
7
7
  import logging
8
8
  import os
9
9
  import tempfile
10
- from datetime import datetime
11
10
  from pathlib import Path
12
11
  from typing import TextIO
13
12
 
14
13
  from ...models.events import EventType, FetchEvent
14
+ from ...time_utils import utc_now_iso
15
15
  from ..base import EventEmitter, PageContext
16
16
 
17
17
  logger = logging.getLogger(__name__)
@@ -102,7 +102,7 @@ class JsonSaveStep:
102
102
  "title": ctx.title,
103
103
  "content": ctx.markdown,
104
104
  "metadata": ctx.metadata,
105
- "fetched_at": datetime.now().isoformat(),
105
+ "fetched_at": utc_now_iso(),
106
106
  }
107
107
 
108
108
  f = self._ensure_temp_file()
@@ -142,7 +142,7 @@ class JsonSaveStep:
142
142
  # No documents written - create empty structure
143
143
  self._base_dir.mkdir(parents=True, exist_ok=True)
144
144
  output = {
145
- "generated_at": datetime.now().isoformat(),
145
+ "generated_at": utc_now_iso(),
146
146
  "document_count": 0,
147
147
  "documents": [],
148
148
  }
@@ -154,7 +154,7 @@ class JsonSaveStep:
154
154
  try:
155
155
  # Close the documents array and add metadata
156
156
  self._temp_file.write("\n ],\n")
157
- self._temp_file.write(f' "generated_at": "{datetime.now().isoformat()}",\n')
157
+ self._temp_file.write(f' "generated_at": "{utc_now_iso()}",\n')
158
158
  self._temp_file.write(f' "document_count": {self._document_count}\n')
159
159
  self._temp_file.write("}\n")
160
160
  self._temp_file.close()
@@ -12,11 +12,11 @@ import hashlib
12
12
  import json
13
13
  import logging
14
14
  import sys
15
- from datetime import datetime
16
15
  from pathlib import Path
17
16
  from typing import IO
18
17
 
19
18
  from ...models.events import EventType, FetchEvent
19
+ from ...time_utils import utc_now_iso
20
20
  from ..base import EventEmitter, PageContext
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -77,7 +77,7 @@ class NdjsonSaveStep:
77
77
  "title": ctx.title,
78
78
  "source_type": ctx.source_type,
79
79
  "metadata": ctx.metadata,
80
- "fetched_at": datetime.now().isoformat(),
80
+ "fetched_at": utc_now_iso(),
81
81
  }
82
82
 
83
83
  async with self._lock:
@@ -5,10 +5,10 @@ from __future__ import annotations
5
5
  import json
6
6
  import logging
7
7
  import sqlite3
8
- from datetime import datetime
9
8
  from pathlib import Path
10
9
 
11
10
  from ...models.events import EventType, FetchEvent
11
+ from ...time_utils import utc_now_iso
12
12
  from ..base import EventEmitter, PageContext
13
13
 
14
14
  logger = logging.getLogger(__name__)
@@ -109,7 +109,7 @@ class SqliteSaveStep:
109
109
  ctx.title,
110
110
  ctx.markdown,
111
111
  json.dumps(ctx.metadata, ensure_ascii=False),
112
- datetime.now().isoformat(),
112
+ utc_now_iso(),
113
113
  ),
114
114
  )
115
115
  # Only count if a row was actually inserted (not ignored)
@@ -348,7 +348,8 @@ class RobotsChecker:
348
348
  delay = parser.crawl_delay(self.user_agent)
349
349
  if delay is not None:
350
350
  return float(delay)
351
- except (TypeError, ValueError):
351
+ except (TypeError, ValueError) as err:
352
+ self.logger.debug("Ignoring invalid Crawl-delay for %s: %s", url, err)
352
353
  return None
353
354
 
354
355
  return None
@@ -372,7 +373,8 @@ class RobotsChecker:
372
373
  try:
373
374
  sitemaps = parser.site_maps()
374
375
  return list(sitemaps) if sitemaps else []
375
- except Exception:
376
+ except Exception as err:
377
+ self.logger.debug("Could not read Sitemap entries for %s: %s", url, err)
376
378
  return []
377
379
 
378
380
  def clear_cache(self) -> None:
@@ -150,9 +150,7 @@ class UrlValidator:
150
150
  ipaddress.ip_address(normalized)
151
151
  return [normalized]
152
152
  except ValueError:
153
- pass
154
-
155
- return self._resolver(normalized)
153
+ return self._resolver(normalized)
156
154
 
157
155
  def _resolve_hostname(self, hostname: str) -> list[str]:
158
156
  """Resolve hostname to a deduplicated list of IP addresses."""
@@ -212,11 +210,13 @@ class UrlValidator:
212
210
  This closes the gap where attacker-controlled DNS maps a public-looking
213
211
  hostname to a private or loopback address.
214
212
  """
213
+ is_hostname_ip = True
215
214
  try:
216
215
  ipaddress.ip_address(hostname)
217
- return None
218
216
  except ValueError:
219
- pass
217
+ is_hostname_ip = False
218
+ if is_hostname_ip:
219
+ return None
220
220
 
221
221
  try:
222
222
  addresses = self._resolver(hostname)
@@ -0,0 +1,29 @@
1
+ """UTC time helpers for persisted docpull data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+
7
+
8
+ def utc_now() -> datetime:
9
+ """Return the current instant as a timezone-aware UTC datetime."""
10
+ return datetime.now(timezone.utc)
11
+
12
+
13
+ def utc_now_iso() -> str:
14
+ """Return the current instant as an ISO-8601 UTC timestamp."""
15
+ return utc_now().isoformat()
16
+
17
+
18
+ def parse_persisted_datetime(value: str) -> datetime:
19
+ """Parse a stored timestamp and normalize it to timezone-aware UTC.
20
+
21
+ Older cache files used naive local timestamps. Treat those legacy values
22
+ as UTC so comparisons stay deterministic after newer writes include an
23
+ explicit ``+00:00`` offset.
24
+ """
25
+ normalized = value[:-1] + "+00:00" if value.endswith("Z") else value
26
+ parsed = datetime.fromisoformat(normalized)
27
+ if parsed.tzinfo is None:
28
+ return parsed.replace(tzinfo=timezone.utc)
29
+ return parsed.astimezone(timezone.utc)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 3.0.0
3
+ Version: 3.0.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
45
  Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: idna>=3.15
47
+ Requires-Dist: regex>=2024.11.6
46
48
  Requires-Dist: rich>=13.0.0
47
49
  Requires-Dist: pyyaml>=6.0
48
50
  Requires-Dist: pydantic>=2.0
51
+ Requires-Dist: urllib3>=2.7.0
49
52
  Provides-Extra: proxy
50
53
  Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
51
54
  Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
56
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
57
60
  Provides-Extra: mcp
58
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
+ Requires-Dist: starlette>=1.0.1; extra == "mcp"
59
64
  Provides-Extra: llm
60
65
  Requires-Dist: tiktoken>=0.7.0; extra == "llm"
61
66
  Provides-Extra: all
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
64
69
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
65
70
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
66
71
  Requires-Dist: mcp>=1.0.0; extra == "all"
72
+ Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
+ Requires-Dist: starlette>=1.0.1; extra == "all"
67
74
  Provides-Extra: dev
68
75
  Requires-Dist: pytest>=7.0.0; extra == "dev"
69
76
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
@@ -8,6 +8,7 @@ src/docpull/doctor.py
8
8
  src/docpull/logging_config.py
9
9
  src/docpull/metadata_extractor.py
10
10
  src/docpull/py.typed
11
+ src/docpull/time_utils.py
11
12
  src/docpull.egg-info/PKG-INFO
12
13
  src/docpull.egg-info/SOURCES.txt
13
14
  src/docpull.egg-info/dependency_links.txt
@@ -80,4 +81,5 @@ tests/test_pipeline.py
80
81
  tests/test_real_site_regressions.py
81
82
  tests/test_save_ndjson.py
82
83
  tests/test_security_hardening.py
83
- tests/test_special_cases.py
84
+ tests/test_special_cases.py
85
+ tests/test_time_utils.py
@@ -3,9 +3,12 @@ html2text>=2020.1.16
3
3
  defusedxml>=0.7.1
4
4
  extruct>=0.15.0
5
5
  aiohttp>=3.9.0
6
+ idna>=3.15
7
+ regex>=2024.11.6
6
8
  rich>=13.0.0
7
9
  pyyaml>=6.0
8
10
  pydantic>=2.0
11
+ urllib3>=2.7.0
9
12
 
10
13
  [all]
11
14
  aiohttp-socks>=0.8.0
@@ -13,6 +16,8 @@ url-normalize>=1.4.0
13
16
  trafilatura>=1.12.0
14
17
  tiktoken>=0.7.0
15
18
  mcp>=1.0.0
19
+ python-multipart>=0.0.27
20
+ starlette>=1.0.1
16
21
 
17
22
  [dev]
18
23
  pytest>=7.0.0
@@ -33,6 +38,8 @@ tiktoken>=0.7.0
33
38
 
34
39
  [mcp]
35
40
  mcp>=1.0.0
41
+ python-multipart>=0.0.27
42
+ starlette>=1.0.1
36
43
 
37
44
  [normalize]
38
45
  url-normalize>=1.4.0
@@ -22,6 +22,7 @@ from docpull.mcp.tools import (
22
22
  read_doc,
23
23
  remove_source,
24
24
  )
25
+ from docpull.security.url_validator import UrlValidationResult
25
26
 
26
27
 
27
28
  def test_builtin_sources_include_common_libraries():
@@ -329,6 +330,18 @@ def test_grep_docs_rejects_invalid_regex(tmp_path):
329
330
  assert "Invalid pattern" in result.text
330
331
 
331
332
 
333
+ def test_grep_docs_times_out_pathological_regex(tmp_path, monkeypatch):
334
+ lib = tmp_path / "lib"
335
+ lib.mkdir()
336
+ (lib / "a.md").write_text("a" * 20_000 + "!")
337
+ monkeypatch.setattr("docpull.mcp.tools.GREP_LINE_TIMEOUT_SECONDS", 0.001)
338
+
339
+ result = grep_docs(r"(a+)+$", docs_dir=tmp_path)
340
+
341
+ assert result.is_error is False
342
+ assert result.data["timed_out"] is True
343
+
344
+
332
345
  # --- Robustness -------------------------------------------------------
333
346
 
334
347
 
@@ -341,6 +354,44 @@ def test_load_user_sources_logs_yaml_error(tmp_path, caplog):
341
354
  assert any("Failed to parse" in rec.message for rec in caplog.records)
342
355
 
343
356
 
357
+ def test_load_user_sources_rejects_unsafe_manual_entries(tmp_path, caplog, monkeypatch):
358
+ class FakeValidator:
359
+ def validate(self, url: str) -> UrlValidationResult:
360
+ if "blocked.example" in url:
361
+ return UrlValidationResult.invalid("blocked test host")
362
+ return UrlValidationResult.valid()
363
+
364
+ monkeypatch.setattr("docpull.mcp.sources._USER_SOURCE_URL_VALIDATOR", FakeValidator())
365
+ path = tmp_path / "sources.yaml"
366
+ path.write_text(
367
+ """
368
+ sources:
369
+ good:
370
+ url: https://example.com/docs
371
+ max_pages: "5"
372
+ ../bad:
373
+ url: https://example.com/docs
374
+ plain_http:
375
+ url: http://example.com/docs
376
+ blocked:
377
+ url: https://blocked.example/docs
378
+ too_many:
379
+ url: https://example.com/docs
380
+ max_pages: 100001
381
+ """
382
+ )
383
+
384
+ with caplog.at_level(logging.WARNING, logger="docpull.mcp.sources"):
385
+ sources = load_user_sources(path=path)
386
+
387
+ assert list(sources) == ["good"]
388
+ assert sources["good"].max_pages == 5
389
+ assert any("unsafe source name" in rec.message for rec in caplog.records)
390
+ assert any("url must be an HTTPS URL" in rec.message for rec in caplog.records)
391
+ assert any("blocked test host" in rec.message for rec in caplog.records)
392
+ assert any("max_pages must be between" in rec.message for rec in caplog.records)
393
+
394
+
344
395
  def test_partial_meta_treats_cache_as_stale(tmp_path):
345
396
  """A meta file marked partial=true should not be considered fresh."""
346
397
  import json
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import timezone
4
+
5
+ from docpull.time_utils import parse_persisted_datetime, utc_now_iso
6
+
7
+
8
+ def test_utc_now_iso_is_timezone_explicit() -> None:
9
+ assert utc_now_iso().endswith("+00:00")
10
+
11
+
12
+ def test_parse_persisted_datetime_normalizes_legacy_naive_values() -> None:
13
+ parsed = parse_persisted_datetime("2026-04-26T00:00:00")
14
+
15
+ assert parsed.tzinfo == timezone.utc
16
+ assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"
17
+
18
+
19
+ def test_parse_persisted_datetime_accepts_z_suffix() -> None:
20
+ parsed = parse_persisted_datetime("2026-04-26T00:00:00Z")
21
+
22
+ assert parsed.tzinfo == timezone.utc
23
+ assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes