docpull 2.3.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docpull-2.3.0/src/docpull.egg-info → docpull-2.4.0}/PKG-INFO +27 -2
  2. {docpull-2.3.0 → docpull-2.4.0}/README.md +26 -1
  3. {docpull-2.3.0 → docpull-2.4.0}/pyproject.toml +1 -1
  4. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/__init__.py +1 -1
  5. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cli.py +62 -9
  6. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/concurrency/manager.py +16 -4
  7. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/extractor.py +114 -1
  8. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/markdown.py +50 -4
  9. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/special_cases.py +237 -27
  10. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/core/fetcher.py +424 -151
  11. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/composite.py +2 -3
  12. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/protocols.py +2 -2
  13. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/client.py +17 -5
  14. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/logging_config.py +2 -3
  15. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/server.py +21 -3
  16. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/tools.py +147 -25
  17. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/config.py +100 -10
  18. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/events.py +12 -12
  19. docpull-2.4.0/src/docpull/models/profiles.py +145 -0
  20. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/base.py +2 -1
  21. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/chunk.py +1 -2
  22. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/convert.py +90 -7
  23. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/dedup.py +11 -6
  24. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/fetch.py +87 -7
  25. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/metadata.py +3 -4
  26. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save.py +67 -3
  27. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
  28. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/validate.py +18 -17
  29. {docpull-2.3.0 → docpull-2.4.0/src/docpull.egg-info}/PKG-INFO +27 -2
  30. {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/SOURCES.txt +2 -0
  31. docpull-2.4.0/tests/test_cache_conditional_get.py +187 -0
  32. {docpull-2.3.0 → docpull-2.4.0}/tests/test_fixes_v2_3_0.py +1 -1
  33. {docpull-2.3.0 → docpull-2.4.0}/tests/test_mcp_tools.py +90 -2
  34. docpull-2.4.0/tests/test_naming.py +120 -0
  35. {docpull-2.3.0 → docpull-2.4.0}/tests/test_security_hardening.py +36 -0
  36. docpull-2.4.0/tests/test_special_cases.py +311 -0
  37. {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_conversion.py +237 -0
  38. {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_integration.py +35 -0
  39. {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_pipeline.py +131 -0
  40. docpull-2.3.0/src/docpull/models/profiles.py +0 -124
  41. docpull-2.3.0/tests/test_special_cases.py +0 -150
  42. {docpull-2.3.0 → docpull-2.4.0}/LICENSE +0 -0
  43. {docpull-2.3.0 → docpull-2.4.0}/setup.cfg +0 -0
  44. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/__main__.py +0 -0
  45. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/__init__.py +0 -0
  46. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/manager.py +0 -0
  47. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/streaming_dedup.py +0 -0
  48. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/concurrency/__init__.py +0 -0
  49. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/__init__.py +0 -0
  50. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/chunking.py +0 -0
  51. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/protocols.py +0 -0
  52. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  53. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/core/__init__.py +0 -0
  54. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/__init__.py +0 -0
  55. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/crawler.py +0 -0
  56. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/filters.py +0 -0
  57. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  58. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  59. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  60. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  61. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/sitemap.py +0 -0
  62. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/doctor.py +0 -0
  63. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/__init__.py +0 -0
  64. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/protocols.py +0 -0
  65. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/rate_limiter.py +0 -0
  66. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/__init__.py +0 -0
  67. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/sources.py +0 -0
  68. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/metadata_extractor.py +0 -0
  69. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/__init__.py +0 -0
  70. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/__init__.py +0 -0
  71. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  72. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  73. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  74. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/py.typed +0 -0
  75. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/__init__.py +0 -0
  76. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/robots.py +0 -0
  77. {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/url_validator.py +0 -0
  78. {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  79. {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/entry_points.txt +0 -0
  80. {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/requires.txt +0 -0
  81. {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/top_level.txt +0 -0
  82. {docpull-2.3.0 → docpull-2.4.0}/tests/test_chunking.py +0 -0
  83. {docpull-2.3.0 → docpull-2.4.0}/tests/test_cli.py +0 -0
  84. {docpull-2.3.0 → docpull-2.4.0}/tests/test_convert_step_new.py +0 -0
  85. {docpull-2.3.0 → docpull-2.4.0}/tests/test_link_extractors.py +0 -0
  86. {docpull-2.3.0 → docpull-2.4.0}/tests/test_save_ndjson.py +0 -0
  87. {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_discovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -278,11 +278,16 @@ NDJSON (one record per page or chunk):
278
278
  ## Security
279
279
 
280
280
  - HTTPS-only, mandatory robots.txt compliance
281
- - SSRF protection: blocks private/internal network IPs, DNS rebinding
281
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding via
282
+ connect-time address pinning
282
283
  - XXE protection via `defusedxml` on sitemaps
283
284
  - Path traversal and CRLF header injection guards
284
285
  - Auth headers stripped on cross-origin redirects
285
286
 
287
+ When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
288
+ `--require-pinned-dns` to refuse this configuration and keep the connector-
289
+ level SSRF guarantees in effect.
290
+
286
291
  ## Options
287
292
 
288
293
  Run `docpull --help` for the full list. Highlights:
@@ -310,6 +315,26 @@ Cache:
310
315
  --cache-ttl DAYS
311
316
  ```
312
317
 
318
+ ## Performance
319
+
320
+ End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
321
+ synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
322
+ HTTP keep-alive, 5% injected duplicate content):
323
+
324
+ | Metric | Value |
325
+ |---|---|
326
+ | Total wall time | ~27 s |
327
+ | Discovery (sitemap parse) | ~80 ms |
328
+ | Fetch + convert + save | ~27 s |
329
+ | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
330
+ | Peak RSS delta from baseline | ~28 MB |
331
+ | Cache manifest size on disk | ~3.4 MB |
332
+ | Duplicates detected (5% injected) | 499 / 500 |
333
+
334
+ Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
335
+ benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
336
+ into trend tooling).
337
+
313
338
  ## Troubleshooting
314
339
 
315
340
  ```bash
@@ -196,11 +196,16 @@ NDJSON (one record per page or chunk):
196
196
  ## Security
197
197
 
198
198
  - HTTPS-only, mandatory robots.txt compliance
199
- - SSRF protection: blocks private/internal network IPs, DNS rebinding
199
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding via
200
+ connect-time address pinning
200
201
  - XXE protection via `defusedxml` on sitemaps
201
202
  - Path traversal and CRLF header injection guards
202
203
  - Auth headers stripped on cross-origin redirects
203
204
 
205
+ When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
206
+ `--require-pinned-dns` to refuse this configuration and keep the connector-
207
+ level SSRF guarantees in effect.
208
+
204
209
  ## Options
205
210
 
206
211
  Run `docpull --help` for the full list. Highlights:
@@ -228,6 +233,26 @@ Cache:
228
233
  --cache-ttl DAYS
229
234
  ```
230
235
 
236
+ ## Performance
237
+
238
+ End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
239
+ synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
240
+ HTTP keep-alive, 5% injected duplicate content):
241
+
242
+ | Metric | Value |
243
+ |---|---|
244
+ | Total wall time | ~27 s |
245
+ | Discovery (sitemap parse) | ~80 ms |
246
+ | Fetch + convert + save | ~27 s |
247
+ | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
248
+ | Peak RSS delta from baseline | ~28 MB |
249
+ | Cache manifest size on disk | ~3.4 MB |
250
+ | Duplicates detected (5% injected) | 499 / 500 |
251
+
252
+ Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
253
+ benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
254
+ into trend tooling).
255
+
231
256
  ## Troubleshooting
232
257
 
233
258
  ```bash
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.3.0"
7
+ version = "2.4.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.3.0"
17
+ __version__ = "2.4.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -102,6 +102,23 @@ Examples:
102
102
  help="Fetch the given URL only (no discovery/crawl). Fast path for agents.",
103
103
  )
104
104
 
105
+ parser.add_argument(
106
+ "--skill",
107
+ type=str,
108
+ metavar="NAME",
109
+ help=(
110
+ "Generate a Claude Code skill directory. Output goes to "
111
+ "<output-dir>/<NAME>/ with hierarchical naming and a "
112
+ "SKILL.md manifest derived from the first page's metadata."
113
+ ),
114
+ )
115
+ parser.add_argument(
116
+ "--skill-description",
117
+ type=str,
118
+ metavar="TEXT",
119
+ help="Override the auto-derived `description` in SKILL.md.",
120
+ )
121
+
105
122
  # Output
106
123
  parser.add_argument(
107
124
  "--output-dir",
@@ -117,6 +134,16 @@ Examples:
117
134
  default=None,
118
135
  help="Output format (default: markdown; 'ndjson' streams one record per line)",
119
136
  )
137
+ parser.add_argument(
138
+ "--naming-strategy",
139
+ choices=["full", "hierarchical", "flat", "short"],
140
+ default=None,
141
+ help=(
142
+ "URL-to-filename strategy. 'full' flattens with underscores; "
143
+ "'hierarchical' preserves the URL path as nested directories. "
144
+ "Mirror profile defaults to hierarchical."
145
+ ),
146
+ )
120
147
  parser.add_argument(
121
148
  "--stream",
122
149
  action="store_true",
@@ -167,6 +194,15 @@ Examples:
167
194
  action="store_true",
168
195
  help="Automatically adjust rate limits based on server responses",
169
196
  )
197
+ crawl_group.add_argument(
198
+ "--no-streaming-discovery",
199
+ action="store_true",
200
+ help=(
201
+ "Fall back to discover-all-then-fetch instead of piping URLs "
202
+ "through a worker pool as discovery yields them. Backstop for "
203
+ "queue-backpressure regressions."
204
+ ),
205
+ )
170
206
 
171
207
  # Content filtering
172
208
  filter_group = parser.add_argument_group("content filtering")
@@ -175,12 +211,6 @@ Examples:
175
211
  action="store_true",
176
212
  help="Enable real-time deduplication",
177
213
  )
178
- filter_group.add_argument(
179
- "--language",
180
- type=str,
181
- metavar="CODE",
182
- help="Include only pages in this language",
183
- )
184
214
  filter_group.add_argument(
185
215
  "--extractor",
186
216
  choices=["default", "trafilatura"],
@@ -244,6 +274,15 @@ Examples:
244
274
  default=None,
245
275
  help="Maximum retry attempts",
246
276
  )
277
+ network_group.add_argument(
278
+ "--require-pinned-dns",
279
+ action="store_true",
280
+ help=(
281
+ "Refuse configurations that delegate DNS to a proxy. With this "
282
+ "flag, --proxy is rejected so the SSRF posture cannot silently "
283
+ "weaken in agent-driven crawls."
284
+ ),
285
+ )
247
286
 
248
287
  # Authentication settings
249
288
  auth_group = parser.add_argument_group("authentication")
@@ -358,8 +397,20 @@ def run_fetcher(args: argparse.Namespace) -> int:
358
397
 
359
398
  # Output settings
360
399
  output_kwargs: dict = {}
361
- if args.output_dir:
400
+ if args.skill:
401
+ # Skill mode: nest under <output-dir>/<skill>/, force hierarchical
402
+ # naming, and stamp the manifest fields. Default --output-dir to
403
+ # `.claude/skills` for the common drop-in use case.
404
+ base = args.output_dir or Path(".claude/skills")
405
+ output_kwargs["directory"] = base / args.skill
406
+ output_kwargs["naming_strategy"] = "hierarchical"
407
+ output_kwargs["skill_name"] = args.skill
408
+ if args.skill_description:
409
+ output_kwargs["skill_description"] = args.skill_description
410
+ elif args.output_dir:
362
411
  output_kwargs["directory"] = args.output_dir
412
+ if args.naming_strategy and "naming_strategy" not in output_kwargs:
413
+ output_kwargs["naming_strategy"] = args.naming_strategy
363
414
  if args.stream:
364
415
  output_kwargs["format"] = "ndjson"
365
416
  output_kwargs["ndjson_filename"] = "-"
@@ -386,6 +437,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
386
437
  crawl_kwargs["rate_limit"] = args.rate_limit
387
438
  if args.adaptive_rate_limit:
388
439
  crawl_kwargs["adaptive_rate_limit"] = True
440
+ if args.no_streaming_discovery:
441
+ crawl_kwargs["streaming_discovery"] = False
389
442
  if args.include_paths:
390
443
  crawl_kwargs["include_paths"] = args.include_paths
391
444
  if args.exclude_paths:
@@ -397,8 +450,6 @@ def run_fetcher(args: argparse.Namespace) -> int:
397
450
  filter_kwargs: dict = {}
398
451
  if args.streaming_dedup:
399
452
  filter_kwargs["streaming_dedup"] = True
400
- if args.language:
401
- filter_kwargs["language"] = args.language
402
453
  if args.extractor:
403
454
  filter_kwargs["extractor"] = args.extractor
404
455
  if args.no_special_cases:
@@ -422,6 +473,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
422
473
  return 1
423
474
  if args.max_retries is not None:
424
475
  network_kwargs["max_retries"] = args.max_retries
476
+ if args.require_pinned_dns:
477
+ network_kwargs["require_pinned_dns"] = True
425
478
  if network_kwargs:
426
479
  config_kwargs["network"] = network_kwargs
427
480
 
@@ -1,8 +1,10 @@
1
1
  """Thread pool manager for CPU-bound operations."""
2
2
 
3
3
  import asyncio
4
+ from collections.abc import Callable
4
5
  from concurrent.futures import ThreadPoolExecutor
5
- from typing import Any, Callable, Optional, TypeVar
6
+ from types import TracebackType
7
+ from typing import Any, TypeVar
6
8
 
7
9
  T = TypeVar("T")
8
10
 
@@ -32,7 +34,7 @@ class ConcurrencyManager:
32
34
  Consider CPU core count for optimal value.
33
35
  """
34
36
  self.max_workers = max_workers
35
- self._executor: Optional[ThreadPoolExecutor] = None
37
+ self._executor: ThreadPoolExecutor | None = None
36
38
 
37
39
  @property
38
40
  def executor(self) -> ThreadPoolExecutor:
@@ -98,7 +100,12 @@ class ConcurrencyManager:
98
100
  """Enter async context."""
99
101
  return self
100
102
 
101
- async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
103
+ async def __aexit__(
104
+ self,
105
+ exc_type: type[BaseException] | None,
106
+ exc_val: BaseException | None,
107
+ exc_tb: TracebackType | None,
108
+ ) -> None:
102
109
  """Exit async context and shutdown executor."""
103
110
  self.shutdown(wait=True)
104
111
 
@@ -106,6 +113,11 @@ class ConcurrencyManager:
106
113
  """Enter sync context."""
107
114
  return self
108
115
 
109
- def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
116
+ def __exit__(
117
+ self,
118
+ exc_type: type[BaseException] | None,
119
+ exc_val: BaseException | None,
120
+ exc_tb: TracebackType | None,
121
+ ) -> None:
110
122
  """Exit sync context and shutdown executor."""
111
123
  self.shutdown(wait=True)
@@ -26,7 +26,7 @@ CONTENT_SELECTORS = [
26
26
  "#documentation",
27
27
  ]
28
28
 
29
- # Elements to remove (navigation, ads, etc.)
29
+ # Elements to remove (navigation, ads, cookie banners, etc.)
30
30
  REMOVE_SELECTORS = [
31
31
  "nav",
32
32
  "header",
@@ -54,6 +54,33 @@ REMOVE_SELECTORS = [
54
54
  "noscript",
55
55
  "iframe",
56
56
  "svg",
57
+ # Cookie / consent / GDPR walls. Most are structural — class names
58
+ # come from a small set of vendor SDKs (OneTrust, Osano, CookieConsent,
59
+ # CookieLaw, Cookiebot, Iubenda) plus generic `.cookie-*` / `.gdpr-*`
60
+ # patterns. The aria-label fallbacks catch dialogs whose className
61
+ # doesn't match the conventions but whose accessibility label does.
62
+ ".cookie-banner",
63
+ ".cookie-consent",
64
+ ".cookie-notice",
65
+ ".cookielaw-banner",
66
+ ".cookiebot",
67
+ ".gdpr",
68
+ ".gdpr-banner",
69
+ ".consent-banner",
70
+ ".consent-popup",
71
+ ".cc-window",
72
+ ".cc-banner",
73
+ ".osano-cm-window",
74
+ ".osano-cm-dialog",
75
+ "#onetrust-banner-sdk",
76
+ "#onetrust-consent-sdk",
77
+ "#onetrust-pc-sdk",
78
+ ".ot-sdk-container",
79
+ ".iubenda-cs-container",
80
+ ".termly-styl-banner",
81
+ '[aria-label*="cookie" i]',
82
+ '[aria-label*="consent" i]',
83
+ '[aria-label*="gdpr" i]',
57
84
  ]
58
85
 
59
86
  # Elements to preserve but simplify
@@ -236,8 +263,94 @@ class MainContentExtractor:
236
263
 
237
264
  # Clean up
238
265
  self._remove_unwanted(content)
266
+ # Normalize fence languages BEFORE we strip attributes — many
267
+ # syntax-highlight conventions encode language in `class` (Prism:
268
+ # `language-python`, highlight.js: `lang-py`, Shiki: `language-bash`).
269
+ # html2text's `mark_code` won't pick these up by default, so we lift
270
+ # the language onto an html2text-friendly `class="lang-X"` form on
271
+ # both the <pre> and inner <code>.
272
+ _normalize_code_fence_language(content)
239
273
  self._clean_attributes(content)
240
274
  self._resolve_links(content, url)
241
275
 
242
276
  result = str(content)
243
277
  return self._clean_whitespace(result)
278
+
279
+
280
+ # Map syntax-highlight library conventions to a canonical short language tag.
281
+ # Order matters: longest/most-specific prefix first so `highlight-source-rust`
282
+ # resolves to `rust`, not `source-rust`. We deliberately skip `none`, `text`,
283
+ # and `plaintext` — they represent "no language."
284
+ _LANG_CLASS_PATTERNS: list[re.Pattern[str]] = [
285
+ re.compile(r"(?:^|\s)highlight-source-([\w+#-]+)", re.IGNORECASE),
286
+ re.compile(r"(?:^|\s)hljs-language-([\w+#-]+)", re.IGNORECASE),
287
+ re.compile(r"(?:^|\s)(?:language|lang|highlight)-([\w+#-]+)", re.IGNORECASE),
288
+ ]
289
+
290
+ # Sentinel injected as the first text node inside a <code> tag. html2text
291
+ # preserves the body of <pre><code> verbatim (it just indents by 4 spaces
292
+ # and wraps in [code]/[/code]), so this sentinel survives through to the
293
+ # Markdown stage where HtmlToMarkdown._clean_output recovers the language
294
+ # and rewrites the block as a fenced GFM code block.
295
+ DOCPULL_FENCE_SENTINEL_PREFIX = "__DOCPULL_FENCE_LANG_"
296
+ DOCPULL_FENCE_SENTINEL_SUFFIX = "__"
297
+
298
+
299
+ def _classes_of(tag: Tag) -> list[str]:
300
+ """Return a tag's CSS classes as a flat list of strings.
301
+
302
+ BeautifulSoup hands back ``str``, ``AttributeValueList``, or ``None``
303
+ depending on parser version. Normalize to ``list[str]`` for the rest
304
+ of the language-detection code.
305
+ """
306
+ raw = tag.get("class")
307
+ if raw is None:
308
+ return []
309
+ if isinstance(raw, str):
310
+ return [raw]
311
+ return [str(c) for c in raw]
312
+
313
+
314
+ def _detect_lang(class_string: str) -> str | None:
315
+ """Return the canonical language tag for a code block, or None."""
316
+ for pattern in _LANG_CLASS_PATTERNS:
317
+ match = pattern.search(class_string)
318
+ if not match:
319
+ continue
320
+ lang = match.group(1).lower()
321
+ if lang in {"none", "plaintext", "text"}:
322
+ return None
323
+ return lang
324
+ return None
325
+
326
+
327
+ def _normalize_code_fence_language(content: BeautifulSoup) -> None:
328
+ """Inject a sentinel that lets the Markdown stage emit fenced blocks.
329
+
330
+ Modern syntax-highlight libraries encode the language as a CSS class
331
+ (Prism: ``language-python``; highlight.js: ``lang-py`` /
332
+ ``hljs-language-bash``; GitHub: ``highlight-source-rust``). html2text
333
+ cannot read these and emits a generic ``[code]...[/code]`` block.
334
+
335
+ We walk every ``<pre>`` and prepend a sentinel ``__DOCPULL_FENCE_LANG_X__``
336
+ as a NavigableString to the inner ``<code>`` (or to the ``<pre>`` itself
337
+ if no inner ``<code>`` exists). Post-conversion, the Markdown layer
338
+ pulls that sentinel back out of the rendered text and rewrites the
339
+ block as a GFM fenced code block with the language tag.
340
+ """
341
+ for pre in content.find_all("pre"):
342
+ if not isinstance(pre, Tag):
343
+ continue
344
+ pre_classes = _classes_of(pre)
345
+ code = pre.find("code") if pre else None
346
+ code_classes: list[str] = []
347
+ if isinstance(code, Tag):
348
+ code_classes = _classes_of(code)
349
+
350
+ lang = _detect_lang(" ".join(pre_classes + code_classes))
351
+ if lang is None:
352
+ continue
353
+
354
+ sentinel = f"{DOCPULL_FENCE_SENTINEL_PREFIX}{lang}{DOCPULL_FENCE_SENTINEL_SUFFIX}\n"
355
+ target = code if isinstance(code, Tag) else pre
356
+ target.insert(0, sentinel)
@@ -4,11 +4,17 @@ from __future__ import annotations
4
4
 
5
5
  import logging
6
6
  import re
7
+ import textwrap
7
8
  from typing import Any
8
9
  from urllib.parse import urljoin
9
10
 
10
11
  import html2text
11
12
 
13
+ from .extractor import (
14
+ DOCPULL_FENCE_SENTINEL_PREFIX,
15
+ DOCPULL_FENCE_SENTINEL_SUFFIX,
16
+ )
17
+
12
18
  logger = logging.getLogger(__name__)
13
19
 
14
20
 
@@ -17,6 +23,43 @@ def _normalize_scheme(url: str) -> str:
17
23
  return re.sub(r"^(https?:)/(?!/)", r"\1//", url)
18
24
 
19
25
 
26
+ # html2text wraps <pre><code> in [code]/[/code] markers and indents the body
27
+ # by 4 spaces. The opening marker may carry trailing whitespace
28
+ # (`[code] \n`); tolerate it so we don't miss real code blocks.
29
+ _HTML2TEXT_CODE_BLOCK_RE = re.compile(
30
+ r"\[code\][ \t]*\n(.*?)\n[ \t]*\[/code\]",
31
+ re.DOTALL,
32
+ )
33
+ _FENCE_SENTINEL_RE = re.compile(
34
+ rf"^[ \t]*{re.escape(DOCPULL_FENCE_SENTINEL_PREFIX)}"
35
+ rf"([\w+#-]+){re.escape(DOCPULL_FENCE_SENTINEL_SUFFIX)}[ \t]*\n",
36
+ re.MULTILINE,
37
+ )
38
+
39
+
40
+ def _rewrite_html2text_code_blocks(markdown: str) -> str:
41
+ """Replace ``[code]...[/code]`` markers with GFM fenced blocks.
42
+
43
+ html2text indents the body of a ``[code]`` block by 4 spaces; we dedent
44
+ that consistently. If the body's first line is a docpull language
45
+ sentinel (injected by the extractor), the fence is opened with that
46
+ language; otherwise the fence is bare.
47
+ """
48
+
49
+ def replace(match: re.Match[str]) -> str:
50
+ body = match.group(1)
51
+ body = textwrap.dedent(body)
52
+ lang = ""
53
+ sentinel_match = _FENCE_SENTINEL_RE.match(body)
54
+ if sentinel_match:
55
+ lang = sentinel_match.group(1)
56
+ body = body[sentinel_match.end() :]
57
+ body = body.rstrip("\n")
58
+ return f"```{lang}\n{body}\n```"
59
+
60
+ return _HTML2TEXT_CODE_BLOCK_RE.sub(replace, markdown)
61
+
62
+
20
63
  class HtmlToMarkdown:
21
64
  """
22
65
  Converts HTML content to clean Markdown.
@@ -77,13 +120,16 @@ class HtmlToMarkdown:
77
120
 
78
121
  def _clean_output(self, markdown: str) -> str:
79
122
  """Clean up the converted Markdown."""
123
+ # Convert html2text's [code]/[/code] markers into GFM fences,
124
+ # recovering the language tag from the docpull sentinel injected
125
+ # by MainContentExtractor when the source HTML carried a Prism /
126
+ # highlight.js / Shiki language class. Must run BEFORE blank-line
127
+ # collapsing so the rewritten fences sit on their own lines.
128
+ markdown = _rewrite_html2text_code_blocks(markdown)
129
+
80
130
  # Remove excessive blank lines
81
131
  markdown = re.sub(r"\n{3,}", "\n\n", markdown)
82
132
 
83
- # Fix code block formatting
84
- # Ensure code blocks have language hint
85
- markdown = re.sub(r"```\n", "```\n", markdown)
86
-
87
133
  # Unmangle html2text's protect_links output:
88
134
  # [text](prefix/<https:/real.url>) -> [text](https://real.url)
89
135
  # The angle-bracketed inner URL is the true absolute URL (the prefix is