docpull 2.3.0__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-2.3.0/src/docpull.egg-info → docpull-2.4.0}/PKG-INFO +27 -2
- {docpull-2.3.0 → docpull-2.4.0}/README.md +26 -1
- {docpull-2.3.0 → docpull-2.4.0}/pyproject.toml +1 -1
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/__init__.py +1 -1
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cli.py +62 -9
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/concurrency/manager.py +16 -4
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/extractor.py +114 -1
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/markdown.py +50 -4
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/special_cases.py +237 -27
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/core/fetcher.py +424 -151
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/composite.py +2 -3
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/protocols.py +2 -2
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/client.py +17 -5
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/logging_config.py +2 -3
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/server.py +21 -3
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/tools.py +147 -25
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/config.py +100 -10
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/events.py +12 -12
- docpull-2.4.0/src/docpull/models/profiles.py +145 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/base.py +2 -1
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/chunk.py +1 -2
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/convert.py +90 -7
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/dedup.py +11 -6
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/fetch.py +87 -7
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/metadata.py +3 -4
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save.py +67 -3
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/validate.py +18 -17
- {docpull-2.3.0 → docpull-2.4.0/src/docpull.egg-info}/PKG-INFO +27 -2
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/SOURCES.txt +2 -0
- docpull-2.4.0/tests/test_cache_conditional_get.py +187 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_fixes_v2_3_0.py +1 -1
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_mcp_tools.py +90 -2
- docpull-2.4.0/tests/test_naming.py +120 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_security_hardening.py +36 -0
- docpull-2.4.0/tests/test_special_cases.py +311 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_conversion.py +237 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_integration.py +35 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_pipeline.py +131 -0
- docpull-2.3.0/src/docpull/models/profiles.py +0 -124
- docpull-2.3.0/tests/test_special_cases.py +0 -150
- {docpull-2.3.0 → docpull-2.4.0}/LICENSE +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/setup.cfg +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/__main__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/manager.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/chunking.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/core/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/crawler.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/filters.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/static.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/discovery/sitemap.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/doctor.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/protocols.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/mcp/sources.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/metadata_extractor.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/models/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_json.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/py.typed +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/__init__.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/robots.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull/security/url_validator.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/requires.txt +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_chunking.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_cli.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_convert_step_new.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_link_extractors.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_save_ndjson.py +0 -0
- {docpull-2.3.0 → docpull-2.4.0}/tests/test_v2_discovery.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -278,11 +278,16 @@ NDJSON (one record per page or chunk):
|
|
|
278
278
|
## Security
|
|
279
279
|
|
|
280
280
|
- HTTPS-only, mandatory robots.txt compliance
|
|
281
|
-
- SSRF protection: blocks private/internal network IPs, DNS rebinding
|
|
281
|
+
- SSRF protection: blocks private/internal network IPs, DNS rebinding via
|
|
282
|
+
connect-time address pinning
|
|
282
283
|
- XXE protection via `defusedxml` on sitemaps
|
|
283
284
|
- Path traversal and CRLF header injection guards
|
|
284
285
|
- Auth headers stripped on cross-origin redirects
|
|
285
286
|
|
|
287
|
+
When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
|
|
288
|
+
`--require-pinned-dns` to refuse this configuration and keep the connector-
|
|
289
|
+
level SSRF guarantees in effect.
|
|
290
|
+
|
|
286
291
|
## Options
|
|
287
292
|
|
|
288
293
|
Run `docpull --help` for the full list. Highlights:
|
|
@@ -310,6 +315,26 @@ Cache:
|
|
|
310
315
|
--cache-ttl DAYS
|
|
311
316
|
```
|
|
312
317
|
|
|
318
|
+
## Performance
|
|
319
|
+
|
|
320
|
+
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
321
|
+
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
322
|
+
HTTP keep-alive, 5% injected duplicate content):
|
|
323
|
+
|
|
324
|
+
| Metric | Value |
|
|
325
|
+
|---|---|
|
|
326
|
+
| Total wall time | ~27 s |
|
|
327
|
+
| Discovery (sitemap parse) | ~80 ms |
|
|
328
|
+
| Fetch + convert + save | ~27 s |
|
|
329
|
+
| Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
|
|
330
|
+
| Peak RSS delta from baseline | ~28 MB |
|
|
331
|
+
| Cache manifest size on disk | ~3.4 MB |
|
|
332
|
+
| Duplicates detected (5% injected) | 499 / 500 |
|
|
333
|
+
|
|
334
|
+
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
335
|
+
benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
|
|
336
|
+
into trend tooling).
|
|
337
|
+
|
|
313
338
|
## Troubleshooting
|
|
314
339
|
|
|
315
340
|
```bash
|
|
@@ -196,11 +196,16 @@ NDJSON (one record per page or chunk):
|
|
|
196
196
|
## Security
|
|
197
197
|
|
|
198
198
|
- HTTPS-only, mandatory robots.txt compliance
|
|
199
|
-
- SSRF protection: blocks private/internal network IPs, DNS rebinding
|
|
199
|
+
- SSRF protection: blocks private/internal network IPs, DNS rebinding via
|
|
200
|
+
connect-time address pinning
|
|
200
201
|
- XXE protection via `defusedxml` on sitemaps
|
|
201
202
|
- Path traversal and CRLF header injection guards
|
|
202
203
|
- Auth headers stripped on cross-origin redirects
|
|
203
204
|
|
|
205
|
+
When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
|
|
206
|
+
`--require-pinned-dns` to refuse this configuration and keep the connector-
|
|
207
|
+
level SSRF guarantees in effect.
|
|
208
|
+
|
|
204
209
|
## Options
|
|
205
210
|
|
|
206
211
|
Run `docpull --help` for the full list. Highlights:
|
|
@@ -228,6 +233,26 @@ Cache:
|
|
|
228
233
|
--cache-ttl DAYS
|
|
229
234
|
```
|
|
230
235
|
|
|
236
|
+
## Performance
|
|
237
|
+
|
|
238
|
+
End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
|
|
239
|
+
synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
|
|
240
|
+
HTTP keep-alive, 5% injected duplicate content):
|
|
241
|
+
|
|
242
|
+
| Metric | Value |
|
|
243
|
+
|---|---|
|
|
244
|
+
| Total wall time | ~27 s |
|
|
245
|
+
| Discovery (sitemap parse) | ~80 ms |
|
|
246
|
+
| Fetch + convert + save | ~27 s |
|
|
247
|
+
| Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
|
|
248
|
+
| Peak RSS delta from baseline | ~28 MB |
|
|
249
|
+
| Cache manifest size on disk | ~3.4 MB |
|
|
250
|
+
| Duplicates detected (5% injected) | 499 / 500 |
|
|
251
|
+
|
|
252
|
+
Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
|
|
253
|
+
benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
|
|
254
|
+
into trend tooling).
|
|
255
|
+
|
|
231
256
|
## Troubleshooting
|
|
232
257
|
|
|
233
258
|
```bash
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.4.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -102,6 +102,23 @@ Examples:
|
|
|
102
102
|
help="Fetch the given URL only (no discovery/crawl). Fast path for agents.",
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"--skill",
|
|
107
|
+
type=str,
|
|
108
|
+
metavar="NAME",
|
|
109
|
+
help=(
|
|
110
|
+
"Generate a Claude Code skill directory. Output goes to "
|
|
111
|
+
"<output-dir>/<NAME>/ with hierarchical naming and a "
|
|
112
|
+
"SKILL.md manifest derived from the first page's metadata."
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
"--skill-description",
|
|
117
|
+
type=str,
|
|
118
|
+
metavar="TEXT",
|
|
119
|
+
help="Override the auto-derived `description` in SKILL.md.",
|
|
120
|
+
)
|
|
121
|
+
|
|
105
122
|
# Output
|
|
106
123
|
parser.add_argument(
|
|
107
124
|
"--output-dir",
|
|
@@ -117,6 +134,16 @@ Examples:
|
|
|
117
134
|
default=None,
|
|
118
135
|
help="Output format (default: markdown; 'ndjson' streams one record per line)",
|
|
119
136
|
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--naming-strategy",
|
|
139
|
+
choices=["full", "hierarchical", "flat", "short"],
|
|
140
|
+
default=None,
|
|
141
|
+
help=(
|
|
142
|
+
"URL-to-filename strategy. 'full' flattens with underscores; "
|
|
143
|
+
"'hierarchical' preserves the URL path as nested directories. "
|
|
144
|
+
"Mirror profile defaults to hierarchical."
|
|
145
|
+
),
|
|
146
|
+
)
|
|
120
147
|
parser.add_argument(
|
|
121
148
|
"--stream",
|
|
122
149
|
action="store_true",
|
|
@@ -167,6 +194,15 @@ Examples:
|
|
|
167
194
|
action="store_true",
|
|
168
195
|
help="Automatically adjust rate limits based on server responses",
|
|
169
196
|
)
|
|
197
|
+
crawl_group.add_argument(
|
|
198
|
+
"--no-streaming-discovery",
|
|
199
|
+
action="store_true",
|
|
200
|
+
help=(
|
|
201
|
+
"Fall back to discover-all-then-fetch instead of piping URLs "
|
|
202
|
+
"through a worker pool as discovery yields them. Backstop for "
|
|
203
|
+
"queue-backpressure regressions."
|
|
204
|
+
),
|
|
205
|
+
)
|
|
170
206
|
|
|
171
207
|
# Content filtering
|
|
172
208
|
filter_group = parser.add_argument_group("content filtering")
|
|
@@ -175,12 +211,6 @@ Examples:
|
|
|
175
211
|
action="store_true",
|
|
176
212
|
help="Enable real-time deduplication",
|
|
177
213
|
)
|
|
178
|
-
filter_group.add_argument(
|
|
179
|
-
"--language",
|
|
180
|
-
type=str,
|
|
181
|
-
metavar="CODE",
|
|
182
|
-
help="Include only pages in this language",
|
|
183
|
-
)
|
|
184
214
|
filter_group.add_argument(
|
|
185
215
|
"--extractor",
|
|
186
216
|
choices=["default", "trafilatura"],
|
|
@@ -244,6 +274,15 @@ Examples:
|
|
|
244
274
|
default=None,
|
|
245
275
|
help="Maximum retry attempts",
|
|
246
276
|
)
|
|
277
|
+
network_group.add_argument(
|
|
278
|
+
"--require-pinned-dns",
|
|
279
|
+
action="store_true",
|
|
280
|
+
help=(
|
|
281
|
+
"Refuse configurations that delegate DNS to a proxy. With this "
|
|
282
|
+
"flag, --proxy is rejected so the SSRF posture cannot silently "
|
|
283
|
+
"weaken in agent-driven crawls."
|
|
284
|
+
),
|
|
285
|
+
)
|
|
247
286
|
|
|
248
287
|
# Authentication settings
|
|
249
288
|
auth_group = parser.add_argument_group("authentication")
|
|
@@ -358,8 +397,20 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
358
397
|
|
|
359
398
|
# Output settings
|
|
360
399
|
output_kwargs: dict = {}
|
|
361
|
-
if args.
|
|
400
|
+
if args.skill:
|
|
401
|
+
# Skill mode: nest under <output-dir>/<skill>/, force hierarchical
|
|
402
|
+
# naming, and stamp the manifest fields. Default --output-dir to
|
|
403
|
+
# `.claude/skills` for the common drop-in use case.
|
|
404
|
+
base = args.output_dir or Path(".claude/skills")
|
|
405
|
+
output_kwargs["directory"] = base / args.skill
|
|
406
|
+
output_kwargs["naming_strategy"] = "hierarchical"
|
|
407
|
+
output_kwargs["skill_name"] = args.skill
|
|
408
|
+
if args.skill_description:
|
|
409
|
+
output_kwargs["skill_description"] = args.skill_description
|
|
410
|
+
elif args.output_dir:
|
|
362
411
|
output_kwargs["directory"] = args.output_dir
|
|
412
|
+
if args.naming_strategy and "naming_strategy" not in output_kwargs:
|
|
413
|
+
output_kwargs["naming_strategy"] = args.naming_strategy
|
|
363
414
|
if args.stream:
|
|
364
415
|
output_kwargs["format"] = "ndjson"
|
|
365
416
|
output_kwargs["ndjson_filename"] = "-"
|
|
@@ -386,6 +437,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
386
437
|
crawl_kwargs["rate_limit"] = args.rate_limit
|
|
387
438
|
if args.adaptive_rate_limit:
|
|
388
439
|
crawl_kwargs["adaptive_rate_limit"] = True
|
|
440
|
+
if args.no_streaming_discovery:
|
|
441
|
+
crawl_kwargs["streaming_discovery"] = False
|
|
389
442
|
if args.include_paths:
|
|
390
443
|
crawl_kwargs["include_paths"] = args.include_paths
|
|
391
444
|
if args.exclude_paths:
|
|
@@ -397,8 +450,6 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
397
450
|
filter_kwargs: dict = {}
|
|
398
451
|
if args.streaming_dedup:
|
|
399
452
|
filter_kwargs["streaming_dedup"] = True
|
|
400
|
-
if args.language:
|
|
401
|
-
filter_kwargs["language"] = args.language
|
|
402
453
|
if args.extractor:
|
|
403
454
|
filter_kwargs["extractor"] = args.extractor
|
|
404
455
|
if args.no_special_cases:
|
|
@@ -422,6 +473,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
422
473
|
return 1
|
|
423
474
|
if args.max_retries is not None:
|
|
424
475
|
network_kwargs["max_retries"] = args.max_retries
|
|
476
|
+
if args.require_pinned_dns:
|
|
477
|
+
network_kwargs["require_pinned_dns"] = True
|
|
425
478
|
if network_kwargs:
|
|
426
479
|
config_kwargs["network"] = network_kwargs
|
|
427
480
|
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""Thread pool manager for CPU-bound operations."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections.abc import Callable
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
-
from
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import Any, TypeVar
|
|
6
8
|
|
|
7
9
|
T = TypeVar("T")
|
|
8
10
|
|
|
@@ -32,7 +34,7 @@ class ConcurrencyManager:
|
|
|
32
34
|
Consider CPU core count for optimal value.
|
|
33
35
|
"""
|
|
34
36
|
self.max_workers = max_workers
|
|
35
|
-
self._executor:
|
|
37
|
+
self._executor: ThreadPoolExecutor | None = None
|
|
36
38
|
|
|
37
39
|
@property
|
|
38
40
|
def executor(self) -> ThreadPoolExecutor:
|
|
@@ -98,7 +100,12 @@ class ConcurrencyManager:
|
|
|
98
100
|
"""Enter async context."""
|
|
99
101
|
return self
|
|
100
102
|
|
|
101
|
-
async def __aexit__(
|
|
103
|
+
async def __aexit__(
|
|
104
|
+
self,
|
|
105
|
+
exc_type: type[BaseException] | None,
|
|
106
|
+
exc_val: BaseException | None,
|
|
107
|
+
exc_tb: TracebackType | None,
|
|
108
|
+
) -> None:
|
|
102
109
|
"""Exit async context and shutdown executor."""
|
|
103
110
|
self.shutdown(wait=True)
|
|
104
111
|
|
|
@@ -106,6 +113,11 @@ class ConcurrencyManager:
|
|
|
106
113
|
"""Enter sync context."""
|
|
107
114
|
return self
|
|
108
115
|
|
|
109
|
-
def __exit__(
|
|
116
|
+
def __exit__(
|
|
117
|
+
self,
|
|
118
|
+
exc_type: type[BaseException] | None,
|
|
119
|
+
exc_val: BaseException | None,
|
|
120
|
+
exc_tb: TracebackType | None,
|
|
121
|
+
) -> None:
|
|
110
122
|
"""Exit sync context and shutdown executor."""
|
|
111
123
|
self.shutdown(wait=True)
|
|
@@ -26,7 +26,7 @@ CONTENT_SELECTORS = [
|
|
|
26
26
|
"#documentation",
|
|
27
27
|
]
|
|
28
28
|
|
|
29
|
-
# Elements to remove (navigation, ads, etc.)
|
|
29
|
+
# Elements to remove (navigation, ads, cookie banners, etc.)
|
|
30
30
|
REMOVE_SELECTORS = [
|
|
31
31
|
"nav",
|
|
32
32
|
"header",
|
|
@@ -54,6 +54,33 @@ REMOVE_SELECTORS = [
|
|
|
54
54
|
"noscript",
|
|
55
55
|
"iframe",
|
|
56
56
|
"svg",
|
|
57
|
+
# Cookie / consent / GDPR walls. Most are structural — class names
|
|
58
|
+
# come from a small set of vendor SDKs (OneTrust, Osano, CookieConsent,
|
|
59
|
+
# CookieLaw, Cookiebot, Iubenda) plus generic `.cookie-*` / `.gdpr-*`
|
|
60
|
+
# patterns. The aria-label fallbacks catch dialogs whose className
|
|
61
|
+
# doesn't match the conventions but whose accessibility label does.
|
|
62
|
+
".cookie-banner",
|
|
63
|
+
".cookie-consent",
|
|
64
|
+
".cookie-notice",
|
|
65
|
+
".cookielaw-banner",
|
|
66
|
+
".cookiebot",
|
|
67
|
+
".gdpr",
|
|
68
|
+
".gdpr-banner",
|
|
69
|
+
".consent-banner",
|
|
70
|
+
".consent-popup",
|
|
71
|
+
".cc-window",
|
|
72
|
+
".cc-banner",
|
|
73
|
+
".osano-cm-window",
|
|
74
|
+
".osano-cm-dialog",
|
|
75
|
+
"#onetrust-banner-sdk",
|
|
76
|
+
"#onetrust-consent-sdk",
|
|
77
|
+
"#onetrust-pc-sdk",
|
|
78
|
+
".ot-sdk-container",
|
|
79
|
+
".iubenda-cs-container",
|
|
80
|
+
".termly-styl-banner",
|
|
81
|
+
'[aria-label*="cookie" i]',
|
|
82
|
+
'[aria-label*="consent" i]',
|
|
83
|
+
'[aria-label*="gdpr" i]',
|
|
57
84
|
]
|
|
58
85
|
|
|
59
86
|
# Elements to preserve but simplify
|
|
@@ -236,8 +263,94 @@ class MainContentExtractor:
|
|
|
236
263
|
|
|
237
264
|
# Clean up
|
|
238
265
|
self._remove_unwanted(content)
|
|
266
|
+
# Normalize fence languages BEFORE we strip attributes — many
|
|
267
|
+
# syntax-highlight conventions encode language in `class` (Prism:
|
|
268
|
+
# `language-python`, highlight.js: `lang-py`, Shiki: `language-bash`).
|
|
269
|
+
# html2text's `mark_code` won't pick these up by default, so we lift
|
|
270
|
+
# the language onto an html2text-friendly `class="lang-X"` form on
|
|
271
|
+
# both the <pre> and inner <code>.
|
|
272
|
+
_normalize_code_fence_language(content)
|
|
239
273
|
self._clean_attributes(content)
|
|
240
274
|
self._resolve_links(content, url)
|
|
241
275
|
|
|
242
276
|
result = str(content)
|
|
243
277
|
return self._clean_whitespace(result)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Map syntax-highlight library conventions to a canonical short language tag.
|
|
281
|
+
# Order matters: longest/most-specific prefix first so `highlight-source-rust`
|
|
282
|
+
# resolves to `rust`, not `source-rust`. We deliberately skip `none`, `text`,
|
|
283
|
+
# and `plaintext` — they represent "no language."
|
|
284
|
+
_LANG_CLASS_PATTERNS: list[re.Pattern[str]] = [
|
|
285
|
+
re.compile(r"(?:^|\s)highlight-source-([\w+#-]+)", re.IGNORECASE),
|
|
286
|
+
re.compile(r"(?:^|\s)hljs-language-([\w+#-]+)", re.IGNORECASE),
|
|
287
|
+
re.compile(r"(?:^|\s)(?:language|lang|highlight)-([\w+#-]+)", re.IGNORECASE),
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
# Sentinel injected as the first text node inside a <code> tag. html2text
|
|
291
|
+
# preserves the body of <pre><code> verbatim (it just indents by 4 spaces
|
|
292
|
+
# and wraps in [code]/[/code]), so this sentinel survives through to the
|
|
293
|
+
# Markdown stage where HtmlToMarkdown._clean_output recovers the language
|
|
294
|
+
# and rewrites the block as a fenced GFM code block.
|
|
295
|
+
DOCPULL_FENCE_SENTINEL_PREFIX = "__DOCPULL_FENCE_LANG_"
|
|
296
|
+
DOCPULL_FENCE_SENTINEL_SUFFIX = "__"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _classes_of(tag: Tag) -> list[str]:
|
|
300
|
+
"""Return a tag's CSS classes as a flat list of strings.
|
|
301
|
+
|
|
302
|
+
BeautifulSoup hands back ``str``, ``AttributeValueList``, or ``None``
|
|
303
|
+
depending on parser version. Normalize to ``list[str]`` for the rest
|
|
304
|
+
of the language-detection code.
|
|
305
|
+
"""
|
|
306
|
+
raw = tag.get("class")
|
|
307
|
+
if raw is None:
|
|
308
|
+
return []
|
|
309
|
+
if isinstance(raw, str):
|
|
310
|
+
return [raw]
|
|
311
|
+
return [str(c) for c in raw]
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _detect_lang(class_string: str) -> str | None:
|
|
315
|
+
"""Return the canonical language tag for a code block, or None."""
|
|
316
|
+
for pattern in _LANG_CLASS_PATTERNS:
|
|
317
|
+
match = pattern.search(class_string)
|
|
318
|
+
if not match:
|
|
319
|
+
continue
|
|
320
|
+
lang = match.group(1).lower()
|
|
321
|
+
if lang in {"none", "plaintext", "text"}:
|
|
322
|
+
return None
|
|
323
|
+
return lang
|
|
324
|
+
return None
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _normalize_code_fence_language(content: BeautifulSoup) -> None:
|
|
328
|
+
"""Inject a sentinel that lets the Markdown stage emit fenced blocks.
|
|
329
|
+
|
|
330
|
+
Modern syntax-highlight libraries encode the language as a CSS class
|
|
331
|
+
(Prism: ``language-python``; highlight.js: ``lang-py`` /
|
|
332
|
+
``hljs-language-bash``; GitHub: ``highlight-source-rust``). html2text
|
|
333
|
+
cannot read these and emits a generic ``[code]...[/code]`` block.
|
|
334
|
+
|
|
335
|
+
We walk every ``<pre>`` and prepend a sentinel ``__DOCPULL_FENCE_LANG_X__``
|
|
336
|
+
as a NavigableString to the inner ``<code>`` (or to the ``<pre>`` itself
|
|
337
|
+
if no inner ``<code>`` exists). Post-conversion, the Markdown layer
|
|
338
|
+
pulls that sentinel back out of the rendered text and rewrites the
|
|
339
|
+
block as a GFM fenced code block with the language tag.
|
|
340
|
+
"""
|
|
341
|
+
for pre in content.find_all("pre"):
|
|
342
|
+
if not isinstance(pre, Tag):
|
|
343
|
+
continue
|
|
344
|
+
pre_classes = _classes_of(pre)
|
|
345
|
+
code = pre.find("code") if pre else None
|
|
346
|
+
code_classes: list[str] = []
|
|
347
|
+
if isinstance(code, Tag):
|
|
348
|
+
code_classes = _classes_of(code)
|
|
349
|
+
|
|
350
|
+
lang = _detect_lang(" ".join(pre_classes + code_classes))
|
|
351
|
+
if lang is None:
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
sentinel = f"{DOCPULL_FENCE_SENTINEL_PREFIX}{lang}{DOCPULL_FENCE_SENTINEL_SUFFIX}\n"
|
|
355
|
+
target = code if isinstance(code, Tag) else pre
|
|
356
|
+
target.insert(0, sentinel)
|
|
@@ -4,11 +4,17 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import re
|
|
7
|
+
import textwrap
|
|
7
8
|
from typing import Any
|
|
8
9
|
from urllib.parse import urljoin
|
|
9
10
|
|
|
10
11
|
import html2text
|
|
11
12
|
|
|
13
|
+
from .extractor import (
|
|
14
|
+
DOCPULL_FENCE_SENTINEL_PREFIX,
|
|
15
|
+
DOCPULL_FENCE_SENTINEL_SUFFIX,
|
|
16
|
+
)
|
|
17
|
+
|
|
12
18
|
logger = logging.getLogger(__name__)
|
|
13
19
|
|
|
14
20
|
|
|
@@ -17,6 +23,43 @@ def _normalize_scheme(url: str) -> str:
|
|
|
17
23
|
return re.sub(r"^(https?:)/(?!/)", r"\1//", url)
|
|
18
24
|
|
|
19
25
|
|
|
26
|
+
# html2text wraps <pre><code> in [code]/[/code] markers and indents the body
|
|
27
|
+
# by 4 spaces. The opening marker may carry trailing whitespace
|
|
28
|
+
# (`[code] \n`); tolerate it so we don't miss real code blocks.
|
|
29
|
+
_HTML2TEXT_CODE_BLOCK_RE = re.compile(
|
|
30
|
+
r"\[code\][ \t]*\n(.*?)\n[ \t]*\[/code\]",
|
|
31
|
+
re.DOTALL,
|
|
32
|
+
)
|
|
33
|
+
_FENCE_SENTINEL_RE = re.compile(
|
|
34
|
+
rf"^[ \t]*{re.escape(DOCPULL_FENCE_SENTINEL_PREFIX)}"
|
|
35
|
+
rf"([\w+#-]+){re.escape(DOCPULL_FENCE_SENTINEL_SUFFIX)}[ \t]*\n",
|
|
36
|
+
re.MULTILINE,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _rewrite_html2text_code_blocks(markdown: str) -> str:
|
|
41
|
+
"""Replace ``[code]...[/code]`` markers with GFM fenced blocks.
|
|
42
|
+
|
|
43
|
+
html2text indents the body of a ``[code]`` block by 4 spaces; we dedent
|
|
44
|
+
that consistently. If the body's first line is a docpull language
|
|
45
|
+
sentinel (injected by the extractor), the fence is opened with that
|
|
46
|
+
language; otherwise the fence is bare.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def replace(match: re.Match[str]) -> str:
|
|
50
|
+
body = match.group(1)
|
|
51
|
+
body = textwrap.dedent(body)
|
|
52
|
+
lang = ""
|
|
53
|
+
sentinel_match = _FENCE_SENTINEL_RE.match(body)
|
|
54
|
+
if sentinel_match:
|
|
55
|
+
lang = sentinel_match.group(1)
|
|
56
|
+
body = body[sentinel_match.end() :]
|
|
57
|
+
body = body.rstrip("\n")
|
|
58
|
+
return f"```{lang}\n{body}\n```"
|
|
59
|
+
|
|
60
|
+
return _HTML2TEXT_CODE_BLOCK_RE.sub(replace, markdown)
|
|
61
|
+
|
|
62
|
+
|
|
20
63
|
class HtmlToMarkdown:
|
|
21
64
|
"""
|
|
22
65
|
Converts HTML content to clean Markdown.
|
|
@@ -77,13 +120,16 @@ class HtmlToMarkdown:
|
|
|
77
120
|
|
|
78
121
|
def _clean_output(self, markdown: str) -> str:
|
|
79
122
|
"""Clean up the converted Markdown."""
|
|
123
|
+
# Convert html2text's [code]/[/code] markers into GFM fences,
|
|
124
|
+
# recovering the language tag from the docpull sentinel injected
|
|
125
|
+
# by MainContentExtractor when the source HTML carried a Prism /
|
|
126
|
+
# highlight.js / Shiki language class. Must run BEFORE blank-line
|
|
127
|
+
# collapsing so the rewritten fences sit on their own lines.
|
|
128
|
+
markdown = _rewrite_html2text_code_blocks(markdown)
|
|
129
|
+
|
|
80
130
|
# Remove excessive blank lines
|
|
81
131
|
markdown = re.sub(r"\n{3,}", "\n\n", markdown)
|
|
82
132
|
|
|
83
|
-
# Fix code block formatting
|
|
84
|
-
# Ensure code blocks have language hint
|
|
85
|
-
markdown = re.sub(r"```\n", "```\n", markdown)
|
|
86
|
-
|
|
87
133
|
# Unmangle html2text's protect_links output:
|
|
88
134
|
# [text](prefix/<https:/real.url>) -> [text](https://real.url)
|
|
89
135
|
# The angle-bracketed inner URL is the true absolute URL (the prefix is
|