domdown 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- domdown-0.1.1/AUTHORS.md +10 -0
- domdown-0.1.1/HISTORY.md +13 -0
- domdown-0.1.1/MANIFEST.in +10 -0
- domdown-0.1.1/PKG-INFO +316 -0
- domdown-0.1.1/README.md +254 -0
- domdown-0.1.1/domdown/__init__.py +16 -0
- domdown-0.1.1/domdown/_constants/__init__.py +33 -0
- domdown-0.1.1/domdown/_constants/selectors.py +305 -0
- domdown-0.1.1/domdown/_core/__init__.py +13 -0
- domdown-0.1.1/domdown/_core/context.py +22 -0
- domdown-0.1.1/domdown/_core/metadata.py +20 -0
- domdown-0.1.1/domdown/_core/options.py +23 -0
- domdown-0.1.1/domdown/_core/result.py +17 -0
- domdown-0.1.1/domdown/_document/__init__.py +7 -0
- domdown-0.1.1/domdown/_document/clean.py +549 -0
- domdown-0.1.1/domdown/_document/parse.py +9 -0
- domdown-0.1.1/domdown/_document/select.py +485 -0
- domdown-0.1.1/domdown/_frontmatter/__init__.py +6 -0
- domdown-0.1.1/domdown/_frontmatter/compose.py +10 -0
- domdown-0.1.1/domdown/_frontmatter/serialize.py +39 -0
- domdown-0.1.1/domdown/_metadata/__init__.py +5 -0
- domdown-0.1.1/domdown/_metadata/extract.py +97 -0
- domdown-0.1.1/domdown/_metadata/helpers.py +180 -0
- domdown-0.1.1/domdown/_metadata/selectors.py +79 -0
- domdown-0.1.1/domdown/_pipeline/__init__.py +5 -0
- domdown-0.1.1/domdown/_pipeline/runner.py +63 -0
- domdown-0.1.1/domdown/_text/__init__.py +16 -0
- domdown-0.1.1/domdown/_text/frontmatter.py +30 -0
- domdown-0.1.1/domdown/_text/normalize.py +20 -0
- domdown-0.1.1/domdown/_text/url.py +43 -0
- domdown-0.1.1/domdown/adapters/__init__.py +7 -0
- domdown-0.1.1/domdown/adapters/base.py +31 -0
- domdown-0.1.1/domdown/adapters/github.py +229 -0
- domdown-0.1.1/domdown/adapters/registry.py +46 -0
- domdown-0.1.1/domdown/api.py +15 -0
- domdown-0.1.1/domdown/markdown/__init__.py +17 -0
- domdown-0.1.1/domdown/markdown/block.py +349 -0
- domdown-0.1.1/domdown/markdown/code.py +54 -0
- domdown-0.1.1/domdown/markdown/images.py +64 -0
- domdown-0.1.1/domdown/markdown/inline.py +81 -0
- domdown-0.1.1/domdown/markdown/links.py +42 -0
- domdown-0.1.1/domdown/markdown/lists.py +40 -0
- domdown-0.1.1/domdown/markdown/postprocess.py +43 -0
- domdown-0.1.1/domdown/markdown/tables.py +52 -0
- domdown-0.1.1/domdown/stages/__init__.py +21 -0
- domdown-0.1.1/domdown/stages/base.py +18 -0
- domdown-0.1.1/domdown/stages/clean.py +57 -0
- domdown-0.1.1/domdown/stages/frontmatter.py +25 -0
- domdown-0.1.1/domdown/stages/markdown.py +23 -0
- domdown-0.1.1/domdown/stages/metadata.py +21 -0
- domdown-0.1.1/domdown/stages/parse.py +21 -0
- domdown-0.1.1/domdown/stages/postprocess.py +21 -0
- domdown-0.1.1/domdown/stages/preserve.py +67 -0
- domdown-0.1.1/domdown.egg-info/PKG-INFO +316 -0
- domdown-0.1.1/domdown.egg-info/SOURCES.txt +151 -0
- domdown-0.1.1/domdown.egg-info/dependency_links.txt +1 -0
- domdown-0.1.1/domdown.egg-info/entry_points.txt +2 -0
- domdown-0.1.1/domdown.egg-info/not-zip-safe +1 -0
- domdown-0.1.1/domdown.egg-info/requires.txt +17 -0
- domdown-0.1.1/domdown.egg-info/top_level.txt +1 -0
- domdown-0.1.1/pyproject.toml +22 -0
- domdown-0.1.1/requirements.txt +4 -0
- domdown-0.1.1/setup.cfg +23 -0
- domdown-0.1.1/setup.py +59 -0
- domdown-0.1.1/tests/__init__.py +1 -0
- domdown-0.1.1/tests/adapters/test_base.py +18 -0
- domdown-0.1.1/tests/adapters/test_github.py +56 -0
- domdown-0.1.1/tests/adapters/test_registry.py +73 -0
- domdown-0.1.1/tests/api/test_api.py +24 -0
- domdown-0.1.1/tests/api/test_package_layout.py +14 -0
- domdown-0.1.1/tests/core/test_core_models.py +51 -0
- domdown-0.1.1/tests/document/test_document_clean.py +1067 -0
- domdown-0.1.1/tests/document/test_document_parse.py +12 -0
- domdown-0.1.1/tests/document/test_document_select.py +459 -0
- domdown-0.1.1/tests/document/test_noise_markers.py +219 -0
- domdown-0.1.1/tests/document/test_partial_selectors.py +171 -0
- domdown-0.1.1/tests/fixtures/__init__.py +23 -0
- domdown-0.1.1/tests/fixtures/article_regressions.py +228 -0
- domdown-0.1.1/tests/fixtures/article_shell.py +41 -0
- domdown-0.1.1/tests/frontmatter/test_frontmatter.py +48 -0
- domdown-0.1.1/tests/markdown/test_markdown_block.py +518 -0
- domdown-0.1.1/tests/markdown/test_markdown_code.py +48 -0
- domdown-0.1.1/tests/markdown/test_markdown_images.py +31 -0
- domdown-0.1.1/tests/markdown/test_markdown_inline.py +61 -0
- domdown-0.1.1/tests/markdown/test_markdown_links.py +30 -0
- domdown-0.1.1/tests/markdown/test_markdown_lists.py +26 -0
- domdown-0.1.1/tests/markdown/test_markdown_postprocess.py +18 -0
- domdown-0.1.1/tests/markdown/test_markdown_tables.py +17 -0
- domdown-0.1.1/tests/metadata/test_metadata_extract.py +226 -0
- domdown-0.1.1/tests/metadata/test_metadata_helpers.py +57 -0
- domdown-0.1.1/tests/pipeline/test_article_shell.py +34 -0
- domdown-0.1.1/tests/pipeline/test_pipeline.py +27 -0
- domdown-0.1.1/tests/pipeline/test_stages.py +148 -0
- domdown-0.1.1/tests/real/README.md +19 -0
- domdown-0.1.1/tests/real/__init__.py +50 -0
- domdown-0.1.1/tests/real/html/0xmaz.me_posts_HookChain-A-Deep-Dive-into-Advanced-EDR-Bypass-Techniques.html +1 -0
- domdown-0.1.1/tests/real/html/1password.com_blog_as-ai-supercharges-phishing-scams-1password-introduces-built-in-protection.html +9 -0
- domdown-0.1.1/tests/real/html/404media.co_a-secure-chat-apps-encryption-is-so-bad-it-is-meaningless.html +905 -0
- domdown-0.1.1/tests/real/html/acronis_boto_cor_de_rosa_campaign_astaroth_whatsapp_brazil.html +195 -0
- domdown-0.1.1/tests/real/html/adnanthekhan_clinejection.html +313 -0
- domdown-0.1.1/tests/real/html/aikido.dev_blog_axios-npm-compromised-maintainer-hijacked-rat.html +2585 -0
- domdown-0.1.1/tests/real/html/arstechnica.com_security_2026_02_new-airsnitch-attack-breaks-wi-fi-encryption-in-homes-offices-and-enterprises.html +2305 -0
- domdown-0.1.1/tests/real/html/attack.mitre.org_techniques_T1102_001.html +1462 -0
- domdown-0.1.1/tests/real/html/blog.alyac.co.kr_5035.html +1725 -0
- domdown-0.1.1/tests/real/html/blog.ethiack.com_blog_bypassing-wafs-for-fun-and-js-injection-with-parameter-pollution.html +3843 -0
- domdown-0.1.1/tests/real/html/blog.gdatasoftware.com_2026_03_38385-acr-stealer-infrastructure.html +44 -0
- domdown-0.1.1/tests/real/html/blog.gdatasoftware.com_2026_03_38399-analysis-kissloader.html +115 -0
- domdown-0.1.1/tests/real/html/bushidotoken.net_2025_02_blackbasta-leaks-lessons-from-ascension.html +4853 -0
- domdown-0.1.1/tests/real/html/cisa.gov_news-events_alerts_2026_03_03_cisa-adds-two-known-exploited-vulnerabilities-catalog.html +1568 -0
- domdown-0.1.1/tests/real/html/dragos_threat_voltzite.html +3489 -0
- domdown-0.1.1/tests/real/html/elastic.co_security-labs_phantom-in-the-vault.html +319 -0
- domdown-0.1.1/tests/real/html/encryptionconsulting.com_enterprise-guide-to-pqc-migration.html +1871 -0
- domdown-0.1.1/tests/real/html/endorlabs.com_learn_how-ai-sast-traced-data-flows-to-uncover-six-openclaw-vulnerabilities.html +2614 -0
- domdown-0.1.1/tests/real/html/fortinet.com_corporate_about-us_newsroom_press-releases_2025_fortinet-threat-report-reveals-record-surge-in-automated-cyberattacks.html +8282 -0
- domdown-0.1.1/tests/real/html/github.com_BerriAI_litellm_issues_24518.html +1622 -0
- domdown-0.1.1/tests/real/html/github.com_PaloAltoNetworks_Unit42-timely-threat-intel_blob_main_2025-12-03-recent-surge-in-ClickFix-activity.txt.html +1484 -0
- domdown-0.1.1/tests/real/html/github.com_nodejs_node_releases_tag_v25.3.0.html +1862 -0
- domdown-0.1.1/tests/real/html/nisos.com_blog_dprk-remote-worker-fraud-interview.html +989 -0
- domdown-0.1.1/tests/real/html/safedep.io_malicious-ixpresso-core-npm-rat.html +11 -0
- domdown-0.1.1/tests/real/html/techzone.bitdefender.com_en_tech-explainers_what-is-dll-sideloading.html +88 -0
- domdown-0.1.1/tests/real/html/zimperium.com_blog_over-3000-android-malware-samples-using-multiple-techniques-to-bypass-detection.html +3340 -0
- domdown-0.1.1/tests/real/manifest.json +167 -0
- domdown-0.1.1/tests/real/raw/0xmaz.me_posts_HookChain-A-Deep-Dive-into-Advanced-EDR-Bypass-Techniques.md +157 -0
- domdown-0.1.1/tests/real/raw/1password.com_blog_as-ai-supercharges-phishing-scams-1password-introduces-built-in-protection.md +182 -0
- domdown-0.1.1/tests/real/raw/404media.co_a-secure-chat-apps-encryption-is-so-bad-it-is-meaningless.md +26 -0
- domdown-0.1.1/tests/real/raw/acronis_boto_cor_de_rosa_campaign_astaroth_whatsapp_brazil.md +158 -0
- domdown-0.1.1/tests/real/raw/adnanthekhan_clinejection.md +369 -0
- domdown-0.1.1/tests/real/raw/aikido.dev_blog_axios-npm-compromised-maintainer-hijacked-rat.md +147 -0
- domdown-0.1.1/tests/real/raw/arstechnica.com_security_2026_02_new-airsnitch-attack-breaks-wi-fi-encryption-in-homes-offices-and-enterprises.md +159 -0
- domdown-0.1.1/tests/real/raw/attack.mitre.org_techniques_T1102_001.md +134 -0
- domdown-0.1.1/tests/real/raw/blog.alyac.co.kr_5035.md +191 -0
- domdown-0.1.1/tests/real/raw/blog.ethiack.com_blog_bypassing-wafs-for-fun-and-js-injection-with-parameter-pollution.md +267 -0
- domdown-0.1.1/tests/real/raw/blog.gdatasoftware.com_2026_03_38385-acr-stealer-infrastructure.md +163 -0
- domdown-0.1.1/tests/real/raw/blog.gdatasoftware.com_2026_03_38399-analysis-kissloader.md +135 -0
- domdown-0.1.1/tests/real/raw/bushidotoken.net_2025_02_blackbasta-leaks-lessons-from-ascension.md +291 -0
- domdown-0.1.1/tests/real/raw/cisa.gov_news-events_alerts_2026_03_03_cisa-adds-two-known-exploited-vulnerabilities-catalog.md +19 -0
- domdown-0.1.1/tests/real/raw/dragos_threat_voltzite.md +48 -0
- domdown-0.1.1/tests/real/raw/elastic.co_security-labs_phantom-in-the-vault.md +476 -0
- domdown-0.1.1/tests/real/raw/encryptionconsulting.com_enterprise-guide-to-pqc-migration.md +222 -0
- domdown-0.1.1/tests/real/raw/endorlabs.com_learn_how-ai-sast-traced-data-flows-to-uncover-six-openclaw-vulnerabilities.md +677 -0
- domdown-0.1.1/tests/real/raw/fortinet.com_corporate_about-us_newsroom_press-releases_2025_fortinet-threat-report-reveals-record-surge-in-automated-cyberattacks.md +68 -0
- domdown-0.1.1/tests/real/raw/github.com_BerriAI_litellm_issues_24518.md +79 -0
- domdown-0.1.1/tests/real/raw/github.com_PaloAltoNetworks_Unit42-timely-threat-intel_blob_main_2025-12-03-recent-surge-in-ClickFix-activity.txt.md +76 -0
- domdown-0.1.1/tests/real/raw/github.com_nodejs_node_releases_tag_v25.3.0.md +61 -0
- domdown-0.1.1/tests/real/raw/nisos.com_blog_dprk-remote-worker-fraud-interview.md +82 -0
- domdown-0.1.1/tests/real/raw/safedep.io_malicious-ixpresso-core-npm-rat.md +203 -0
- domdown-0.1.1/tests/real/raw/techzone.bitdefender.com_en_tech-explainers_what-is-dll-sideloading.md +80 -0
- domdown-0.1.1/tests/real/raw/zimperium.com_blog_over-3000-android-malware-samples-using-multiple-techniques-to-bypass-detection.md +184 -0
- domdown-0.1.1/tests/real/test_real_examples.py +24 -0
- domdown-0.1.1/tests/text/test_text_frontmatter.py +23 -0
- domdown-0.1.1/tests/text/test_text_normalize.py +15 -0
- domdown-0.1.1/tests/text/test_text_url.py +28 -0
domdown-0.1.1/AUTHORS.md
ADDED
domdown-0.1.1/HISTORY.md
ADDED
domdown-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: domdown
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: extracts the main content from web pages and returns cleaned HTML, optional markdown, and structured metadata.
|
|
5
|
+
Home-page: https://github.com/juanmcristobal/domdown
|
|
6
|
+
Author: Juan Manuel Cristóbal Moreno
|
|
7
|
+
Author-email: juanmcristobal@gmail.com
|
|
8
|
+
Keywords: domdown
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: AUTHORS.md
|
|
19
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
20
|
+
Requires-Dist: lxml>=5.0
|
|
21
|
+
Requires-Dist: soupsieve>=2.5
|
|
22
|
+
Requires-Dist: httpx>=0.27
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: black==24.4.0; extra == "dev"
|
|
25
|
+
Requires-Dist: isort==5.13.2; extra == "dev"
|
|
26
|
+
Requires-Dist: pip==24.0; extra == "dev"
|
|
27
|
+
Requires-Dist: bump2version==1.0.1; extra == "dev"
|
|
28
|
+
Requires-Dist: wheel==0.43.0; extra == "dev"
|
|
29
|
+
Requires-Dist: flake8==7.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: tox==4.14.2; extra == "dev"
|
|
31
|
+
Requires-Dist: coverage==7.4.4; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest==8.1.1; extra == "dev"
|
|
33
|
+
Requires-Dist: build; extra == "dev"
|
|
34
|
+
Requires-Dist: twine==5.1.1; extra == "dev"
|
|
35
|
+
Dynamic: author
|
|
36
|
+
Dynamic: author-email
|
|
37
|
+
Dynamic: classifier
|
|
38
|
+
Dynamic: description
|
|
39
|
+
Dynamic: description-content-type
|
|
40
|
+
Dynamic: home-page
|
|
41
|
+
Dynamic: keywords
|
|
42
|
+
Dynamic: license-file
|
|
43
|
+
Dynamic: provides-extra
|
|
44
|
+
Dynamic: requires-dist
|
|
45
|
+
Dynamic: requires-python
|
|
46
|
+
Dynamic: summary
|
|
47
|
+
|
|
48
|
+
# domdown
|
|
49
|
+
|
|
50
|
+

|
|
51
|
+
|
|
52
|
+
`domdown` turns article-like web pages into clean, structured Markdown.
|
|
53
|
+
|
|
54
|
+
It is built for pages where the shape matters: long-form posts, research writeups, technical blogs, security reports, and other content-heavy pages that need to become readable Markdown without losing useful structure.
|
|
55
|
+
|
|
56
|
+
## What it does
|
|
57
|
+
|
|
58
|
+
`domdown` takes care of the full HTML-to-Markdown pipeline:
|
|
59
|
+
|
|
60
|
+
- Parses messy web HTML
|
|
61
|
+
- Selects the main article content
|
|
62
|
+
- Removes navigation, promo blocks, and other chrome
|
|
63
|
+
- Extracts metadata
|
|
64
|
+
- Preserves images, tables, code blocks, links, and lists
|
|
65
|
+
- Optionally emits YAML frontmatter
|
|
66
|
+
- Renders the final Markdown document
|
|
67
|
+
|
|
68
|
+
The result is Markdown that is ready to read, reuse, archive, or feed into another model.
|
|
69
|
+
|
|
70
|
+
## Why it exists
|
|
71
|
+
|
|
72
|
+
Most pages are not written like clean documents. They mix article content with menus, banners, share widgets, related links, and other page furniture.
|
|
73
|
+
|
|
74
|
+
`domdown` is designed for cases where you want the content to stay faithful to the original page while still producing a clean Markdown output that is easy to consume downstream.
|
|
75
|
+
|
|
76
|
+
## Example
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from domdown import DomdownOptions, html_to_markdown
|
|
80
|
+
|
|
81
|
+
html = """
|
|
82
|
+
<html>
|
|
83
|
+
<head>
|
|
84
|
+
<title>Credential theft campaign expands</title>
|
|
85
|
+
<meta name="description" content="A concise security article." />
|
|
86
|
+
<link rel="canonical" href="https://example.com/research/campaign" />
|
|
87
|
+
</head>
|
|
88
|
+
<body>
|
|
89
|
+
<nav>Home Pricing Docs</nav>
|
|
90
|
+
<article>
|
|
91
|
+
<h1>Credential theft campaign expands</h1>
|
|
92
|
+
<p>Researchers observed a new wave of phishing infrastructure.</p>
|
|
93
|
+
<figure>
|
|
94
|
+
<img src="/images/chart.png" alt="Campaign infrastructure chart" />
|
|
95
|
+
<figcaption>Campaign infrastructure by week.</figcaption>
|
|
96
|
+
</figure>
|
|
97
|
+
<ul>
|
|
98
|
+
<li>Windows targets increased.</li>
|
|
99
|
+
<li>Linux staging remained stable.</li>
|
|
100
|
+
</ul>
|
|
101
|
+
</article>
|
|
102
|
+
</body>
|
|
103
|
+
</html>
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
markdown = html_to_markdown(
|
|
107
|
+
html,
|
|
108
|
+
DomdownOptions(base_url="https://example.com/research/campaign"),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
print(markdown)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Output:
|
|
115
|
+
|
|
116
|
+
```markdown
|
|
117
|
+
---
|
|
118
|
+
title: Credential theft campaign expands
|
|
119
|
+
source: "https://example.com/research/campaign"
|
|
120
|
+
description: A concise security article.
|
|
121
|
+
---
|
|
122
|
+
# Credential theft campaign expands
|
|
123
|
+
|
|
124
|
+
Researchers observed a new wave of phishing infrastructure.
|
|
125
|
+
|
|
126
|
+

|
|
127
|
+
|
|
128
|
+
Campaign infrastructure by week.
|
|
129
|
+
|
|
130
|
+
- Windows targets increased.
|
|
131
|
+
- Linux staging remained stable.
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## What it preserves
|
|
135
|
+
|
|
136
|
+
`domdown` is optimized for article-style pages where useful structure should survive the conversion:
|
|
137
|
+
|
|
138
|
+
- Titles and headings
|
|
139
|
+
- Visible author and publication metadata
|
|
140
|
+
- Canonical URLs and source references
|
|
141
|
+
- Images and captions
|
|
142
|
+
- Tables and code blocks
|
|
143
|
+
- Inline links and emphasized text
|
|
144
|
+
- Lists, quotes, and other document structure
|
|
145
|
+
|
|
146
|
+
## Using domdown
|
|
147
|
+
|
|
148
|
+
### Client usage
|
|
149
|
+
|
|
150
|
+
Use `html_to_markdown()` when you only need the final Markdown document as a string.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from domdown import DomdownOptions, html_to_markdown
|
|
154
|
+
|
|
155
|
+
markdown = html_to_markdown(
|
|
156
|
+
html,
|
|
157
|
+
DomdownOptions(
|
|
158
|
+
base_url="https://example.com/post",
|
|
159
|
+
emit_frontmatter=False,
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
When `emit_frontmatter=True` or left at the default, the returned string includes YAML frontmatter followed by the Markdown body.
|
|
165
|
+
|
|
166
|
+
### API usage
|
|
167
|
+
|
|
168
|
+
Use `HtmlToMarkdownPipeline` when you want structured output.
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from domdown import DomdownOptions, HtmlToMarkdownPipeline
|
|
172
|
+
|
|
173
|
+
pipeline = HtmlToMarkdownPipeline(
|
|
174
|
+
DomdownOptions(base_url="https://example.com/post")
|
|
175
|
+
)
|
|
176
|
+
result = pipeline.run(html)
|
|
177
|
+
|
|
178
|
+
print(result.document)
|
|
179
|
+
print(result.markdown)
|
|
180
|
+
print(result.cleaned_html)
|
|
181
|
+
print(result.frontmatter)
|
|
182
|
+
print(result.warnings)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`HtmlToMarkdownResult` exposes:
|
|
186
|
+
|
|
187
|
+
| Field | Type | Description |
|
|
188
|
+
| --- | --- | --- |
|
|
189
|
+
| `markdown` | `str` | Markdown rendered from the selected content. |
|
|
190
|
+
| `cleaned_html` | `str \| None` | HTML after parsing, selection, cleaning, and preservation. |
|
|
191
|
+
| `metadata` | `HtmlMetadata \| None` | Normalized metadata extracted from the source HTML. |
|
|
192
|
+
| `frontmatter` | `str \| None` | YAML frontmatter when enabled. |
|
|
193
|
+
| `document` | `str \| None` | Final document string, including frontmatter when enabled. |
|
|
194
|
+
| `warnings` | `tuple[str, ...]` | Non-fatal pipeline warnings. |
|
|
195
|
+
|
|
196
|
+
`HtmlMetadata` exposes:
|
|
197
|
+
|
|
198
|
+
| Field | Type |
|
|
199
|
+
| --- | --- |
|
|
200
|
+
| `title` | `str \| None` |
|
|
201
|
+
| `site_name` | `str \| None` |
|
|
202
|
+
| `source` | `str \| None` |
|
|
203
|
+
| `author` | `tuple[str, ...]` |
|
|
204
|
+
| `published` | `str \| None` |
|
|
205
|
+
| `created` | `str \| None` |
|
|
206
|
+
| `description` | `str \| None` |
|
|
207
|
+
| `tags` | `tuple[str, ...]` |
|
|
208
|
+
| `language` | `str \| None` |
|
|
209
|
+
| `canonical_url` | `str \| None` |
|
|
210
|
+
| `image` | `str \| None` |
|
|
211
|
+
|
|
212
|
+
## Options
|
|
213
|
+
|
|
214
|
+
`DomdownOptions` controls parsing, cleanup, metadata extraction, and output shaping.
|
|
215
|
+
|
|
216
|
+
| Option | Default | Behavior |
|
|
217
|
+
| --- | --- | --- |
|
|
218
|
+
| `base_url` | `None` | Source URL used for metadata and relative URL resolution. |
|
|
219
|
+
| `created` | `None` | Creation date to include in metadata/frontmatter. |
|
|
220
|
+
| `extract_metadata` | `True` | Enables metadata extraction. |
|
|
221
|
+
| `emit_frontmatter` | `True` | Prepends YAML frontmatter to `document`. |
|
|
222
|
+
| `prefer_article_body` | `True` | Prefers article-like containers during selection. |
|
|
223
|
+
| `author_priority` | `"visible"` | Chooses visible author text before metadata unless set otherwise. |
|
|
224
|
+
| `frontmatter_tags` | `()` | Extra tags to include in generated frontmatter. |
|
|
225
|
+
| `preserve_images` | `True` | Keeps images for Markdown rendering. |
|
|
226
|
+
| `preserve_tables` | `True` | Keeps tables for Markdown rendering. |
|
|
227
|
+
| `preserve_code_blocks` | `True` | Keeps code/preformatted blocks. |
|
|
228
|
+
| `strip_hidden` | `True` | Removes hidden or non-visible elements. |
|
|
229
|
+
| `remove_selectors` | `()` | CSS selectors to remove. |
|
|
230
|
+
| `keep_selectors` | `()` | CSS selectors to protect during cleaning. |
|
|
231
|
+
| `unwrap_selectors` | `()` | CSS selectors whose wrapper is removed while children remain. |
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from domdown import DomdownOptions
|
|
237
|
+
|
|
238
|
+
options = DomdownOptions(
|
|
239
|
+
base_url="https://example.com/article",
|
|
240
|
+
emit_frontmatter=True,
|
|
241
|
+
preserve_images=True,
|
|
242
|
+
remove_selectors=(".share-widget", ".newsletter-signup"),
|
|
243
|
+
)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Real-world coverage
|
|
247
|
+
|
|
248
|
+
`domdown` includes curated real-world HTML/Markdown pairs under `tests/real/` to protect the pipeline against regressions on live site shapes.
|
|
249
|
+
|
|
250
|
+
- `html/` stores the captured HTML for each case.
|
|
251
|
+
- `raw/` stores the expected Markdown output for the same case.
|
|
252
|
+
- `manifest.json` declares the cases and their relative fixture paths.
|
|
253
|
+
|
|
254
|
+
To run the real-example suite:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
pytest tests/real/test_real_examples.py -q
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Public API
|
|
261
|
+
|
|
262
|
+
`domdown` exports these names from `domdown.__init__`:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
from domdown import (
|
|
266
|
+
DomdownOptions,
|
|
267
|
+
HtmlMetadata,
|
|
268
|
+
HtmlToMarkdownPipeline,
|
|
269
|
+
HtmlToMarkdownResult,
|
|
270
|
+
html_to_markdown,
|
|
271
|
+
)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## Installation
|
|
275
|
+
|
|
276
|
+
Install from this repository:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
pip install git+https://github.com/juanmcristobal/domdown.git
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Install locally for development:
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
git clone https://github.com/juanmcristobal/domdown.git
|
|
286
|
+
cd domdown
|
|
287
|
+
pip install -e ".[dev]"
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Runtime dependencies:
|
|
291
|
+
|
|
292
|
+
- `beautifulsoup4`
|
|
293
|
+
- `lxml`
|
|
294
|
+
- `soupsieve`
|
|
295
|
+
- `httpx`
|
|
296
|
+
|
|
297
|
+
## Support & Connect
|
|
298
|
+
|
|
299
|
+
* ⭐ **Star the repo** if you found it useful
|
|
300
|
+
* ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
|
|
301
|
+
* 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# History
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
## 0.1.0 (2026-05-21)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
* First release.
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
## 0.1.1 (2026-05-31)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
* Fix release workflow checkout for PyPI publish.
|
domdown-0.1.1/README.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# domdown
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
`domdown` turns article-like web pages into clean, structured Markdown.
|
|
6
|
+
|
|
7
|
+
It is built for pages where the shape matters: long-form posts, research writeups, technical blogs, security reports, and other content-heavy pages that need to become readable Markdown without losing useful structure.
|
|
8
|
+
|
|
9
|
+
## What it does
|
|
10
|
+
|
|
11
|
+
`domdown` takes care of the full HTML-to-Markdown pipeline:
|
|
12
|
+
|
|
13
|
+
- Parses messy web HTML
|
|
14
|
+
- Selects the main article content
|
|
15
|
+
- Removes navigation, promo blocks, and other chrome
|
|
16
|
+
- Extracts metadata
|
|
17
|
+
- Preserves images, tables, code blocks, links, and lists
|
|
18
|
+
- Optionally emits YAML frontmatter
|
|
19
|
+
- Renders the final Markdown document
|
|
20
|
+
|
|
21
|
+
The result is Markdown that is ready to read, reuse, archive, or feed into another model.
|
|
22
|
+
|
|
23
|
+
## Why it exists
|
|
24
|
+
|
|
25
|
+
Most pages are not written like clean documents. They mix article content with menus, banners, share widgets, related links, and other page furniture.
|
|
26
|
+
|
|
27
|
+
`domdown` is designed for cases where you want the content to stay faithful to the original page while still producing a clean Markdown output that is easy to consume downstream.
|
|
28
|
+
|
|
29
|
+
## Example
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from domdown import DomdownOptions, html_to_markdown
|
|
33
|
+
|
|
34
|
+
html = """
|
|
35
|
+
<html>
|
|
36
|
+
<head>
|
|
37
|
+
<title>Credential theft campaign expands</title>
|
|
38
|
+
<meta name="description" content="A concise security article." />
|
|
39
|
+
<link rel="canonical" href="https://example.com/research/campaign" />
|
|
40
|
+
</head>
|
|
41
|
+
<body>
|
|
42
|
+
<nav>Home Pricing Docs</nav>
|
|
43
|
+
<article>
|
|
44
|
+
<h1>Credential theft campaign expands</h1>
|
|
45
|
+
<p>Researchers observed a new wave of phishing infrastructure.</p>
|
|
46
|
+
<figure>
|
|
47
|
+
<img src="/images/chart.png" alt="Campaign infrastructure chart" />
|
|
48
|
+
<figcaption>Campaign infrastructure by week.</figcaption>
|
|
49
|
+
</figure>
|
|
50
|
+
<ul>
|
|
51
|
+
<li>Windows targets increased.</li>
|
|
52
|
+
<li>Linux staging remained stable.</li>
|
|
53
|
+
</ul>
|
|
54
|
+
</article>
|
|
55
|
+
</body>
|
|
56
|
+
</html>
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
markdown = html_to_markdown(
|
|
60
|
+
html,
|
|
61
|
+
DomdownOptions(base_url="https://example.com/research/campaign"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(markdown)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Output:
|
|
68
|
+
|
|
69
|
+
```markdown
|
|
70
|
+
---
|
|
71
|
+
title: Credential theft campaign expands
|
|
72
|
+
source: "https://example.com/research/campaign"
|
|
73
|
+
description: A concise security article.
|
|
74
|
+
---
|
|
75
|
+
# Credential theft campaign expands
|
|
76
|
+
|
|
77
|
+
Researchers observed a new wave of phishing infrastructure.
|
|
78
|
+
|
|
79
|
+

|
|
80
|
+
|
|
81
|
+
Campaign infrastructure by week.
|
|
82
|
+
|
|
83
|
+
- Windows targets increased.
|
|
84
|
+
- Linux staging remained stable.
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## What it preserves
|
|
88
|
+
|
|
89
|
+
`domdown` is optimized for article-style pages where useful structure should survive the conversion:
|
|
90
|
+
|
|
91
|
+
- Titles and headings
|
|
92
|
+
- Visible author and publication metadata
|
|
93
|
+
- Canonical URLs and source references
|
|
94
|
+
- Images and captions
|
|
95
|
+
- Tables and code blocks
|
|
96
|
+
- Inline links and emphasized text
|
|
97
|
+
- Lists, quotes, and other document structure
|
|
98
|
+
|
|
99
|
+
## Using domdown
|
|
100
|
+
|
|
101
|
+
### Client usage
|
|
102
|
+
|
|
103
|
+
Use `html_to_markdown()` when you only need the final Markdown document as a string.
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from domdown import DomdownOptions, html_to_markdown
|
|
107
|
+
|
|
108
|
+
markdown = html_to_markdown(
|
|
109
|
+
html,
|
|
110
|
+
DomdownOptions(
|
|
111
|
+
base_url="https://example.com/post",
|
|
112
|
+
emit_frontmatter=False,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
When `emit_frontmatter=True` or left at the default, the returned string includes YAML frontmatter followed by the Markdown body.
|
|
118
|
+
|
|
119
|
+
### API usage
|
|
120
|
+
|
|
121
|
+
Use `HtmlToMarkdownPipeline` when you want structured output.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from domdown import DomdownOptions, HtmlToMarkdownPipeline
|
|
125
|
+
|
|
126
|
+
pipeline = HtmlToMarkdownPipeline(
|
|
127
|
+
DomdownOptions(base_url="https://example.com/post")
|
|
128
|
+
)
|
|
129
|
+
result = pipeline.run(html)
|
|
130
|
+
|
|
131
|
+
print(result.document)
|
|
132
|
+
print(result.markdown)
|
|
133
|
+
print(result.cleaned_html)
|
|
134
|
+
print(result.frontmatter)
|
|
135
|
+
print(result.warnings)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
`HtmlToMarkdownResult` exposes:
|
|
139
|
+
|
|
140
|
+
| Field | Type | Description |
|
|
141
|
+
| --- | --- | --- |
|
|
142
|
+
| `markdown` | `str` | Markdown rendered from the selected content. |
|
|
143
|
+
| `cleaned_html` | `str \| None` | HTML after parsing, selection, cleaning, and preservation. |
|
|
144
|
+
| `metadata` | `HtmlMetadata \| None` | Normalized metadata extracted from the source HTML. |
|
|
145
|
+
| `frontmatter` | `str \| None` | YAML frontmatter when enabled. |
|
|
146
|
+
| `document` | `str \| None` | Final document string, including frontmatter when enabled. |
|
|
147
|
+
| `warnings` | `tuple[str, ...]` | Non-fatal pipeline warnings. |
|
|
148
|
+
|
|
149
|
+
`HtmlMetadata` exposes:
|
|
150
|
+
|
|
151
|
+
| Field | Type |
|
|
152
|
+
| --- | --- |
|
|
153
|
+
| `title` | `str \| None` |
|
|
154
|
+
| `site_name` | `str \| None` |
|
|
155
|
+
| `source` | `str \| None` |
|
|
156
|
+
| `author` | `tuple[str, ...]` |
|
|
157
|
+
| `published` | `str \| None` |
|
|
158
|
+
| `created` | `str \| None` |
|
|
159
|
+
| `description` | `str \| None` |
|
|
160
|
+
| `tags` | `tuple[str, ...]` |
|
|
161
|
+
| `language` | `str \| None` |
|
|
162
|
+
| `canonical_url` | `str \| None` |
|
|
163
|
+
| `image` | `str \| None` |
|
|
164
|
+
|
|
165
|
+
## Options
|
|
166
|
+
|
|
167
|
+
`DomdownOptions` controls parsing, cleanup, metadata extraction, and output shaping.
|
|
168
|
+
|
|
169
|
+
| Option | Default | Behavior |
|
|
170
|
+
| --- | --- | --- |
|
|
171
|
+
| `base_url` | `None` | Source URL used for metadata and relative URL resolution. |
|
|
172
|
+
| `created` | `None` | Creation date to include in metadata/frontmatter. |
|
|
173
|
+
| `extract_metadata` | `True` | Enables metadata extraction. |
|
|
174
|
+
| `emit_frontmatter` | `True` | Prepends YAML frontmatter to `document`. |
|
|
175
|
+
| `prefer_article_body` | `True` | Prefers article-like containers during selection. |
|
|
176
|
+
| `author_priority` | `"visible"` | Chooses visible author text before metadata unless set otherwise. |
|
|
177
|
+
| `frontmatter_tags` | `()` | Extra tags to include in generated frontmatter. |
|
|
178
|
+
| `preserve_images` | `True` | Keeps images for Markdown rendering. |
|
|
179
|
+
| `preserve_tables` | `True` | Keeps tables for Markdown rendering. |
|
|
180
|
+
| `preserve_code_blocks` | `True` | Keeps code/preformatted blocks. |
|
|
181
|
+
| `strip_hidden` | `True` | Removes hidden or non-visible elements. |
|
|
182
|
+
| `remove_selectors` | `()` | CSS selectors to remove. |
|
|
183
|
+
| `keep_selectors` | `()` | CSS selectors to protect during cleaning. |
|
|
184
|
+
| `unwrap_selectors` | `()` | CSS selectors whose wrapper is removed while children remain. |
|
|
185
|
+
|
|
186
|
+
Example:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from domdown import DomdownOptions
|
|
190
|
+
|
|
191
|
+
options = DomdownOptions(
|
|
192
|
+
base_url="https://example.com/article",
|
|
193
|
+
emit_frontmatter=True,
|
|
194
|
+
preserve_images=True,
|
|
195
|
+
remove_selectors=(".share-widget", ".newsletter-signup"),
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Real-world coverage
|
|
200
|
+
|
|
201
|
+
`domdown` includes curated real-world HTML/Markdown pairs under `tests/real/` to protect the pipeline against regressions on live site shapes.
|
|
202
|
+
|
|
203
|
+
- `html/` stores the captured HTML for each case.
|
|
204
|
+
- `raw/` stores the expected Markdown output for the same case.
|
|
205
|
+
- `manifest.json` declares the cases and their relative fixture paths.
|
|
206
|
+
|
|
207
|
+
To run the real-example suite:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
pytest tests/real/test_real_examples.py -q
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Public API
|
|
214
|
+
|
|
215
|
+
`domdown` exports these names from `domdown.__init__`:
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from domdown import (
|
|
219
|
+
DomdownOptions,
|
|
220
|
+
HtmlMetadata,
|
|
221
|
+
HtmlToMarkdownPipeline,
|
|
222
|
+
HtmlToMarkdownResult,
|
|
223
|
+
html_to_markdown,
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Installation
|
|
228
|
+
|
|
229
|
+
Install from this repository:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
pip install git+https://github.com/juanmcristobal/domdown.git
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Install locally for development:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
git clone https://github.com/juanmcristobal/domdown.git
|
|
239
|
+
cd domdown
|
|
240
|
+
pip install -e ".[dev]"
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Runtime dependencies:
|
|
244
|
+
|
|
245
|
+
- `beautifulsoup4`
|
|
246
|
+
- `lxml`
|
|
247
|
+
- `soupsieve`
|
|
248
|
+
- `httpx`
|
|
249
|
+
|
|
250
|
+
## Support & Connect
|
|
251
|
+
|
|
252
|
+
* ⭐ **Star the repo** if you found it useful
|
|
253
|
+
* ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
|
|
254
|
+
* 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .api import html_to_markdown
|
|
4
|
+
from ._core import DomdownOptions, HtmlMetadata, HtmlToMarkdownResult
|
|
5
|
+
from ._pipeline import HtmlToMarkdownPipeline
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"__version__",
|
|
11
|
+
"DomdownOptions",
|
|
12
|
+
"HtmlMetadata",
|
|
13
|
+
"HtmlToMarkdownResult",
|
|
14
|
+
"HtmlToMarkdownPipeline",
|
|
15
|
+
"html_to_markdown",
|
|
16
|
+
]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .selectors import (
|
|
4
|
+
BOILERPLATE_PHRASES,
|
|
5
|
+
CONTENT_SELECTORS,
|
|
6
|
+
CONTENT_SELECTORS_EXACT,
|
|
7
|
+
CONTENT_SELECTORS_FALLBACK,
|
|
8
|
+
DEFAULT_REMOVE_SELECTORS,
|
|
9
|
+
JS_SHELL_PHRASES,
|
|
10
|
+
HEADER_MARKERS,
|
|
11
|
+
NOISE_MARKERS,
|
|
12
|
+
RELATED_PHRASES,
|
|
13
|
+
REFINABLE_CHILD_TAGS,
|
|
14
|
+
ROOT_SELECTORS,
|
|
15
|
+
SHARE_SELECTORS,
|
|
16
|
+
SKIP_TAGS,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"BOILERPLATE_PHRASES",
|
|
21
|
+
"CONTENT_SELECTORS",
|
|
22
|
+
"CONTENT_SELECTORS_EXACT",
|
|
23
|
+
"CONTENT_SELECTORS_FALLBACK",
|
|
24
|
+
"DEFAULT_REMOVE_SELECTORS",
|
|
25
|
+
"JS_SHELL_PHRASES",
|
|
26
|
+
"HEADER_MARKERS",
|
|
27
|
+
"NOISE_MARKERS",
|
|
28
|
+
"RELATED_PHRASES",
|
|
29
|
+
"REFINABLE_CHILD_TAGS",
|
|
30
|
+
"ROOT_SELECTORS",
|
|
31
|
+
"SHARE_SELECTORS",
|
|
32
|
+
"SKIP_TAGS",
|
|
33
|
+
]
|