domdown 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. domdown-0.1.1/AUTHORS.md +10 -0
  2. domdown-0.1.1/HISTORY.md +13 -0
  3. domdown-0.1.1/MANIFEST.in +10 -0
  4. domdown-0.1.1/PKG-INFO +316 -0
  5. domdown-0.1.1/README.md +254 -0
  6. domdown-0.1.1/domdown/__init__.py +16 -0
  7. domdown-0.1.1/domdown/_constants/__init__.py +33 -0
  8. domdown-0.1.1/domdown/_constants/selectors.py +305 -0
  9. domdown-0.1.1/domdown/_core/__init__.py +13 -0
  10. domdown-0.1.1/domdown/_core/context.py +22 -0
  11. domdown-0.1.1/domdown/_core/metadata.py +20 -0
  12. domdown-0.1.1/domdown/_core/options.py +23 -0
  13. domdown-0.1.1/domdown/_core/result.py +17 -0
  14. domdown-0.1.1/domdown/_document/__init__.py +7 -0
  15. domdown-0.1.1/domdown/_document/clean.py +549 -0
  16. domdown-0.1.1/domdown/_document/parse.py +9 -0
  17. domdown-0.1.1/domdown/_document/select.py +485 -0
  18. domdown-0.1.1/domdown/_frontmatter/__init__.py +6 -0
  19. domdown-0.1.1/domdown/_frontmatter/compose.py +10 -0
  20. domdown-0.1.1/domdown/_frontmatter/serialize.py +39 -0
  21. domdown-0.1.1/domdown/_metadata/__init__.py +5 -0
  22. domdown-0.1.1/domdown/_metadata/extract.py +97 -0
  23. domdown-0.1.1/domdown/_metadata/helpers.py +180 -0
  24. domdown-0.1.1/domdown/_metadata/selectors.py +79 -0
  25. domdown-0.1.1/domdown/_pipeline/__init__.py +5 -0
  26. domdown-0.1.1/domdown/_pipeline/runner.py +63 -0
  27. domdown-0.1.1/domdown/_text/__init__.py +16 -0
  28. domdown-0.1.1/domdown/_text/frontmatter.py +30 -0
  29. domdown-0.1.1/domdown/_text/normalize.py +20 -0
  30. domdown-0.1.1/domdown/_text/url.py +43 -0
  31. domdown-0.1.1/domdown/adapters/__init__.py +7 -0
  32. domdown-0.1.1/domdown/adapters/base.py +31 -0
  33. domdown-0.1.1/domdown/adapters/github.py +229 -0
  34. domdown-0.1.1/domdown/adapters/registry.py +46 -0
  35. domdown-0.1.1/domdown/api.py +15 -0
  36. domdown-0.1.1/domdown/markdown/__init__.py +17 -0
  37. domdown-0.1.1/domdown/markdown/block.py +349 -0
  38. domdown-0.1.1/domdown/markdown/code.py +54 -0
  39. domdown-0.1.1/domdown/markdown/images.py +64 -0
  40. domdown-0.1.1/domdown/markdown/inline.py +81 -0
  41. domdown-0.1.1/domdown/markdown/links.py +42 -0
  42. domdown-0.1.1/domdown/markdown/lists.py +40 -0
  43. domdown-0.1.1/domdown/markdown/postprocess.py +43 -0
  44. domdown-0.1.1/domdown/markdown/tables.py +52 -0
  45. domdown-0.1.1/domdown/stages/__init__.py +21 -0
  46. domdown-0.1.1/domdown/stages/base.py +18 -0
  47. domdown-0.1.1/domdown/stages/clean.py +57 -0
  48. domdown-0.1.1/domdown/stages/frontmatter.py +25 -0
  49. domdown-0.1.1/domdown/stages/markdown.py +23 -0
  50. domdown-0.1.1/domdown/stages/metadata.py +21 -0
  51. domdown-0.1.1/domdown/stages/parse.py +21 -0
  52. domdown-0.1.1/domdown/stages/postprocess.py +21 -0
  53. domdown-0.1.1/domdown/stages/preserve.py +67 -0
  54. domdown-0.1.1/domdown.egg-info/PKG-INFO +316 -0
  55. domdown-0.1.1/domdown.egg-info/SOURCES.txt +151 -0
  56. domdown-0.1.1/domdown.egg-info/dependency_links.txt +1 -0
  57. domdown-0.1.1/domdown.egg-info/entry_points.txt +2 -0
  58. domdown-0.1.1/domdown.egg-info/not-zip-safe +1 -0
  59. domdown-0.1.1/domdown.egg-info/requires.txt +17 -0
  60. domdown-0.1.1/domdown.egg-info/top_level.txt +1 -0
  61. domdown-0.1.1/pyproject.toml +22 -0
  62. domdown-0.1.1/requirements.txt +4 -0
  63. domdown-0.1.1/setup.cfg +23 -0
  64. domdown-0.1.1/setup.py +59 -0
  65. domdown-0.1.1/tests/__init__.py +1 -0
  66. domdown-0.1.1/tests/adapters/test_base.py +18 -0
  67. domdown-0.1.1/tests/adapters/test_github.py +56 -0
  68. domdown-0.1.1/tests/adapters/test_registry.py +73 -0
  69. domdown-0.1.1/tests/api/test_api.py +24 -0
  70. domdown-0.1.1/tests/api/test_package_layout.py +14 -0
  71. domdown-0.1.1/tests/core/test_core_models.py +51 -0
  72. domdown-0.1.1/tests/document/test_document_clean.py +1067 -0
  73. domdown-0.1.1/tests/document/test_document_parse.py +12 -0
  74. domdown-0.1.1/tests/document/test_document_select.py +459 -0
  75. domdown-0.1.1/tests/document/test_noise_markers.py +219 -0
  76. domdown-0.1.1/tests/document/test_partial_selectors.py +171 -0
  77. domdown-0.1.1/tests/fixtures/__init__.py +23 -0
  78. domdown-0.1.1/tests/fixtures/article_regressions.py +228 -0
  79. domdown-0.1.1/tests/fixtures/article_shell.py +41 -0
  80. domdown-0.1.1/tests/frontmatter/test_frontmatter.py +48 -0
  81. domdown-0.1.1/tests/markdown/test_markdown_block.py +518 -0
  82. domdown-0.1.1/tests/markdown/test_markdown_code.py +48 -0
  83. domdown-0.1.1/tests/markdown/test_markdown_images.py +31 -0
  84. domdown-0.1.1/tests/markdown/test_markdown_inline.py +61 -0
  85. domdown-0.1.1/tests/markdown/test_markdown_links.py +30 -0
  86. domdown-0.1.1/tests/markdown/test_markdown_lists.py +26 -0
  87. domdown-0.1.1/tests/markdown/test_markdown_postprocess.py +18 -0
  88. domdown-0.1.1/tests/markdown/test_markdown_tables.py +17 -0
  89. domdown-0.1.1/tests/metadata/test_metadata_extract.py +226 -0
  90. domdown-0.1.1/tests/metadata/test_metadata_helpers.py +57 -0
  91. domdown-0.1.1/tests/pipeline/test_article_shell.py +34 -0
  92. domdown-0.1.1/tests/pipeline/test_pipeline.py +27 -0
  93. domdown-0.1.1/tests/pipeline/test_stages.py +148 -0
  94. domdown-0.1.1/tests/real/README.md +19 -0
  95. domdown-0.1.1/tests/real/__init__.py +50 -0
  96. domdown-0.1.1/tests/real/html/0xmaz.me_posts_HookChain-A-Deep-Dive-into-Advanced-EDR-Bypass-Techniques.html +1 -0
  97. domdown-0.1.1/tests/real/html/1password.com_blog_as-ai-supercharges-phishing-scams-1password-introduces-built-in-protection.html +9 -0
  98. domdown-0.1.1/tests/real/html/404media.co_a-secure-chat-apps-encryption-is-so-bad-it-is-meaningless.html +905 -0
  99. domdown-0.1.1/tests/real/html/acronis_boto_cor_de_rosa_campaign_astaroth_whatsapp_brazil.html +195 -0
  100. domdown-0.1.1/tests/real/html/adnanthekhan_clinejection.html +313 -0
  101. domdown-0.1.1/tests/real/html/aikido.dev_blog_axios-npm-compromised-maintainer-hijacked-rat.html +2585 -0
  102. domdown-0.1.1/tests/real/html/arstechnica.com_security_2026_02_new-airsnitch-attack-breaks-wi-fi-encryption-in-homes-offices-and-enterprises.html +2305 -0
  103. domdown-0.1.1/tests/real/html/attack.mitre.org_techniques_T1102_001.html +1462 -0
  104. domdown-0.1.1/tests/real/html/blog.alyac.co.kr_5035.html +1725 -0
  105. domdown-0.1.1/tests/real/html/blog.ethiack.com_blog_bypassing-wafs-for-fun-and-js-injection-with-parameter-pollution.html +3843 -0
  106. domdown-0.1.1/tests/real/html/blog.gdatasoftware.com_2026_03_38385-acr-stealer-infrastructure.html +44 -0
  107. domdown-0.1.1/tests/real/html/blog.gdatasoftware.com_2026_03_38399-analysis-kissloader.html +115 -0
  108. domdown-0.1.1/tests/real/html/bushidotoken.net_2025_02_blackbasta-leaks-lessons-from-ascension.html +4853 -0
  109. domdown-0.1.1/tests/real/html/cisa.gov_news-events_alerts_2026_03_03_cisa-adds-two-known-exploited-vulnerabilities-catalog.html +1568 -0
  110. domdown-0.1.1/tests/real/html/dragos_threat_voltzite.html +3489 -0
  111. domdown-0.1.1/tests/real/html/elastic.co_security-labs_phantom-in-the-vault.html +319 -0
  112. domdown-0.1.1/tests/real/html/encryptionconsulting.com_enterprise-guide-to-pqc-migration.html +1871 -0
  113. domdown-0.1.1/tests/real/html/endorlabs.com_learn_how-ai-sast-traced-data-flows-to-uncover-six-openclaw-vulnerabilities.html +2614 -0
  114. domdown-0.1.1/tests/real/html/fortinet.com_corporate_about-us_newsroom_press-releases_2025_fortinet-threat-report-reveals-record-surge-in-automated-cyberattacks.html +8282 -0
  115. domdown-0.1.1/tests/real/html/github.com_BerriAI_litellm_issues_24518.html +1622 -0
  116. domdown-0.1.1/tests/real/html/github.com_PaloAltoNetworks_Unit42-timely-threat-intel_blob_main_2025-12-03-recent-surge-in-ClickFix-activity.txt.html +1484 -0
  117. domdown-0.1.1/tests/real/html/github.com_nodejs_node_releases_tag_v25.3.0.html +1862 -0
  118. domdown-0.1.1/tests/real/html/nisos.com_blog_dprk-remote-worker-fraud-interview.html +989 -0
  119. domdown-0.1.1/tests/real/html/safedep.io_malicious-ixpresso-core-npm-rat.html +11 -0
  120. domdown-0.1.1/tests/real/html/techzone.bitdefender.com_en_tech-explainers_what-is-dll-sideloading.html +88 -0
  121. domdown-0.1.1/tests/real/html/zimperium.com_blog_over-3000-android-malware-samples-using-multiple-techniques-to-bypass-detection.html +3340 -0
  122. domdown-0.1.1/tests/real/manifest.json +167 -0
  123. domdown-0.1.1/tests/real/raw/0xmaz.me_posts_HookChain-A-Deep-Dive-into-Advanced-EDR-Bypass-Techniques.md +157 -0
  124. domdown-0.1.1/tests/real/raw/1password.com_blog_as-ai-supercharges-phishing-scams-1password-introduces-built-in-protection.md +182 -0
  125. domdown-0.1.1/tests/real/raw/404media.co_a-secure-chat-apps-encryption-is-so-bad-it-is-meaningless.md +26 -0
  126. domdown-0.1.1/tests/real/raw/acronis_boto_cor_de_rosa_campaign_astaroth_whatsapp_brazil.md +158 -0
  127. domdown-0.1.1/tests/real/raw/adnanthekhan_clinejection.md +369 -0
  128. domdown-0.1.1/tests/real/raw/aikido.dev_blog_axios-npm-compromised-maintainer-hijacked-rat.md +147 -0
  129. domdown-0.1.1/tests/real/raw/arstechnica.com_security_2026_02_new-airsnitch-attack-breaks-wi-fi-encryption-in-homes-offices-and-enterprises.md +159 -0
  130. domdown-0.1.1/tests/real/raw/attack.mitre.org_techniques_T1102_001.md +134 -0
  131. domdown-0.1.1/tests/real/raw/blog.alyac.co.kr_5035.md +191 -0
  132. domdown-0.1.1/tests/real/raw/blog.ethiack.com_blog_bypassing-wafs-for-fun-and-js-injection-with-parameter-pollution.md +267 -0
  133. domdown-0.1.1/tests/real/raw/blog.gdatasoftware.com_2026_03_38385-acr-stealer-infrastructure.md +163 -0
  134. domdown-0.1.1/tests/real/raw/blog.gdatasoftware.com_2026_03_38399-analysis-kissloader.md +135 -0
  135. domdown-0.1.1/tests/real/raw/bushidotoken.net_2025_02_blackbasta-leaks-lessons-from-ascension.md +291 -0
  136. domdown-0.1.1/tests/real/raw/cisa.gov_news-events_alerts_2026_03_03_cisa-adds-two-known-exploited-vulnerabilities-catalog.md +19 -0
  137. domdown-0.1.1/tests/real/raw/dragos_threat_voltzite.md +48 -0
  138. domdown-0.1.1/tests/real/raw/elastic.co_security-labs_phantom-in-the-vault.md +476 -0
  139. domdown-0.1.1/tests/real/raw/encryptionconsulting.com_enterprise-guide-to-pqc-migration.md +222 -0
  140. domdown-0.1.1/tests/real/raw/endorlabs.com_learn_how-ai-sast-traced-data-flows-to-uncover-six-openclaw-vulnerabilities.md +677 -0
  141. domdown-0.1.1/tests/real/raw/fortinet.com_corporate_about-us_newsroom_press-releases_2025_fortinet-threat-report-reveals-record-surge-in-automated-cyberattacks.md +68 -0
  142. domdown-0.1.1/tests/real/raw/github.com_BerriAI_litellm_issues_24518.md +79 -0
  143. domdown-0.1.1/tests/real/raw/github.com_PaloAltoNetworks_Unit42-timely-threat-intel_blob_main_2025-12-03-recent-surge-in-ClickFix-activity.txt.md +76 -0
  144. domdown-0.1.1/tests/real/raw/github.com_nodejs_node_releases_tag_v25.3.0.md +61 -0
  145. domdown-0.1.1/tests/real/raw/nisos.com_blog_dprk-remote-worker-fraud-interview.md +82 -0
  146. domdown-0.1.1/tests/real/raw/safedep.io_malicious-ixpresso-core-npm-rat.md +203 -0
  147. domdown-0.1.1/tests/real/raw/techzone.bitdefender.com_en_tech-explainers_what-is-dll-sideloading.md +80 -0
  148. domdown-0.1.1/tests/real/raw/zimperium.com_blog_over-3000-android-malware-samples-using-multiple-techniques-to-bypass-detection.md +184 -0
  149. domdown-0.1.1/tests/real/test_real_examples.py +24 -0
  150. domdown-0.1.1/tests/text/test_text_frontmatter.py +23 -0
  151. domdown-0.1.1/tests/text/test_text_normalize.py +15 -0
  152. domdown-0.1.1/tests/text/test_text_url.py +28 -0
@@ -0,0 +1,10 @@
1
+ # Credits
2
+
3
+
4
+ ## Development Lead
5
+
6
+ * Juan Manuel Cristóbal Moreno <juanmcristobal@gmail.com>
7
+
8
+ ## Contributors
9
+
10
+ None yet. Why not be the first?
@@ -0,0 +1,13 @@
1
+ # History
2
+
3
+
4
+ ## 0.1.0 (2026-05-21)
5
+
6
+
7
+ * First release.
8
+
9
+
10
+ ## 0.1.1 (2026-05-31)
11
+
12
+
13
+ * Fix release workflow checkout for PyPI publish.
@@ -0,0 +1,10 @@
1
+ include AUTHORS.md
2
+ include HISTORY.md
3
+ include README.md
4
+ include requirements.txt
5
+
6
+ recursive-include tests *
7
+ recursive-exclude * __pycache__
8
+ recursive-exclude * *.py[co]
9
+
10
+ recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
domdown-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,316 @@
1
+ Metadata-Version: 2.4
2
+ Name: domdown
3
+ Version: 0.1.1
4
+ Summary: extracts the main content from web pages and returns cleaned HTML, optional markdown, and structured metadata.
5
+ Home-page: https://github.com/juanmcristobal/domdown
6
+ Author: Juan Manuel Cristóbal Moreno
7
+ Author-email: juanmcristobal@gmail.com
8
+ Keywords: domdown
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Natural Language :: English
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: AUTHORS.md
19
+ Requires-Dist: beautifulsoup4>=4.12
20
+ Requires-Dist: lxml>=5.0
21
+ Requires-Dist: soupsieve>=2.5
22
+ Requires-Dist: httpx>=0.27
23
+ Provides-Extra: dev
24
+ Requires-Dist: black==24.4.0; extra == "dev"
25
+ Requires-Dist: isort==5.13.2; extra == "dev"
26
+ Requires-Dist: pip==24.0; extra == "dev"
27
+ Requires-Dist: bump2version==1.0.1; extra == "dev"
28
+ Requires-Dist: wheel==0.43.0; extra == "dev"
29
+ Requires-Dist: flake8==7.0.0; extra == "dev"
30
+ Requires-Dist: tox==4.14.2; extra == "dev"
31
+ Requires-Dist: coverage==7.4.4; extra == "dev"
32
+ Requires-Dist: pytest==8.1.1; extra == "dev"
33
+ Requires-Dist: build; extra == "dev"
34
+ Requires-Dist: twine==5.1.1; extra == "dev"
35
+ Dynamic: author
36
+ Dynamic: author-email
37
+ Dynamic: classifier
38
+ Dynamic: description
39
+ Dynamic: description-content-type
40
+ Dynamic: home-page
41
+ Dynamic: keywords
42
+ Dynamic: license-file
43
+ Dynamic: provides-extra
44
+ Dynamic: requires-dist
45
+ Dynamic: requires-python
46
+ Dynamic: summary
47
+
48
+ # domdown
49
+
50
+ ![domdown banner](assets/domdown-banner.jpg)
51
+
52
+ `domdown` turns article-like web pages into clean, structured Markdown.
53
+
54
+ It is built for pages where the shape matters: long-form posts, research writeups, technical blogs, security reports, and other content-heavy pages that need to become readable Markdown without losing useful structure.
55
+
56
+ ## What it does
57
+
58
+ `domdown` takes care of the full HTML-to-Markdown pipeline:
59
+
60
+ - Parses messy web HTML
61
+ - Selects the main article content
62
+ - Removes navigation, promo blocks, and other chrome
63
+ - Extracts metadata
64
+ - Preserves images, tables, code blocks, links, and lists
65
+ - Optionally emits YAML frontmatter
66
+ - Renders the final Markdown document
67
+
68
+ The result is Markdown that is ready to read, reuse, archive, or feed into another model.
69
+
70
+ ## Why it exists
71
+
72
+ Most pages are not written like clean documents. They mix article content with menus, banners, share widgets, related links, and other page furniture.
73
+
74
+ `domdown` is designed for cases where you want the content to stay faithful to the original page while still producing a clean Markdown output that is easy to consume downstream.
75
+
76
+ ## Example
77
+
78
+ ```python
79
+ from domdown import DomdownOptions, html_to_markdown
80
+
81
+ html = """
82
+ <html>
83
+ <head>
84
+ <title>Credential theft campaign expands</title>
85
+ <meta name="description" content="A concise security article." />
86
+ <link rel="canonical" href="https://example.com/research/campaign" />
87
+ </head>
88
+ <body>
89
+ <nav>Home Pricing Docs</nav>
90
+ <article>
91
+ <h1>Credential theft campaign expands</h1>
92
+ <p>Researchers observed a new wave of phishing infrastructure.</p>
93
+ <figure>
94
+ <img src="/images/chart.png" alt="Campaign infrastructure chart" />
95
+ <figcaption>Campaign infrastructure by week.</figcaption>
96
+ </figure>
97
+ <ul>
98
+ <li>Windows targets increased.</li>
99
+ <li>Linux staging remained stable.</li>
100
+ </ul>
101
+ </article>
102
+ </body>
103
+ </html>
104
+ """
105
+
106
+ markdown = html_to_markdown(
107
+ html,
108
+ DomdownOptions(base_url="https://example.com/research/campaign"),
109
+ )
110
+
111
+ print(markdown)
112
+ ```
113
+
114
+ Output:
115
+
116
+ ```markdown
117
+ ---
118
+ title: Credential theft campaign expands
119
+ source: "https://example.com/research/campaign"
120
+ description: A concise security article.
121
+ ---
122
+ # Credential theft campaign expands
123
+
124
+ Researchers observed a new wave of phishing infrastructure.
125
+
126
+ ![Campaign infrastructure chart](https://example.com/images/chart.png)
127
+
128
+ Campaign infrastructure by week.
129
+
130
+ - Windows targets increased.
131
+ - Linux staging remained stable.
132
+ ```
133
+
134
+ ## What it preserves
135
+
136
+ `domdown` is optimized for article-style pages where useful structure should survive the conversion:
137
+
138
+ - Titles and headings
139
+ - Visible author and publication metadata
140
+ - Canonical URLs and source references
141
+ - Images and captions
142
+ - Tables and code blocks
143
+ - Inline links and emphasized text
144
+ - Lists, quotes, and other document structure
145
+
146
+ ## Using domdown
147
+
148
+ ### Client usage
149
+
150
+ Use `html_to_markdown()` when you only need the final Markdown document as a string.
151
+
152
+ ```python
153
+ from domdown import DomdownOptions, html_to_markdown
154
+
155
+ markdown = html_to_markdown(
156
+ html,
157
+ DomdownOptions(
158
+ base_url="https://example.com/post",
159
+ emit_frontmatter=False,
160
+ ),
161
+ )
162
+ ```
163
+
164
+ When `emit_frontmatter=True` or left at the default, the returned string includes YAML frontmatter followed by the Markdown body.
165
+
166
+ ### API usage
167
+
168
+ Use `HtmlToMarkdownPipeline` when you want structured output.
169
+
170
+ ```python
171
+ from domdown import DomdownOptions, HtmlToMarkdownPipeline
172
+
173
+ pipeline = HtmlToMarkdownPipeline(
174
+ DomdownOptions(base_url="https://example.com/post")
175
+ )
176
+ result = pipeline.run(html)
177
+
178
+ print(result.document)
179
+ print(result.markdown)
180
+ print(result.cleaned_html)
181
+ print(result.frontmatter)
182
+ print(result.warnings)
183
+ ```
184
+
185
+ `HtmlToMarkdownResult` exposes:
186
+
187
+ | Field | Type | Description |
188
+ | --- | --- | --- |
189
+ | `markdown` | `str` | Markdown rendered from the selected content. |
190
+ | `cleaned_html` | `str \| None` | HTML after parsing, selection, cleaning, and preservation. |
191
+ | `metadata` | `HtmlMetadata \| None` | Normalized metadata extracted from the source HTML. |
192
+ | `frontmatter` | `str \| None` | YAML frontmatter when enabled. |
193
+ | `document` | `str \| None` | Final document string, including frontmatter when enabled. |
194
+ | `warnings` | `tuple[str, ...]` | Non-fatal pipeline warnings. |
195
+
196
+ `HtmlMetadata` exposes:
197
+
198
+ | Field | Type |
199
+ | --- | --- |
200
+ | `title` | `str \| None` |
201
+ | `site_name` | `str \| None` |
202
+ | `source` | `str \| None` |
203
+ | `author` | `tuple[str, ...]` |
204
+ | `published` | `str \| None` |
205
+ | `created` | `str \| None` |
206
+ | `description` | `str \| None` |
207
+ | `tags` | `tuple[str, ...]` |
208
+ | `language` | `str \| None` |
209
+ | `canonical_url` | `str \| None` |
210
+ | `image` | `str \| None` |
211
+
212
+ ## Options
213
+
214
+ `DomdownOptions` controls parsing, cleanup, metadata extraction, and output shaping.
215
+
216
+ | Option | Default | Behavior |
217
+ | --- | --- | --- |
218
+ | `base_url` | `None` | Source URL used for metadata and relative URL resolution. |
219
+ | `created` | `None` | Creation date to include in metadata/frontmatter. |
220
+ | `extract_metadata` | `True` | Enables metadata extraction. |
221
+ | `emit_frontmatter` | `True` | Prepends YAML frontmatter to `document`. |
222
+ | `prefer_article_body` | `True` | Prefers article-like containers during selection. |
223
+ | `author_priority` | `"visible"` | Chooses visible author text before metadata unless set otherwise. |
224
+ | `frontmatter_tags` | `()` | Extra tags to include in generated frontmatter. |
225
+ | `preserve_images` | `True` | Keeps images for Markdown rendering. |
226
+ | `preserve_tables` | `True` | Keeps tables for Markdown rendering. |
227
+ | `preserve_code_blocks` | `True` | Keeps code/preformatted blocks. |
228
+ | `strip_hidden` | `True` | Removes hidden or non-visible elements. |
229
+ | `remove_selectors` | `()` | CSS selectors to remove. |
230
+ | `keep_selectors` | `()` | CSS selectors to protect during cleaning. |
231
+ | `unwrap_selectors` | `()` | CSS selectors whose wrapper is removed while children remain. |
232
+
233
+ Example:
234
+
235
+ ```python
236
+ from domdown import DomdownOptions
237
+
238
+ options = DomdownOptions(
239
+ base_url="https://example.com/article",
240
+ emit_frontmatter=True,
241
+ preserve_images=True,
242
+ remove_selectors=(".share-widget", ".newsletter-signup"),
243
+ )
244
+ ```
245
+
246
+ ## Real-world coverage
247
+
248
+ `domdown` includes curated real-world HTML/Markdown pairs under `tests/real/` to protect the pipeline against regressions on live site shapes.
249
+
250
+ - `html/` stores the captured HTML for each case.
251
+ - `raw/` stores the expected Markdown output for the same case.
252
+ - `manifest.json` declares the cases and their relative fixture paths.
253
+
254
+ To run the real-example suite:
255
+
256
+ ```bash
257
+ pytest tests/real/test_real_examples.py -q
258
+ ```
259
+
260
+ ## Public API
261
+
262
+ `domdown` exports these names from `domdown.__init__`:
263
+
264
+ ```python
265
+ from domdown import (
266
+ DomdownOptions,
267
+ HtmlMetadata,
268
+ HtmlToMarkdownPipeline,
269
+ HtmlToMarkdownResult,
270
+ html_to_markdown,
271
+ )
272
+ ```
273
+
274
+ ## Installation
275
+
276
+ Install from this repository:
277
+
278
+ ```bash
279
+ pip install git+https://github.com/juanmcristobal/domdown.git
280
+ ```
281
+
282
+ Install locally for development:
283
+
284
+ ```bash
285
+ git clone https://github.com/juanmcristobal/domdown.git
286
+ cd domdown
287
+ pip install -e ".[dev]"
288
+ ```
289
+
290
+ Runtime dependencies:
291
+
292
+ - `beautifulsoup4`
293
+ - `lxml`
294
+ - `soupsieve`
295
+ - `httpx`
296
+
297
+ ## Support & Connect
298
+
299
+ * ⭐ **Star the repo** if you found it useful
300
+ * ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
301
+ * 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
302
+
303
+
304
+ # History
305
+
306
+
307
+ ## 0.1.0 (2026-05-21)
308
+
309
+
310
+ * First release.
311
+
312
+
313
+ ## 0.1.1 (2026-05-31)
314
+
315
+
316
+ * Fix release workflow checkout for PyPI publish.
@@ -0,0 +1,254 @@
1
+ # domdown
2
+
3
+ ![domdown banner](assets/domdown-banner.jpg)
4
+
5
+ `domdown` turns article-like web pages into clean, structured Markdown.
6
+
7
+ It is built for pages where the shape matters: long-form posts, research writeups, technical blogs, security reports, and other content-heavy pages that need to become readable Markdown without losing useful structure.
8
+
9
+ ## What it does
10
+
11
+ `domdown` takes care of the full HTML-to-Markdown pipeline:
12
+
13
+ - Parses messy web HTML
14
+ - Selects the main article content
15
+ - Removes navigation, promo blocks, and other chrome
16
+ - Extracts metadata
17
+ - Preserves images, tables, code blocks, links, and lists
18
+ - Optionally emits YAML frontmatter
19
+ - Renders the final Markdown document
20
+
21
+ The result is Markdown that is ready to read, reuse, archive, or feed into another model.
22
+
23
+ ## Why it exists
24
+
25
+ Most pages are not written like clean documents. They mix article content with menus, banners, share widgets, related links, and other page furniture.
26
+
27
+ `domdown` is designed for cases where you want the content to stay faithful to the original page while still producing a clean Markdown output that is easy to consume downstream.
28
+
29
+ ## Example
30
+
31
+ ```python
32
+ from domdown import DomdownOptions, html_to_markdown
33
+
34
+ html = """
35
+ <html>
36
+ <head>
37
+ <title>Credential theft campaign expands</title>
38
+ <meta name="description" content="A concise security article." />
39
+ <link rel="canonical" href="https://example.com/research/campaign" />
40
+ </head>
41
+ <body>
42
+ <nav>Home Pricing Docs</nav>
43
+ <article>
44
+ <h1>Credential theft campaign expands</h1>
45
+ <p>Researchers observed a new wave of phishing infrastructure.</p>
46
+ <figure>
47
+ <img src="/images/chart.png" alt="Campaign infrastructure chart" />
48
+ <figcaption>Campaign infrastructure by week.</figcaption>
49
+ </figure>
50
+ <ul>
51
+ <li>Windows targets increased.</li>
52
+ <li>Linux staging remained stable.</li>
53
+ </ul>
54
+ </article>
55
+ </body>
56
+ </html>
57
+ """
58
+
59
+ markdown = html_to_markdown(
60
+ html,
61
+ DomdownOptions(base_url="https://example.com/research/campaign"),
62
+ )
63
+
64
+ print(markdown)
65
+ ```
66
+
67
+ Output:
68
+
69
+ ```markdown
70
+ ---
71
+ title: Credential theft campaign expands
72
+ source: "https://example.com/research/campaign"
73
+ description: A concise security article.
74
+ ---
75
+ # Credential theft campaign expands
76
+
77
+ Researchers observed a new wave of phishing infrastructure.
78
+
79
+ ![Campaign infrastructure chart](https://example.com/images/chart.png)
80
+
81
+ Campaign infrastructure by week.
82
+
83
+ - Windows targets increased.
84
+ - Linux staging remained stable.
85
+ ```
86
+
87
+ ## What it preserves
88
+
89
+ `domdown` is optimized for article-style pages where useful structure should survive the conversion:
90
+
91
+ - Titles and headings
92
+ - Visible author and publication metadata
93
+ - Canonical URLs and source references
94
+ - Images and captions
95
+ - Tables and code blocks
96
+ - Inline links and emphasized text
97
+ - Lists, quotes, and other document structure
98
+
99
+ ## Using domdown
100
+
101
+ ### Client usage
102
+
103
+ Use `html_to_markdown()` when you only need the final Markdown document as a string.
104
+
105
+ ```python
106
+ from domdown import DomdownOptions, html_to_markdown
107
+
108
+ markdown = html_to_markdown(
109
+ html,
110
+ DomdownOptions(
111
+ base_url="https://example.com/post",
112
+ emit_frontmatter=False,
113
+ ),
114
+ )
115
+ ```
116
+
117
+ When `emit_frontmatter=True` or left at the default, the returned string includes YAML frontmatter followed by the Markdown body.
118
+
119
+ ### API usage
120
+
121
+ Use `HtmlToMarkdownPipeline` when you want structured output.
122
+
123
+ ```python
124
+ from domdown import DomdownOptions, HtmlToMarkdownPipeline
125
+
126
+ pipeline = HtmlToMarkdownPipeline(
127
+ DomdownOptions(base_url="https://example.com/post")
128
+ )
129
+ result = pipeline.run(html)
130
+
131
+ print(result.document)
132
+ print(result.markdown)
133
+ print(result.cleaned_html)
134
+ print(result.frontmatter)
135
+ print(result.warnings)
136
+ ```
137
+
138
+ `HtmlToMarkdownResult` exposes:
139
+
140
+ | Field | Type | Description |
141
+ | --- | --- | --- |
142
+ | `markdown` | `str` | Markdown rendered from the selected content. |
143
+ | `cleaned_html` | `str \| None` | HTML after parsing, selection, cleaning, and preservation. |
144
+ | `metadata` | `HtmlMetadata \| None` | Normalized metadata extracted from the source HTML. |
145
+ | `frontmatter` | `str \| None` | YAML frontmatter when enabled. |
146
+ | `document` | `str \| None` | Final document string, including frontmatter when enabled. |
147
+ | `warnings` | `tuple[str, ...]` | Non-fatal pipeline warnings. |
148
+
149
+ `HtmlMetadata` exposes:
150
+
151
+ | Field | Type |
152
+ | --- | --- |
153
+ | `title` | `str \| None` |
154
+ | `site_name` | `str \| None` |
155
+ | `source` | `str \| None` |
156
+ | `author` | `tuple[str, ...]` |
157
+ | `published` | `str \| None` |
158
+ | `created` | `str \| None` |
159
+ | `description` | `str \| None` |
160
+ | `tags` | `tuple[str, ...]` |
161
+ | `language` | `str \| None` |
162
+ | `canonical_url` | `str \| None` |
163
+ | `image` | `str \| None` |
164
+
165
+ ## Options
166
+
167
+ `DomdownOptions` controls parsing, cleanup, metadata extraction, and output shaping.
168
+
169
+ | Option | Default | Behavior |
170
+ | --- | --- | --- |
171
+ | `base_url` | `None` | Source URL used for metadata and relative URL resolution. |
172
+ | `created` | `None` | Creation date to include in metadata/frontmatter. |
173
+ | `extract_metadata` | `True` | Enables metadata extraction. |
174
+ | `emit_frontmatter` | `True` | Prepends YAML frontmatter to `document`. |
175
+ | `prefer_article_body` | `True` | Prefers article-like containers during selection. |
176
+ | `author_priority` | `"visible"` | Chooses visible author text before metadata unless set otherwise. |
177
+ | `frontmatter_tags` | `()` | Extra tags to include in generated frontmatter. |
178
+ | `preserve_images` | `True` | Keeps images for Markdown rendering. |
179
+ | `preserve_tables` | `True` | Keeps tables for Markdown rendering. |
180
+ | `preserve_code_blocks` | `True` | Keeps code/preformatted blocks. |
181
+ | `strip_hidden` | `True` | Removes hidden or non-visible elements. |
182
+ | `remove_selectors` | `()` | CSS selectors to remove. |
183
+ | `keep_selectors` | `()` | CSS selectors to protect during cleaning. |
184
+ | `unwrap_selectors` | `()` | CSS selectors whose wrapper is removed while children remain. |
185
+
186
+ Example:
187
+
188
+ ```python
189
+ from domdown import DomdownOptions
190
+
191
+ options = DomdownOptions(
192
+ base_url="https://example.com/article",
193
+ emit_frontmatter=True,
194
+ preserve_images=True,
195
+ remove_selectors=(".share-widget", ".newsletter-signup"),
196
+ )
197
+ ```
198
+
199
+ ## Real-world coverage
200
+
201
+ `domdown` includes curated real-world HTML/Markdown pairs under `tests/real/` to protect the pipeline against regressions on live site shapes.
202
+
203
+ - `html/` stores the captured HTML for each case.
204
+ - `raw/` stores the expected Markdown output for the same case.
205
+ - `manifest.json` declares the cases and their relative fixture paths.
206
+
207
+ To run the real-example suite:
208
+
209
+ ```bash
210
+ pytest tests/real/test_real_examples.py -q
211
+ ```
212
+
213
+ ## Public API
214
+
215
+ `domdown` exports these names from `domdown.__init__`:
216
+
217
+ ```python
218
+ from domdown import (
219
+ DomdownOptions,
220
+ HtmlMetadata,
221
+ HtmlToMarkdownPipeline,
222
+ HtmlToMarkdownResult,
223
+ html_to_markdown,
224
+ )
225
+ ```
226
+
227
+ ## Installation
228
+
229
+ Install from this repository:
230
+
231
+ ```bash
232
+ pip install git+https://github.com/juanmcristobal/domdown.git
233
+ ```
234
+
235
+ Install locally for development:
236
+
237
+ ```bash
238
+ git clone https://github.com/juanmcristobal/domdown.git
239
+ cd domdown
240
+ pip install -e ".[dev]"
241
+ ```
242
+
243
+ Runtime dependencies:
244
+
245
+ - `beautifulsoup4`
246
+ - `lxml`
247
+ - `soupsieve`
248
+ - `httpx`
249
+
250
+ ## Support & Connect
251
+
252
+ * ⭐ **Star the repo** if you found it useful
253
+ * ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
254
+ * 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from .api import html_to_markdown
4
+ from ._core import DomdownOptions, HtmlMetadata, HtmlToMarkdownResult
5
+ from ._pipeline import HtmlToMarkdownPipeline
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = [
10
+ "__version__",
11
+ "DomdownOptions",
12
+ "HtmlMetadata",
13
+ "HtmlToMarkdownResult",
14
+ "HtmlToMarkdownPipeline",
15
+ "html_to_markdown",
16
+ ]
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from .selectors import (
4
+ BOILERPLATE_PHRASES,
5
+ CONTENT_SELECTORS,
6
+ CONTENT_SELECTORS_EXACT,
7
+ CONTENT_SELECTORS_FALLBACK,
8
+ DEFAULT_REMOVE_SELECTORS,
9
+ JS_SHELL_PHRASES,
10
+ HEADER_MARKERS,
11
+ NOISE_MARKERS,
12
+ RELATED_PHRASES,
13
+ REFINABLE_CHILD_TAGS,
14
+ ROOT_SELECTORS,
15
+ SHARE_SELECTORS,
16
+ SKIP_TAGS,
17
+ )
18
+
19
+ __all__ = [
20
+ "BOILERPLATE_PHRASES",
21
+ "CONTENT_SELECTORS",
22
+ "CONTENT_SELECTORS_EXACT",
23
+ "CONTENT_SELECTORS_FALLBACK",
24
+ "DEFAULT_REMOVE_SELECTORS",
25
+ "JS_SHELL_PHRASES",
26
+ "HEADER_MARKERS",
27
+ "NOISE_MARKERS",
28
+ "RELATED_PHRASES",
29
+ "REFINABLE_CHILD_TAGS",
30
+ "ROOT_SELECTORS",
31
+ "SHARE_SELECTORS",
32
+ "SKIP_TAGS",
33
+ ]