html-to-markdown 1.1.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -5,6 +5,7 @@
5
5
  .coverage
6
6
  .env
7
7
  .idea/
8
+ .run/
8
9
  .mypy_cache/
9
10
  .pdm-build/
10
11
  .pdm-python
@@ -1,7 +1,7 @@
1
1
  The MIT License (MIT)
2
2
 
3
3
  Copyright 2012-2018 Matthew Tretter
4
- Copyright 2024 Na'aman Hirschfeld
4
+ Copyright 2024-2025 Na'aman Hirschfeld
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.4
2
+ Name: html-to-markdown
3
+ Version: 1.2.1
4
+ Summary: Convert HTML to markdown
5
+ Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
6
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: converter,html,markdown,text-extraction,text-processing
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Text Processing
20
+ Classifier: Topic :: Text Processing :: Markup
21
+ Classifier: Topic :: Text Processing :: Markup :: HTML
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Topic :: Utilities
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.9
26
+ Requires-Dist: beautifulsoup4>=4.12.3
27
+ Description-Content-Type: text/markdown
28
+
29
+ # html-to-markdown
30
+
31
+ A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
32
+ of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
33
+ Python 3.9+.
34
+
35
+ ## Features
36
+
37
+ - Full type safety with strict MyPy adherence
38
+ - Functional API design
39
+ - Extensive test coverage
40
+ - Configurable conversion options
41
+ - CLI tool for easy conversions
42
+ - Support for pre-configured BeautifulSoup instances
43
+ - Strict semver versioning
44
+
45
+ ## Installation
46
+
47
+ ```shell
48
+ pip install html-to-markdown
49
+ ```
50
+
51
+ ## Quick Start
52
+
53
+ Convert HTML to Markdown with a single function call:
54
+
55
+ ```python
56
+ from html_to_markdown import convert_to_markdown
57
+
58
+ html = """
59
+ <article>
60
+ <h1>Welcome</h1>
61
+ <p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
62
+ <ul>
63
+ <li>Item 1</li>
64
+ <li>Item 2</li>
65
+ </ul>
66
+ </article>
67
+ """
68
+
69
+ markdown = convert_to_markdown(html)
70
+ print(markdown)
71
+ ```
72
+
73
+ Output:
74
+
75
+ ```markdown
76
+ # Welcome
77
+
78
+ This is a **sample** with a [link](https://example.com).
79
+
80
+ * Item 1
81
+ * Item 2
82
+ ```
83
+
84
+ ### Working with BeautifulSoup
85
+
86
+ If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
87
+
88
+ ```python
89
+ from bs4 import BeautifulSoup
90
+ from html_to_markdown import convert_to_markdown
91
+
92
+ # Configure BeautifulSoup with your preferred parser
93
+ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
94
+ markdown = convert_to_markdown(soup)
95
+ ```
96
+
97
+ ## Advanced Usage
98
+
99
+ ### Customizing Conversion Options
100
+
101
+ The library offers extensive customization through various options:
102
+
103
+ ```python
104
+ from html_to_markdown import convert_to_markdown
105
+
106
+ html = "<div>Your content here...</div>"
107
+ markdown = convert_to_markdown(
108
+ html,
109
+ heading_style="atx", # Use # style headers
110
+ strong_em_symbol="*", # Use * for bold/italic
111
+ bullets="*+-", # Define bullet point characters
112
+ wrap=True, # Enable text wrapping
113
+ wrap_width=100, # Set wrap width
114
+ escape_asterisks=True, # Escape * characters
115
+ code_language="python", # Default code block language
116
+ )
117
+ ```
118
+
119
+ ### Configuration Options
120
+
121
+ | Option | Type | Default | Description |
122
+ | -------------------- | ---- | -------------- | ------------------------------------------------------ |
123
+ | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
124
+ | `bullets` | str | `'*+-'` | Characters to use for bullet points |
125
+ | `code_language` | str | `''` | Default language for code blocks |
126
+ | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
127
+ | `escape_asterisks` | bool | `True` | Escape * characters |
128
+ | `escape_underscores` | bool | `True` | Escape _ characters |
129
+ | `wrap` | bool | `False` | Enable text wrapping |
130
+ | `wrap_width` | int | `80` | Text wrap width |
131
+
132
+ For a complete list of options, see the [Configuration](#configuration) section below.
133
+
134
+ ## CLI Usage
135
+
136
+ Convert HTML files directly from the command line:
137
+
138
+ ```shell
139
+ # Convert a file
140
+ html_to_markdown input.html > output.md
141
+
142
+ # Process stdin
143
+ cat input.html | html_to_markdown > output.md
144
+
145
+ # Use custom options
146
+ html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
147
+ ```
148
+
149
+ View all available options:
150
+
151
+ ```shell
152
+ html_to_markdown --help
153
+ ```
154
+
155
+ ## Migration from Markdownify
156
+
157
+ For existing projects using Markdownify, a compatibility layer is provided:
158
+
159
+ ```python
160
+ # Old code
161
+ from markdownify import markdownify as md
162
+
163
+ # New code - works the same way
164
+ from html_to_markdown import markdownify as md
165
+ ```
166
+
167
+ The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
168
+
169
+ ## Configuration
170
+
171
+ Full list of configuration options:
172
+
173
+ - `autolinks`: Convert valid URLs to Markdown links automatically
174
+ - `bullets`: Characters to use for bullet points in lists
175
+ - `code_language`: Default language for fenced code blocks
176
+ - `code_language_callback`: Function to determine code block language
177
+ - `convert`: List of HTML tags to convert (None = all supported tags)
178
+ - `default_title`: Use default titles for elements like links
179
+ - `escape_asterisks`: Escape * characters
180
+ - `escape_misc`: Escape miscellaneous Markdown characters
181
+ - `escape_underscores`: Escape _ characters
182
+ - `heading_style`: Header style (underlined/atx/atx_closed)
183
+ - `keep_inline_images_in`: Tags where inline images should be kept
184
+ - `newline_style`: Style for handling newlines (spaces/backslash)
185
+ - `strip`: Tags to remove from output
186
+ - `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
187
+ - `sub_symbol`: Symbol for subscript text
188
+ - `sup_symbol`: Symbol for superscript text
189
+ - `wrap`: Enable text wrapping
190
+ - `wrap_width`: Width for text wrapping
191
+ - `convert_as_inline`: Treat content as inline elements
192
+
193
+ ## Contribution
194
+
195
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
196
+ submitting PRs to avoid disappointment.
197
+
198
+ ### Local Development
199
+
200
+ 1. Clone the repo
201
+
202
+ 1. Install the system dependencies
203
+
204
+ 1. Install the full dependencies with `uv sync`
205
+
206
+ 1. Install the pre-commit hooks with:
207
+
208
+ ```shell
209
+ pre-commit install && pre-commit install --hook-type commit-msg
210
+ ```
211
+
212
+ 1. Make your changes and submit a PR
213
+
214
+ ## License
215
+
216
+ This library uses the MIT license.
217
+
218
+ ## Acknowledgments
219
+
220
+ Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
@@ -0,0 +1,192 @@
1
+ # html-to-markdown
2
+
3
+ A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
4
+ of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
5
+ Python 3.9+.
6
+
7
+ ## Features
8
+
9
+ - Full type safety with strict MyPy adherence
10
+ - Functional API design
11
+ - Extensive test coverage
12
+ - Configurable conversion options
13
+ - CLI tool for easy conversions
14
+ - Support for pre-configured BeautifulSoup instances
15
+ - Strict semver versioning
16
+
17
+ ## Installation
18
+
19
+ ```shell
20
+ pip install html-to-markdown
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ Convert HTML to Markdown with a single function call:
26
+
27
+ ```python
28
+ from html_to_markdown import convert_to_markdown
29
+
30
+ html = """
31
+ <article>
32
+ <h1>Welcome</h1>
33
+ <p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
34
+ <ul>
35
+ <li>Item 1</li>
36
+ <li>Item 2</li>
37
+ </ul>
38
+ </article>
39
+ """
40
+
41
+ markdown = convert_to_markdown(html)
42
+ print(markdown)
43
+ ```
44
+
45
+ Output:
46
+
47
+ ```markdown
48
+ # Welcome
49
+
50
+ This is a **sample** with a [link](https://example.com).
51
+
52
+ * Item 1
53
+ * Item 2
54
+ ```
55
+
56
+ ### Working with BeautifulSoup
57
+
58
+ If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
59
+
60
+ ```python
61
+ from bs4 import BeautifulSoup
62
+ from html_to_markdown import convert_to_markdown
63
+
64
+ # Configure BeautifulSoup with your preferred parser
65
+ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
66
+ markdown = convert_to_markdown(soup)
67
+ ```
68
+
69
+ ## Advanced Usage
70
+
71
+ ### Customizing Conversion Options
72
+
73
+ The library offers extensive customization through various options:
74
+
75
+ ```python
76
+ from html_to_markdown import convert_to_markdown
77
+
78
+ html = "<div>Your content here...</div>"
79
+ markdown = convert_to_markdown(
80
+ html,
81
+ heading_style="atx", # Use # style headers
82
+ strong_em_symbol="*", # Use * for bold/italic
83
+ bullets="*+-", # Define bullet point characters
84
+ wrap=True, # Enable text wrapping
85
+ wrap_width=100, # Set wrap width
86
+ escape_asterisks=True, # Escape * characters
87
+ code_language="python", # Default code block language
88
+ )
89
+ ```
90
+
91
+ ### Configuration Options
92
+
93
+ | Option | Type | Default | Description |
94
+ | -------------------- | ---- | -------------- | ------------------------------------------------------ |
95
+ | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
96
+ | `bullets` | str | `'*+-'` | Characters to use for bullet points |
97
+ | `code_language` | str | `''` | Default language for code blocks |
98
+ | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
99
+ | `escape_asterisks` | bool | `True` | Escape * characters |
100
+ | `escape_underscores` | bool | `True` | Escape _ characters |
101
+ | `wrap` | bool | `False` | Enable text wrapping |
102
+ | `wrap_width` | int | `80` | Text wrap width |
103
+
104
+ For a complete list of options, see the [Configuration](#configuration) section below.
105
+
106
+ ## CLI Usage
107
+
108
+ Convert HTML files directly from the command line:
109
+
110
+ ```shell
111
+ # Convert a file
112
+ html_to_markdown input.html > output.md
113
+
114
+ # Process stdin
115
+ cat input.html | html_to_markdown > output.md
116
+
117
+ # Use custom options
118
+ html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
119
+ ```
120
+
121
+ View all available options:
122
+
123
+ ```shell
124
+ html_to_markdown --help
125
+ ```
126
+
127
+ ## Migration from Markdownify
128
+
129
+ For existing projects using Markdownify, a compatibility layer is provided:
130
+
131
+ ```python
132
+ # Old code
133
+ from markdownify import markdownify as md
134
+
135
+ # New code - works the same way
136
+ from html_to_markdown import markdownify as md
137
+ ```
138
+
139
+ The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
140
+
141
+ ## Configuration
142
+
143
+ Full list of configuration options:
144
+
145
+ - `autolinks`: Convert valid URLs to Markdown links automatically
146
+ - `bullets`: Characters to use for bullet points in lists
147
+ - `code_language`: Default language for fenced code blocks
148
+ - `code_language_callback`: Function to determine code block language
149
+ - `convert`: List of HTML tags to convert (None = all supported tags)
150
+ - `default_title`: Use default titles for elements like links
151
+ - `escape_asterisks`: Escape * characters
152
+ - `escape_misc`: Escape miscellaneous Markdown characters
153
+ - `escape_underscores`: Escape _ characters
154
+ - `heading_style`: Header style (underlined/atx/atx_closed)
155
+ - `keep_inline_images_in`: Tags where inline images should be kept
156
+ - `newline_style`: Style for handling newlines (spaces/backslash)
157
+ - `strip`: Tags to remove from output
158
+ - `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
159
+ - `sub_symbol`: Symbol for subscript text
160
+ - `sup_symbol`: Symbol for superscript text
161
+ - `wrap`: Enable text wrapping
162
+ - `wrap_width`: Width for text wrapping
163
+ - `convert_as_inline`: Treat content as inline elements
164
+
165
+ ## Contribution
166
+
167
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
168
+ submitting PRs to avoid disappointment.
169
+
170
+ ### Local Development
171
+
172
+ 1. Clone the repo
173
+
174
+ 1. Install the system dependencies
175
+
176
+ 1. Install the full dependencies with `uv sync`
177
+
178
+ 1. Install the pre-commit hooks with:
179
+
180
+ ```shell
181
+ pre-commit install && pre-commit install --hook-type commit-msg
182
+ ```
183
+
184
+ 1. Make your changes and submit a PR
185
+
186
+ ## License
187
+
188
+ This library uses the MIT license.
189
+
190
+ ## Acknowledgments
191
+
192
+ Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
@@ -0,0 +1,5 @@
1
+ from html_to_markdown.processing import convert_to_markdown
2
+
3
+ markdownify = convert_to_markdown
4
+
5
+ __all__ = ["convert_to_markdown", "markdownify"]
@@ -0,0 +1,11 @@
1
+ import sys
2
+
3
+ if __name__ == "__main__":
4
+ from html_to_markdown.cli import main
5
+
6
+ try:
7
+ result = main(sys.argv[1:])
8
+ print(result) # noqa: T201
9
+ except ValueError as e:
10
+ print(str(e), file=sys.stderr) # noqa: T201
11
+ sys.exit(1)
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from re import Pattern
5
+ from typing import Final
6
+
7
+ convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
+ line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
+ whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
+ html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
+
12
+ ASTERISK: Final = "*"
13
+ ATX: Final = "atx"
14
+ ATX_CLOSED: Final = "atx_closed"
15
+ BACKSLASH: Final = "backslash"
16
+ UNDERLINED: Final = "underlined"
17
+ SPACES: Final = "spaces"
18
+ UNDERSCORE: Final = "_"
@@ -55,17 +55,19 @@ SupportedElements = Literal[
55
55
  "kbd",
56
56
  ]
57
57
 
58
- ConverterssMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
58
+ ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
59
59
 
60
60
  T = TypeVar("T")
61
61
 
62
62
 
63
63
  def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
64
- """This abstracts all simple inline tags like b, em, del, ...
65
- Returns a function that wraps the chomped text in a pair of the string
66
- that is returned by markup_fn, with '/' inserted in the string used after
67
- the text if it looks like an HTML tag. markup_fn is necessary to allow for
68
- references to self.strong_em_symbol etc.
64
+ """Create an inline converter for a markup pattern or tag.
65
+
66
+ Args:
67
+ markup_prefix: The markup prefix to insert.
68
+
69
+ Returns:
70
+ A function that can be used to convert HTML to Markdown.
69
71
  """
70
72
 
71
73
  def implementation(*, tag: Tag, text: str) -> str:
@@ -83,7 +85,7 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
83
85
 
84
86
  return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
85
87
 
86
- return cast(Callable[[Tag, str], str], implementation)
88
+ return cast("Callable[[Tag, str], str]", implementation)
87
89
 
88
90
 
89
91
  def _get_colspan(tag: Tag) -> int:
@@ -185,7 +187,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
185
187
  parent = tag.parent
186
188
  if parent is not None and parent.name == "ol":
187
189
  start = (
188
- int(cast(str, parent["start"]))
190
+ int(cast("str", parent["start"]))
189
191
  if isinstance(parent.get("start"), str) and str(parent.get("start")).isnumeric()
190
192
  else 1
191
193
  )
@@ -261,7 +263,6 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
261
263
  overline = ""
262
264
  underline = ""
263
265
  if is_headrow and not tag.previous_sibling:
264
- # first row and is headline: print headline underline
265
266
  full_colspan = 0
266
267
  for cell in cells:
267
268
  if "colspan" in cell.attrs and cell["colspan"].isdigit():
@@ -270,12 +271,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
270
271
  full_colspan += 1
271
272
  underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
272
273
  elif not tag.previous_sibling and (
273
- parent_name == "table" or (parent_name == "tbody" and not cast(Tag, tag.parent).previous_sibling)
274
+ parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
274
275
  ):
275
- # first row, not headline, and:
276
- # - the parent is table or
277
- # - the parent is tbody at the beginning of a table.
278
- # print empty headline above this row
279
276
  overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
280
277
  overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
281
278
  return overline + "|" + text + "\n" + underline
@@ -295,7 +292,7 @@ def create_converters_map(
295
292
  sup_symbol: str,
296
293
  wrap: bool,
297
294
  wrap_width: int,
298
- ) -> ConverterssMap:
295
+ ) -> ConvertersMap:
299
296
  """Create a mapping of HTML elements to their corresponding conversion functions.
300
297
 
301
298
  Args:
@@ -332,7 +329,7 @@ def create_converters_map(
332
329
  return func(**kwargs)
333
330
  return func(text)
334
331
 
335
- return cast(Callable[[str, Tag], T], _inner)
332
+ return cast("Callable[[str, Tag], T]", _inner)
336
333
 
337
334
  return {
338
335
  "a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from itertools import chain
3
4
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
4
5
 
5
6
  from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
@@ -11,7 +12,7 @@ from html_to_markdown.constants import (
11
12
  html_heading_re,
12
13
  whitespace_re,
13
14
  )
14
- from html_to_markdown.converters import ConverterssMap, create_converters_map
15
+ from html_to_markdown.converters import ConvertersMap, create_converters_map
15
16
  from html_to_markdown.utils import escape
16
17
 
17
18
  if TYPE_CHECKING:
@@ -76,18 +77,23 @@ def _is_nested_tag(el: PageElement) -> bool:
76
77
 
77
78
  def _process_tag(
78
79
  tag: Tag,
79
- converters_map: ConverterssMap,
80
+ converters_map: ConvertersMap,
80
81
  *,
81
- convert: Iterable[str] | None,
82
+ convert: set[str] | None,
82
83
  convert_as_inline: bool = False,
83
84
  escape_asterisks: bool,
84
85
  escape_misc: bool,
85
86
  escape_underscores: bool,
86
- strip: Iterable[str] | None,
87
+ strip: set[str] | None,
87
88
  ) -> str:
89
+ should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
90
+ tag_name: SupportedTag | None = (
91
+ cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
92
+ )
88
93
  text = ""
94
+
89
95
  is_heading = html_heading_re.match(tag.name) is not None
90
- is_cell = tag.name in {"td", "th"}
96
+ is_cell = tag_name in {"td", "th"}
91
97
  convert_children_as_inline = convert_as_inline or is_heading or is_cell
92
98
 
93
99
  if _is_nested_tag(tag):
@@ -121,9 +127,7 @@ def _process_tag(
121
127
  strip=strip,
122
128
  )
123
129
 
124
- tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
125
-
126
- if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
130
+ if tag_name and should_convert_tag:
127
131
  return converters_map[tag_name]( # type: ignore[call-arg]
128
132
  tag=tag, text=text, convert_as_inline=convert_as_inline
129
133
  )
@@ -140,11 +144,9 @@ def _process_text(
140
144
  ) -> str:
141
145
  text = str(el) or ""
142
146
 
143
- # normalize whitespace if we're not inside a preformatted element
144
147
  if not el.find_parent("pre"):
145
148
  text = whitespace_re.sub(" ", text)
146
149
 
147
- # escape special characters if we're not inside a preformatted or code element
148
150
  if not el.find_parent(["pre", "code", "kbd", "samp"]):
149
151
  text = escape(
150
152
  text=text,
@@ -153,9 +155,6 @@ def _process_text(
153
155
  escape_underscores=escape_underscores,
154
156
  )
155
157
 
156
- # remove trailing whitespaces if any of the following condition is true:
157
- # - current text node is the last node in li
158
- # - current text node is followed by an embedded list
159
158
  if (
160
159
  el.parent
161
160
  and el.parent.name == "li"
@@ -166,7 +165,7 @@ def _process_text(
166
165
  return text
167
166
 
168
167
 
169
- def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert: Iterable[str] | None) -> bool:
168
+ def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
170
169
  if strip is not None:
171
170
  return tag_name not in strip
172
171
  if convert is not None:
@@ -174,6 +173,14 @@ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert:
174
173
  return True
175
174
 
176
175
 
176
+ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
177
+ if value is None:
178
+ return None
179
+ if isinstance(value, str):
180
+ return set(",".split(value))
181
+ return {*chain(*[v.split(",") for v in value])}
182
+
183
+
177
184
  def convert_to_markdown(
178
185
  source: str | BeautifulSoup,
179
186
  *,
@@ -181,7 +188,7 @@ def convert_to_markdown(
181
188
  bullets: str = "*+-",
182
189
  code_language: str = "",
183
190
  code_language_callback: Callable[[Any], str] | None = None,
184
- convert: Iterable[str] | None = None,
191
+ convert: str | Iterable[str] | None = None,
185
192
  default_title: bool = False,
186
193
  escape_asterisks: bool = True,
187
194
  escape_misc: bool = True,
@@ -189,7 +196,7 @@ def convert_to_markdown(
189
196
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
190
197
  keep_inline_images_in: Iterable[str] | None = None,
191
198
  newline_style: Literal["spaces", "backslash"] = SPACES,
192
- strip: Iterable[str] | None = None,
199
+ strip: str | Iterable[str] | None = None,
193
200
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
194
201
  sub_symbol: str = "",
195
202
  sup_symbol: str = "",
@@ -221,13 +228,22 @@ def convert_to_markdown(
221
228
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
222
229
  convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
223
230
 
231
+ Raises:
232
+ ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
233
+
224
234
  Returns:
225
235
  str: A string of Markdown-formatted text converted from the given HTML.
226
236
  """
227
237
  if isinstance(source, str):
228
238
  from bs4 import BeautifulSoup
229
239
 
230
- source = BeautifulSoup(source, "html.parser")
240
+ if "".join(source.split("\n")):
241
+ source = BeautifulSoup(source, "html.parser")
242
+ else:
243
+ raise ValueError("The input HTML is empty.")
244
+
245
+ if strip is not None and convert is not None:
246
+ raise ValueError("Only one of 'strip' and 'convert' can be specified.")
231
247
 
232
248
  converters_map = create_converters_map(
233
249
  autolinks=autolinks,
@@ -248,10 +264,10 @@ def convert_to_markdown(
248
264
  return _process_tag(
249
265
  source,
250
266
  converters_map,
251
- convert=convert,
267
+ convert=_as_optional_set(convert),
252
268
  convert_as_inline=convert_as_inline,
253
269
  escape_asterisks=escape_asterisks,
254
270
  escape_misc=escape_misc,
255
271
  escape_underscores=escape_underscores,
256
- strip=strip,
272
+ strip=_as_optional_set(strip),
257
273
  )
@@ -0,0 +1,115 @@
1
+ [build-system]
2
+ build-backend = "hatchling.build"
3
+
4
+ requires = [ "hatchling" ]
5
+
6
+ [project]
7
+ name = "html-to-markdown"
8
+ version = "1.2.1"
9
+ description = "Convert HTML to markdown"
10
+ readme = "README.md"
11
+ keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
12
+ license = { text = "MIT" }
13
+ authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
14
+ requires-python = ">=3.9"
15
+ classifiers = [
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Text Processing",
26
+ "Topic :: Text Processing :: Markup",
27
+ "Topic :: Text Processing :: Markup :: HTML",
28
+ "Topic :: Text Processing :: Markup :: Markdown",
29
+ "Topic :: Utilities",
30
+ "Typing :: Typed",
31
+ ]
32
+
33
+ dependencies = [
34
+ "beautifulsoup4>=4.12.3",
35
+ ]
36
+
37
+ urls.homepage = "https://github.com/Goldziher/html-to-markdown"
38
+
39
+ [dependency-groups]
40
+ dev = [
41
+ "covdefaults>=2.3",
42
+ "mypy>=1.14.1",
43
+ "pre-commit>=4.1",
44
+ "pytest>=8.3.4",
45
+ "pytest-cov>=6",
46
+ "pytest-mock>=3.14",
47
+ "ruff>=0.9.3",
48
+ "types-beautifulsoup4>=4.12.0.20241020",
49
+ ]
50
+
51
+ [tool.hatch.build]
52
+ skip-excluded-dirs = true
53
+
54
+ [tool.hatch.build.targets.sdist]
55
+ only-include = [ "html_to_markdown" ]
56
+
57
+ [tool.hatch.build.targets.wheel]
58
+ only-include = [ "html_to_markdown" ]
59
+
60
+ [tool.ruff]
61
+ target-version = "py39"
62
+ line-length = 120
63
+ src = [ "html_to_markdown", "tests" ]
64
+
65
+ format.docstring-code-line-length = 120
66
+ format.docstring-code-format = true
67
+ lint.select = [ "ALL" ]
68
+ lint.ignore = [
69
+ "ANN401", # dynamically typed ANY
70
+ "COM812", # Conflicts with formatter
71
+ "D100", # # pydocstyle - missing docstring in public module
72
+ "D104", # pydocstyle - missing docstring in public package
73
+ "D107", # pydocstyle - missing docstring in __init__
74
+ "D205", # pydocstyle - 1 blank line required between summary line and description
75
+ "E501", # pycodestyle line too long, handled by ruff format
76
+ "EM", # Exception messages,
77
+ "FBT", # Boolean Args
78
+ "FIX", # we allow todo and fixme comments
79
+ "ISC001", # Conflicts with formatter
80
+ "PLR0913", # Pylint - too many arguments.
81
+ "PLR2004", # Magic variables, we allow them
82
+ "TD", # we allow todo and fixme comments
83
+ "TRY", # Try except block, rules are too strict
84
+ ]
85
+ lint.per-file-ignores."tests/**/*.*" = [ "ARG", "D", "PD", "PT006", "PT013", "S" ]
86
+ lint.isort.known-first-party = [ "html_to_markdown", "tests" ]
87
+ lint.pydocstyle.convention = "google"
88
+
89
+ [tool.pytest.ini_options]
90
+ asyncio_mode = "auto"
91
+ asyncio_default_fixture_loop_scope = "function"
92
+
93
+ [tool.coverage.run]
94
+ omit = [ "tests/*" ]
95
+ plugins = [ "covdefaults" ]
96
+ source = [ "html_to_markdown" ]
97
+
98
+ [tool.coverage.report]
99
+ exclude_lines = [ 'if TYPE_CHECKING:' ]
100
+ fail_under = 100
101
+
102
+ [tool.mypy]
103
+ packages = [ "html_to_markdown", "tests" ]
104
+ python_version = "3.9"
105
+ implicit_reexport = false
106
+ show_error_codes = true
107
+ strict = true
108
+
109
+ [[tool.mypy.overrides]]
110
+ module = "tests.*"
111
+ disallow_any_generics = false
112
+ disallow_untyped_decorators = false
113
+
114
+ [tool.uv]
115
+ default-groups = [ "dev" ]
@@ -1,101 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: html-to-markdown
3
- Version: 1.1.0
4
- Summary: Convert HTML to markdown
5
- Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
- License: MIT
7
- License-File: LICENSE
8
- Keywords: beautifulsoup,converter,html,markdown,text-processing
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3.9
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Classifier: Topic :: Text Processing
18
- Classifier: Topic :: Text Processing :: Markup
19
- Classifier: Topic :: Text Processing :: Markup :: HTML
20
- Classifier: Topic :: Text Processing :: Markup :: Markdown
21
- Classifier: Topic :: Utilities
22
- Classifier: Typing :: Typed
23
- Requires-Python: >=3.9
24
- Requires-Dist: beautifulsoup4>=4.12.3
25
- Description-Content-Type: text/markdown
26
-
27
- # html_to_markdown
28
-
29
- This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
30
- Python 3.9 and above.
31
-
32
- ### Differences with the Markdownify
33
-
34
- - The refactored codebase uses a strict functional approach - no classes are involved.
35
- - There is full typing with strict MyPy strict adherence and a py.typed file included.
36
- - The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
37
- - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
38
- point versioning is no longer aligned.
39
-
40
- ## Installation
41
-
42
- ```shell
43
- pip install html_to_markdown
44
- ```
45
-
46
- ## Usage
47
-
48
- Convert an string HTML to Markdown:
49
-
50
- ```python
51
- from html_to_markdown import convert_to_markdown
52
-
53
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
54
- ```
55
-
56
- Or pass a pre-configured instance of `BeautifulSoup`:
57
-
58
- ```python
59
- from bs4 import BeautifulSoup
60
- from html_to_markdown import convert_to_markdown
61
-
62
- soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
63
-
64
- convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
65
- ```
66
-
67
- ### Options
68
-
69
- The `convert_to_markdown` function accepts the following kwargs:
70
-
71
- - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
72
- - bullets (str): A string of characters to use for bullet points in lists. Defaults to '*+-'.
73
- - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
74
- - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
75
- - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
76
- - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
77
- - escape_asterisks (bool): Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
78
- - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
79
- - escape_underscores (bool): Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
80
- - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
81
- underlined".
82
- - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
83
- - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
84
- - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
85
- - strong_em_symbol (Literal["*", "_"]): Symbol to use for strong/emphasized text. Defaults to "*".
86
- - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
87
- - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
88
- - wrap (bool): Wrap text to the specified width. Defaults to False.
89
- - wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
90
- - convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
91
-
92
- ## CLI
93
-
94
- For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
95
- pipe input from stdin:
96
-
97
- ```shell
98
- cat example.html | html_to_markdown > example.md
99
- ```
100
-
101
- Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
@@ -1,75 +0,0 @@
1
- # html_to_markdown
2
-
3
- This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
4
- Python 3.9 and above.
5
-
6
- ### Differences with the Markdownify
7
-
8
- - The refactored codebase uses a strict functional approach - no classes are involved.
9
- - There is full typing with strict MyPy strict adherence and a py.typed file included.
10
- - The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
11
- - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
12
- point versioning is no longer aligned.
13
-
14
- ## Installation
15
-
16
- ```shell
17
- pip install html_to_markdown
18
- ```
19
-
20
- ## Usage
21
-
22
- Convert an string HTML to Markdown:
23
-
24
- ```python
25
- from html_to_markdown import convert_to_markdown
26
-
27
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
28
- ```
29
-
30
- Or pass a pre-configured instance of `BeautifulSoup`:
31
-
32
- ```python
33
- from bs4 import BeautifulSoup
34
- from html_to_markdown import convert_to_markdown
35
-
36
- soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
37
-
38
- convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
39
- ```
40
-
41
- ### Options
42
-
43
- The `convert_to_markdown` function accepts the following kwargs:
44
-
45
- - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
46
- - bullets (str): A string of characters to use for bullet points in lists. Defaults to '*+-'.
47
- - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
48
- - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
49
- - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
50
- - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
51
- - escape_asterisks (bool): Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
52
- - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
53
- - escape_underscores (bool): Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
54
- - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
55
- underlined".
56
- - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
57
- - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
58
- - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
59
- - strong_em_symbol (Literal["*", "_"]): Symbol to use for strong/emphasized text. Defaults to "*".
60
- - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
61
- - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
62
- - wrap (bool): Wrap text to the specified width. Defaults to False.
63
- - wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
64
- - convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
65
-
66
- ## CLI
67
-
68
- For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
69
- pipe input from stdin:
70
-
71
- ```shell
72
- cat example.html | html_to_markdown > example.md
73
- ```
74
-
75
- Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
@@ -1,3 +0,0 @@
1
- from html_to_markdown.processing import convert_to_markdown
2
-
3
- __all__ = ["convert_to_markdown"]
@@ -1,7 +0,0 @@
1
- import sys
2
-
3
- from html_to_markdown.dli import cli
4
-
5
- if __name__ == "__main__":
6
- result = cli(sys.argv[1:])
7
- print(result) # noqa: T201
@@ -1,18 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from re import Pattern
5
- from typing import Final, Literal
6
-
7
- convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
- line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
- whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
- html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
-
12
- ASTERISK: Final[Literal["*"]] = "*"
13
- ATX: Final[Literal["atx"]] = "atx"
14
- ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
15
- BACKSLASH: Final[Literal["backslash"]] = "backslash"
16
- UNDERLINED: Final[Literal["underlined"]] = "underlined"
17
- SPACES: Final[Literal["spaces"]] = "spaces"
18
- UNDERSCORE: Final[Literal["_"]] = "_"
@@ -1,137 +0,0 @@
1
- [project]
2
- name = "html-to-markdown"
3
- version = "1.1.0"
4
- description = "Convert HTML to markdown"
5
- authors = [{ name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" }]
6
- requires-python = ">=3.9"
7
- readme = "README.md"
8
- license = { text = "MIT" }
9
- classifiers = [
10
- "Intended Audience :: Developers",
11
- "License :: OSI Approved :: MIT License",
12
- "Operating System :: OS Independent",
13
- "Programming Language :: Python :: 3.9",
14
- "Programming Language :: Python :: 3.10",
15
- "Programming Language :: Python :: 3.11",
16
- "Programming Language :: Python :: 3.12",
17
- "Programming Language :: Python :: 3.13",
18
- "Topic :: Text Processing :: Markup :: HTML",
19
- "Topic :: Text Processing :: Markup :: Markdown",
20
- "Topic :: Text Processing :: Markup",
21
- "Topic :: Text Processing",
22
- "Topic :: Utilities",
23
- "Typing :: Typed",
24
- ]
25
- keywords = ["markdown", "html", "beautifulsoup", "converter", "text-processing"]
26
-
27
- dependencies = [
28
- "beautifulsoup4>=4.12.3",
29
- ]
30
-
31
- [project.scripts]
32
- html_to_markdown = "html_to_markdown.__main__:cli"
33
-
34
- [build-system]
35
- requires = ["hatchling"]
36
- build-backend = "hatchling.build"
37
-
38
- [tool.hatch.build]
39
- skip-excluded-dirs = true
40
-
41
- [tool.hatch.build.targets.sdist]
42
- only-include = ["html_to_markdown"]
43
-
44
- [tool.hatch.build.targets.wheel]
45
- only-include = ["html_to_markdown"]
46
-
47
- [tool.pdm]
48
- distribution = true
49
-
50
- [tool.pdm.dev-dependencies]
51
- dev = [
52
- "covdefaults>=2.3.0",
53
- "pre-commit>=3.7.0",
54
- "pytest>=8.1.1",
55
- "ruff>=0.4.0",
56
- "pytest-mock>=3.14.0",
57
- "pytest-cov>=5.0.0",
58
- "types-beautifulsoup4>=4.12.0.20240907",
59
- "mypy>=1.11.2",
60
- ]
61
-
62
- [tool.pdm.scripts]
63
- setup = { composite = [
64
- "pre-commit install --install-hooks",
65
- "pre-commit install --hook-type commit-msg",
66
- "pdm install"
67
- ] }
68
- lint = "pre-commit run --all-files"
69
- test.cmd = "pytest"
70
- test.env = { "PYTHONPATH" = "." }
71
- coverage.cmd = "pytest --disable-warnings --cov --cov-report xml"
72
- coverage.env = { "PYTHONPATH" = "." }
73
-
74
- # linters configuration below
75
- [tool.ruff]
76
- line-length = 120
77
- target-version = "py39"
78
- lint.select = ["ALL"]
79
- lint.ignore = [
80
- "ANN401", # dynamically typed ANY
81
- "D100", # # pydocstyle - missing docstring in public module
82
- "D104", # pydocstyle - missing docstring in public package
83
- "D107", # pydocstyle - missing docstring in __init__
84
- "D205", # pydocstyle - 1 blank line required between summary line and description
85
- "E501", # pycodestyle line too long, handled by ruff format
86
- "EM", # Exception messages,
87
- "FIX", # we allow todo and fixme comments
88
- "PLR2004", # Magic variables, we allow them
89
- "TD", # we allow todo and fixme comments
90
- "TRY", # Try except block, rules are too strict
91
- "COM812", # Conflicts with formatter
92
- "ISC001", # Conflicts with formatter
93
- "FBT", # Boolean Args
94
- ]
95
- src = ["html_to_markdown", "tests"]
96
-
97
- [tool.ruff.lint.per-file-ignores]
98
- "tests/**/*.*" = ["S", "D", "PT006", "PT013", "PD", "ARG"]
99
-
100
- [tool.ruff.format]
101
- docstring-code-format = true
102
- docstring-code-line-length = 120
103
-
104
- [tool.ruff.lint.pydocstyle]
105
- convention = "google"
106
-
107
- [tool.ruff.lint.isort]
108
- known-first-party = ["html_to_markdown", "tests"]
109
-
110
- [tool.ruff.lint.pylint]
111
- max-args = 25
112
- max-returns = 10
113
-
114
- [tool.mypy]
115
- packages = ["html_to_markdown", "tests"]
116
- python_version = "3.9"
117
- implicit_reexport = false
118
- show_error_codes = true
119
- strict = true
120
-
121
- [[tool.mypy.overrides]]
122
- module = "tests.*"
123
- disallow_any_generics = false
124
- disallow_untyped_decorators = false
125
-
126
- [tool.coverage.run]
127
- omit = ["tests/*"]
128
- plugins = ["covdefaults"]
129
- source = ["html_to_markdown"]
130
-
131
- [tool.coverage.report]
132
- exclude_lines = ['if TYPE_CHECKING:']
133
- fail_under = 100
134
-
135
- [tool.pytest.ini_options]
136
- asyncio_mode = "auto"
137
- asyncio_default_fixture_loop_scope = "function"