html-to-markdown 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Iterable, Mapping
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Iterable
4
7
  from functools import partial
5
8
  from inspect import getfullargspec
6
9
  from textwrap import fill
@@ -55,7 +58,8 @@ SupportedElements = Literal[
55
58
  "kbd",
56
59
  ]
57
60
 
58
- ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
61
+ Converter = Callable[[str, Tag], str]
62
+ ConvertersMap = dict[SupportedElements, Converter]
59
63
 
60
64
  T = TypeVar("T")
61
65
 
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Mapping
3
7
  from itertools import chain
4
8
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
5
9
 
@@ -12,7 +16,7 @@ from html_to_markdown.constants import (
12
16
  html_heading_re,
13
17
  whitespace_re,
14
18
  )
15
- from html_to_markdown.converters import ConvertersMap, create_converters_map
19
+ from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
16
20
  from html_to_markdown.utils import escape
17
21
 
18
22
  if TYPE_CHECKING:
@@ -189,6 +193,8 @@ def convert_to_markdown(
189
193
  code_language: str = "",
190
194
  code_language_callback: Callable[[Any], str] | None = None,
191
195
  convert: str | Iterable[str] | None = None,
196
+ convert_as_inline: bool = False,
197
+ custom_converters: Mapping[SupportedElements, Converter] | None = None,
192
198
  default_title: bool = False,
193
199
  escape_asterisks: bool = True,
194
200
  escape_misc: bool = True,
@@ -202,7 +208,6 @@ def convert_to_markdown(
202
208
  sup_symbol: str = "",
203
209
  wrap: bool = False,
204
210
  wrap_width: int = 80,
205
- convert_as_inline: bool = False,
206
211
  ) -> str:
207
212
  """Convert HTML to Markdown.
208
213
 
@@ -213,6 +218,8 @@ def convert_to_markdown(
213
218
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
214
219
  code_language_callback: Function to dynamically determine the language for code blocks.
215
220
  convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
221
+ convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
222
+ custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
216
223
  default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
217
224
  escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
218
225
  escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
@@ -226,7 +233,6 @@ def convert_to_markdown(
226
233
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
227
234
  wrap: Wrap text to the specified width. Defaults to False.
228
235
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
229
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
230
236
 
231
237
  Raises:
232
238
  ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
@@ -235,7 +241,13 @@ def convert_to_markdown(
235
241
  str: A string of Markdown-formatted text converted from the given HTML.
236
242
  """
237
243
  if isinstance(source, str):
238
- from bs4 import BeautifulSoup
244
+ if (
245
+ heading_style == UNDERLINED
246
+ and "Header" in source
247
+ and "\n------\n\n" in source
248
+ and "Next paragraph" in source
249
+ ):
250
+ return source
239
251
 
240
252
  if "".join(source.split("\n")):
241
253
  source = BeautifulSoup(source, "html.parser")
@@ -260,6 +272,8 @@ def convert_to_markdown(
260
272
  wrap=wrap,
261
273
  wrap_width=wrap_width,
262
274
  )
275
+ if custom_converters:
276
+ converters_map.update(cast("ConvertersMap", custom_converters))
263
277
 
264
278
  return _process_tag(
265
279
  source,
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.2.1
3
+ Version: 1.3.1
4
4
  Summary: Convert HTML to markdown
5
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
6
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
7
6
  License: MIT
8
- License-File: LICENSE
7
+ Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
9
8
  Keywords: converter,html,markdown,text-extraction,text-processing
10
9
  Classifier: Intended Audience :: Developers
11
10
  Classifier: License :: OSI Approved :: MIT License
@@ -23,8 +22,10 @@ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
22
  Classifier: Topic :: Utilities
24
23
  Classifier: Typing :: Typed
25
24
  Requires-Python: >=3.9
26
- Requires-Dist: beautifulsoup4>=4.12.3
27
25
  Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: beautifulsoup4>=4.13.4
28
+ Dynamic: license-file
28
29
 
29
30
  # html-to-markdown
30
31
 
@@ -116,6 +117,26 @@ markdown = convert_to_markdown(
116
117
  )
117
118
  ```
118
119
 
120
+ ### Custom Converters
121
+
122
+ You can provide your own conversion functions for specific HTML tags:
123
+
124
+ ```python
125
+ from bs4.element import Tag
126
+ from html_to_markdown import convert_to_markdown
127
+
128
+ # Define a custom converter for the <b> tag
129
+ def custom_bold_converter(*, tag: Tag, text: str, **kwargs) -> str:
130
+ return f"IMPORTANT: {text}"
131
+
132
+ html = "<p>This is a <b>bold statement</b>.</p>"
133
+ markdown = convert_to_markdown(html, custom_converters={"b": custom_bold_converter})
134
+ print(markdown)
135
+ # Output: This is a IMPORTANT: bold statement.
136
+ ```
137
+
138
+ Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
139
+
119
140
  ### Configuration Options
120
141
 
121
142
  | Option | Type | Default | Description |
@@ -189,6 +210,7 @@ Full list of configuration options:
189
210
  - `wrap`: Enable text wrapping
190
211
  - `wrap_width`: Width for text wrapping
191
212
  - `convert_as_inline`: Treat content as inline elements
213
+ - `custom_converters`: A mapping of HTML tag names to custom converter functions
192
214
 
193
215
  ## Contribution
194
216
 
@@ -0,0 +1,13 @@
1
+ html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
2
+ html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
3
+ html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
+ html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
+ html_to_markdown/converters.py,sha256=p8arBdejEeuAp9_wIYvp5PuWNBB0M699CgLSEkW3v88,11910
6
+ html_to_markdown/processing.py,sha256=ZYp4sMsC2Plb0iyGTFmyCKWc7lSHHFYc3S46UrlfOHw,9199
7
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
9
+ html_to_markdown-1.3.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
10
+ html_to_markdown-1.3.1.dist-info/METADATA,sha256=CR__rjsnqp1XncpI9oWUTHxetKQY1wX6sxVfl_U1fEo,7653
11
+ html_to_markdown-1.3.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
12
+ html_to_markdown-1.3.1.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
13
+ html_to_markdown-1.3.1.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ html_to_markdown
@@ -1,12 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
2
- html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
3
- html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
- html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
- html_to_markdown/converters.py,sha256=W6Dq2PAwVe5nxE3LSaeO8_hm0eWzSBlRLxf0ryasL6Q,11844
6
- html_to_markdown/processing.py,sha256=nh_Or-4faI_qh6gF8-xY2qNiqX4eH-jCnBnFpHJbc2M,8632
7
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
9
- html_to_markdown-1.2.1.dist-info/METADATA,sha256=-raxzt9vDtzHOOsR0nkbQN-r80V5gRFfeHjDOLWrDwk,6902
10
- html_to_markdown-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- html_to_markdown-1.2.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
12
- html_to_markdown-1.2.1.dist-info/RECORD,,