html-to-markdown 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,5 +1,5 @@
1
1
  from html_to_markdown.processing import convert_to_markdown
2
2
 
3
- from .legacy import Markdownify
3
+ markdownify = convert_to_markdown
4
4
 
5
- __all__ = ["Markdownify", "convert_to_markdown"]
5
+ __all__ = ["convert_to_markdown", "markdownify"]
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Iterable, Mapping
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Iterable
4
7
  from functools import partial
5
8
  from inspect import getfullargspec
6
9
  from textwrap import fill
@@ -55,7 +58,8 @@ SupportedElements = Literal[
55
58
  "kbd",
56
59
  ]
57
60
 
58
- ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
61
+ Converter = Callable[[str, Tag], str]
62
+ ConvertersMap = dict[SupportedElements, Converter]
59
63
 
60
64
  T = TypeVar("T")
61
65
 
@@ -85,7 +89,7 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
85
89
 
86
90
  return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
87
91
 
88
- return cast(Callable[[Tag, str], str], implementation)
92
+ return cast("Callable[[Tag, str], str]", implementation)
89
93
 
90
94
 
91
95
  def _get_colspan(tag: Tag) -> int:
@@ -187,7 +191,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
187
191
  parent = tag.parent
188
192
  if parent is not None and parent.name == "ol":
189
193
  start = (
190
- int(cast(str, parent["start"]))
194
+ int(cast("str", parent["start"]))
191
195
  if isinstance(parent.get("start"), str) and str(parent.get("start")).isnumeric()
192
196
  else 1
193
197
  )
@@ -263,7 +267,6 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
263
267
  overline = ""
264
268
  underline = ""
265
269
  if is_headrow and not tag.previous_sibling:
266
- # first row and is headline: print headline underline
267
270
  full_colspan = 0
268
271
  for cell in cells:
269
272
  if "colspan" in cell.attrs and cell["colspan"].isdigit():
@@ -272,12 +275,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
272
275
  full_colspan += 1
273
276
  underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
274
277
  elif not tag.previous_sibling and (
275
- parent_name == "table" or (parent_name == "tbody" and not cast(Tag, tag.parent).previous_sibling)
278
+ parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
276
279
  ):
277
- # first row, not headline, and:
278
- # - the parent is table or
279
- # - the parent is tbody at the beginning of a table.
280
- # print empty headline above this row
281
280
  overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
282
281
  overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
283
282
  return overline + "|" + text + "\n" + underline
@@ -334,7 +333,7 @@ def create_converters_map(
334
333
  return func(**kwargs)
335
334
  return func(text)
336
335
 
337
- return cast(Callable[[str, Tag], T], _inner)
336
+ return cast("Callable[[str, Tag], T]", _inner)
338
337
 
339
338
  return {
340
339
  "a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Mapping
3
7
  from itertools import chain
4
8
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
5
9
 
@@ -12,7 +16,7 @@ from html_to_markdown.constants import (
12
16
  html_heading_re,
13
17
  whitespace_re,
14
18
  )
15
- from html_to_markdown.converters import ConvertersMap, create_converters_map
19
+ from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
16
20
  from html_to_markdown.utils import escape
17
21
 
18
22
  if TYPE_CHECKING:
@@ -87,7 +91,9 @@ def _process_tag(
87
91
  strip: set[str] | None,
88
92
  ) -> str:
89
93
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
90
- tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
94
+ tag_name: SupportedTag | None = (
95
+ cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
96
+ )
91
97
  text = ""
92
98
 
93
99
  is_heading = html_heading_re.match(tag.name) is not None
@@ -142,11 +148,9 @@ def _process_text(
142
148
  ) -> str:
143
149
  text = str(el) or ""
144
150
 
145
- # normalize whitespace if we're not inside a preformatted element
146
151
  if not el.find_parent("pre"):
147
152
  text = whitespace_re.sub(" ", text)
148
153
 
149
- # escape special characters if we're not inside a preformatted or code element
150
154
  if not el.find_parent(["pre", "code", "kbd", "samp"]):
151
155
  text = escape(
152
156
  text=text,
@@ -155,9 +159,6 @@ def _process_text(
155
159
  escape_underscores=escape_underscores,
156
160
  )
157
161
 
158
- # remove trailing whitespaces if any of the following condition is true:
159
- # - current text node is the last node in li
160
- # - current text node is followed by an embedded list
161
162
  if (
162
163
  el.parent
163
164
  and el.parent.name == "li"
@@ -192,6 +193,8 @@ def convert_to_markdown(
192
193
  code_language: str = "",
193
194
  code_language_callback: Callable[[Any], str] | None = None,
194
195
  convert: str | Iterable[str] | None = None,
196
+ convert_as_inline: bool = False,
197
+ custom_converters: Mapping[SupportedElements, Converter] | None = None,
195
198
  default_title: bool = False,
196
199
  escape_asterisks: bool = True,
197
200
  escape_misc: bool = True,
@@ -205,7 +208,6 @@ def convert_to_markdown(
205
208
  sup_symbol: str = "",
206
209
  wrap: bool = False,
207
210
  wrap_width: int = 80,
208
- convert_as_inline: bool = False,
209
211
  ) -> str:
210
212
  """Convert HTML to Markdown.
211
213
 
@@ -216,6 +218,8 @@ def convert_to_markdown(
216
218
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
217
219
  code_language_callback: Function to dynamically determine the language for code blocks.
218
220
  convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
221
+ convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
222
+ custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
219
223
  default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
220
224
  escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
221
225
  escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
@@ -229,7 +233,6 @@ def convert_to_markdown(
229
233
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
230
234
  wrap: Wrap text to the specified width. Defaults to False.
231
235
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
232
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
233
236
 
234
237
  Raises:
235
238
  ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
@@ -263,6 +266,8 @@ def convert_to_markdown(
263
266
  wrap=wrap,
264
267
  wrap_width=wrap_width,
265
268
  )
269
+ if custom_converters:
270
+ converters_map.update(cast("ConvertersMap", custom_converters))
266
271
 
267
272
  return _process_tag(
268
273
  source,
@@ -0,0 +1,242 @@
1
+ Metadata-Version: 2.4
2
+ Name: html-to-markdown
3
+ Version: 1.3.0
4
+ Summary: Convert HTML to markdown
5
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
+ Keywords: converter,html,markdown,text-extraction,text-processing
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Text Processing
19
+ Classifier: Topic :: Text Processing :: Markup
20
+ Classifier: Topic :: Text Processing :: Markup :: HTML
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: beautifulsoup4>=4.12.3
28
+ Dynamic: license-file
29
+
30
+ # html-to-markdown
31
+
32
+ A modern, fully typed Python library for converting HTML to Markdown. This library is a completely rewritten fork
33
+ of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
34
+ Python 3.9+.
35
+
36
+ ## Features
37
+
38
+ - Full type safety with strict MyPy adherence
39
+ - Functional API design
40
+ - Extensive test coverage
41
+ - Configurable conversion options
42
+ - CLI tool for easy conversions
43
+ - Support for pre-configured BeautifulSoup instances
44
+ - Strict semver versioning
45
+
46
+ ## Installation
47
+
48
+ ```shell
49
+ pip install html-to-markdown
50
+ ```
51
+
52
+ ## Quick Start
53
+
54
+ Convert HTML to Markdown with a single function call:
55
+
56
+ ```python
57
+ from html_to_markdown import convert_to_markdown
58
+
59
+ html = """
60
+ <article>
61
+ <h1>Welcome</h1>
62
+ <p>This is a <strong>sample</strong> with a <a href="https://example.com">link</a>.</p>
63
+ <ul>
64
+ <li>Item 1</li>
65
+ <li>Item 2</li>
66
+ </ul>
67
+ </article>
68
+ """
69
+
70
+ markdown = convert_to_markdown(html)
71
+ print(markdown)
72
+ ```
73
+
74
+ Output:
75
+
76
+ ```markdown
77
+ # Welcome
78
+
79
+ This is a **sample** with a [link](https://example.com).
80
+
81
+ * Item 1
82
+ * Item 2
83
+ ```
84
+
85
+ ### Working with BeautifulSoup
86
+
87
+ If you need more control over HTML parsing, you can pass a pre-configured BeautifulSoup instance:
88
+
89
+ ```python
90
+ from bs4 import BeautifulSoup
91
+ from html_to_markdown import convert_to_markdown
92
+
93
+ # Configure BeautifulSoup with your preferred parser
94
+ soup = BeautifulSoup(html, "lxml") # Note: lxml requires additional installation
95
+ markdown = convert_to_markdown(soup)
96
+ ```
97
+
98
+ ## Advanced Usage
99
+
100
+ ### Customizing Conversion Options
101
+
102
+ The library offers extensive customization through various options:
103
+
104
+ ```python
105
+ from html_to_markdown import convert_to_markdown
106
+
107
+ html = "<div>Your content here...</div>"
108
+ markdown = convert_to_markdown(
109
+ html,
110
+ heading_style="atx", # Use # style headers
111
+ strong_em_symbol="*", # Use * for bold/italic
112
+ bullets="*+-", # Define bullet point characters
113
+ wrap=True, # Enable text wrapping
114
+ wrap_width=100, # Set wrap width
115
+ escape_asterisks=True, # Escape * characters
116
+ code_language="python", # Default code block language
117
+ )
118
+ ```
119
+
120
+ ### Custom Converters
121
+
122
+ You can provide your own conversion functions for specific HTML tags:
123
+
124
+ ```python
125
+ from bs4.element import Tag
126
+ from html_to_markdown import convert_to_markdown
127
+
128
+ # Define a custom converter for the <b> tag
129
+ def custom_bold_converter(*, tag: Tag, text: str, **kwargs) -> str:
130
+ return f"IMPORTANT: {text}"
131
+
132
+ html = "<p>This is a <b>bold statement</b>.</p>"
133
+ markdown = convert_to_markdown(html, custom_converters={"b": custom_bold_converter})
134
+ print(markdown)
135
+ # Output: This is a IMPORTANT: bold statement.
136
+ ```
137
+
138
+ Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
139
+
140
+ ### Configuration Options
141
+
142
+ | Option | Type | Default | Description |
143
+ | -------------------- | ---- | -------------- | ------------------------------------------------------ |
144
+ | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
145
+ | `bullets` | str | `'*+-'` | Characters to use for bullet points |
146
+ | `code_language` | str | `''` | Default language for code blocks |
147
+ | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
148
+ | `escape_asterisks` | bool | `True` | Escape * characters |
149
+ | `escape_underscores` | bool | `True` | Escape _ characters |
150
+ | `wrap` | bool | `False` | Enable text wrapping |
151
+ | `wrap_width` | int | `80` | Text wrap width |
152
+
153
+ For a complete list of options, see the [Configuration](#configuration) section below.
154
+
155
+ ## CLI Usage
156
+
157
+ Convert HTML files directly from the command line:
158
+
159
+ ```shell
160
+ # Convert a file
161
+ html_to_markdown input.html > output.md
162
+
163
+ # Process stdin
164
+ cat input.html | html_to_markdown > output.md
165
+
166
+ # Use custom options
167
+ html_to_markdown --heading-style atx --wrap --wrap-width 100 input.html > output.md
168
+ ```
169
+
170
+ View all available options:
171
+
172
+ ```shell
173
+ html_to_markdown --help
174
+ ```
175
+
176
+ ## Migration from Markdownify
177
+
178
+ For existing projects using Markdownify, a compatibility layer is provided:
179
+
180
+ ```python
181
+ # Old code
182
+ from markdownify import markdownify as md
183
+
184
+ # New code - works the same way
185
+ from html_to_markdown import markdownify as md
186
+ ```
187
+
188
+ The `markdownify` function is an alias for `convert_to_markdown` and provides identical functionality.
189
+
190
+ ## Configuration
191
+
192
+ Full list of configuration options:
193
+
194
+ - `autolinks`: Convert valid URLs to Markdown links automatically
195
+ - `bullets`: Characters to use for bullet points in lists
196
+ - `code_language`: Default language for fenced code blocks
197
+ - `code_language_callback`: Function to determine code block language
198
+ - `convert`: List of HTML tags to convert (None = all supported tags)
199
+ - `default_title`: Use default titles for elements like links
200
+ - `escape_asterisks`: Escape * characters
201
+ - `escape_misc`: Escape miscellaneous Markdown characters
202
+ - `escape_underscores`: Escape _ characters
203
+ - `heading_style`: Header style (underlined/atx/atx_closed)
204
+ - `keep_inline_images_in`: Tags where inline images should be kept
205
+ - `newline_style`: Style for handling newlines (spaces/backslash)
206
+ - `strip`: Tags to remove from output
207
+ - `strong_em_symbol`: Symbol for strong/emphasized text (\* or \_)
208
+ - `sub_symbol`: Symbol for subscript text
209
+ - `sup_symbol`: Symbol for superscript text
210
+ - `wrap`: Enable text wrapping
211
+ - `wrap_width`: Width for text wrapping
212
+ - `convert_as_inline`: Treat content as inline elements
213
+ - `custom_converters`: A mapping of HTML tag names to custom converter functions
214
+
215
+ ## Contribution
216
+
217
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
218
+ submitting PRs to avoid disappointment.
219
+
220
+ ### Local Development
221
+
222
+ 1. Clone the repo
223
+
224
+ 1. Install the system dependencies
225
+
226
+ 1. Install the full dependencies with `uv sync`
227
+
228
+ 1. Install the pre-commit hooks with:
229
+
230
+ ```shell
231
+ pre-commit install && pre-commit install --hook-type commit-msg
232
+ ```
233
+
234
+ 1. Make your changes and submit a PR
235
+
236
+ ## License
237
+
238
+ This library uses the MIT license.
239
+
240
+ ## Acknowledgments
241
+
242
+ Special thanks to the original [markdownify](https://pypi.org/project/markdownify/) project creators and contributors.
@@ -0,0 +1,13 @@
1
+ html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
2
+ html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
3
+ html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
+ html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
+ html_to_markdown/converters.py,sha256=p8arBdejEeuAp9_wIYvp5PuWNBB0M699CgLSEkW3v88,11910
6
+ html_to_markdown/processing.py,sha256=WNMzB_dt5yn11xK59zPVMb1aMCvMAEowpgg-tki8meI,9028
7
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
9
+ html_to_markdown-1.3.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
10
+ html_to_markdown-1.3.0.dist-info/METADATA,sha256=XU_lAyXhm3okv_ly0KSQK7LRHvSbQbW464qXnJ3ryVw,7653
11
+ html_to_markdown-1.3.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
12
+ html_to_markdown-1.3.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
13
+ html_to_markdown-1.3.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -1,7 +1,7 @@
1
1
  The MIT License (MIT)
2
2
 
3
3
  Copyright 2012-2018 Matthew Tretter
4
- Copyright 2024 Na'aman Hirschfeld
4
+ Copyright 2024-2025 Na'aman Hirschfeld
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1 @@
1
+ html_to_markdown
@@ -1,89 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Literal
4
-
5
- from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
6
- from html_to_markdown.converters import create_converters_map
7
-
8
- if TYPE_CHECKING:
9
- from collections.abc import Callable, Iterable
10
-
11
- from bs4 import Tag
12
-
13
-
14
- def _create_legacy_class(
15
- autolinks: bool,
16
- bullets: str,
17
- code_language: str,
18
- code_language_callback: Callable[[Tag], str] | None,
19
- default_title: bool,
20
- heading_style: Literal["atx", "atx_closed", "underlined"],
21
- keep_inline_images_in: Iterable[str] | None,
22
- newline_style: str,
23
- strong_em_symbol: str,
24
- sub_symbol: str,
25
- sup_symbol: str,
26
- wrap: bool,
27
- wrap_width: int,
28
- ) -> type:
29
- """Create a legacy class for Markdownify.
30
-
31
- Deprecated: Use the new hooks api instead.
32
-
33
- Args:
34
- autolinks: Whether to convert URLs into links.
35
- bullets: The bullet characters to use for unordered lists.
36
- code_language: The default code language to use.
37
- code_language_callback: A callback to get the code language.
38
- default_title: Whether to use the URL as the title for links.
39
- heading_style: The style of headings.
40
- keep_inline_images_in: The tags to keep inline images in.
41
- newline_style: The style of newlines.
42
- strong_em_symbol: The symbol to use for strong and emphasis text.
43
- sub_symbol: The symbol to use for subscript text.
44
- sup_symbol: The symbol to use for superscript text.
45
- wrap: Whether to wrap text.
46
- wrap_width: The width to wrap text at.
47
-
48
- Returns:
49
- A class that can be used to convert HTML to Markdown.
50
- """
51
- return type(
52
- "Markdownify",
53
- (),
54
- {
55
- k.removeprefix("_"): v
56
- for k, v in create_converters_map(
57
- autolinks=autolinks,
58
- bullets=bullets,
59
- code_language=code_language,
60
- code_language_callback=code_language_callback,
61
- default_title=default_title,
62
- heading_style=heading_style,
63
- keep_inline_images_in=keep_inline_images_in,
64
- newline_style=newline_style,
65
- strong_em_symbol=strong_em_symbol,
66
- sub_symbol=sub_symbol,
67
- sup_symbol=sup_symbol,
68
- wrap=wrap,
69
- wrap_width=wrap_width,
70
- ).items()
71
- },
72
- )
73
-
74
-
75
- Markdownify = _create_legacy_class(
76
- autolinks=True,
77
- bullets="*+-",
78
- code_language="",
79
- code_language_callback=None,
80
- default_title=False,
81
- heading_style=UNDERLINED,
82
- keep_inline_images_in=None,
83
- newline_style=SPACES,
84
- strong_em_symbol=ASTERISK,
85
- sub_symbol="",
86
- sup_symbol="",
87
- wrap=False,
88
- wrap_width=80,
89
- )
@@ -1,102 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: html-to-markdown
3
- Version: 1.2.0
4
- Summary: Convert HTML to markdown
5
- Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
- License: MIT
7
- License-File: LICENSE
8
- Keywords: converter,html,markdown,text-extraction,text-processing
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3 :: Only
13
- Classifier: Programming Language :: Python :: 3.9
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: Programming Language :: Python :: 3.13
18
- Classifier: Topic :: Text Processing
19
- Classifier: Topic :: Text Processing :: Markup
20
- Classifier: Topic :: Text Processing :: Markup :: HTML
21
- Classifier: Topic :: Text Processing :: Markup :: Markdown
22
- Classifier: Topic :: Utilities
23
- Classifier: Typing :: Typed
24
- Requires-Python: >=3.9
25
- Requires-Dist: beautifulsoup4>=4.12.3
26
- Description-Content-Type: text/markdown
27
-
28
- # html_to_markdown
29
-
30
- This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
31
- Python 3.9 and above.
32
-
33
- ### Differences with the Markdownify
34
-
35
- - The refactored codebase uses a strict functional approach - no classes are involved.
36
- - There is full typing with strict MyPy strict adherence and a py.typed file included.
37
- - The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
38
- - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
39
- point versioning is no longer aligned.
40
-
41
- ## Installation
42
-
43
- ```shell
44
- pip install html_to_markdown
45
- ```
46
-
47
- ## Usage
48
-
49
- Convert an string HTML to Markdown:
50
-
51
- ```python
52
- from html_to_markdown import convert_to_markdown
53
-
54
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
55
- ```
56
-
57
- Or pass a pre-configured instance of `BeautifulSoup`:
58
-
59
- ```python
60
- from bs4 import BeautifulSoup
61
- from html_to_markdown import convert_to_markdown
62
-
63
- soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
64
-
65
- convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
66
- ```
67
-
68
- ### Options
69
-
70
- The `convert_to_markdown` function accepts the following kwargs:
71
-
72
- - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
73
- - bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
74
- - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
75
- - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
76
- - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
77
- - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
78
- - escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
79
- - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
80
- - escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
81
- - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
82
- underlined".
83
- - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
84
- - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
85
- - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
86
- - strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
87
- - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
88
- - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
89
- - wrap (bool): Wrap text to the specified width. Defaults to False.
90
- - wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
91
- - convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
92
-
93
- ## CLI
94
-
95
- For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
96
- pipe input from stdin:
97
-
98
- ```shell
99
- cat example.html | html_to_markdown > example.md
100
- ```
101
-
102
- Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
@@ -1,13 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=cXm4YOyrAp2HKHMDfnVA5e75zg6wdqpyXugjBYvBMFc,143
2
- html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
3
- html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
- html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
- html_to_markdown/converters.py,sha256=hW4RqAbgx0tdTzfUSvAGQg1OgQUmHL1cekZtJLFq_Ns,12080
6
- html_to_markdown/legacy.py,sha256=vL-MVKPXOue-JJafXFtmGcVIPylwmPOly0CELTSzWRQ,2773
7
- html_to_markdown/processing.py,sha256=L1wZwUm7WA8wN4GA5zjCStwACb-8S2scQZPbzeHgdY8,8951
8
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
10
- html_to_markdown-1.2.0.dist-info/METADATA,sha256=Dg2ZibNWNW_GyszXG2bxT-oOtOJc8iryVxlLn38eMww,4709
11
- html_to_markdown-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- html_to_markdown-1.2.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
13
- html_to_markdown-1.2.0.dist-info/RECORD,,