html-to-markdown 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/PKG-INFO +8 -7
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/README.md +4 -4
- html_to_markdown-1.2.0/html_to_markdown/__init__.py +5 -0
- html_to_markdown-1.2.0/html_to_markdown/__main__.py +11 -0
- html_to_markdown-1.2.0/html_to_markdown/constants.py +18 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/html_to_markdown/converters.py +9 -7
- html_to_markdown-1.2.0/html_to_markdown/legacy.py +89 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/html_to_markdown/processing.py +33 -14
- html_to_markdown-1.2.0/pyproject.toml +113 -0
- html_to_markdown-1.1.0/html_to_markdown/__init__.py +0 -3
- html_to_markdown-1.1.0/html_to_markdown/__main__.py +0 -7
- html_to_markdown-1.1.0/html_to_markdown/constants.py +0 -18
- html_to_markdown-1.1.0/pyproject.toml +0 -137
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/.gitignore +0 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/LICENSE +0 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.1.0 → html_to_markdown-1.2.0}/html_to_markdown/utils.py +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Convert HTML to markdown
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
License-File: LICENSE
|
|
8
|
-
Keywords:
|
|
8
|
+
Keywords: converter,html,markdown,text-extraction,text-processing
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -69,20 +70,20 @@ convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
|
|
|
69
70
|
The `convert_to_markdown` function accepts the following kwargs:
|
|
70
71
|
|
|
71
72
|
- autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
72
|
-
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '
|
|
73
|
+
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
|
|
73
74
|
- code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
74
75
|
- code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
|
|
75
76
|
- convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
76
77
|
- default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
77
|
-
- escape_asterisks (bool): Escape asterisks (
|
|
78
|
+
- escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
|
|
78
79
|
- escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
79
|
-
-
|
|
80
|
+
- escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
|
|
80
81
|
- heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
|
|
81
82
|
underlined".
|
|
82
83
|
- keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
|
|
83
84
|
- newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
|
|
84
85
|
- strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
|
|
85
|
-
-
|
|
86
|
+
- strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
|
|
86
87
|
- sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
|
|
87
88
|
- sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
|
|
88
89
|
- wrap (bool): Wrap text to the specified width. Defaults to False.
|
|
@@ -43,20 +43,20 @@ convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
|
|
|
43
43
|
The `convert_to_markdown` function accepts the following kwargs:
|
|
44
44
|
|
|
45
45
|
- autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
46
|
-
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '
|
|
46
|
+
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
|
|
47
47
|
- code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
48
48
|
- code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
|
|
49
49
|
- convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
50
50
|
- default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
51
|
-
- escape_asterisks (bool): Escape asterisks (
|
|
51
|
+
- escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
|
|
52
52
|
- escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
53
|
-
-
|
|
53
|
+
- escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
|
|
54
54
|
- heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
|
|
55
55
|
underlined".
|
|
56
56
|
- keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
|
|
57
57
|
- newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
|
|
58
58
|
- strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
|
|
59
|
-
-
|
|
59
|
+
- strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
|
|
60
60
|
- sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
|
|
61
61
|
- sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
|
|
62
62
|
- wrap (bool): Wrap text to the specified width. Defaults to False.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from re import Pattern
|
|
5
|
+
from typing import Final
|
|
6
|
+
|
|
7
|
+
convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
|
|
8
|
+
line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
|
|
9
|
+
whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
|
|
10
|
+
html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
|
|
11
|
+
|
|
12
|
+
ASTERISK: Final = "*"
|
|
13
|
+
ATX: Final = "atx"
|
|
14
|
+
ATX_CLOSED: Final = "atx_closed"
|
|
15
|
+
BACKSLASH: Final = "backslash"
|
|
16
|
+
UNDERLINED: Final = "underlined"
|
|
17
|
+
SPACES: Final = "spaces"
|
|
18
|
+
UNDERSCORE: Final = "_"
|
|
@@ -55,17 +55,19 @@ SupportedElements = Literal[
|
|
|
55
55
|
"kbd",
|
|
56
56
|
]
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
|
|
59
59
|
|
|
60
60
|
T = TypeVar("T")
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
"""Create an inline converter for a markup pattern or tag.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
markup_prefix: The markup prefix to insert.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A function that can be used to convert HTML to Markdown.
|
|
69
71
|
"""
|
|
70
72
|
|
|
71
73
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
@@ -295,7 +297,7 @@ def create_converters_map(
|
|
|
295
297
|
sup_symbol: str,
|
|
296
298
|
wrap: bool,
|
|
297
299
|
wrap_width: int,
|
|
298
|
-
) ->
|
|
300
|
+
) -> ConvertersMap:
|
|
299
301
|
"""Create a mapping of HTML elements to their corresponding conversion functions.
|
|
300
302
|
|
|
301
303
|
Args:
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
|
+
|
|
5
|
+
from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
|
|
6
|
+
from html_to_markdown.converters import create_converters_map
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Callable, Iterable
|
|
10
|
+
|
|
11
|
+
from bs4 import Tag
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _create_legacy_class(
|
|
15
|
+
autolinks: bool,
|
|
16
|
+
bullets: str,
|
|
17
|
+
code_language: str,
|
|
18
|
+
code_language_callback: Callable[[Tag], str] | None,
|
|
19
|
+
default_title: bool,
|
|
20
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
21
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
22
|
+
newline_style: str,
|
|
23
|
+
strong_em_symbol: str,
|
|
24
|
+
sub_symbol: str,
|
|
25
|
+
sup_symbol: str,
|
|
26
|
+
wrap: bool,
|
|
27
|
+
wrap_width: int,
|
|
28
|
+
) -> type:
|
|
29
|
+
"""Create a legacy class for Markdownify.
|
|
30
|
+
|
|
31
|
+
Deprecated: Use the new hooks api instead.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
autolinks: Whether to convert URLs into links.
|
|
35
|
+
bullets: The bullet characters to use for unordered lists.
|
|
36
|
+
code_language: The default code language to use.
|
|
37
|
+
code_language_callback: A callback to get the code language.
|
|
38
|
+
default_title: Whether to use the URL as the title for links.
|
|
39
|
+
heading_style: The style of headings.
|
|
40
|
+
keep_inline_images_in: The tags to keep inline images in.
|
|
41
|
+
newline_style: The style of newlines.
|
|
42
|
+
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
43
|
+
sub_symbol: The symbol to use for subscript text.
|
|
44
|
+
sup_symbol: The symbol to use for superscript text.
|
|
45
|
+
wrap: Whether to wrap text.
|
|
46
|
+
wrap_width: The width to wrap text at.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A class that can be used to convert HTML to Markdown.
|
|
50
|
+
"""
|
|
51
|
+
return type(
|
|
52
|
+
"Markdownify",
|
|
53
|
+
(),
|
|
54
|
+
{
|
|
55
|
+
k.removeprefix("_"): v
|
|
56
|
+
for k, v in create_converters_map(
|
|
57
|
+
autolinks=autolinks,
|
|
58
|
+
bullets=bullets,
|
|
59
|
+
code_language=code_language,
|
|
60
|
+
code_language_callback=code_language_callback,
|
|
61
|
+
default_title=default_title,
|
|
62
|
+
heading_style=heading_style,
|
|
63
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
64
|
+
newline_style=newline_style,
|
|
65
|
+
strong_em_symbol=strong_em_symbol,
|
|
66
|
+
sub_symbol=sub_symbol,
|
|
67
|
+
sup_symbol=sup_symbol,
|
|
68
|
+
wrap=wrap,
|
|
69
|
+
wrap_width=wrap_width,
|
|
70
|
+
).items()
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
Markdownify = _create_legacy_class(
|
|
76
|
+
autolinks=True,
|
|
77
|
+
bullets="*+-",
|
|
78
|
+
code_language="",
|
|
79
|
+
code_language_callback=None,
|
|
80
|
+
default_title=False,
|
|
81
|
+
heading_style=UNDERLINED,
|
|
82
|
+
keep_inline_images_in=None,
|
|
83
|
+
newline_style=SPACES,
|
|
84
|
+
strong_em_symbol=ASTERISK,
|
|
85
|
+
sub_symbol="",
|
|
86
|
+
sup_symbol="",
|
|
87
|
+
wrap=False,
|
|
88
|
+
wrap_width=80,
|
|
89
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from itertools import chain
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
4
5
|
|
|
5
6
|
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
|
|
@@ -11,7 +12,7 @@ from html_to_markdown.constants import (
|
|
|
11
12
|
html_heading_re,
|
|
12
13
|
whitespace_re,
|
|
13
14
|
)
|
|
14
|
-
from html_to_markdown.converters import
|
|
15
|
+
from html_to_markdown.converters import ConvertersMap, create_converters_map
|
|
15
16
|
from html_to_markdown.utils import escape
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -76,18 +77,21 @@ def _is_nested_tag(el: PageElement) -> bool:
|
|
|
76
77
|
|
|
77
78
|
def _process_tag(
|
|
78
79
|
tag: Tag,
|
|
79
|
-
converters_map:
|
|
80
|
+
converters_map: ConvertersMap,
|
|
80
81
|
*,
|
|
81
|
-
convert:
|
|
82
|
+
convert: set[str] | None,
|
|
82
83
|
convert_as_inline: bool = False,
|
|
83
84
|
escape_asterisks: bool,
|
|
84
85
|
escape_misc: bool,
|
|
85
86
|
escape_underscores: bool,
|
|
86
|
-
strip:
|
|
87
|
+
strip: set[str] | None,
|
|
87
88
|
) -> str:
|
|
89
|
+
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
90
|
+
tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
|
|
88
91
|
text = ""
|
|
92
|
+
|
|
89
93
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
90
|
-
is_cell =
|
|
94
|
+
is_cell = tag_name in {"td", "th"}
|
|
91
95
|
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
92
96
|
|
|
93
97
|
if _is_nested_tag(tag):
|
|
@@ -121,9 +125,7 @@ def _process_tag(
|
|
|
121
125
|
strip=strip,
|
|
122
126
|
)
|
|
123
127
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
|
|
128
|
+
if tag_name and should_convert_tag:
|
|
127
129
|
return converters_map[tag_name]( # type: ignore[call-arg]
|
|
128
130
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
129
131
|
)
|
|
@@ -166,7 +168,7 @@ def _process_text(
|
|
|
166
168
|
return text
|
|
167
169
|
|
|
168
170
|
|
|
169
|
-
def _should_convert_tag(*, tag_name: str, strip:
|
|
171
|
+
def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
|
|
170
172
|
if strip is not None:
|
|
171
173
|
return tag_name not in strip
|
|
172
174
|
if convert is not None:
|
|
@@ -174,6 +176,14 @@ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert:
|
|
|
174
176
|
return True
|
|
175
177
|
|
|
176
178
|
|
|
179
|
+
def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
180
|
+
if value is None:
|
|
181
|
+
return None
|
|
182
|
+
if isinstance(value, str):
|
|
183
|
+
return set(",".split(value))
|
|
184
|
+
return {*chain(*[v.split(",") for v in value])}
|
|
185
|
+
|
|
186
|
+
|
|
177
187
|
def convert_to_markdown(
|
|
178
188
|
source: str | BeautifulSoup,
|
|
179
189
|
*,
|
|
@@ -181,7 +191,7 @@ def convert_to_markdown(
|
|
|
181
191
|
bullets: str = "*+-",
|
|
182
192
|
code_language: str = "",
|
|
183
193
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
184
|
-
convert: Iterable[str] | None = None,
|
|
194
|
+
convert: str | Iterable[str] | None = None,
|
|
185
195
|
default_title: bool = False,
|
|
186
196
|
escape_asterisks: bool = True,
|
|
187
197
|
escape_misc: bool = True,
|
|
@@ -189,7 +199,7 @@ def convert_to_markdown(
|
|
|
189
199
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
190
200
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
191
201
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
192
|
-
strip: Iterable[str] | None = None,
|
|
202
|
+
strip: str | Iterable[str] | None = None,
|
|
193
203
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
194
204
|
sub_symbol: str = "",
|
|
195
205
|
sup_symbol: str = "",
|
|
@@ -221,13 +231,22 @@ def convert_to_markdown(
|
|
|
221
231
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
222
232
|
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
223
233
|
|
|
234
|
+
Raises:
|
|
235
|
+
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
236
|
+
|
|
224
237
|
Returns:
|
|
225
238
|
str: A string of Markdown-formatted text converted from the given HTML.
|
|
226
239
|
"""
|
|
227
240
|
if isinstance(source, str):
|
|
228
241
|
from bs4 import BeautifulSoup
|
|
229
242
|
|
|
230
|
-
|
|
243
|
+
if "".join(source.split("\n")):
|
|
244
|
+
source = BeautifulSoup(source, "html.parser")
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError("The input HTML is empty.")
|
|
247
|
+
|
|
248
|
+
if strip is not None and convert is not None:
|
|
249
|
+
raise ValueError("Only one of 'strip' and 'convert' can be specified.")
|
|
231
250
|
|
|
232
251
|
converters_map = create_converters_map(
|
|
233
252
|
autolinks=autolinks,
|
|
@@ -248,10 +267,10 @@ def convert_to_markdown(
|
|
|
248
267
|
return _process_tag(
|
|
249
268
|
source,
|
|
250
269
|
converters_map,
|
|
251
|
-
convert=convert,
|
|
270
|
+
convert=_as_optional_set(convert),
|
|
252
271
|
convert_as_inline=convert_as_inline,
|
|
253
272
|
escape_asterisks=escape_asterisks,
|
|
254
273
|
escape_misc=escape_misc,
|
|
255
274
|
escape_underscores=escape_underscores,
|
|
256
|
-
strip=strip,
|
|
275
|
+
strip=_as_optional_set(strip),
|
|
257
276
|
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
build-backend = "hatchling.build"
|
|
3
|
+
|
|
4
|
+
requires = [ "hatchling" ]
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "html-to-markdown"
|
|
8
|
+
version = "1.2.0"
|
|
9
|
+
description = "Convert HTML to markdown"
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
|
|
12
|
+
|
|
13
|
+
license = { text = "MIT" }
|
|
14
|
+
authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
|
|
15
|
+
requires-python = ">=3.9"
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Text Processing",
|
|
27
|
+
"Topic :: Text Processing :: Markup",
|
|
28
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
29
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
30
|
+
"Topic :: Utilities",
|
|
31
|
+
"Typing :: Typed",
|
|
32
|
+
]
|
|
33
|
+
dependencies = [
|
|
34
|
+
"beautifulsoup4>=4.12.3",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"covdefaults>=2.3",
|
|
40
|
+
"mypy>=1.14.1",
|
|
41
|
+
"pre-commit>=4.1",
|
|
42
|
+
"pytest>=8.3.4",
|
|
43
|
+
"pytest-cov>=6",
|
|
44
|
+
"pytest-mock>=3.14",
|
|
45
|
+
"ruff>=0.9.3",
|
|
46
|
+
"types-beautifulsoup4>=4.12.0.20241020",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build]
|
|
50
|
+
skip-excluded-dirs = true
|
|
51
|
+
|
|
52
|
+
[tool.hatch.build.targets.sdist]
|
|
53
|
+
only-include = [ "html_to_markdown" ]
|
|
54
|
+
|
|
55
|
+
[tool.hatch.build.targets.wheel]
|
|
56
|
+
only-include = [ "html_to_markdown" ]
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
target-version = "py39"
|
|
60
|
+
line-length = 120
|
|
61
|
+
src = [ "html_to_markdown", "tests" ]
|
|
62
|
+
|
|
63
|
+
format.docstring-code-line-length = 120
|
|
64
|
+
format.docstring-code-format = true
|
|
65
|
+
lint.select = [ "ALL" ]
|
|
66
|
+
lint.ignore = [
|
|
67
|
+
"ANN401", # dynamically typed ANY
|
|
68
|
+
"COM812", # Conflicts with formatter
|
|
69
|
+
"D100", # # pydocstyle - missing docstring in public module
|
|
70
|
+
"D104", # pydocstyle - missing docstring in public package
|
|
71
|
+
"D107", # pydocstyle - missing docstring in __init__
|
|
72
|
+
"D205", # pydocstyle - 1 blank line required between summary line and description
|
|
73
|
+
"E501", # pycodestyle line too long, handled by ruff format
|
|
74
|
+
"EM", # Exception messages,
|
|
75
|
+
"FBT", # Boolean Args
|
|
76
|
+
"FIX", # we allow todo and fixme comments
|
|
77
|
+
"ISC001", # Conflicts with formatter
|
|
78
|
+
"PLR0913", # Pylint - too many arguments.
|
|
79
|
+
"PLR2004", # Magic variables, we allow them
|
|
80
|
+
"TD", # we allow todo and fixme comments
|
|
81
|
+
"TRY", # Try except block, rules are too strict
|
|
82
|
+
]
|
|
83
|
+
lint.per-file-ignores."tests/**/*.*" = [ "ARG", "D", "PD", "PT006", "PT013", "S" ]
|
|
84
|
+
lint.isort.known-first-party = [ "html_to_markdown", "tests" ]
|
|
85
|
+
lint.pydocstyle.convention = "google"
|
|
86
|
+
|
|
87
|
+
[tool.pytest.ini_options]
|
|
88
|
+
asyncio_mode = "auto"
|
|
89
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
90
|
+
|
|
91
|
+
[tool.coverage.run]
|
|
92
|
+
omit = [ "tests/*" ]
|
|
93
|
+
plugins = [ "covdefaults" ]
|
|
94
|
+
source = [ "html_to_markdown" ]
|
|
95
|
+
|
|
96
|
+
[tool.coverage.report]
|
|
97
|
+
exclude_lines = [ 'if TYPE_CHECKING:' ]
|
|
98
|
+
fail_under = 100
|
|
99
|
+
|
|
100
|
+
[tool.mypy]
|
|
101
|
+
packages = [ "html_to_markdown", "tests" ]
|
|
102
|
+
python_version = "3.9"
|
|
103
|
+
implicit_reexport = false
|
|
104
|
+
show_error_codes = true
|
|
105
|
+
strict = true
|
|
106
|
+
|
|
107
|
+
[[tool.mypy.overrides]]
|
|
108
|
+
module = "tests.*"
|
|
109
|
+
disallow_any_generics = false
|
|
110
|
+
disallow_untyped_decorators = false
|
|
111
|
+
|
|
112
|
+
[tool.uv]
|
|
113
|
+
default-groups = [ "dev" ]
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from re import Pattern
|
|
5
|
-
from typing import Final, Literal
|
|
6
|
-
|
|
7
|
-
convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
|
|
8
|
-
line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
|
|
9
|
-
whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
|
|
10
|
-
html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
|
|
11
|
-
|
|
12
|
-
ASTERISK: Final[Literal["*"]] = "*"
|
|
13
|
-
ATX: Final[Literal["atx"]] = "atx"
|
|
14
|
-
ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
|
|
15
|
-
BACKSLASH: Final[Literal["backslash"]] = "backslash"
|
|
16
|
-
UNDERLINED: Final[Literal["underlined"]] = "underlined"
|
|
17
|
-
SPACES: Final[Literal["spaces"]] = "spaces"
|
|
18
|
-
UNDERSCORE: Final[Literal["_"]] = "_"
|
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "html-to-markdown"
|
|
3
|
-
version = "1.1.0"
|
|
4
|
-
description = "Convert HTML to markdown"
|
|
5
|
-
authors = [{ name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" }]
|
|
6
|
-
requires-python = ">=3.9"
|
|
7
|
-
readme = "README.md"
|
|
8
|
-
license = { text = "MIT" }
|
|
9
|
-
classifiers = [
|
|
10
|
-
"Intended Audience :: Developers",
|
|
11
|
-
"License :: OSI Approved :: MIT License",
|
|
12
|
-
"Operating System :: OS Independent",
|
|
13
|
-
"Programming Language :: Python :: 3.9",
|
|
14
|
-
"Programming Language :: Python :: 3.10",
|
|
15
|
-
"Programming Language :: Python :: 3.11",
|
|
16
|
-
"Programming Language :: Python :: 3.12",
|
|
17
|
-
"Programming Language :: Python :: 3.13",
|
|
18
|
-
"Topic :: Text Processing :: Markup :: HTML",
|
|
19
|
-
"Topic :: Text Processing :: Markup :: Markdown",
|
|
20
|
-
"Topic :: Text Processing :: Markup",
|
|
21
|
-
"Topic :: Text Processing",
|
|
22
|
-
"Topic :: Utilities",
|
|
23
|
-
"Typing :: Typed",
|
|
24
|
-
]
|
|
25
|
-
keywords = ["markdown", "html", "beautifulsoup", "converter", "text-processing"]
|
|
26
|
-
|
|
27
|
-
dependencies = [
|
|
28
|
-
"beautifulsoup4>=4.12.3",
|
|
29
|
-
]
|
|
30
|
-
|
|
31
|
-
[project.scripts]
|
|
32
|
-
html_to_markdown = "html_to_markdown.__main__:cli"
|
|
33
|
-
|
|
34
|
-
[build-system]
|
|
35
|
-
requires = ["hatchling"]
|
|
36
|
-
build-backend = "hatchling.build"
|
|
37
|
-
|
|
38
|
-
[tool.hatch.build]
|
|
39
|
-
skip-excluded-dirs = true
|
|
40
|
-
|
|
41
|
-
[tool.hatch.build.targets.sdist]
|
|
42
|
-
only-include = ["html_to_markdown"]
|
|
43
|
-
|
|
44
|
-
[tool.hatch.build.targets.wheel]
|
|
45
|
-
only-include = ["html_to_markdown"]
|
|
46
|
-
|
|
47
|
-
[tool.pdm]
|
|
48
|
-
distribution = true
|
|
49
|
-
|
|
50
|
-
[tool.pdm.dev-dependencies]
|
|
51
|
-
dev = [
|
|
52
|
-
"covdefaults>=2.3.0",
|
|
53
|
-
"pre-commit>=3.7.0",
|
|
54
|
-
"pytest>=8.1.1",
|
|
55
|
-
"ruff>=0.4.0",
|
|
56
|
-
"pytest-mock>=3.14.0",
|
|
57
|
-
"pytest-cov>=5.0.0",
|
|
58
|
-
"types-beautifulsoup4>=4.12.0.20240907",
|
|
59
|
-
"mypy>=1.11.2",
|
|
60
|
-
]
|
|
61
|
-
|
|
62
|
-
[tool.pdm.scripts]
|
|
63
|
-
setup = { composite = [
|
|
64
|
-
"pre-commit install --install-hooks",
|
|
65
|
-
"pre-commit install --hook-type commit-msg",
|
|
66
|
-
"pdm install"
|
|
67
|
-
] }
|
|
68
|
-
lint = "pre-commit run --all-files"
|
|
69
|
-
test.cmd = "pytest"
|
|
70
|
-
test.env = { "PYTHONPATH" = "." }
|
|
71
|
-
coverage.cmd = "pytest --disable-warnings --cov --cov-report xml"
|
|
72
|
-
coverage.env = { "PYTHONPATH" = "." }
|
|
73
|
-
|
|
74
|
-
# linters configuration below
|
|
75
|
-
[tool.ruff]
|
|
76
|
-
line-length = 120
|
|
77
|
-
target-version = "py39"
|
|
78
|
-
lint.select = ["ALL"]
|
|
79
|
-
lint.ignore = [
|
|
80
|
-
"ANN401", # dynamically typed ANY
|
|
81
|
-
"D100", # # pydocstyle - missing docstring in public module
|
|
82
|
-
"D104", # pydocstyle - missing docstring in public package
|
|
83
|
-
"D107", # pydocstyle - missing docstring in __init__
|
|
84
|
-
"D205", # pydocstyle - 1 blank line required between summary line and description
|
|
85
|
-
"E501", # pycodestyle line too long, handled by ruff format
|
|
86
|
-
"EM", # Exception messages,
|
|
87
|
-
"FIX", # we allow todo and fixme comments
|
|
88
|
-
"PLR2004", # Magic variables, we allow them
|
|
89
|
-
"TD", # we allow todo and fixme comments
|
|
90
|
-
"TRY", # Try except block, rules are too strict
|
|
91
|
-
"COM812", # Conflicts with formatter
|
|
92
|
-
"ISC001", # Conflicts with formatter
|
|
93
|
-
"FBT", # Boolean Args
|
|
94
|
-
]
|
|
95
|
-
src = ["html_to_markdown", "tests"]
|
|
96
|
-
|
|
97
|
-
[tool.ruff.lint.per-file-ignores]
|
|
98
|
-
"tests/**/*.*" = ["S", "D", "PT006", "PT013", "PD", "ARG"]
|
|
99
|
-
|
|
100
|
-
[tool.ruff.format]
|
|
101
|
-
docstring-code-format = true
|
|
102
|
-
docstring-code-line-length = 120
|
|
103
|
-
|
|
104
|
-
[tool.ruff.lint.pydocstyle]
|
|
105
|
-
convention = "google"
|
|
106
|
-
|
|
107
|
-
[tool.ruff.lint.isort]
|
|
108
|
-
known-first-party = ["html_to_markdown", "tests"]
|
|
109
|
-
|
|
110
|
-
[tool.ruff.lint.pylint]
|
|
111
|
-
max-args = 25
|
|
112
|
-
max-returns = 10
|
|
113
|
-
|
|
114
|
-
[tool.mypy]
|
|
115
|
-
packages = ["html_to_markdown", "tests"]
|
|
116
|
-
python_version = "3.9"
|
|
117
|
-
implicit_reexport = false
|
|
118
|
-
show_error_codes = true
|
|
119
|
-
strict = true
|
|
120
|
-
|
|
121
|
-
[[tool.mypy.overrides]]
|
|
122
|
-
module = "tests.*"
|
|
123
|
-
disallow_any_generics = false
|
|
124
|
-
disallow_untyped_decorators = false
|
|
125
|
-
|
|
126
|
-
[tool.coverage.run]
|
|
127
|
-
omit = ["tests/*"]
|
|
128
|
-
plugins = ["covdefaults"]
|
|
129
|
-
source = ["html_to_markdown"]
|
|
130
|
-
|
|
131
|
-
[tool.coverage.report]
|
|
132
|
-
exclude_lines = ['if TYPE_CHECKING:']
|
|
133
|
-
fail_under = 100
|
|
134
|
-
|
|
135
|
-
[tool.pytest.ini_options]
|
|
136
|
-
asyncio_mode = "auto"
|
|
137
|
-
asyncio_default_fixture_loop_scope = "function"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|