html-to-markdown 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,14 +1,15 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: Convert HTML to markdown
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
7
  License-File: LICENSE
8
- Keywords: beautifulsoup,converter,html,markdown,text-processing
8
+ Keywords: converter,html,markdown,text-extraction,text-processing
9
9
  Classifier: Intended Audience :: Developers
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
12
13
  Classifier: Programming Language :: Python :: 3.9
13
14
  Classifier: Programming Language :: Python :: 3.10
14
15
  Classifier: Programming Language :: Python :: 3.11
@@ -69,20 +70,20 @@ convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
69
70
  The `convert_to_markdown` function accepts the following kwargs:
70
71
 
71
72
  - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
72
- - bullets (str): A string of characters to use for bullet points in lists. Defaults to '*+-'.
73
+ - bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
73
74
  - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
74
75
  - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
75
76
  - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
76
77
  - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
77
- - escape_asterisks (bool): Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
78
+ - escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
78
79
  - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
79
- - escape_underscores (bool): Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
80
+ - escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
80
81
  - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
81
82
  underlined".
82
83
  - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
83
84
  - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
84
85
  - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
85
- - strong_em_symbol (Literal["*", "_"]): Symbol to use for strong/emphasized text. Defaults to "*".
86
+ - strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
86
87
  - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
87
88
  - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
88
89
  - wrap (bool): Wrap text to the specified width. Defaults to False.
@@ -43,20 +43,20 @@ convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
43
43
  The `convert_to_markdown` function accepts the following kwargs:
44
44
 
45
45
  - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
46
- - bullets (str): A string of characters to use for bullet points in lists. Defaults to '*+-'.
46
+ - bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
47
47
  - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
48
48
  - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
49
49
  - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
50
50
  - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
51
- - escape_asterisks (bool): Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
51
+ - escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
52
52
  - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
53
- - escape_underscores (bool): Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
53
+ - escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
54
54
  - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
55
55
  underlined".
56
56
  - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
57
57
  - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
58
58
  - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
59
- - strong_em_symbol (Literal["*", "_"]): Symbol to use for strong/emphasized text. Defaults to "*".
59
+ - strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
60
60
  - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
61
61
  - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
62
62
  - wrap (bool): Wrap text to the specified width. Defaults to False.
@@ -0,0 +1,5 @@
1
+ from html_to_markdown.processing import convert_to_markdown
2
+
3
+ from .legacy import Markdownify
4
+
5
+ __all__ = ["Markdownify", "convert_to_markdown"]
@@ -0,0 +1,11 @@
1
+ import sys
2
+
3
+ if __name__ == "__main__":
4
+ from html_to_markdown.cli import main
5
+
6
+ try:
7
+ result = main(sys.argv[1:])
8
+ print(result) # noqa: T201
9
+ except ValueError as e:
10
+ print(str(e), file=sys.stderr) # noqa: T201
11
+ sys.exit(1)
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from re import Pattern
5
+ from typing import Final
6
+
7
+ convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
+ line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
+ whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
+ html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
+
12
+ ASTERISK: Final = "*"
13
+ ATX: Final = "atx"
14
+ ATX_CLOSED: Final = "atx_closed"
15
+ BACKSLASH: Final = "backslash"
16
+ UNDERLINED: Final = "underlined"
17
+ SPACES: Final = "spaces"
18
+ UNDERSCORE: Final = "_"
@@ -55,17 +55,19 @@ SupportedElements = Literal[
55
55
  "kbd",
56
56
  ]
57
57
 
58
- ConverterssMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
58
+ ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
59
59
 
60
60
  T = TypeVar("T")
61
61
 
62
62
 
63
63
  def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
64
- """This abstracts all simple inline tags like b, em, del, ...
65
- Returns a function that wraps the chomped text in a pair of the string
66
- that is returned by markup_fn, with '/' inserted in the string used after
67
- the text if it looks like an HTML tag. markup_fn is necessary to allow for
68
- references to self.strong_em_symbol etc.
64
+ """Create an inline converter for a markup pattern or tag.
65
+
66
+ Args:
67
+ markup_prefix: The markup prefix to insert.
68
+
69
+ Returns:
70
+ A function that can be used to convert HTML to Markdown.
69
71
  """
70
72
 
71
73
  def implementation(*, tag: Tag, text: str) -> str:
@@ -295,7 +297,7 @@ def create_converters_map(
295
297
  sup_symbol: str,
296
298
  wrap: bool,
297
299
  wrap_width: int,
298
- ) -> ConverterssMap:
300
+ ) -> ConvertersMap:
299
301
  """Create a mapping of HTML elements to their corresponding conversion functions.
300
302
 
301
303
  Args:
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
6
+ from html_to_markdown.converters import create_converters_map
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable, Iterable
10
+
11
+ from bs4 import Tag
12
+
13
+
14
+ def _create_legacy_class(
15
+ autolinks: bool,
16
+ bullets: str,
17
+ code_language: str,
18
+ code_language_callback: Callable[[Tag], str] | None,
19
+ default_title: bool,
20
+ heading_style: Literal["atx", "atx_closed", "underlined"],
21
+ keep_inline_images_in: Iterable[str] | None,
22
+ newline_style: str,
23
+ strong_em_symbol: str,
24
+ sub_symbol: str,
25
+ sup_symbol: str,
26
+ wrap: bool,
27
+ wrap_width: int,
28
+ ) -> type:
29
+ """Create a legacy class for Markdownify.
30
+
31
+ Deprecated: Use the new hooks api instead.
32
+
33
+ Args:
34
+ autolinks: Whether to convert URLs into links.
35
+ bullets: The bullet characters to use for unordered lists.
36
+ code_language: The default code language to use.
37
+ code_language_callback: A callback to get the code language.
38
+ default_title: Whether to use the URL as the title for links.
39
+ heading_style: The style of headings.
40
+ keep_inline_images_in: The tags to keep inline images in.
41
+ newline_style: The style of newlines.
42
+ strong_em_symbol: The symbol to use for strong and emphasis text.
43
+ sub_symbol: The symbol to use for subscript text.
44
+ sup_symbol: The symbol to use for superscript text.
45
+ wrap: Whether to wrap text.
46
+ wrap_width: The width to wrap text at.
47
+
48
+ Returns:
49
+ A class that can be used to convert HTML to Markdown.
50
+ """
51
+ return type(
52
+ "Markdownify",
53
+ (),
54
+ {
55
+ k.removeprefix("_"): v
56
+ for k, v in create_converters_map(
57
+ autolinks=autolinks,
58
+ bullets=bullets,
59
+ code_language=code_language,
60
+ code_language_callback=code_language_callback,
61
+ default_title=default_title,
62
+ heading_style=heading_style,
63
+ keep_inline_images_in=keep_inline_images_in,
64
+ newline_style=newline_style,
65
+ strong_em_symbol=strong_em_symbol,
66
+ sub_symbol=sub_symbol,
67
+ sup_symbol=sup_symbol,
68
+ wrap=wrap,
69
+ wrap_width=wrap_width,
70
+ ).items()
71
+ },
72
+ )
73
+
74
+
75
+ Markdownify = _create_legacy_class(
76
+ autolinks=True,
77
+ bullets="*+-",
78
+ code_language="",
79
+ code_language_callback=None,
80
+ default_title=False,
81
+ heading_style=UNDERLINED,
82
+ keep_inline_images_in=None,
83
+ newline_style=SPACES,
84
+ strong_em_symbol=ASTERISK,
85
+ sub_symbol="",
86
+ sup_symbol="",
87
+ wrap=False,
88
+ wrap_width=80,
89
+ )
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from itertools import chain
3
4
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
4
5
 
5
6
  from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
@@ -11,7 +12,7 @@ from html_to_markdown.constants import (
11
12
  html_heading_re,
12
13
  whitespace_re,
13
14
  )
14
- from html_to_markdown.converters import ConverterssMap, create_converters_map
15
+ from html_to_markdown.converters import ConvertersMap, create_converters_map
15
16
  from html_to_markdown.utils import escape
16
17
 
17
18
  if TYPE_CHECKING:
@@ -76,18 +77,21 @@ def _is_nested_tag(el: PageElement) -> bool:
76
77
 
77
78
  def _process_tag(
78
79
  tag: Tag,
79
- converters_map: ConverterssMap,
80
+ converters_map: ConvertersMap,
80
81
  *,
81
- convert: Iterable[str] | None,
82
+ convert: set[str] | None,
82
83
  convert_as_inline: bool = False,
83
84
  escape_asterisks: bool,
84
85
  escape_misc: bool,
85
86
  escape_underscores: bool,
86
- strip: Iterable[str] | None,
87
+ strip: set[str] | None,
87
88
  ) -> str:
89
+ should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
90
+ tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
88
91
  text = ""
92
+
89
93
  is_heading = html_heading_re.match(tag.name) is not None
90
- is_cell = tag.name in {"td", "th"}
94
+ is_cell = tag_name in {"td", "th"}
91
95
  convert_children_as_inline = convert_as_inline or is_heading or is_cell
92
96
 
93
97
  if _is_nested_tag(tag):
@@ -121,9 +125,7 @@ def _process_tag(
121
125
  strip=strip,
122
126
  )
123
127
 
124
- tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
125
-
126
- if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
128
+ if tag_name and should_convert_tag:
127
129
  return converters_map[tag_name]( # type: ignore[call-arg]
128
130
  tag=tag, text=text, convert_as_inline=convert_as_inline
129
131
  )
@@ -166,7 +168,7 @@ def _process_text(
166
168
  return text
167
169
 
168
170
 
169
- def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert: Iterable[str] | None) -> bool:
171
+ def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
170
172
  if strip is not None:
171
173
  return tag_name not in strip
172
174
  if convert is not None:
@@ -174,6 +176,14 @@ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert:
174
176
  return True
175
177
 
176
178
 
179
+ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
180
+ if value is None:
181
+ return None
182
+ if isinstance(value, str):
183
+ return set(",".split(value))
184
+ return {*chain(*[v.split(",") for v in value])}
185
+
186
+
177
187
  def convert_to_markdown(
178
188
  source: str | BeautifulSoup,
179
189
  *,
@@ -181,7 +191,7 @@ def convert_to_markdown(
181
191
  bullets: str = "*+-",
182
192
  code_language: str = "",
183
193
  code_language_callback: Callable[[Any], str] | None = None,
184
- convert: Iterable[str] | None = None,
194
+ convert: str | Iterable[str] | None = None,
185
195
  default_title: bool = False,
186
196
  escape_asterisks: bool = True,
187
197
  escape_misc: bool = True,
@@ -189,7 +199,7 @@ def convert_to_markdown(
189
199
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
190
200
  keep_inline_images_in: Iterable[str] | None = None,
191
201
  newline_style: Literal["spaces", "backslash"] = SPACES,
192
- strip: Iterable[str] | None = None,
202
+ strip: str | Iterable[str] | None = None,
193
203
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
194
204
  sub_symbol: str = "",
195
205
  sup_symbol: str = "",
@@ -221,13 +231,22 @@ def convert_to_markdown(
221
231
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
222
232
  convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
223
233
 
234
+ Raises:
235
+ ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
236
+
224
237
  Returns:
225
238
  str: A string of Markdown-formatted text converted from the given HTML.
226
239
  """
227
240
  if isinstance(source, str):
228
241
  from bs4 import BeautifulSoup
229
242
 
230
- source = BeautifulSoup(source, "html.parser")
243
+ if "".join(source.split("\n")):
244
+ source = BeautifulSoup(source, "html.parser")
245
+ else:
246
+ raise ValueError("The input HTML is empty.")
247
+
248
+ if strip is not None and convert is not None:
249
+ raise ValueError("Only one of 'strip' and 'convert' can be specified.")
231
250
 
232
251
  converters_map = create_converters_map(
233
252
  autolinks=autolinks,
@@ -248,10 +267,10 @@ def convert_to_markdown(
248
267
  return _process_tag(
249
268
  source,
250
269
  converters_map,
251
- convert=convert,
270
+ convert=_as_optional_set(convert),
252
271
  convert_as_inline=convert_as_inline,
253
272
  escape_asterisks=escape_asterisks,
254
273
  escape_misc=escape_misc,
255
274
  escape_underscores=escape_underscores,
256
- strip=strip,
275
+ strip=_as_optional_set(strip),
257
276
  )
@@ -0,0 +1,113 @@
1
+ [build-system]
2
+ build-backend = "hatchling.build"
3
+
4
+ requires = [ "hatchling" ]
5
+
6
+ [project]
7
+ name = "html-to-markdown"
8
+ version = "1.2.0"
9
+ description = "Convert HTML to markdown"
10
+ readme = "README.md"
11
+ keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
12
+
13
+ license = { text = "MIT" }
14
+ authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
15
+ requires-python = ">=3.9"
16
+ classifiers = [
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Topic :: Text Processing",
27
+ "Topic :: Text Processing :: Markup",
28
+ "Topic :: Text Processing :: Markup :: HTML",
29
+ "Topic :: Text Processing :: Markup :: Markdown",
30
+ "Topic :: Utilities",
31
+ "Typing :: Typed",
32
+ ]
33
+ dependencies = [
34
+ "beautifulsoup4>=4.12.3",
35
+ ]
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "covdefaults>=2.3",
40
+ "mypy>=1.14.1",
41
+ "pre-commit>=4.1",
42
+ "pytest>=8.3.4",
43
+ "pytest-cov>=6",
44
+ "pytest-mock>=3.14",
45
+ "ruff>=0.9.3",
46
+ "types-beautifulsoup4>=4.12.0.20241020",
47
+ ]
48
+
49
+ [tool.hatch.build]
50
+ skip-excluded-dirs = true
51
+
52
+ [tool.hatch.build.targets.sdist]
53
+ only-include = [ "html_to_markdown" ]
54
+
55
+ [tool.hatch.build.targets.wheel]
56
+ only-include = [ "html_to_markdown" ]
57
+
58
+ [tool.ruff]
59
+ target-version = "py39"
60
+ line-length = 120
61
+ src = [ "html_to_markdown", "tests" ]
62
+
63
+ format.docstring-code-line-length = 120
64
+ format.docstring-code-format = true
65
+ lint.select = [ "ALL" ]
66
+ lint.ignore = [
67
+ "ANN401", # dynamically typed ANY
68
+ "COM812", # Conflicts with formatter
69
+ "D100", # # pydocstyle - missing docstring in public module
70
+ "D104", # pydocstyle - missing docstring in public package
71
+ "D107", # pydocstyle - missing docstring in __init__
72
+ "D205", # pydocstyle - 1 blank line required between summary line and description
73
+ "E501", # pycodestyle line too long, handled by ruff format
74
+ "EM", # Exception messages,
75
+ "FBT", # Boolean Args
76
+ "FIX", # we allow todo and fixme comments
77
+ "ISC001", # Conflicts with formatter
78
+ "PLR0913", # Pylint - too many arguments.
79
+ "PLR2004", # Magic variables, we allow them
80
+ "TD", # we allow todo and fixme comments
81
+ "TRY", # Try except block, rules are too strict
82
+ ]
83
+ lint.per-file-ignores."tests/**/*.*" = [ "ARG", "D", "PD", "PT006", "PT013", "S" ]
84
+ lint.isort.known-first-party = [ "html_to_markdown", "tests" ]
85
+ lint.pydocstyle.convention = "google"
86
+
87
+ [tool.pytest.ini_options]
88
+ asyncio_mode = "auto"
89
+ asyncio_default_fixture_loop_scope = "function"
90
+
91
+ [tool.coverage.run]
92
+ omit = [ "tests/*" ]
93
+ plugins = [ "covdefaults" ]
94
+ source = [ "html_to_markdown" ]
95
+
96
+ [tool.coverage.report]
97
+ exclude_lines = [ 'if TYPE_CHECKING:' ]
98
+ fail_under = 100
99
+
100
+ [tool.mypy]
101
+ packages = [ "html_to_markdown", "tests" ]
102
+ python_version = "3.9"
103
+ implicit_reexport = false
104
+ show_error_codes = true
105
+ strict = true
106
+
107
+ [[tool.mypy.overrides]]
108
+ module = "tests.*"
109
+ disallow_any_generics = false
110
+ disallow_untyped_decorators = false
111
+
112
+ [tool.uv]
113
+ default-groups = [ "dev" ]
@@ -1,3 +0,0 @@
1
- from html_to_markdown.processing import convert_to_markdown
2
-
3
- __all__ = ["convert_to_markdown"]
@@ -1,7 +0,0 @@
1
- import sys
2
-
3
- from html_to_markdown.dli import cli
4
-
5
- if __name__ == "__main__":
6
- result = cli(sys.argv[1:])
7
- print(result) # noqa: T201
@@ -1,18 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from re import Pattern
5
- from typing import Final, Literal
6
-
7
- convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
- line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
- whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
- html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
-
12
- ASTERISK: Final[Literal["*"]] = "*"
13
- ATX: Final[Literal["atx"]] = "atx"
14
- ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
15
- BACKSLASH: Final[Literal["backslash"]] = "backslash"
16
- UNDERLINED: Final[Literal["underlined"]] = "underlined"
17
- SPACES: Final[Literal["spaces"]] = "spaces"
18
- UNDERSCORE: Final[Literal["_"]] = "_"
@@ -1,137 +0,0 @@
1
- [project]
2
- name = "html-to-markdown"
3
- version = "1.1.0"
4
- description = "Convert HTML to markdown"
5
- authors = [{ name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" }]
6
- requires-python = ">=3.9"
7
- readme = "README.md"
8
- license = { text = "MIT" }
9
- classifiers = [
10
- "Intended Audience :: Developers",
11
- "License :: OSI Approved :: MIT License",
12
- "Operating System :: OS Independent",
13
- "Programming Language :: Python :: 3.9",
14
- "Programming Language :: Python :: 3.10",
15
- "Programming Language :: Python :: 3.11",
16
- "Programming Language :: Python :: 3.12",
17
- "Programming Language :: Python :: 3.13",
18
- "Topic :: Text Processing :: Markup :: HTML",
19
- "Topic :: Text Processing :: Markup :: Markdown",
20
- "Topic :: Text Processing :: Markup",
21
- "Topic :: Text Processing",
22
- "Topic :: Utilities",
23
- "Typing :: Typed",
24
- ]
25
- keywords = ["markdown", "html", "beautifulsoup", "converter", "text-processing"]
26
-
27
- dependencies = [
28
- "beautifulsoup4>=4.12.3",
29
- ]
30
-
31
- [project.scripts]
32
- html_to_markdown = "html_to_markdown.__main__:cli"
33
-
34
- [build-system]
35
- requires = ["hatchling"]
36
- build-backend = "hatchling.build"
37
-
38
- [tool.hatch.build]
39
- skip-excluded-dirs = true
40
-
41
- [tool.hatch.build.targets.sdist]
42
- only-include = ["html_to_markdown"]
43
-
44
- [tool.hatch.build.targets.wheel]
45
- only-include = ["html_to_markdown"]
46
-
47
- [tool.pdm]
48
- distribution = true
49
-
50
- [tool.pdm.dev-dependencies]
51
- dev = [
52
- "covdefaults>=2.3.0",
53
- "pre-commit>=3.7.0",
54
- "pytest>=8.1.1",
55
- "ruff>=0.4.0",
56
- "pytest-mock>=3.14.0",
57
- "pytest-cov>=5.0.0",
58
- "types-beautifulsoup4>=4.12.0.20240907",
59
- "mypy>=1.11.2",
60
- ]
61
-
62
- [tool.pdm.scripts]
63
- setup = { composite = [
64
- "pre-commit install --install-hooks",
65
- "pre-commit install --hook-type commit-msg",
66
- "pdm install"
67
- ] }
68
- lint = "pre-commit run --all-files"
69
- test.cmd = "pytest"
70
- test.env = { "PYTHONPATH" = "." }
71
- coverage.cmd = "pytest --disable-warnings --cov --cov-report xml"
72
- coverage.env = { "PYTHONPATH" = "." }
73
-
74
- # linters configuration below
75
- [tool.ruff]
76
- line-length = 120
77
- target-version = "py39"
78
- lint.select = ["ALL"]
79
- lint.ignore = [
80
- "ANN401", # dynamically typed ANY
81
- "D100", # # pydocstyle - missing docstring in public module
82
- "D104", # pydocstyle - missing docstring in public package
83
- "D107", # pydocstyle - missing docstring in __init__
84
- "D205", # pydocstyle - 1 blank line required between summary line and description
85
- "E501", # pycodestyle line too long, handled by ruff format
86
- "EM", # Exception messages,
87
- "FIX", # we allow todo and fixme comments
88
- "PLR2004", # Magic variables, we allow them
89
- "TD", # we allow todo and fixme comments
90
- "TRY", # Try except block, rules are too strict
91
- "COM812", # Conflicts with formatter
92
- "ISC001", # Conflicts with formatter
93
- "FBT", # Boolean Args
94
- ]
95
- src = ["html_to_markdown", "tests"]
96
-
97
- [tool.ruff.lint.per-file-ignores]
98
- "tests/**/*.*" = ["S", "D", "PT006", "PT013", "PD", "ARG"]
99
-
100
- [tool.ruff.format]
101
- docstring-code-format = true
102
- docstring-code-line-length = 120
103
-
104
- [tool.ruff.lint.pydocstyle]
105
- convention = "google"
106
-
107
- [tool.ruff.lint.isort]
108
- known-first-party = ["html_to_markdown", "tests"]
109
-
110
- [tool.ruff.lint.pylint]
111
- max-args = 25
112
- max-returns = 10
113
-
114
- [tool.mypy]
115
- packages = ["html_to_markdown", "tests"]
116
- python_version = "3.9"
117
- implicit_reexport = false
118
- show_error_codes = true
119
- strict = true
120
-
121
- [[tool.mypy.overrides]]
122
- module = "tests.*"
123
- disallow_any_generics = false
124
- disallow_untyped_decorators = false
125
-
126
- [tool.coverage.run]
127
- omit = ["tests/*"]
128
- plugins = ["covdefaults"]
129
- source = ["html_to_markdown"]
130
-
131
- [tool.coverage.report]
132
- exclude_lines = ['if TYPE_CHECKING:']
133
- fail_under = 100
134
-
135
- [tool.pytest.ini_options]
136
- asyncio_mode = "auto"
137
- asyncio_default_fixture_loop_scope = "function"