html-to-markdown 1.3.1__tar.gz → 1.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show
  1. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/PKG-INFO +1 -1
  2. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/__main__.py +4 -1
  3. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/processing.py +33 -11
  4. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown.egg-info/PKG-INFO +1 -1
  5. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown.egg-info/SOURCES.txt +1 -0
  6. html_to_markdown-1.3.3/html_to_markdown.egg-info/entry_points.txt +2 -0
  7. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/pyproject.toml +2 -3
  8. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/LICENSE +0 -0
  9. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/README.md +0 -0
  10. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/__init__.py +0 -0
  11. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/cli.py +0 -0
  12. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/constants.py +0 -0
  13. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/converters.py +0 -0
  14. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/py.typed +0 -0
  15. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown/utils.py +0 -0
  16. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  17. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown.egg-info/requires.txt +0 -0
  18. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/html_to_markdown.egg-info/top_level.txt +0 -0
  19. {html_to_markdown-1.3.1 → html_to_markdown-1.3.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.1
3
+ Version: 1.3.3
4
4
  Summary: Convert HTML to markdown
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  import sys
2
2
 
3
- if __name__ == "__main__":
3
+ def cli():
4
4
  from html_to_markdown.cli import main
5
5
 
6
6
  try:
@@ -9,3 +9,6 @@ if __name__ == "__main__":
9
9
  except ValueError as e:
10
10
  print(str(e), file=sys.stderr) # noqa: T201
11
11
  sys.exit(1)
12
+
13
+ if __name__ == "__main__":
14
+ cli()
@@ -89,6 +89,7 @@ def _process_tag(
89
89
  escape_misc: bool,
90
90
  escape_underscores: bool,
91
91
  strip: set[str] | None,
92
+ context_before: str = "",
92
93
  ) -> str:
93
94
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
94
95
  tag_name: SupportedTag | None = (
@@ -129,12 +130,21 @@ def _process_tag(
129
130
  escape_misc=escape_misc,
130
131
  escape_underscores=escape_underscores,
131
132
  strip=strip,
133
+ context_before=(context_before + text)[-2:],
132
134
  )
133
135
 
134
136
  if tag_name and should_convert_tag:
135
- return converters_map[tag_name]( # type: ignore[call-arg]
137
+ rendered = converters_map[tag_name]( # type: ignore[call-arg]
136
138
  tag=tag, text=text, convert_as_inline=convert_as_inline
137
139
  )
140
+ # For headings, ensure two newlines before if not already present
141
+ # Edge case where the document starts with a \n and then a heading
142
+ if is_heading and context_before not in {"", "\n"}:
143
+ n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
144
+ if n_eol_to_add > 0:
145
+ prefix = "\n" * n_eol_to_add
146
+ return f"{prefix}{rendered}"
147
+ return rendered
138
148
 
139
149
  return text
140
150
 
@@ -275,13 +285,25 @@ def convert_to_markdown(
275
285
  if custom_converters:
276
286
  converters_map.update(cast("ConvertersMap", custom_converters))
277
287
 
278
- return _process_tag(
279
- source,
280
- converters_map,
281
- convert=_as_optional_set(convert),
282
- convert_as_inline=convert_as_inline,
283
- escape_asterisks=escape_asterisks,
284
- escape_misc=escape_misc,
285
- escape_underscores=escape_underscores,
286
- strip=_as_optional_set(strip),
287
- )
288
+ text = ""
289
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
290
+ if isinstance(el, NavigableString):
291
+ text += _process_text(
292
+ el=el,
293
+ escape_misc=escape_misc,
294
+ escape_asterisks=escape_asterisks,
295
+ escape_underscores=escape_underscores,
296
+ )
297
+ elif isinstance(el, Tag):
298
+ text += _process_tag(
299
+ el,
300
+ converters_map,
301
+ convert_as_inline=convert_as_inline,
302
+ convert=_as_optional_set(convert),
303
+ escape_asterisks=escape_asterisks,
304
+ escape_misc=escape_misc,
305
+ escape_underscores=escape_underscores,
306
+ strip=_as_optional_set(strip),
307
+ context_before=text[-2:],
308
+ )
309
+ return text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.1
3
+ Version: 1.3.3
4
4
  Summary: Convert HTML to markdown
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -12,5 +12,6 @@ html_to_markdown/utils.py
12
12
  html_to_markdown.egg-info/PKG-INFO
13
13
  html_to_markdown.egg-info/SOURCES.txt
14
14
  html_to_markdown.egg-info/dependency_links.txt
15
+ html_to_markdown.egg-info/entry_points.txt
15
16
  html_to_markdown.egg-info/requires.txt
16
17
  html_to_markdown.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ html_to_markdown = html_to_markdown.__main__:cli
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.3.1"
8
+ version = "1.3.3"
9
9
  description = "Convert HTML to markdown"
10
10
  readme = "README.md"
11
11
  keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
@@ -29,6 +29,7 @@ classifiers = [
29
29
  "Topic :: Utilities",
30
30
  "Typing :: Typed",
31
31
  ]
32
+ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
32
33
 
33
34
  dependencies = [
34
35
  "beautifulsoup4>=4.13.4",
@@ -58,8 +59,6 @@ html_to_markdown = [ "py.typed" ]
58
59
  [tool.hatch.build]
59
60
  skip-excluded-dirs = true
60
61
 
61
- scripts.html_to_markdown = "html_to_markdown.__main__:cli"
62
-
63
62
  [tool.ruff]
64
63
  target-version = "py39"
65
64
  line-length = 120