html-to-markdown 2.14.0__cp310-abi3-macosx_11_0_arm64.whl → 2.14.5__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- html_to_markdown/__init__.py +3 -7
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/_html_to_markdown.pyi +6 -6
- html_to_markdown/api.py +45 -46
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/v1_compat.py +0 -1
- {html_to_markdown-2.14.0.data → html_to_markdown-2.14.5.data}/scripts/html-to-markdown +0 -0
- {html_to_markdown-2.14.0.dist-info → html_to_markdown-2.14.5.dist-info}/METADATA +2 -2
- html_to_markdown-2.14.5.dist-info/RECORD +17 -0
- html_to_markdown-2.14.0.dist-info/RECORD +0 -17
- {html_to_markdown-2.14.0.dist-info → html_to_markdown-2.14.5.dist-info}/WHEEL +0 -0
- {html_to_markdown-2.14.0.dist-info → html_to_markdown-2.14.5.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -15,16 +15,16 @@ V1 API (backward compatibility):
|
|
|
15
15
|
markdown = convert_to_markdown(html, heading_style="atx")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
import contextlib
|
|
19
|
-
|
|
20
18
|
from html_to_markdown.api import (
|
|
21
19
|
InlineImage,
|
|
22
20
|
InlineImageConfig,
|
|
23
21
|
InlineImageWarning,
|
|
22
|
+
MetadataConfig,
|
|
24
23
|
OptionsHandle,
|
|
25
24
|
convert,
|
|
26
25
|
convert_with_handle,
|
|
27
26
|
convert_with_inline_images,
|
|
27
|
+
convert_with_metadata,
|
|
28
28
|
create_options_handle,
|
|
29
29
|
)
|
|
30
30
|
from html_to_markdown.exceptions import (
|
|
@@ -37,10 +37,6 @@ from html_to_markdown.exceptions import (
|
|
|
37
37
|
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
38
38
|
from html_to_markdown.v1_compat import convert_to_markdown, markdownify
|
|
39
39
|
|
|
40
|
-
# Optional metadata support
|
|
41
|
-
with contextlib.suppress(ImportError):
|
|
42
|
-
from html_to_markdown.api import MetadataConfig, convert_with_metadata
|
|
43
|
-
|
|
44
40
|
__all__ = [
|
|
45
41
|
"ConflictingOptionsError",
|
|
46
42
|
"ConversionOptions",
|
|
@@ -63,4 +59,4 @@ __all__ = [
|
|
|
63
59
|
"markdownify",
|
|
64
60
|
]
|
|
65
61
|
|
|
66
|
-
__version__ = "2.14.
|
|
62
|
+
__version__ = "2.14.5"
|
|
Binary file
|
|
@@ -141,13 +141,13 @@ class DocumentMetadata(TypedDict):
|
|
|
141
141
|
canonical_url: str | None
|
|
142
142
|
base_href: str | None
|
|
143
143
|
language: str | None
|
|
144
|
-
text_direction: str | None
|
|
144
|
+
text_direction: str | None
|
|
145
145
|
open_graph: dict[str, str]
|
|
146
146
|
twitter_card: dict[str, str]
|
|
147
147
|
meta_tags: dict[str, str]
|
|
148
148
|
|
|
149
149
|
class HeaderMetadata(TypedDict):
|
|
150
|
-
level: int
|
|
150
|
+
level: int
|
|
151
151
|
text: str
|
|
152
152
|
id: str | None
|
|
153
153
|
depth: int
|
|
@@ -157,7 +157,7 @@ class LinkMetadata(TypedDict):
|
|
|
157
157
|
href: str
|
|
158
158
|
text: str
|
|
159
159
|
title: str | None
|
|
160
|
-
link_type: str
|
|
160
|
+
link_type: str
|
|
161
161
|
rel: list[str]
|
|
162
162
|
attributes: dict[str, str]
|
|
163
163
|
|
|
@@ -165,12 +165,12 @@ class ImageMetadata(TypedDict):
|
|
|
165
165
|
src: str
|
|
166
166
|
alt: str | None
|
|
167
167
|
title: str | None
|
|
168
|
-
dimensions: tuple[int, int] | None
|
|
169
|
-
image_type: str
|
|
168
|
+
dimensions: tuple[int, int] | None
|
|
169
|
+
image_type: str
|
|
170
170
|
attributes: dict[str, str]
|
|
171
171
|
|
|
172
172
|
class StructuredData(TypedDict):
|
|
173
|
-
data_type: str
|
|
173
|
+
data_type: str
|
|
174
174
|
raw_json: str
|
|
175
175
|
schema_type: str | None
|
|
176
176
|
|
html_to_markdown/api.py
CHANGED
|
@@ -8,20 +8,16 @@ import html_to_markdown._html_to_markdown as _rust
|
|
|
8
8
|
from html_to_markdown._html_to_markdown import (
|
|
9
9
|
ConversionOptionsHandle as OptionsHandle,
|
|
10
10
|
)
|
|
11
|
-
from html_to_markdown._html_to_markdown import
|
|
11
|
+
from html_to_markdown._html_to_markdown import (
|
|
12
|
+
InlineImageConfig,
|
|
13
|
+
MetadataConfig,
|
|
14
|
+
)
|
|
12
15
|
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
13
16
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
_HAS_METADATA = True
|
|
19
|
-
except ImportError:
|
|
20
|
-
MetadataConfig = None # type: ignore[misc,assignment]
|
|
21
|
-
if TYPE_CHECKING:
|
|
22
|
-
from html_to_markdown._html_to_markdown import ExtendedMetadata # pragma: no cover
|
|
23
|
-
else:
|
|
24
|
-
ExtendedMetadata = dict[str, object] # type: ignore[assignment]
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from html_to_markdown._html_to_markdown import ExtendedMetadata # pragma: no cover
|
|
19
|
+
else:
|
|
20
|
+
ExtendedMetadata = dict[str, object] # type: ignore[assignment]
|
|
25
21
|
|
|
26
22
|
|
|
27
23
|
class InlineImage(TypedDict):
|
|
@@ -146,40 +142,43 @@ def convert_with_handle(html: str, handle: OptionsHandle) -> str:
|
|
|
146
142
|
return _rust.convert_with_options_handle(html, handle)
|
|
147
143
|
|
|
148
144
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
145
|
+
def convert_with_metadata(
|
|
146
|
+
html: str,
|
|
147
|
+
options: ConversionOptions | None = None,
|
|
148
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
149
|
+
metadata_config: MetadataConfig | None = None,
|
|
150
|
+
) -> tuple[str, ExtendedMetadata]:
|
|
151
|
+
"""Convert HTML and extract comprehensive metadata.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
html: HTML string to convert
|
|
155
|
+
options: Optional conversion configuration
|
|
156
|
+
preprocessing: Optional preprocessing configuration
|
|
157
|
+
metadata_config: Optional metadata extraction configuration
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Tuple of (markdown, metadata_dict) where metadata_dict contains:
|
|
161
|
+
- document: Document-level metadata (title, description, lang, etc.)
|
|
162
|
+
- headers: List of header elements with hierarchy
|
|
163
|
+
- links: List of extracted hyperlinks with classification
|
|
164
|
+
- images: List of extracted images with metadata
|
|
165
|
+
- structured_data: List of JSON-LD, Microdata, or RDFa blocks
|
|
166
|
+
"""
|
|
167
|
+
if not hasattr(_rust, "convert_with_metadata"):
|
|
168
|
+
raise ImportError(
|
|
169
|
+
"convert_with_metadata is missing from the native extension; this indicates a broken/partial installation."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if options is None:
|
|
173
|
+
options = ConversionOptions()
|
|
174
|
+
if preprocessing is None:
|
|
175
|
+
preprocessing = PreprocessingOptions()
|
|
176
|
+
if metadata_config is None:
|
|
177
|
+
metadata_config = MetadataConfig()
|
|
178
|
+
|
|
179
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
180
|
+
markdown, metadata = _rust.convert_with_metadata(html, rust_options, metadata_config)
|
|
181
|
+
return markdown, metadata
|
|
183
182
|
|
|
184
183
|
|
|
185
184
|
__all__ = [
|
|
Binary file
|
html_to_markdown/v1_compat.py
CHANGED
|
@@ -136,7 +136,6 @@ def convert_to_markdown(
|
|
|
136
136
|
)
|
|
137
137
|
|
|
138
138
|
# ~keep: v1 used indented code blocks by default, but switched to backticks when a language was set
|
|
139
|
-
# This maintains v1 behavior for backward compatibility
|
|
140
139
|
code_block_style = "backticks" if code_language else "indented"
|
|
141
140
|
|
|
142
141
|
options = ConversionOptions(
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.14.
|
|
3
|
+
Version: 2.14.5
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -44,7 +44,7 @@ High-performance HTML to Markdown converter with a clean Python API (powered by
|
|
|
44
44
|
[](https://hex.pm/packages/html_to_markdown)
|
|
45
45
|
[](https://www.nuget.org/packages/Goldziher.HtmlToMarkdown/)
|
|
46
46
|
[](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
|
|
47
|
-
[](https://pkg.go.dev/github.com/Goldziher/html-to-markdown/packages/go/htmltomarkdown)
|
|
47
|
+
[](https://pkg.go.dev/github.com/Goldziher/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
48
48
|
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
49
49
|
[](https://discord.gg/pXxagNK2zN)
|
|
50
50
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown-2.14.5.data/scripts/html-to-markdown,sha256=pZtJBTQolbGl_VL3mBo8mLIppGODYhfcq6yOZVldWOg,6263872
|
|
2
|
+
html_to_markdown-2.14.5.dist-info/RECORD,,
|
|
3
|
+
html_to_markdown-2.14.5.dist-info/WHEEL,sha256=WvP__evn8XoyZeDO32cKBm5BQTOFbdB1WoQ-d3AzYdw,132
|
|
4
|
+
html_to_markdown-2.14.5.dist-info/METADATA,sha256=_m5URCaMwleUAZpC712kPn5LKdqwwZesMTzlbm0F1MQ,23252
|
|
5
|
+
html_to_markdown-2.14.5.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
+
html_to_markdown/options.py,sha256=vImRfeHAeyAy0Lnt6cTPHGbj7mTdw8AEUgo19u7MAA0,5080
|
|
7
|
+
html_to_markdown/_html_to_markdown.pyi,sha256=IPD6CegtaanBsKTmK30v4nvWZ5HUlCajS6jkiOsoVj8,5875
|
|
8
|
+
html_to_markdown/_html_to_markdown.abi3.so,sha256=hpn9BCm1Z-79e5NsXTxDjhtloCGWp1I_2aEwRy9-KTI,3503168
|
|
9
|
+
html_to_markdown/__init__.py,sha256=7FQJWsnvd_XVwzVEM7rEKI1XKfqIB81oRnV44qUTJeM,1605
|
|
10
|
+
html_to_markdown/api.py,sha256=MsTij04ij6hFhhNxdc5RXf2yobaRmB0BO1P_fjS4VvY,6806
|
|
11
|
+
html_to_markdown/v1_compat.py,sha256=kn5GYvgn3dTW_Zksu9PzWVk-5CYhvXxsqAeyTdDYZSY,8001
|
|
12
|
+
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
+
html_to_markdown/cli_proxy.py,sha256=HPYKH5Mf5OUvkbEQISJvAkxrbjWKxE5GokA44HoQ6z8,3858
|
|
16
|
+
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
+
html_to_markdown/bin/html-to-markdown,sha256=pZtJBTQolbGl_VL3mBo8mLIppGODYhfcq6yOZVldWOg,6263872
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown-2.14.0.dist-info/RECORD,,
|
|
2
|
-
html_to_markdown-2.14.0.dist-info/WHEEL,sha256=WvP__evn8XoyZeDO32cKBm5BQTOFbdB1WoQ-d3AzYdw,132
|
|
3
|
-
html_to_markdown-2.14.0.dist-info/METADATA,sha256=JG4gq6AeDUutrpmm9yBndbm3rGNfg5TpQH6SpOsuTw4,23246
|
|
4
|
-
html_to_markdown-2.14.0.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
5
|
-
html_to_markdown-2.14.0.data/scripts/html-to-markdown,sha256=IuASvb2-OF95pz52ODWCERrDGRIBc4bWjzETXGcP_3w,6263856
|
|
6
|
-
html_to_markdown/options.py,sha256=vImRfeHAeyAy0Lnt6cTPHGbj7mTdw8AEUgo19u7MAA0,5080
|
|
7
|
-
html_to_markdown/_html_to_markdown.pyi,sha256=C4Y80U97JpQhjWZxpDb2Lu2ojbFwAZLtygchVdmhYs0,6093
|
|
8
|
-
html_to_markdown/_html_to_markdown.abi3.so,sha256=jIVAAeoTyJkoHiTwjskEbKnJxvWuDDQ2HMvzs2qZMzs,3486656
|
|
9
|
-
html_to_markdown/__init__.py,sha256=NuQiHxunqCAfbGRYXjgbp31YBXoLlYjXU_Fko4GC5oM,1720
|
|
10
|
-
html_to_markdown/api.py,sha256=zXXoFpdDbMIQXl65NT7BjjYu_1xwEM7VNGNUK2zQNfQ,6934
|
|
11
|
-
html_to_markdown/v1_compat.py,sha256=nZN8hVd3u4vacbfyLsPMIGqSmJENZgx1Ya0SpqVLi-g,8061
|
|
12
|
-
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
-
html_to_markdown/cli_proxy.py,sha256=HPYKH5Mf5OUvkbEQISJvAkxrbjWKxE5GokA44HoQ6z8,3858
|
|
16
|
-
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
-
html_to_markdown/bin/html-to-markdown,sha256=IuASvb2-OF95pz52ODWCERrDGRIBc4bWjzETXGcP_3w,6263856
|
|
File without changes
|
|
File without changes
|