html-to-markdown 2.6.3__cp310-abi3-macosx_11_0_arm64.whl → 2.14.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,12 +15,17 @@ V1 API (backward compatibility):
15
15
  markdown = convert_to_markdown(html, heading_style="atx")
16
16
  """
17
17
 
18
+ import contextlib
19
+
18
20
  from html_to_markdown.api import (
19
21
  InlineImage,
20
22
  InlineImageConfig,
21
23
  InlineImageWarning,
24
+ OptionsHandle,
22
25
  convert,
26
+ convert_with_handle,
23
27
  convert_with_inline_images,
28
+ create_options_handle,
24
29
  )
25
30
  from html_to_markdown.exceptions import (
26
31
  ConflictingOptionsError,
@@ -32,6 +37,10 @@ from html_to_markdown.exceptions import (
32
37
  from html_to_markdown.options import ConversionOptions, PreprocessingOptions
33
38
  from html_to_markdown.v1_compat import convert_to_markdown, markdownify
34
39
 
40
+ # Optional metadata support
41
+ with contextlib.suppress(ImportError):
42
+ from html_to_markdown.api import MetadataConfig, convert_with_metadata
43
+
35
44
  __all__ = [
36
45
  "ConflictingOptionsError",
37
46
  "ConversionOptions",
@@ -41,12 +50,17 @@ __all__ = [
41
50
  "InlineImageConfig",
42
51
  "InlineImageWarning",
43
52
  "InvalidParserError",
53
+ "MetadataConfig",
44
54
  "MissingDependencyError",
55
+ "OptionsHandle",
45
56
  "PreprocessingOptions",
46
57
  "convert",
47
58
  "convert_to_markdown",
59
+ "convert_with_handle",
48
60
  "convert_with_inline_images",
61
+ "convert_with_metadata",
62
+ "create_options_handle",
49
63
  "markdownify",
50
64
  ]
51
65
 
52
- __version__ = "2.5.7"
66
+ __version__ = "2.14.0"
Binary file
@@ -0,0 +1,196 @@
1
+ from typing import Literal, TypedDict
2
+
3
+ class PreprocessingOptions:
4
+ enabled: bool
5
+ preset: Literal["minimal", "standard", "aggressive"]
6
+ remove_navigation: bool
7
+ remove_forms: bool
8
+
9
+ def __init__(
10
+ self,
11
+ *,
12
+ enabled: bool = False,
13
+ preset: Literal["minimal", "standard", "aggressive"] = "standard",
14
+ remove_navigation: bool = True,
15
+ remove_forms: bool = True,
16
+ ) -> None: ...
17
+
18
+ class ConversionOptions:
19
+ heading_style: Literal["underlined", "atx", "atx_closed"]
20
+ list_indent_type: Literal["spaces", "tabs"]
21
+ list_indent_width: int
22
+ bullets: str
23
+ strong_em_symbol: str
24
+ escape_asterisks: bool
25
+ escape_underscores: bool
26
+ escape_misc: bool
27
+ escape_ascii: bool
28
+ code_language: str
29
+ autolinks: bool
30
+ default_title: bool
31
+ br_in_tables: bool
32
+ hocr_spatial_tables: bool
33
+ highlight_style: Literal["double-equal", "html", "bold", "none"]
34
+ extract_metadata: bool
35
+ whitespace_mode: Literal["normalized", "strict"]
36
+ strip_newlines: bool
37
+ wrap: bool
38
+ wrap_width: int
39
+ convert_as_inline: bool
40
+ sub_symbol: str
41
+ sup_symbol: str
42
+ newline_style: Literal["spaces", "backslash"]
43
+ code_block_style: Literal["indented", "backticks", "tildes"]
44
+ keep_inline_images_in: list[str]
45
+ preprocessing: PreprocessingOptions
46
+ encoding: str
47
+ debug: bool
48
+ strip_tags: list[str]
49
+ preserve_tags: list[str]
50
+
51
+ def __init__(
52
+ self,
53
+ *,
54
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined",
55
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
56
+ list_indent_width: int = 4,
57
+ bullets: str = "*+-",
58
+ strong_em_symbol: str = "*",
59
+ escape_asterisks: bool = False,
60
+ escape_underscores: bool = False,
61
+ escape_misc: bool = False,
62
+ escape_ascii: bool = False,
63
+ code_language: str = "",
64
+ autolinks: bool = True,
65
+ default_title: bool = False,
66
+ br_in_tables: bool = False,
67
+ hocr_spatial_tables: bool = True,
68
+ highlight_style: Literal["double-equal", "html", "bold", "none"] = "double-equal",
69
+ extract_metadata: bool = True,
70
+ whitespace_mode: Literal["normalized", "strict"] = "normalized",
71
+ strip_newlines: bool = False,
72
+ wrap: bool = False,
73
+ wrap_width: int = 80,
74
+ convert_as_inline: bool = False,
75
+ sub_symbol: str = "",
76
+ sup_symbol: str = "",
77
+ newline_style: Literal["spaces", "backslash"] = "spaces",
78
+ code_block_style: Literal["indented", "backticks", "tildes"] = "indented",
79
+ keep_inline_images_in: list[str] = [],
80
+ preprocessing: PreprocessingOptions | None = None,
81
+ encoding: str = "utf-8",
82
+ debug: bool = False,
83
+ strip_tags: list[str] = [],
84
+ preserve_tags: list[str] = [],
85
+ ) -> None: ...
86
+
87
+ class InlineImageConfig:
88
+ max_decoded_size_bytes: int
89
+ filename_prefix: str | None
90
+ capture_svg: bool
91
+ infer_dimensions: bool
92
+
93
+ def __init__(
94
+ self,
95
+ max_decoded_size_bytes: int = ...,
96
+ filename_prefix: str | None = None,
97
+ capture_svg: bool = True,
98
+ infer_dimensions: bool = False,
99
+ ) -> None: ...
100
+
101
+ class ConversionOptionsHandle:
102
+ def __init__(self, options: ConversionOptions | None = None) -> None: ...
103
+
104
+ class InlineImage(TypedDict):
105
+ data: bytes
106
+ format: str
107
+ filename: str | None
108
+ description: str | None
109
+ dimensions: tuple[int, int] | None
110
+ source: Literal["img_data_uri", "svg_element"]
111
+ attributes: dict[str, str]
112
+
113
+ class InlineImageWarning(TypedDict):
114
+ index: int
115
+ message: str
116
+
117
+ class MetadataConfig:
118
+ extract_document: bool
119
+ extract_headers: bool
120
+ extract_links: bool
121
+ extract_images: bool
122
+ extract_structured_data: bool
123
+ max_structured_data_size: int
124
+
125
+ def __init__(
126
+ self,
127
+ *,
128
+ extract_document: bool = True,
129
+ extract_headers: bool = True,
130
+ extract_links: bool = True,
131
+ extract_images: bool = True,
132
+ extract_structured_data: bool = True,
133
+ max_structured_data_size: int = 1_000_000,
134
+ ) -> None: ...
135
+
136
+ class DocumentMetadata(TypedDict):
137
+ title: str | None
138
+ description: str | None
139
+ keywords: list[str]
140
+ author: str | None
141
+ canonical_url: str | None
142
+ base_href: str | None
143
+ language: str | None
144
+ text_direction: str | None # "ltr" | "rtl" | "auto" | None
145
+ open_graph: dict[str, str]
146
+ twitter_card: dict[str, str]
147
+ meta_tags: dict[str, str]
148
+
149
+ class HeaderMetadata(TypedDict):
150
+ level: int # 1-6
151
+ text: str
152
+ id: str | None
153
+ depth: int
154
+ html_offset: int
155
+
156
+ class LinkMetadata(TypedDict):
157
+ href: str
158
+ text: str
159
+ title: str | None
160
+ link_type: str # "anchor" | "internal" | "external" | "email" | "phone" | "other"
161
+ rel: list[str]
162
+ attributes: dict[str, str]
163
+
164
+ class ImageMetadata(TypedDict):
165
+ src: str
166
+ alt: str | None
167
+ title: str | None
168
+ dimensions: tuple[int, int] | None # (width, height)
169
+ image_type: str # "data_uri" | "inline_svg" | "external" | "relative"
170
+ attributes: dict[str, str]
171
+
172
+ class StructuredData(TypedDict):
173
+ data_type: str # "json_ld" | "microdata" | "rdfa"
174
+ raw_json: str
175
+ schema_type: str | None
176
+
177
+ class ExtendedMetadata(TypedDict):
178
+ document: DocumentMetadata
179
+ headers: list[HeaderMetadata]
180
+ links: list[LinkMetadata]
181
+ images: list[ImageMetadata]
182
+ structured_data: list[StructuredData]
183
+
184
+ def convert(html: str, options: ConversionOptions | None = None) -> str: ...
185
+ def convert_with_inline_images(
186
+ html: str,
187
+ options: ConversionOptions | None = None,
188
+ image_config: InlineImageConfig | None = None,
189
+ ) -> tuple[str, list[InlineImage], list[InlineImageWarning]]: ...
190
+ def convert_with_metadata(
191
+ html: str,
192
+ options: ConversionOptions | None = None,
193
+ metadata_config: MetadataConfig | None = None,
194
+ ) -> tuple[str, ExtendedMetadata]: ...
195
+ def create_options_handle(options: ConversionOptions | None = None) -> ConversionOptionsHandle: ...
196
+ def convert_with_options_handle(html: str, handle: ConversionOptionsHandle) -> str: ...
html_to_markdown/api.py CHANGED
@@ -1,20 +1,27 @@
1
- """New v2 functional API for HTML to Markdown conversion.
2
-
3
- This module provides the new functional API with dataclass-based options,
4
- using the Rust backend for conversion.
5
- """
1
+ """High-level Python API backed by the Rust core."""
6
2
 
7
3
  from __future__ import annotations
8
4
 
9
- from typing import TYPE_CHECKING, Literal, TypedDict, cast
5
+ from typing import TYPE_CHECKING, Literal, TypedDict
10
6
 
11
- import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
7
+ import html_to_markdown._html_to_markdown as _rust
8
+ from html_to_markdown._html_to_markdown import (
9
+ ConversionOptionsHandle as OptionsHandle,
10
+ )
11
+ from html_to_markdown._html_to_markdown import InlineImageConfig
12
12
  from html_to_markdown.options import ConversionOptions, PreprocessingOptions
13
13
 
14
- if TYPE_CHECKING:
15
- from html_to_markdown._html_to_markdown import InlineImageConfig
16
- else:
17
- InlineImageConfig = _rust.InlineImageConfig # type: ignore[misc, assignment]
14
+ _HAS_METADATA = False
15
+ try:
16
+ from html_to_markdown._html_to_markdown import ExtendedMetadata, MetadataConfig
17
+
18
+ _HAS_METADATA = True
19
+ except ImportError:
20
+ MetadataConfig = None # type: ignore[misc,assignment]
21
+ if TYPE_CHECKING:
22
+ from html_to_markdown._html_to_markdown import ExtendedMetadata # pragma: no cover
23
+ else:
24
+ ExtendedMetadata = dict[str, object] # type: ignore[assignment]
18
25
 
19
26
 
20
27
  class InlineImage(TypedDict):
@@ -37,7 +44,6 @@ class InlineImageWarning(TypedDict):
37
44
 
38
45
 
39
46
  def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
40
- """Convert high-level preprocessing options to the Rust bindings."""
41
47
  return _rust.PreprocessingOptions(
42
48
  enabled=options.enabled,
43
49
  preset=options.preset,
@@ -50,7 +56,6 @@ def _to_rust_options(
50
56
  options: ConversionOptions,
51
57
  preprocessing: PreprocessingOptions,
52
58
  ) -> _rust.ConversionOptions:
53
- """Convert high-level conversion options to the Rust bindings."""
54
59
  return _rust.ConversionOptions(
55
60
  heading_style=options.heading_style,
56
61
  list_indent_type=options.list_indent_type,
@@ -91,23 +96,17 @@ def convert(
91
96
  options: ConversionOptions | None = None,
92
97
  preprocessing: PreprocessingOptions | None = None,
93
98
  ) -> str:
94
- """Convert HTML to Markdown using the Rust backend.
99
+ """Convert HTML to Markdown using the Rust backend."""
100
+ if options is None and preprocessing is None:
101
+ return _rust.convert(html, None)
95
102
 
96
- Args:
97
- html: HTML string to convert.
98
- options: Conversion configuration options (defaults to ConversionOptions()).
99
- preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
100
-
101
- Returns:
102
- Converted Markdown string.
103
- """
104
103
  if options is None:
105
104
  options = ConversionOptions()
106
105
  if preprocessing is None:
107
106
  preprocessing = PreprocessingOptions()
108
107
 
109
108
  rust_options = _to_rust_options(options, preprocessing)
110
- return cast("str", _rust.convert(html, rust_options))
109
+ return _rust.convert(html, rust_options)
111
110
 
112
111
 
113
112
  def convert_with_inline_images(
@@ -116,10 +115,7 @@ def convert_with_inline_images(
116
115
  preprocessing: PreprocessingOptions | None = None,
117
116
  image_config: InlineImageConfig | None = None,
118
117
  ) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
119
- """Convert HTML and extract inline images.
120
-
121
- Returns Markdown along with extracted inline images and any warnings.
122
- """
118
+ """Convert HTML and extract inline images."""
123
119
  if options is None:
124
120
  options = ConversionOptions()
125
121
  if preprocessing is None:
@@ -128,17 +124,73 @@ def convert_with_inline_images(
128
124
  image_config = InlineImageConfig()
129
125
 
130
126
  rust_options = _to_rust_options(options, preprocessing)
131
- markdown, images, warnings = cast(
132
- "tuple[str, list[InlineImage], list[InlineImageWarning]]",
133
- _rust.convert_with_inline_images(html, rust_options, image_config),
134
- )
127
+ markdown, images, warnings = _rust.convert_with_inline_images(html, rust_options, image_config)
135
128
  return markdown, list(images), list(warnings)
136
129
 
137
130
 
131
+ def create_options_handle(
132
+ options: ConversionOptions | None = None,
133
+ preprocessing: PreprocessingOptions | None = None,
134
+ ) -> OptionsHandle:
135
+ """Create a reusable ConversionOptions handle backed by Rust."""
136
+ if options is None:
137
+ options = ConversionOptions()
138
+ if preprocessing is None:
139
+ preprocessing = PreprocessingOptions()
140
+ rust_options = _to_rust_options(options, preprocessing)
141
+ return _rust.create_options_handle(rust_options)
142
+
143
+
144
+ def convert_with_handle(html: str, handle: OptionsHandle) -> str:
145
+ """Convert HTML using a pre-parsed ConversionOptions handle."""
146
+ return _rust.convert_with_options_handle(html, handle)
147
+
148
+
149
+ if _HAS_METADATA:
150
+
151
+ def convert_with_metadata(
152
+ html: str,
153
+ options: ConversionOptions | None = None,
154
+ preprocessing: PreprocessingOptions | None = None,
155
+ metadata_config: MetadataConfig | None = None,
156
+ ) -> tuple[str, ExtendedMetadata]:
157
+ """Convert HTML and extract comprehensive metadata.
158
+
159
+ Args:
160
+ html: HTML string to convert
161
+ options: Optional conversion configuration
162
+ preprocessing: Optional preprocessing configuration
163
+ metadata_config: Optional metadata extraction configuration
164
+
165
+ Returns:
166
+ Tuple of (markdown, metadata_dict) where metadata_dict contains:
167
+ - document: Document-level metadata (title, description, lang, etc.)
168
+ - headers: List of header elements with hierarchy
169
+ - links: List of extracted hyperlinks with classification
170
+ - images: List of extracted images with metadata
171
+ - structured_data: List of JSON-LD, Microdata, or RDFa blocks
172
+ """
173
+ if options is None:
174
+ options = ConversionOptions()
175
+ if preprocessing is None:
176
+ preprocessing = PreprocessingOptions()
177
+ if metadata_config is None:
178
+ metadata_config = MetadataConfig()
179
+
180
+ rust_options = _to_rust_options(options, preprocessing)
181
+ markdown, metadata = _rust.convert_with_metadata(html, rust_options, metadata_config)
182
+ return markdown, metadata
183
+
184
+
138
185
  __all__ = [
139
186
  "InlineImage",
140
187
  "InlineImageConfig",
141
188
  "InlineImageWarning",
189
+ "MetadataConfig",
190
+ "OptionsHandle",
142
191
  "convert",
192
+ "convert_with_handle",
143
193
  "convert_with_inline_images",
194
+ "convert_with_metadata",
195
+ "create_options_handle",
144
196
  ]
Binary file
@@ -11,6 +11,15 @@ import warnings
11
11
  from html_to_markdown import ConversionOptions, PreprocessingOptions
12
12
  from html_to_markdown import convert as convert_v2
13
13
 
14
+ DEPRECATION_MESSAGE = (
15
+ "The v1 compatibility layer is deprecated and will be removed in v3.0. "
16
+ "Use html_to_markdown.convert() with ConversionOptions instead."
17
+ )
18
+
19
+
20
+ def _warn_deprecated(api_name: str, *, stacklevel: int = 2) -> None:
21
+ warnings.warn(f"{api_name} is deprecated. {DEPRECATION_MESSAGE}", DeprecationWarning, stacklevel=stacklevel)
22
+
14
23
 
15
24
  def convert_to_markdown(
16
25
  html: str,
@@ -104,12 +113,7 @@ def convert_to_markdown(
104
113
  Use :func:`html_to_markdown.convert` with :class:`ConversionOptions` instead.
105
114
  The v1 API is provided for backward compatibility only.
106
115
  """
107
- warnings.warn(
108
- "convert_to_markdown() is deprecated and will be removed in v3.0. "
109
- "Use html_to_markdown.convert() with ConversionOptions instead.",
110
- DeprecationWarning,
111
- stacklevel=2,
112
- )
116
+ _warn_deprecated("convert_to_markdown()", stacklevel=2)
113
117
 
114
118
  if code_language_callback is not None:
115
119
  raise NotImplementedError(
@@ -181,12 +185,7 @@ def markdownify(*args: object, **kwargs: object) -> str:
181
185
  .. deprecated:: 2.0
182
186
  Use html_to_markdown.convert() instead.
183
187
  """
184
- warnings.warn(
185
- "markdownify() is deprecated and will be removed in v3.0. "
186
- "Use html_to_markdown.convert() with ConversionOptions instead.",
187
- DeprecationWarning,
188
- stacklevel=2,
189
- )
188
+ _warn_deprecated("markdownify()", stacklevel=2)
190
189
  return convert_to_markdown(*args, **kwargs) # type: ignore[arg-type]
191
190
 
192
191