html-to-markdown 3.6.10 → 3.6.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -12
- data/ext/html_to_markdown_rb/Cargo.toml +1 -1
- data/ext/html_to_markdown_rb/native/Cargo.lock +3 -3
- data/ext/html_to_markdown_rb/native/Cargo.toml +9 -2
- data/ext/html_to_markdown_rb/src/lib.rs +1 -1
- data/lib/html_to_markdown/native.rb +1 -1
- data/lib/html_to_markdown/version.rb +2 -2
- data/lib/html_to_markdown.rb +1 -1
- data/lib/html_to_markdown_rb.so +0 -0
- data/sig/types.rbs +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 70d67eb7df250429349103b104af5bb9118d742d947020ffc9d0768fc632a086
|
|
4
|
+
data.tar.gz: ca87f0babd9da500f417d89727e143c9f0ebbaef5766184fd45737e39794a0a5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '090e40fd52c42ad51e9988d14c195482a857d2793d7cebe746c83e44e0376aa5954298aa1ba4abd7063f7fb69eedcd3cc1a9bb0b447324a1dd5ef5c143343006'
|
|
7
|
+
data.tar.gz: db8c7e520b5bde43566ff153d77ef56a401fb530ebbc329eb23dbad46b4e6bbfef892d50d8515158a4c10696eca408161722206e5842403812dd3a89aa6e9ecc
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
4
|
<a href="https://github.com/kreuzberg-dev/alef">
|
|
5
|
-
<img src="https://img.shields.io/badge/
|
|
5
|
+
<img src="https://img.shields.io/badge/built%20with-alef%20%D7%90-007ec6" alt="Built with alef">
|
|
6
6
|
</a>
|
|
7
7
|
<!-- Language Bindings -->
|
|
8
8
|
<a href="https://crates.io/crates/html-to-markdown-rs">
|
|
@@ -161,7 +161,7 @@ require 'html_to_markdown'
|
|
|
161
161
|
result = HtmlToMarkdown.convert(html)
|
|
162
162
|
markdown = result[:content] # Converted Markdown string
|
|
163
163
|
metadata = result[:metadata] # Metadata (when extract_metadata: true)
|
|
164
|
-
tables = result[:tables] # Structured table data
|
|
164
|
+
tables = result[:tables] # Structured table data
|
|
165
165
|
document = result[:document] # Document-level info
|
|
166
166
|
images = result[:images] # Extracted images
|
|
167
167
|
warnings = result[:warnings] # Any conversion warnings
|
|
@@ -171,14 +171,13 @@ warnings = result[:warnings] # Any conversion warnings
|
|
|
171
171
|
|
|
172
172
|
**`ConversionOptions`** – Key configuration fields:
|
|
173
173
|
|
|
174
|
-
- `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"
|
|
174
|
+
- `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"atx"`
|
|
175
175
|
- `list_indent_width`: Spaces per indent level — default: `2`
|
|
176
|
-
- `bullets`: Bullet characters cycle — default: `"
|
|
176
|
+
- `bullets`: Bullet characters cycle — default: `"-*+"`
|
|
177
177
|
- `wrap`: Enable text wrapping — default: `false`
|
|
178
178
|
- `wrap_width`: Wrap at column — default: `80`
|
|
179
179
|
- `code_language`: Default fenced code block language — default: none
|
|
180
|
-
- `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `
|
|
181
|
-
- `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
|
|
180
|
+
- `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `true`
|
|
182
181
|
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
183
182
|
|
|
184
183
|
## Djot Output Format
|
|
@@ -205,11 +204,13 @@ require 'html_to_markdown'
|
|
|
205
204
|
html = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
|
|
206
205
|
|
|
207
206
|
# Default Markdown output
|
|
208
|
-
|
|
207
|
+
markdown_result = HtmlToMarkdown.convert(html)
|
|
208
|
+
markdown = markdown_result[:content]
|
|
209
209
|
# Result: "This is **bold** and *italic* text."
|
|
210
210
|
|
|
211
211
|
# Djot output
|
|
212
|
-
|
|
212
|
+
djot_result = HtmlToMarkdown.convert(html, output_format: 'djot')
|
|
213
|
+
djot = djot_result[:content]
|
|
213
214
|
# Result: "This is *bold* and _italic_ text."
|
|
214
215
|
```
|
|
215
216
|
|
|
@@ -224,7 +225,8 @@ require 'html_to_markdown'
|
|
|
224
225
|
|
|
225
226
|
html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
|
|
226
227
|
|
|
227
|
-
|
|
228
|
+
result = HtmlToMarkdown.convert(html, output_format: 'plain')
|
|
229
|
+
plain = result[:content]
|
|
228
230
|
# Result: "Title\n\nThis is bold and italic text."
|
|
229
231
|
```
|
|
230
232
|
|
|
@@ -309,13 +311,13 @@ markdown = result[:content]
|
|
|
309
311
|
|
|
310
312
|
## Part of Kreuzberg.dev
|
|
311
313
|
|
|
312
|
-
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from
|
|
314
|
+
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 91+ formats with optional OCR.
|
|
313
315
|
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
|
314
316
|
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
|
317
|
+
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
|
315
318
|
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
|
316
319
|
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
|
317
320
|
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
|
|
318
|
-
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
|
319
321
|
|
|
320
322
|
## Contributing
|
|
321
323
|
|
|
@@ -343,5 +345,4 @@ If you find this library useful, consider [sponsoring the project](https://githu
|
|
|
343
345
|
Have questions or run into issues? We're here to help:
|
|
344
346
|
|
|
345
347
|
- **GitHub Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
|
|
346
|
-
- **Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
|
|
347
348
|
- **Discord Community:** [discord.gg/xt9WY3GnKR](https://discord.gg/xt9WY3GnKR)
|
|
@@ -263,7 +263,7 @@ dependencies = [
|
|
|
263
263
|
|
|
264
264
|
[[package]]
|
|
265
265
|
name = "html-to-markdown-rb"
|
|
266
|
-
version = "3.6.
|
|
266
|
+
version = "3.6.11"
|
|
267
267
|
dependencies = [
|
|
268
268
|
"async-trait",
|
|
269
269
|
"html-to-markdown-rs",
|
|
@@ -276,9 +276,9 @@ dependencies = [
|
|
|
276
276
|
|
|
277
277
|
[[package]]
|
|
278
278
|
name = "html-to-markdown-rs"
|
|
279
|
-
version = "3.6.
|
|
279
|
+
version = "3.6.11"
|
|
280
280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
281
|
-
checksum = "
|
|
281
|
+
checksum = "32027e930a32dd01839a07405eae5e482ff507ccc4ca6fd1a89091304bd7ba22"
|
|
282
282
|
dependencies = [
|
|
283
283
|
"ahash",
|
|
284
284
|
"astral-tl",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "3.6.
|
|
3
|
+
version = "3.6.11"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
license = "MIT"
|
|
6
6
|
description = "High-performance HTML to Markdown converter"
|
|
@@ -16,9 +16,16 @@ name = "html_to_markdown_rb"
|
|
|
16
16
|
path = "../src/lib.rs"
|
|
17
17
|
crate-type = ["cdylib"]
|
|
18
18
|
|
|
19
|
+
[features]
|
|
20
|
+
default = ["inline-images", "metadata", "testkit", "visitor"]
|
|
21
|
+
inline-images = ["html-to-markdown-rs/inline-images"]
|
|
22
|
+
metadata = ["html-to-markdown-rs/metadata"]
|
|
23
|
+
testkit = ["html-to-markdown-rs/testkit"]
|
|
24
|
+
visitor = ["html-to-markdown-rs/visitor"]
|
|
25
|
+
|
|
19
26
|
[dependencies]
|
|
20
27
|
async-trait = "0.1"
|
|
21
|
-
html-to-markdown-rs = { version = "3.6.
|
|
28
|
+
html-to-markdown-rs = { version = "3.6.11", features = ["serde", "metadata", "visitor", "inline-images", "testkit"] }
|
|
22
29
|
magnus = "0.8"
|
|
23
30
|
rb-sys = ">=0.9, <0.9.128"
|
|
24
31
|
serde = { version = "1", features = ["derive"] }
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// This file is auto-generated by alef. DO NOT EDIT.
|
|
2
|
-
// alef:hash:
|
|
2
|
+
// alef:hash:7d62cf0c304e24a167a710e943106d8203118ef6cb83aa812f3133c24bfa7c7f
|
|
3
3
|
// Re-generate with: alef generate
|
|
4
4
|
#![allow(dead_code, unused_imports, unused_variables)]
|
|
5
5
|
#![allow(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:7d62cf0c304e24a167a710e943106d8203118ef6cb83aa812f3133c24bfa7c7f
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:7d62cf0c304e24a167a710e943106d8203118ef6cb83aa812f3133c24bfa7c7f
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
6
6
|
|
|
7
7
|
module HtmlToMarkdown
|
|
8
8
|
## The version string for this package.
|
|
9
|
-
VERSION = "3.6.
|
|
9
|
+
VERSION = "3.6.11"
|
|
10
10
|
end
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:7d62cf0c304e24a167a710e943106d8203118ef6cb83aa812f3133c24bfa7c7f
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
data/lib/html_to_markdown_rb.so
CHANGED
|
Binary file
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:7d62cf0c304e24a167a710e943106d8203118ef6cb83aa812f3133c24bfa7c7f
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
|