html-to-markdown 2.9.2 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,24 +1,24 @@
1
- # frozen_string_literal: true
2
-
3
- require 'bundler/gem_tasks'
4
- require 'rb_sys/extensiontask'
5
- require 'rspec/core/rake_task'
6
-
7
- GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
8
-
9
- RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
- ext.lib_dir = 'lib'
11
- ext.ext_dir = 'ext/html-to-markdown-rb'
12
- ext.cross_compile = true
13
- ext.cross_platform = %w[
14
- x86_64-linux
15
- x86_64-darwin
16
- arm64-darwin
17
- x64-mingw32
18
- ]
19
- end
20
-
21
- RSpec::Core::RakeTask.new(:spec)
22
-
23
- task spec: :compile
24
- task default: :spec
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rb_sys/extensiontask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ GEMSPEC = Gem::Specification.load(File.expand_path('html-to-markdown-rb.gemspec', __dir__))
8
+
9
+ RbSys::ExtensionTask.new('html-to-markdown-rb', GEMSPEC) do |ext|
10
+ ext.lib_dir = 'lib'
11
+ ext.ext_dir = 'ext/html-to-markdown-rb'
12
+ ext.cross_compile = true
13
+ ext.cross_platform = %w[
14
+ x86_64-linux
15
+ x86_64-darwin
16
+ arm64-darwin
17
+ x64-mingw32
18
+ ]
19
+ end
20
+
21
+ RSpec::Core::RakeTask.new(:spec)
22
+
23
+ task spec: :compile
24
+ task default: :spec
data/Steepfile CHANGED
@@ -1,26 +1,26 @@
1
- # frozen_string_literal: true
2
-
3
- # Steepfile for type checking html-to-markdown Ruby gem
4
-
5
- target :lib do
6
- signature 'sig'
7
-
8
- check 'lib'
9
-
10
- # Configure libraries
11
- library 'pathname'
12
- library 'open3'
13
-
14
- # Ignore vendor directory
15
- ignore 'vendor'
16
-
17
- # Ignore spec directory
18
- ignore 'spec'
19
-
20
- # Ignore bin directory
21
- ignore 'bin'
22
-
23
- # Ignore internal implementation modules (not public API)
24
- ignore 'lib/html_to_markdown/cli.rb'
25
- ignore 'lib/html_to_markdown/cli_proxy.rb'
26
- end
1
+ # frozen_string_literal: true
2
+
3
+ # Steepfile for type checking html-to-markdown Ruby gem
4
+
5
+ target :lib do
6
+ signature 'sig'
7
+
8
+ check 'lib'
9
+
10
+ # Configure libraries
11
+ library 'pathname'
12
+ library 'open3'
13
+
14
+ # Ignore vendor directory
15
+ ignore 'vendor'
16
+
17
+ # Ignore spec directory
18
+ ignore 'spec'
19
+
20
+ # Ignore bin directory
21
+ ignore 'bin'
22
+
23
+ # Ignore internal implementation modules (not public API)
24
+ ignore 'lib/html_to_markdown/cli.rb'
25
+ ignore 'lib/html_to_markdown/cli_proxy.rb'
26
+ end
data/bin/benchmark.rb CHANGED
@@ -1,94 +1,94 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'optparse'
5
- require 'time'
6
-
7
- $LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
8
- require 'html_to_markdown'
9
-
10
- def json_escape(value)
11
- value.to_s.gsub(/["\\\n\r]/) do |char|
12
- case char
13
- when '"', '\\'
14
- "\\#{char}"
15
- when "\n"
16
- '\\n'
17
- when "\r"
18
- '\\r'
19
- end
20
- end
21
- end
22
-
23
- options = {
24
- iterations: 50,
25
- format: 'html'
26
- }
27
-
28
- OptionParser.new do |parser|
29
- parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
30
-
31
- parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
32
- options[:file] = file
33
- end
34
-
35
- parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
36
- options[:iterations] = n.positive? ? n : 1
37
- end
38
-
39
- parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
40
- options[:format] = format.downcase
41
- end
42
- end.parse!
43
-
44
- fixture = options.fetch(:file) do
45
- warn 'Missing --file parameter'
46
- exit 1
47
- end
48
-
49
- unless File.exist?(fixture)
50
- warn "Fixture not found: #{fixture}"
51
- exit 1
52
- end
53
-
54
- unless %w[html hocr].include?(options[:format])
55
- warn "Unsupported format: #{options[:format]}"
56
- exit 1
57
- end
58
-
59
- html = File.binread(fixture)
60
- html.force_encoding(Encoding::UTF_8)
61
- html.freeze
62
- iterations = options[:iterations]
63
- options_handle = HtmlToMarkdown.options(
64
- options[:format] == 'hocr' ? { hocr_spatial_tables: false } : nil
65
- )
66
-
67
- def convert_document(html, options_handle)
68
- HtmlToMarkdown.convert_with_options(html, options_handle)
69
- end
70
-
71
- convert_document(html, options_handle)
72
-
73
- start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
74
- iterations.times { convert_document(html, options_handle) }
75
- elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
76
-
77
- payload_size_bytes = html.bytesize
78
- bytes_processed = payload_size_bytes * iterations
79
- ops_per_sec = iterations / elapsed
80
- mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
81
-
82
- payload = %({
83
- "language":"ruby",
84
- "fixture":"#{json_escape(File.basename(fixture))}",
85
- "fixture_path":"#{json_escape(fixture)}",
86
- "iterations":#{iterations},
87
- "elapsed_seconds":#{format('%.8f', elapsed)},
88
- "ops_per_sec":#{format('%.4f', ops_per_sec)},
89
- "mb_per_sec":#{format('%.4f', mb_per_sec)},
90
- "bytes_processed":#{bytes_processed},
91
- "payload_size_bytes":#{payload_size_bytes}
92
- })
93
-
94
- puts payload.strip
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+ require 'time'
6
+
7
+ $LOAD_PATH.unshift(File.expand_path('../lib', __dir__))
8
+ require 'html_to_markdown'
9
+
10
+ def json_escape(value)
11
+ value.to_s.gsub(/["\\\n\r]/) do |char|
12
+ case char
13
+ when '"', '\\'
14
+ "\\#{char}"
15
+ when "\n"
16
+ '\\n'
17
+ when "\r"
18
+ '\\r'
19
+ end
20
+ end
21
+ end
22
+
23
+ options = {
24
+ iterations: 50,
25
+ format: 'html'
26
+ }
27
+
28
+ OptionParser.new do |parser|
29
+ parser.banner = 'ruby benchmark.rb --file path/to/fixture.html [--iterations 200]'
30
+
31
+ parser.on('--file FILE', 'HTML fixture to convert repeatedly') do |file|
32
+ options[:file] = file
33
+ end
34
+
35
+ parser.on('--iterations N', Integer, 'Number of conversion iterations (default: 50)') do |n|
36
+ options[:iterations] = n.positive? ? n : 1
37
+ end
38
+
39
+ parser.on('--format FORMAT', 'Fixture format (html or hocr)') do |format|
40
+ options[:format] = format.downcase
41
+ end
42
+ end.parse!
43
+
44
+ fixture = options.fetch(:file) do
45
+ warn 'Missing --file parameter'
46
+ exit 1
47
+ end
48
+
49
+ unless File.exist?(fixture)
50
+ warn "Fixture not found: #{fixture}"
51
+ exit 1
52
+ end
53
+
54
+ unless %w[html hocr].include?(options[:format])
55
+ warn "Unsupported format: #{options[:format]}"
56
+ exit 1
57
+ end
58
+
59
+ html = File.binread(fixture)
60
+ html.force_encoding(Encoding::UTF_8)
61
+ html.freeze
62
+ iterations = options[:iterations]
63
+ options_handle = HtmlToMarkdown.options(
64
+ options[:format] == 'hocr' ? { hocr_spatial_tables: false } : nil
65
+ )
66
+
67
+ def convert_document(html, options_handle)
68
+ HtmlToMarkdown.convert_with_options(html, options_handle)
69
+ end
70
+
71
+ convert_document(html, options_handle)
72
+
73
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
74
+ iterations.times { convert_document(html, options_handle) }
75
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
76
+
77
+ payload_size_bytes = html.bytesize
78
+ bytes_processed = payload_size_bytes * iterations
79
+ ops_per_sec = iterations / elapsed
80
+ mb_per_sec = (bytes_processed.to_f / (1024 * 1024)) / elapsed
81
+
82
+ payload = %({
83
+ "language":"ruby",
84
+ "fixture":"#{json_escape(File.basename(fixture))}",
85
+ "fixture_path":"#{json_escape(fixture)}",
86
+ "iterations":#{iterations},
87
+ "elapsed_seconds":#{format('%.8f', elapsed)},
88
+ "ops_per_sec":#{format('%.4f', ops_per_sec)},
89
+ "mb_per_sec":#{format('%.4f', mb_per_sec)},
90
+ "bytes_processed":#{bytes_processed},
91
+ "payload_size_bytes":#{payload_size_bytes}
92
+ })
93
+
94
+ puts payload.strip
data/exe/html-to-markdown CHANGED
@@ -1,6 +1,6 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require 'html_to_markdown/cli'
5
-
6
- exit HtmlToMarkdown::CLI.run(ARGV)
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'html_to_markdown/cli'
5
+
6
+ exit HtmlToMarkdown::CLI.run(ARGV)
@@ -1,38 +1,38 @@
1
- # frozen_string_literal: true
2
-
3
- require 'mkmf'
4
- require 'rb_sys/mkmf'
5
- require 'rbconfig'
6
- require 'pathname'
7
-
8
- if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
9
- devkit = ENV.fetch('RI_DEVKIT', nil)
10
- prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
11
-
12
- if devkit
13
- sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
14
- extra_args = [
15
- '--target=x86_64-pc-windows-gnu',
16
- "--sysroot=#{sysroot}"
17
- ]
18
-
19
- existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
20
- ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
21
- end
22
- end
23
-
24
- default_profile = ENV.fetch('CARGO_PROFILE', 'release')
25
-
26
- create_rust_makefile('html_to_markdown_rb') do |config|
27
- config.profile = default_profile.to_sym
28
-
29
- native_dir = File.expand_path('native', __dir__)
30
- relative_native =
31
- begin
32
- Pathname.new(native_dir).relative_path_from(Pathname.new(__dir__)).to_s
33
- rescue ArgumentError
34
- native_dir
35
- end
36
-
37
- config.ext_dir = relative_native
38
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ require 'rb_sys/mkmf'
5
+ require 'rbconfig'
6
+ require 'pathname'
7
+
8
+ if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
9
+ devkit = ENV.fetch('RI_DEVKIT', nil)
10
+ prefix = ENV['MSYSTEM_PREFIX'] || '/ucrt64'
11
+
12
+ if devkit
13
+ sysroot = "#{devkit}#{prefix}".tr('\\\\', '/')
14
+ extra_args = [
15
+ '--target=x86_64-pc-windows-gnu',
16
+ "--sysroot=#{sysroot}"
17
+ ]
18
+
19
+ existing = ENV['BINDGEN_EXTRA_CLANG_ARGS'].to_s.split(/\s+/)
20
+ ENV['BINDGEN_EXTRA_CLANG_ARGS'] = (existing + extra_args).uniq.join(' ')
21
+ end
22
+ end
23
+
24
+ default_profile = ENV.fetch('CARGO_PROFILE', 'release')
25
+
26
+ create_rust_makefile('html_to_markdown_rb') do |config|
27
+ config.profile = default_profile.to_sym
28
+
29
+ native_dir = File.expand_path('native', __dir__)
30
+ relative_native =
31
+ begin
32
+ Pathname.new(native_dir).relative_path_from(Pathname.new(__dir__)).to_s
33
+ rescue ArgumentError
34
+ native_dir
35
+ end
36
+
37
+ config.ext_dir = relative_native
38
+ end
@@ -1,28 +1,28 @@
1
- [package]
2
- name = "html-to-markdown-rb"
3
- version = "2.9.2"
4
- edition = "2024"
5
- authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
- license = "MIT"
7
- repository = "https://github.com/Goldziher/html-to-markdown"
8
- homepage = "https://github.com/Goldziher/html-to-markdown"
9
- documentation = "https://docs.rs/html-to-markdown-rs"
10
- readme = "README.md"
11
- rust-version = "1.85"
12
- description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
- keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
- categories = ["api-bindings"]
15
-
16
- [lib]
17
- name = "html_to_markdown_rb"
18
- crate-type = ["cdylib", "rlib"]
19
-
20
- [features]
21
- default = []
22
-
23
- [dependencies]
24
- html-to-markdown-rs = { version = "2.9.2", features = ["inline-images"] }
25
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
-
27
- [dev-dependencies]
28
- pretty_assertions = "1.4"
1
+ [package]
2
+ name = "html-to-markdown-rb"
3
+ version = "2.11.1"
4
+ edition = "2024"
5
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
+ license = "MIT"
7
+ repository = "https://github.com/Goldziher/html-to-markdown"
8
+ homepage = "https://github.com/Goldziher/html-to-markdown"
9
+ documentation = "https://docs.rs/html-to-markdown-rs"
10
+ readme = "README.md"
11
+ rust-version = "1.85"
12
+ description = "Ruby bindings (Magnus) for html-to-markdown - high-performance HTML to Markdown converter"
13
+ keywords = ["html", "markdown", "ruby", "magnus", "bindings"]
14
+ categories = ["api-bindings"]
15
+
16
+ [lib]
17
+ name = "html_to_markdown_rb"
18
+ crate-type = ["cdylib", "rlib"]
19
+
20
+ [features]
21
+ default = []
22
+
23
+ [dependencies]
24
+ html-to-markdown-rs = { version = "2.11.1", features = ["inline-images"] }
25
+ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
26
+
27
+ [dev-dependencies]
28
+ pretty_assertions = "1.4"