pdf2htmlex 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +1 -0
- data/CHANGELOG.md +0 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +21 -0
- data/README.md +125 -0
- data/Rakefile +4 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/example/demo1.pdf +0 -0
- data/lib/pdf2htmlex.rb +156 -0
- data/lib/pdf2htmlex/config.rb +66 -0
- data/lib/pdf2htmlex/version.rb +5 -0
- data/pdf2htmlex.gemspec +34 -0
- metadata +78 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b41b15ee0a9766fab2d08d3f77ccfe92c10b811c883347e57964b9b6e79c9b09
|
4
|
+
data.tar.gz: f846a1c3abbf90d5cc01d4f05960af3da32d9d2c8566a37fa52d96996c4f7ee9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 38cde8a2d5ba49fdb2fec7ab39843a62bfe3c3f96eb7c85dc71fe962f7832fac276c66daa514b74301169d4eb44674f752940203f1992fe5fe05fb5b01f96a54
|
7
|
+
data.tar.gz: 7c23bf805cebc8c9241ca1030bdcd025eed22909b0cf94438bae40a850fafa2bd362fdc31f67a281e28724794d61e29940f63eef93fd0d711c0417143efca338
|
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--require spec_helper
|
data/CHANGELOG.md
ADDED
File without changes
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
pdf2htmlex (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.4.4)
|
10
|
+
rake (12.3.3)
|
11
|
+
rspec (3.10.0)
|
12
|
+
rspec-core (~> 3.10.0)
|
13
|
+
rspec-expectations (~> 3.10.0)
|
14
|
+
rspec-mocks (~> 3.10.0)
|
15
|
+
rspec-core (3.10.1)
|
16
|
+
rspec-support (~> 3.10.0)
|
17
|
+
rspec-expectations (3.10.1)
|
18
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
19
|
+
rspec-support (~> 3.10.0)
|
20
|
+
rspec-mocks (3.10.2)
|
21
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
22
|
+
rspec-support (~> 3.10.0)
|
23
|
+
rspec-support (3.10.2)
|
24
|
+
|
25
|
+
PLATFORMS
|
26
|
+
ruby
|
27
|
+
|
28
|
+
DEPENDENCIES
|
29
|
+
pdf2htmlex!
|
30
|
+
rake (~> 12.0)
|
31
|
+
rspec (~> 3.2)
|
32
|
+
|
33
|
+
BUNDLED WITH
|
34
|
+
2.1.4
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021 Marcos G. Zimmermann
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
# Ruby wrapper for pdf2htmlEX
|
2
|
+
|
3
|
+
`pdf2htmlEX` converts PDF to HTML while retaining text, format and style as much as possible by making use of HTML5, JavaScript and modern CSS features. Even difficult content like PDFs with embedded fonts, multicolumn documents, scientific papers with complicated figures and mathematical formulas will mostly be represented correctly. Fallback mode generates HTML pages which do not require any JavaScript to view them correctly at the expense of a larger file size.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
You will need `pdf2htmlEX` installed. If you are using Mac OS X, I recommend installing pdf2htmlEX with Homebrew by running the following:
|
8
|
+
|
9
|
+
$ brew install pdf2htmlex
|
10
|
+
|
11
|
+
If you are using Debian, you can install the pdf2htmlex package like so:
|
12
|
+
|
13
|
+
$ apt install pdf2htmlex
|
14
|
+
|
15
|
+
If you've docker env setup, just install it via docker:
|
16
|
+
|
17
|
+
$ alias pdf2htmlex="docker run -ti --rm -v ~/pdf:/tmp/pdf iapain/pdf2htmlex pdf2htmlEX"
|
18
|
+
|
19
|
+
After that just add this line to your application's Gemfile:
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
gem 'pdf2htmlex'
|
23
|
+
```
|
24
|
+
|
25
|
+
And then execute:
|
26
|
+
|
27
|
+
$ bundle install
|
28
|
+
|
29
|
+
Or install it yourself as:
|
30
|
+
|
31
|
+
$ gem install pdf2htmlex
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
Configuration:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
require 'pdf2htmlex'
|
39
|
+
|
40
|
+
Pdf2htmlex.config.executable = '/usr/local/bin/pdf2htmlEX' # Default value: 'pdf2htmlex'
|
41
|
+
```
|
42
|
+
|
43
|
+
To convert files use the `Pdf2htmlex::convert` method with the PDF file as the first argument, optional output filename as the second argument and list of options. Example:
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
2.7.0 (main):0 > output = Pdf2htmlex.convert('./example/demo1.pdf')
|
47
|
+
=> #<Pathname:/var/folders/rn/1xx0_xsd089fldtvtnq1tk7m0000gn/T/d20210409-96258-1pmrw5z/demo1.html>
|
48
|
+
2.7.0 (main):0 > output = Pdf2htmlex.convert('./example/demo1.pdf', 'sample.html')
|
49
|
+
=> #<Pathname:/var/folders/rn/1xx0_xsd089fldtvtnq1tk7m0000gn/T/d20210409-96258-14l7p6u/sample.html>
|
50
|
+
2.7.0 (main):0 > output = Pdf2htmlex.convert('./example/demo1.pdf', 'demo.html', dest_dir: '/tmp')
|
51
|
+
=> #<Pathname:/tmp/demo.html>
|
52
|
+
```
|
53
|
+
|
54
|
+
All options:
|
55
|
+
```bash
|
56
|
+
first_page: '--first-page', # first page to convert (default: 1)
|
57
|
+
last_page: '--last-page', # last page to convert (default: 2147483647)
|
58
|
+
zoom: '--zoom', # zoom ratio
|
59
|
+
fit_width: '--fit-width', # fit width to <fp> pixels
|
60
|
+
fit_height: '--fit-height', # fit height to <fp> pixels
|
61
|
+
use_cropbox: '--use-cropbox', # use CropBox instead of MediaBox (default: 1)
|
62
|
+
hdpi: '--hdpi', # horizontal resolution for graphics in DPI (default: 144)
|
63
|
+
vdpi: '--vdpi', # vertical resolution for graphics in DPI (default: 144)
|
64
|
+
embed: '--embed', # specify which elements should be embedded into output
|
65
|
+
embed_css: '--embed-css', # embed CSS files into output (default: 1)
|
66
|
+
embed_font: '--embed-font', # embed font files into output (default: 1)
|
67
|
+
embed_image: '--embed-image', # embed image files into output (default: 1)
|
68
|
+
embed_javascript: '--embed-javascript', # embed JavaScript files into output (default: 1)
|
69
|
+
embed_outline: '--embed-outline', # embed outlines into output (default: 1)
|
70
|
+
split_pages: '--split-pages', # split pages into separate files (default: 0)
|
71
|
+
dest_dir: '--dest-dir', # specify destination directory (default: ".")
|
72
|
+
css_filename: '--css-filename', # filename of the generated css file (default: "")
|
73
|
+
page_filename: '--page-filename', # filename template for split pages (default: "")
|
74
|
+
outline_filename: '--outline-filename', # filename of the generated outline file (default: "")
|
75
|
+
process_nontext: '--process-nontext', # render graphics in addition to text (default: 1)
|
76
|
+
process_outline: '--process-outline', # show outline in HTML (default: 1)
|
77
|
+
process_annotation: '--process-annotation', # show annotation in HTML (default: 0)
|
78
|
+
process_form: '--process-form', # include text fields and radio buttons (default: 0)
|
79
|
+
printing: '--printing', # enable printing support (default: 1)
|
80
|
+
fallback: '--fallback', # output in fallback mode (default: 0)
|
81
|
+
tmp_file_size_limit: '--tmp-file-size-limit', # Maximum size (in KB) used by temporary files, -1 for no limit. (default: -1)
|
82
|
+
embed_external_font: '--embed-external-font', # embed local match for external fonts (default: 1)
|
83
|
+
font_format: '--font-format', # suffix for embedded font files (ttf,otf,woff,svg) (default: "woff")
|
84
|
+
decompose_ligature: '--decompose-ligature', # decompose ligatures, such as fi -> fi (default: 0)
|
85
|
+
auto_hint: '--auto-hint', # use fontforge autohint on fonts without hints (default: 0)
|
86
|
+
external_hint_tool: '--external-hint-tool', # external tool for hinting fonts (overrides --auto-hint) (default: "")
|
87
|
+
stretch_narrow_glyph: '--stretch-narrow-glyph', # stretch narrow glyphs instead of padding them (default: 0)
|
88
|
+
squeeze_wide_glyph: '--squeeze-wide-glyph', # shrink wide glyphs instead of truncating them (default: 1)
|
89
|
+
override_fstype: '--override-fstype', # clear the fstype bits in TTF/OTF fonts (default: 0)
|
90
|
+
process_type3: '--process-type3', # convert Type 3 fonts for web (experimental) (default: 0)
|
91
|
+
heps: '--heps', # horizontal threshold for merging text, in pixels (default: 1)
|
92
|
+
veps: '--veps', # vertical threshold for merging text, in pixels (default: 1)
|
93
|
+
space_threshold: '--space-threshold', # word break threshold (threshold * em) (default: 0.125)
|
94
|
+
font_size_multiplier: '--font-size-multiplier', # a value greater than 1 increases the rendering accuracy (default: 4)
|
95
|
+
space_as_offset: '--space-as-offset', # treat space characters as offsets (default: 0)
|
96
|
+
tounicode: '--tounicode', # how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore) (default: 0)
|
97
|
+
optimize_text: '--optimize-text', # try to reduce the number of HTML elements used for text (default: 0)
|
98
|
+
correct_text_visibility: '--correct-text-visibility', # try to detect texts covered by other graphics and properly arrange them (default: 0)
|
99
|
+
bg_format: '--bg-format', # specify background image format (default: "png")
|
100
|
+
svg_node_count_limit: '--svg-node-count-limit', # if node count in a svg background image exceeds this limit, fall back this page to bitmap background; negative value means no limit. (default: -1)
|
101
|
+
svg_embed_bitmap: '--svg-embed-bitmap', # 1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible. (default: 1)
|
102
|
+
owner_password: '--owner-password', # owner password (for encrypted files)
|
103
|
+
user_password: '--user-password', # user password (for encrypted files)
|
104
|
+
no_drm: '--no-drm', # override document DRM settings (default: 0)
|
105
|
+
clean_tmp: '--clean-tmp', # remove temporary files after conversion (default: 1)
|
106
|
+
tmp_dir: '--tmp-dir', # specify the location of temporary directory. (default: "/var/folders/rn/1xx0_xsd089fldtvtnq1tk7m0000gn/T/")
|
107
|
+
data_dir: '--data-dir', # specify data directory (default: "/usr/local/Cellar/pdf2htmlex/0.14.6_24/share/pdf2htmlEX")
|
108
|
+
debug: '--debug', # print debugging information (default: 0)
|
109
|
+
proof: '--proof', # texts are drawn on both text layer and background for proof. (default: 0)
|
110
|
+
```
|
111
|
+
|
112
|
+
## Development
|
113
|
+
|
114
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
115
|
+
|
116
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
117
|
+
|
118
|
+
## Contributing
|
119
|
+
|
120
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/marcosgz/pdf2htmlex.
|
121
|
+
|
122
|
+
|
123
|
+
## License
|
124
|
+
|
125
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'pdf2htmlex'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/example/demo1.pdf
ADDED
Binary file
|
data/lib/pdf2htmlex.rb
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# @see https://www.rubydoc.info/stdlib/open3/Open3.popen3
|
4
|
+
require 'open3'
|
5
|
+
# @see https://www.rubydoc.info/stdlib/pathname/Pathname
|
6
|
+
require 'pathname'
|
7
|
+
# @see https://ruby-doc.com/stdlib/libdoc/tmpdir/rdoc/Dir.html
|
8
|
+
require 'tmpdir'
|
9
|
+
|
10
|
+
require_relative 'pdf2htmlex/version'
|
11
|
+
require_relative 'pdf2htmlex/config'
|
12
|
+
|
13
|
+
# Ruby wrapper for the pdf2htmlEX tool
|
14
|
+
#
|
15
|
+
# `pdf2htmlEX` converts PDF to HTML while retaining text, format and style as much as
|
16
|
+
# possible by making use of HTML5, JavaScript and modern CSS features. Even difficult
|
17
|
+
# content like PDFs with embedded fonts, multicolumn documents, scientific papers with
|
18
|
+
# complicated figures and mathematical formulas will mostly be represented correctly.
|
19
|
+
# Fallback mode generates HTML pages which do not require any JavaScript to view them
|
20
|
+
# correctly at the expense of a larger file size.
|
21
|
+
#
|
22
|
+
# @see https://github.com/coolwanglu/pdf2htmlEX
|
23
|
+
module Pdf2htmlex
|
24
|
+
class Error < StandardError; end
|
25
|
+
|
26
|
+
extend self
|
27
|
+
|
28
|
+
# Convert input PDF file to HTML
|
29
|
+
#
|
30
|
+
# @param input_pdf [String, Pathname] The path of PDF file
|
31
|
+
# @param html_filename [String, Pathname, NilClass] The output HTML filename
|
32
|
+
# @param options [Hash] List of pdf2htmlex options
|
33
|
+
# @option [Integer] :first_page (default: 1) First page to convert
|
34
|
+
# @option [Integer] :last_page (default: 2147483647) Last page to convert
|
35
|
+
# @option [Float] :zoom Zoom ratio
|
36
|
+
# @option [Float] :fit_width Fit width to <fp> pixels
|
37
|
+
# @option [Float] :fit_height Fit height to <fp> pixels
|
38
|
+
# @option [Boolean, Integer] :use_cropbo (default: true) Use CropBox instead of MediaBox
|
39
|
+
# @option [Float] :hdpi (default: 144) Horizontal resolution for graphics in DPI
|
40
|
+
# @option [Float] :vdpi (default: 144) Vertical resolution for graphics in DPI
|
41
|
+
# @option [String] :embed Specify which elements should be embedded into output
|
42
|
+
# @option [Boolean, Integer] :embed_css (default: true) Embed CSS files into output
|
43
|
+
# @option [Boolean, Integer] :embed_font (default: true) Embed font files into output
|
44
|
+
# @option [Boolean, Integer] :embed_image (default: true) Embed image files into output
|
45
|
+
# @option [Boolean, Integer] :embed_javascript (default: true) Embed JavaScript files into output
|
46
|
+
# @option [Boolean, Integer] :embed_outline (default: true) Embed outlines into output
|
47
|
+
# @option [Boolean, Integer] :split_pages (default: false) Split pages into separate files
|
48
|
+
# @option [String] :dest_dir (default: ".") Specify destination directory
|
49
|
+
# @option [String] :css_filename (default: "") Filename of the generated css file
|
50
|
+
# @option [String] :page_filename (default: "") Filename template for split pages
|
51
|
+
# @option [String] :outline_filename (default: "") Filename of the generated outline file
|
52
|
+
# @option [Boolean, Integer] :process_nontext (default: true) Render graphics in addition to text
|
53
|
+
# @option [Boolean, Integer] :process_outline (default: true) Show outline in HTML
|
54
|
+
# @option [Boolean, Integer] :process_annotation (default: false) Show annotation in HTML
|
55
|
+
# @option [Boolean, Integer] :process_form (default: false) Include text fields and radio buttons
|
56
|
+
# @option [Boolean, Integer] :printing (default: true) Enable printing support
|
57
|
+
# @option [Boolean, Integer] :fallback (default: false) Output in fallback mode
|
58
|
+
# @option [Integer] :tmp_file_size_limit (default: -1) Maximum size (in KB) used by temporary files, -1 for no limit.
|
59
|
+
# @option [Boolean, Integer] :embed_external_font (default: true) Embed local match for external fonts
|
60
|
+
# @option [String] :font_format (default: "woff") Suffix for embedded font files (ttf,otf,woff,svg)
|
61
|
+
# @option [Boolean, Integer] :decompose_ligature (default: false) Decompose ligatures, such as fi -> fi
|
62
|
+
# @option [Boolean, Integer] :auto_hint (default: false) Use fontforge autohint on fonts without hints
|
63
|
+
# @option [String] :external_hint_tool (default: "") External tool for hinting fonts (overrides --auto-hint)
|
64
|
+
# @option [Boolean, Integer] :stretch_narrow_glyph (default: false) Stretch narrow glyphs instead of padding them
|
65
|
+
# @option [Boolean, Integer] :squeeze_wide_glyph (default: true) Shrink wide glyphs instead of truncating them
|
66
|
+
# @option [Boolean, Integer] :override_fstype (default: false) Clear the fstype bits in TTF/OTF fonts
|
67
|
+
# @option [Boolean, Integer] :process_type3 (default: false) Convert Type 3 fonts for web (experimental)
|
68
|
+
# @option [Float] :heps (default: 1) Horizontal threshold for merging text, in pixels
|
69
|
+
# @option [Float] :veps (default: 1) Vertical threshold for merging text, in pixels
|
70
|
+
# @option [Float] :space_threshold(default: 0.125) Word break threshold (threshold * em)
|
71
|
+
# @option [Integer] :font_size_multiplier (default: 4) A value greater than 1 increases the rendering accuracy
|
72
|
+
# @option [Boolean, Integer] :space_as_offset (default: false) Treat space characters as offsets
|
73
|
+
# @option [Integer] :tounicode (default: 0) How to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)
|
74
|
+
# @option [Boolean, Integer] :optimize_text (default: false) Try to reduce the number of HTML elements used for text
|
75
|
+
# @option [Boolean, Integer] :correct_text_visibility (default: false) Try to detect texts covered by other graphics and properly arrange them
|
76
|
+
# @option [String] :bg_format (default: "png") Specify background image format
|
77
|
+
# @option [Integer] :svg_node_count_limit (default: -1) if node count in a svg background image exceeds this limit, fall back this page to bitmap background; negative value means no limit
|
78
|
+
# @option [Boolean, Integer] :svg_embed_bitmap (default: true) True: embed bitmaps in svg background; False: dump bitmaps to external files if possible.
|
79
|
+
# @option [String] :owner_password Owner password (for encrypted files)
|
80
|
+
# @option [String] :user_password User password (for encrypted files)
|
81
|
+
# @option [Boolean, Integer] :no_drm (default: false) Override document DRM settings
|
82
|
+
# @option [Boolean, Integer] :clean_tmp (default: true) Remove temporary files after conversion
|
83
|
+
# @option [String] :tmp_dir (default: "/var/folders/rn/1xx0_xsd089fldtvtnq1tk7m0000gn/T/") Specify the location of temporary directory
|
84
|
+
# @option [String] :data_dir (default: "/usr/local/Cellar/pdf2htmlex/0.14.6_24/share/pdf2htmlEX") Specify data directory
|
85
|
+
# @option [Boolean, Integer] :debug (default: false) Print debugging information
|
86
|
+
# @option [Boolean, Integer] :proof (default: false) Exts are drawn on both text layer and background for proof.
|
87
|
+
# @raise [Pdf2htmlex::Error] catch errors from pdf2htmlEX into Pdf2htmlex::Error exception
|
88
|
+
# @return [Pathname] The generated HTML file
|
89
|
+
def convert(input_pdf, html_filename = nil, **options)
|
90
|
+
options[:dest_dir] ||= make_tempdir
|
91
|
+
|
92
|
+
output = output_html(options[:dest_dir], html_filename || input_pdf)
|
93
|
+
command = normalize_options(options)
|
94
|
+
command << input_pdf
|
95
|
+
command << output.basename.to_s
|
96
|
+
|
97
|
+
run_command(command)
|
98
|
+
|
99
|
+
output
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def make_tempdir
|
105
|
+
Dir.mktmpdir
|
106
|
+
end
|
107
|
+
|
108
|
+
# Generate a unique HTML filename within destination directory
|
109
|
+
# @param base_dir [String] The directory path
|
110
|
+
# @param target_filename [String] the
|
111
|
+
def output_html(base_dir, target_filename)
|
112
|
+
basename = File.basename(target_filename).sub(%r{\.(pdf|htm|html)$}i, '')
|
113
|
+
filename = File.join(base_dir, format('%<fname>s.html', fname: basename))
|
114
|
+
n = 0
|
115
|
+
while File.exist?(filename)
|
116
|
+
n += 1
|
117
|
+
filename = File.join(base_dir, format('%<fname>s.%<fnum>d.html', fname: basename, fnum: n))
|
118
|
+
end
|
119
|
+
Pathname.new(filename)
|
120
|
+
end
|
121
|
+
|
122
|
+
def normalize_options(opts)
|
123
|
+
[].tap do |arr|
|
124
|
+
opts.each do |k, v|
|
125
|
+
next if v.nil?
|
126
|
+
next unless CMD_OPTIONS.key?(k)
|
127
|
+
|
128
|
+
arr << CMD_OPTIONS[k] << cast_value(v)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def cast_value(value)
|
134
|
+
case value
|
135
|
+
when Proc then cast_value(value.call)
|
136
|
+
when TrueClass then '1'
|
137
|
+
when FalseClass then '0'
|
138
|
+
else
|
139
|
+
value
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def run_command(command, input = nil)
|
144
|
+
opts = { binmode: true, stdin_data: input }
|
145
|
+
|
146
|
+
command.unshift config.executable
|
147
|
+
|
148
|
+
output, error, status = Open3.capture3(*command, opts)
|
149
|
+
|
150
|
+
if status.exitstatus != 0
|
151
|
+
raise Error, "pdf2htmlEX failed with: #{error}\nCommand: #{command.join(' ')}"
|
152
|
+
end
|
153
|
+
|
154
|
+
output
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frizen_string_literal: true
|
2
|
+
|
3
|
+
module Pdf2htmlex
|
4
|
+
CMD_OPTIONS = {
|
5
|
+
first_page: '--first-page', # first page to convert (default: 1)
|
6
|
+
last_page: '--last-page', # last page to convert (default: 2147483647)
|
7
|
+
zoom: '--zoom', # zoom ratio
|
8
|
+
fit_width: '--fit-width', # fit width to <fp> pixels
|
9
|
+
fit_height: '--fit-height', # fit height to <fp> pixels
|
10
|
+
use_cropbox: '--use-cropbox', # use CropBox instead of MediaBox (default: 1)
|
11
|
+
hdpi: '--hdpi', # horizontal resolution for graphics in DPI (default: 144)
|
12
|
+
vdpi: '--vdpi', # vertical resolution for graphics in DPI (default: 144)
|
13
|
+
embed: '--embed', # specify which elements should be embedded into output
|
14
|
+
embed_css: '--embed-css', # embed CSS files into output (default: 1)
|
15
|
+
embed_font: '--embed-font', # embed font files into output (default: 1)
|
16
|
+
embed_image: '--embed-image', # embed image files into output (default: 1)
|
17
|
+
embed_javascript: '--embed-javascript', # embed JavaScript files into output (default: 1)
|
18
|
+
embed_outline: '--embed-outline', # embed outlines into output (default: 1)
|
19
|
+
split_pages: '--split-pages', # split pages into separate files (default: 0)
|
20
|
+
dest_dir: '--dest-dir', # specify destination directory (default: ".")
|
21
|
+
css_filename: '--css-filename', # filename of the generated css file (default: "")
|
22
|
+
page_filename: '--page-filename', # filename template for split pages (default: "")
|
23
|
+
outline_filename: '--outline-filename', # filename of the generated outline file (default: "")
|
24
|
+
process_nontext: '--process-nontext', # render graphics in addition to text (default: 1)
|
25
|
+
process_outline: '--process-outline', # show outline in HTML (default: 1)
|
26
|
+
process_annotation: '--process-annotation', # show annotation in HTML (default: 0)
|
27
|
+
process_form: '--process-form', # include text fields and radio buttons (default: 0)
|
28
|
+
printing: '--printing', # enable printing support (default: 1)
|
29
|
+
fallback: '--fallback', # output in fallback mode (default: 0)
|
30
|
+
tmp_file_size_limit: '--tmp-file-size-limit', # Maximum size (in KB) used by temporary files, -1 for no limit. (default: -1)
|
31
|
+
embed_external_font: '--embed-external-font', # embed local match for external fonts (default: 1)
|
32
|
+
font_format: '--font-format', # suffix for embedded font files (ttf,otf,woff,svg) (default: "woff")
|
33
|
+
decompose_ligature: '--decompose-ligature', # decompose ligatures, such as fi -> fi (default: 0)
|
34
|
+
auto_hint: '--auto-hint', # use fontforge autohint on fonts without hints (default: 0)
|
35
|
+
external_hint_tool: '--external-hint-tool', # external tool for hinting fonts (overrides --auto-hint) (default: "")
|
36
|
+
stretch_narrow_glyph: '--stretch-narrow-glyph', # stretch narrow glyphs instead of padding them (default: 0)
|
37
|
+
squeeze_wide_glyph: '--squeeze-wide-glyph', # shrink wide glyphs instead of truncating them (default: 1)
|
38
|
+
override_fstype: '--override-fstype', # clear the fstype bits in TTF/OTF fonts (default: 0)
|
39
|
+
process_type3: '--process-type3', # convert Type 3 fonts for web (experimental) (default: 0)
|
40
|
+
heps: '--heps', # horizontal threshold for merging text, in pixels (default: 1)
|
41
|
+
veps: '--veps', # vertical threshold for merging text, in pixels (default: 1)
|
42
|
+
space_threshold: '--space-threshold', # word break threshold (threshold * em) (default: 0.125)
|
43
|
+
font_size_multiplier: '--font-size-multiplier', # a value greater than 1 increases the rendering accuracy (default: 4)
|
44
|
+
space_as_offset: '--space-as-offset', # treat space characters as offsets (default: 0)
|
45
|
+
tounicode: '--tounicode', # how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore) (default: 0)
|
46
|
+
optimize_text: '--optimize-text', # try to reduce the number of HTML elements used for text (default: 0)
|
47
|
+
correct_text_visibility: '--correct-text-visibility', # try to detect texts covered by other graphics and properly arrange them (default: 0)
|
48
|
+
bg_format: '--bg-format', # specify background image format (default: "png")
|
49
|
+
svg_node_count_limit: '--svg-node-count-limit', # if node count in a svg background image exceeds this limit, fall back this page to bitmap background; negative value means no limit. (default: -1)
|
50
|
+
svg_embed_bitmap: '--svg-embed-bitmap', # 1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible. (default: 1)
|
51
|
+
owner_password: '--owner-password', # owner password (for encrypted files)
|
52
|
+
user_password: '--user-password', # user password (for encrypted files)
|
53
|
+
no_drm: '--no-drm', # override document DRM settings (default: 0)
|
54
|
+
clean_tmp: '--clean-tmp', # remove temporary files after conversion (default: 1)
|
55
|
+
tmp_dir: '--tmp-dir', # specify the location of temporary directory. (default: "/var/folders/rn/1xx0_xsd089fldtvtnq1tk7m0000gn/T/")
|
56
|
+
data_dir: '--data-dir', # specify data directory (default: "/usr/local/Cellar/pdf2htmlex/0.14.6_24/share/pdf2htmlEX")
|
57
|
+
debug: '--debug', # print debugging information (default: 0)
|
58
|
+
proof: '--proof', # texts are drawn on both text layer and background for proof. (default: 0)
|
59
|
+
}
|
60
|
+
|
61
|
+
Config = Struct.new(:executable)
|
62
|
+
|
63
|
+
def config
|
64
|
+
@config ||= Config.new('pdf2htmlex')
|
65
|
+
end
|
66
|
+
end
|
data/pdf2htmlex.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/pdf2htmlex/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'pdf2htmlex'
|
7
|
+
spec.version = Pdf2htmlex::VERSION
|
8
|
+
spec.authors = ['Marcos G. Zimmermann']
|
9
|
+
spec.email = ['mgzmaster@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = 'Ruby wrapper for the pdf2htmlEX that convert PDF files to HTML'
|
12
|
+
spec.description = 'pdf2htmlEX helps to convert PDF files into HTML. This simple library uses the pdf2htmlEX tool under the hood.'
|
13
|
+
spec.homepage = 'https://github.com/marcosgz/pdf2htmlex'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
|
+
|
17
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
18
|
+
|
19
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
20
|
+
spec.metadata['source_code_uri'] = 'https://github.com/marcosgz/pdf2htmlex'
|
21
|
+
spec.metadata['changelog_uri'] = 'https://github.com/marcosgz/blob/main/pdf2htmlex'
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
27
|
+
end
|
28
|
+
spec.bindir = 'exe'
|
29
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
|
+
spec.require_paths = ['lib']
|
31
|
+
spec.requirements << 'pdf2htmlEX'
|
32
|
+
|
33
|
+
spec.add_development_dependency 'rspec', '~> 3.2'
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdf2htmlex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Marcos G. Zimmermann
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-04-09 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.2'
|
27
|
+
description: pdf2htmlEX helps to convert PDF files into HTML. This simple library
|
28
|
+
uses the pdf2htmlEX tool under the hood.
|
29
|
+
email:
|
30
|
+
- mgzmaster@gmail.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- ".gitignore"
|
36
|
+
- ".rspec"
|
37
|
+
- CHANGELOG.md
|
38
|
+
- Gemfile
|
39
|
+
- Gemfile.lock
|
40
|
+
- LICENSE.txt
|
41
|
+
- README.md
|
42
|
+
- Rakefile
|
43
|
+
- bin/console
|
44
|
+
- bin/setup
|
45
|
+
- example/demo1.pdf
|
46
|
+
- lib/pdf2htmlex.rb
|
47
|
+
- lib/pdf2htmlex/config.rb
|
48
|
+
- lib/pdf2htmlex/version.rb
|
49
|
+
- pdf2htmlex.gemspec
|
50
|
+
homepage: https://github.com/marcosgz/pdf2htmlex
|
51
|
+
licenses:
|
52
|
+
- MIT
|
53
|
+
metadata:
|
54
|
+
allowed_push_host: https://rubygems.org
|
55
|
+
homepage_uri: https://github.com/marcosgz/pdf2htmlex
|
56
|
+
source_code_uri: https://github.com/marcosgz/pdf2htmlex
|
57
|
+
changelog_uri: https://github.com/marcosgz/blob/main/pdf2htmlex
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 2.3.0
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements:
|
73
|
+
- pdf2htmlEX
|
74
|
+
rubygems_version: 3.1.2
|
75
|
+
signing_key:
|
76
|
+
specification_version: 4
|
77
|
+
summary: Ruby wrapper for the pdf2htmlEX that convert PDF files to HTML
|
78
|
+
test_files: []
|