hexapdf 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +54 -0
  3. data/README.md +8 -7
  4. data/examples/022-outline.rb +5 -1
  5. data/examples/032-acro_form_list_and_fill.rb +47 -0
  6. data/examples/033-text_extraction.rb +34 -0
  7. data/lib/hexapdf/cli/debug_info.rb +98 -0
  8. data/lib/hexapdf/cli/images.rb +2 -2
  9. data/lib/hexapdf/cli/info.rb +2 -0
  10. data/lib/hexapdf/cli/inspect.rb +5 -1
  11. data/lib/hexapdf/cli.rb +2 -0
  12. data/lib/hexapdf/configuration.rb +8 -0
  13. data/lib/hexapdf/content/canvas.rb +1 -1
  14. data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
  15. data/lib/hexapdf/content.rb +2 -0
  16. data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
  17. data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
  18. data/lib/hexapdf/document.rb +7 -3
  19. data/lib/hexapdf/encryption/security_handler.rb +3 -1
  20. data/lib/hexapdf/filter/brotli_decode.rb +88 -0
  21. data/lib/hexapdf/filter.rb +1 -0
  22. data/lib/hexapdf/font/cmap.rb +10 -6
  23. data/lib/hexapdf/font/true_type/builder.rb +1 -1
  24. data/lib/hexapdf/font/true_type/font.rb +13 -0
  25. data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
  26. data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
  27. data/lib/hexapdf/font/true_type.rb +1 -0
  28. data/lib/hexapdf/layout/style.rb +6 -2
  29. data/lib/hexapdf/parser.rb +29 -4
  30. data/lib/hexapdf/revision.rb +6 -2
  31. data/lib/hexapdf/task/pdfa.rb +108 -1
  32. data/lib/hexapdf/type/acro_form/field.rb +4 -1
  33. data/lib/hexapdf/type/acro_form/form.rb +4 -0
  34. data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
  35. data/lib/hexapdf/type/annotations/widget.rb +9 -0
  36. data/lib/hexapdf/type/document_security_store.rb +80 -0
  37. data/lib/hexapdf/type/page.rb +11 -0
  38. data/lib/hexapdf/type.rb +1 -0
  39. data/lib/hexapdf/version.rb +1 -1
  40. data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
  41. data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
  42. data/test/hexapdf/digital_signature/common.rb +19 -5
  43. data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
  44. data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
  45. data/test/hexapdf/encryption/test_security_handler.rb +7 -5
  46. data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
  47. data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
  48. data/test/hexapdf/font/true_type/test_builder.rb +9 -0
  49. data/test/hexapdf/font/true_type/test_font.rb +17 -3
  50. data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
  51. data/test/hexapdf/task/test_pdfa.rb +72 -0
  52. data/test/hexapdf/test_document.rb +13 -0
  53. data/test/hexapdf/test_parser.rb +55 -3
  54. data/test/hexapdf/test_revision.rb +27 -6
  55. data/test/hexapdf/type/acro_form/test_field.rb +5 -0
  56. data/test/hexapdf/type/acro_form/test_form.rb +6 -0
  57. data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
  58. data/test/hexapdf/type/annotations/test_widget.rb +11 -0
  59. data/test/hexapdf/type/test_page.rb +8 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +41 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: de7c1790b3c958a91f071b5c20063eafea93fed12a034b89890242fec25c3026
4
- data.tar.gz: 0e201dc452930a2a81461be5bf9cd27d2749b92815498c06b37a1e2635a20d7d
3
+ metadata.gz: 04f2a87f1aaa95513275432d718996b7d598fc15e476f6999f6b6fe9f29cd0f8
4
+ data.tar.gz: 539d2b0e984db4ca4095bf0aad5208fbbdff5a08acc80d270a6b1c824f12c87e
5
5
  SHA512:
6
- metadata.gz: 1d1b13a5c28c83ca8ec4730cfc0af3016ceb14831c16587b000d8b69d0c7482d166bed21542f4929a8e4614fc732208c5670451a34339776b78a66dea8374949
7
- data.tar.gz: 309e3aa2a80ec92b4fd35e72e9ab0c114fe4022467be9fb6fb5805085d6616ea8301ab30345a7322b009aeeffa5d6b052abfb8f0bafbaa9ca8b2223af3a6b223
6
+ metadata.gz: c35f8b0267ef60c6392ae99d8c001e4d6b5e18ea1f5a62132d44bbf865d52cc8a9b08436e107c35a01d3a6edaeb7be9bcede20931a255416d4ea4d07778f8fc0
7
+ data.tar.gz: bfdedefe99c534d62b11f406b447902ea6824758153448ebfba35d0e456850134ba36a6cb2c97d668983e8a5b5b96bf0ab0a03c6136f2478a7717a0e7bb0933b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,57 @@
1
+ ## 1.7.0 - 2026-04-13
2
+
3
+ ### Added
4
+
5
+ * Smart text extraction for retrieving layouted text from pages
6
+ * Support for digitally signing with ECDSA keys
7
+ * Support for digitally signing with DSA keys
8
+ * Support for BrotliDecode filter
9
+ * [HexaPDF::Type::DocumentSecurityStore] and
10
+ [HexaPDF::Type::ValidationRelatedInformation]
11
+
12
+ ### Changed
13
+
14
+ * **Breaking change**: [HexaPDF::Document#unwrap] to not unwrap streams
15
+ * Automatic detection of digital signature size to account for small deviations
16
+ * [HexaPDF::Type::AcroForm::Form#fill] to ignore password fields
17
+ * [HexaPDF::Type::AcroForm::TextField] validation to convert invalid Symbol
18
+ values to String
19
+ * [HexaPDF::Type::Annotations::Widget] validation to also validate a widget as a
20
+ field if necessary
21
+ * PDF/A task to include a fix for mismatching glyph widths for Type 2 CID fonts
22
+
23
+ ### Fixed
24
+
25
+ * Writing of PDF documents with an invalid value for the /Info dictionary
26
+ * Subsetting of TrueType fonts in case compound glyphs are themselves compound
27
+
28
+
29
+ ## 1.6.0 - 2026-02-10
30
+
31
+ ### Added
32
+
33
+ * CLI command `hexapdf debug-info` for creating debugging information,
34
+ especially for malformed files
35
+
36
+ ### Changed
37
+
38
+ * Optimized decoding character codes with a CMap to drastically lower memory
39
+ usage
40
+ * CLI command `hexapdf inspect rev` to show whether the cross-reference table
41
+ was reconstructed
42
+
43
+ ### Fixed
44
+
45
+ * Path generation for image extraction in CLI command `hexapdf images`
46
+ * Handling of certain invalid PDFs where the generation number for object
47
+ identifiers don't match their cross-reference section value
48
+ * AES 256bit encryption to include unnecessary field /Length in encryption
49
+ dictionary to work around buggy PDF libraries
50
+ * Parsing of invalid /Filter and /DecodeParms stream keys in case they resolve
51
+ to a recursive structure
52
+ * [HexaPDF::Type::AcroForm::Field#each_widget] to only yield widget objects
53
+
54
+
1
55
  ## 1.5.0 - 2025-12-08
2
56
 
3
57
  ### Added
data/README.md CHANGED
@@ -13,7 +13,7 @@ In short, it allows
13
13
  * **securing** PDF files by encrypting or signing them and
14
14
  * **optimizing** PDF files for smaller file size or other criteria.
15
15
 
16
- HexaPDF is available under two license, the AGPL and a commercial license, see the [License
16
+ HexaPDF is available under two licenses, the AGPL and a commercial license, see the [License
17
17
  section](#License) for details.
18
18
 
19
19
 
@@ -93,12 +93,13 @@ with example graphics and PDF files and tightly integrated into the rest of the
93
93
 
94
94
  ## Requirements and Installation
95
95
 
96
- Since HexaPDF is written in Ruby, a working Ruby installation is needed - see the
97
- [official installation documentation][rbinstall] for details. Note that you need Ruby version 2.6 or
98
- higher as prior versions are not supported!
96
+ Since HexaPDF is written in Ruby, a working Ruby installation is needed - see the [official
97
+ installation documentation][rbinstall] for details. Note that you need Ruby version 3.0 or higher as
98
+ prior versions are not supported!
99
99
 
100
- HexaPDF works on all Ruby implementations that are CRuby compatible, e.g. TruffleRuby, and on any
101
- platform supported by Ruby (Linux, macOS, Windows, ...).
100
+ HexaPDF works on all Ruby implementations that are CRuby compatible and on any platform supported by
101
+ Ruby (Linux, macOS, Windows, ...). Implementations like JRuby and TruffleRuby should work but
102
+ HexaPDF is not actively tested against them.
102
103
 
103
104
  Apart from Ruby itself the HexaPDF library has only one external dependency `geom2d` which is
104
105
  written and provided by the HexaPDF authors. The `hexapdf` application has an additional dependency
@@ -117,7 +118,7 @@ Prawn is a **library for generating content**.
117
118
 
118
119
  To be more specific, it is easily possible to read an existing PDF with HexaPDF and modify parts of
119
120
  it before writing it out again. The modifications can be to the PDF object structure like removing
120
- superfluous annotations or the the content itself.
121
+ superfluous annotations or the content itself.
121
122
 
122
123
  Prawn has no such functionality. There is basic support for using a PDF as a template using the
123
124
  `pdf-reader` and `prawn-template` gems but support is very limited. However, Prawn has a very
@@ -10,7 +10,11 @@
10
10
  require 'hexapdf'
11
11
 
12
12
  doc = HexaPDF::Document.new
13
- 6.times { doc.pages.add }
13
+ 6.times do |i|
14
+ doc.pages.add.canvas.
15
+ font("Helvetica", size: 150).
16
+ text("Page #{i + 1}", at: [10, 660])
17
+ end
14
18
 
15
19
  doc.outline.add_item("Main") do |main|
16
20
  main.add_item("Page 1", destination: 0)
@@ -0,0 +1,47 @@
1
+ # # PDF Forms - List and fill fields
2
+ #
3
+ # This example shows how to list the form fields of an interactive PDF form and
4
+ # how to fill out the form.
5
+ #
6
+ # The output file from the [PDF forms](acro_form.html) example can be used as
7
+ # input.
8
+ #
9
+ # One way to list and fill a PDF form is to use the [HexaPDF CLI with the 'form'
10
+ # command](/documentation/hexapdf.1.html#form). Here, however, we are doing it
11
+ # with the HexaPDF API.
12
+ #
13
+ # Usage:
14
+ # : `ruby acro_form_list_and_fill.rb [INPUT.PDF]`
15
+ #
16
+
17
+ require 'base64'
18
+ require 'hexapdf'
19
+
20
+ doc = HexaPDF::Document.open(ARGV[0] || 'acro_form.pdf')
21
+ exit unless doc.acro_form
22
+
23
+ puts "Listing all form fields:"
24
+ doc.acro_form.each_field do |field|
25
+ puts "#{field.full_field_name} (#{field.concrete_field_type})"
26
+ end
27
+
28
+ # We are using this to generate some values for existing text fields. In the
29
+ # real world one would be getting the values from the user.
30
+ puts "\nFilling in the text fields with random values:"
31
+ values = {}
32
+ doc.acro_form.each_field do |field|
33
+ next unless field.field_type == :Tx
34
+ value = Base64.encode64(field.full_field_name).strip
35
+ value = if field.key?(:MaxLen)
36
+ value[0, field[:MaxLen]]
37
+ else
38
+ "Value #{field.field_type} #{value}"
39
+ end
40
+ values[field.full_field_name] = value
41
+ puts "#{field.full_field_name}: #{value}"
42
+ end
43
+
44
+ # Now actually fill out the form the values
45
+ doc.acro_form.fill(values)
46
+
47
+ doc.write('acro_form_list_and_fill.pdf', optimize: true)
@@ -0,0 +1,34 @@
1
+ # # Text Extraction
2
+ #
3
+ # This example shows how to extract layouted text from a page.
4
+ #
5
+ # It uses the provided input PDF or creates a small sample PDF as input. Then it
6
+ # extracts the text for each page and creates new pages with the extracted text
7
+ # in a fixed-width font.
8
+ #
9
+ # Usage:
10
+ # : `ruby text_extraction.rb [INPUT.PDF]`
11
+ #
12
+
13
+ require 'hexapdf'
14
+
15
+ # Use the input PDF or create a sample PDF.
16
+ if ARGV.length > 0
17
+ doc = HexaPDF::Document.open(ARGV[0])
18
+ else
19
+ composer = HexaPDF::Composer.new do |pdf|
20
+ pdf.lorem_ipsum(count: 3, padding: [0, 0, 20])
21
+ pdf.lorem_ipsum(padding: [0, 50, 20], text_indent: 40)
22
+ pdf.lorem_ipsum(count: 2)
23
+ end
24
+ doc = composer.document
25
+ end
26
+
27
+ # Extract the existing pages and add new ones with the extracted text
28
+ doc.pages.count.times do |index|
29
+ text = doc.pages[index].extract_text
30
+ doc.pages.add.canvas.font('/usr/share/fonts/truetype/freefont/FreeMono.ttf', size: 6).
31
+ text(text, at: [10, 820])
32
+ end
33
+
34
+ doc.write('text_extraction.pdf', optimize: true)
@@ -0,0 +1,98 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'hexapdf/cli/command'
38
+
39
+ module HexaPDF
40
+ module CLI
41
+
42
+ # Creates debugging information for adding to an issue.
43
+ class DebugInfo < Command
44
+
45
+ def initialize #:nodoc:
46
+ super('debug-info', takes_commands: false)
47
+ short_desc("Create debug information for a PDF file")
48
+ long_desc(<<~EOF)
49
+ Creates debug information for a possibly malformed PDF file that can be attached to an
50
+ issue.
51
+
52
+ Two files are created: anonymized-FILE where all strings are replaced with zeroes and
53
+ debug_info.txt with additional debug information.
54
+ EOF
55
+
56
+ options.on("--password PASSWORD", "-p", String,
57
+ "The password for decryption. Use - for reading from standard input.") do |pwd|
58
+ @password = (pwd == '-' ? read_password : pwd)
59
+ end
60
+
61
+ @password = nil
62
+ end
63
+
64
+ def execute(file) #:nodoc:
65
+ output_name = "anonymized-#{file}"
66
+ puts "Creating anonymized file '#{output_name}'"
67
+ data = File.binread(file)
68
+ data.gsub!(/(>>\s*stream\s*)(.*?)(\s*endstream)/m) {|m| "#{$1}#{'0' * $2.length}#{$3}" }
69
+ data.gsub!(/([^<]<)([0-9A-Fa-f#{Tokenizer::WHITESPACE}]*?)>/m) {|m| "#{$1}#{'0' * $2.length}>" }
70
+ data.gsub!(/\((.*?)\)/m) {|m| "(#{'0' * $1.length})" }
71
+ File.binwrite(output_name, data)
72
+
73
+ debug_info = +''
74
+ puts "Collecting debug information in debug_info.txt"
75
+ begin
76
+ output = capture_output { HexaPDF::CLI::Application.new.parse(['info', '--check', file]) }
77
+ debug_info << "Output:\n"<< output
78
+ rescue
79
+ debug_info << "Error collecting info: #{$!.message}\n"
80
+ end
81
+ File.write('debug_info.txt', debug_info)
82
+ end
83
+
84
+ private
85
+
86
+ def capture_output
87
+ stdout, stderr = $stdout, $stderr
88
+ $stdout = $stderr = StringIO.new
89
+ yield
90
+ $stdout.string
91
+ ensure
92
+ $stdout, $stderr = stdout, stderr
93
+ end
94
+
95
+ end
96
+
97
+ end
98
+ end
@@ -147,7 +147,7 @@ module HexaPDF
147
147
  # Extracts the images with the given indices.
148
148
  def extract_images(doc)
149
149
  FileUtils.mkdir_p(File.dirname("#{@prefix}filename"))
150
- prefix = File.directory?(@prefix) ? @prefix : "@{prefix}-"
150
+ prefix = File.directory?(@prefix) ? @prefix : "#{@prefix}-"
151
151
 
152
152
  done = Set.new
153
153
  count = total = 0
@@ -157,7 +157,7 @@ module HexaPDF
157
157
  info = image.info
158
158
  if info.writable
159
159
  count += 1
160
- path = "#{@prefix}#{index}.#{image.info.extension}"
160
+ path = "#{prefix}#{index}.#{image.info.extension}"
161
161
  maybe_raise_on_existing_file(path)
162
162
  if command_parser.verbosity_info?
163
163
  puts "Extracting image #{index} (#{image.width}x#{image.height}, " \
@@ -137,6 +137,8 @@ module HexaPDF
137
137
  end
138
138
  elsif doc.encrypted?
139
139
  output_line("Encrypted", "yes (no or wrong password given)")
140
+ else
141
+ output_line("Encrypted", "no")
140
142
  end
141
143
 
142
144
  if doc.revisions.parser.linearized?
@@ -293,6 +293,10 @@ module HexaPDF
293
293
  IO.copy_stream(@doc.revisions.parser.io, $stdout, length, 0)
294
294
  else
295
295
  puts "Document has #{@doc.revisions.count} revision#{@doc.revisions.count == 1 ? '' : 's'}"
296
+ if @doc.revisions.parser.reconstructed? && @doc.revisions.count == 1 &&
297
+ @doc.revisions.current == @doc.revisions.parser.reconstructed_revision
298
+ puts "Document cross-reference table has been reconstructed"
299
+ end
296
300
  revision_information do |rev, index, count, signature, end_offset|
297
301
  type = if rev.trailer[:XRefStm]
298
302
  "xref table + stream"
@@ -415,7 +419,7 @@ module HexaPDF
415
419
  sig = signatures[rev]
416
420
  if sig
417
421
  end_index = sig[:ByteRange][-2] + sig[:ByteRange][-1]
418
- else
422
+ elsif rev != @doc.revisions.parser.reconstructed_revision
419
423
  io.seek(startxrefs[index], IO::SEEK_SET)
420
424
  buffer = ''.b
421
425
  while io.pos < startxrefs[index + 1]
data/lib/hexapdf/cli.rb CHANGED
@@ -49,6 +49,7 @@ require 'hexapdf/cli/image2pdf'
49
49
  require 'hexapdf/cli/form'
50
50
  require 'hexapdf/cli/fonts'
51
51
  require 'hexapdf/cli/usage'
52
+ require 'hexapdf/cli/debug_info'
52
53
  require 'hexapdf/version'
53
54
  require 'hexapdf/document'
54
55
 
@@ -125,6 +126,7 @@ module HexaPDF
125
126
  add_command(HexaPDF::CLI::Form.new)
126
127
  add_command(HexaPDF::CLI::Fonts.new)
127
128
  add_command(HexaPDF::CLI::Usage.new)
129
+ add_command(HexaPDF::CLI::DebugInfo.new)
128
130
  add_command(CmdParse::HelpCommand.new)
129
131
  version_command = CmdParse::VersionCommand.new(add_switches: false)
130
132
  add_command(version_command)
@@ -559,6 +559,7 @@ module HexaPDF
559
559
  JPXDecode: 'HexaPDF::Filter::PassThrough',
560
560
  Crypt: 'HexaPDF::Filter::Crypt',
561
561
  Encryption: 'HexaPDF::Filter::Encryption',
562
+ BrotliDecode: 'HexaPDF::Filter::BrotliDecode',
562
563
  },
563
564
  'font.default' => 'Times',
564
565
  'font.fallback' => ['ZapfDingbats', 'Symbol'],
@@ -636,6 +637,11 @@ module HexaPDF
636
637
  #
637
638
  # See PDF2.0 s8.6
638
639
  #
640
+ # filter.brotli.compression::
641
+ # Specifies the compression level that should be used with the BrotliDecode filter. The level
642
+ # can range from 0 (no compression), 1 (best speed) to 11 (best compression). The default
643
+ # value is 8 which is a good compromise between speed and resulting size.
644
+ #
639
645
  # filter.flate.compression::
640
646
  # Specifies the compression level that should be used with the FlateDecode filter. The level
641
647
  # can range from 0 (no compression), 1 (best speed) to 9 (best compression, default).
@@ -754,6 +760,8 @@ module HexaPDF
754
760
  MCR: 'HexaPDF::Type::MarkedContentReference',
755
761
  OBJR: 'HexaPDF::Type::ObjectReference',
756
762
  Measure: 'HexaPDF::Type::Measure',
763
+ DSS: 'HexaPDF::Type::DocumentSecurityStore',
764
+ VRI: 'HexaPDF::Type::DocumentSecurityStore::ValidationRelatedInformation',
757
765
  },
758
766
  'object.subtype_map' => {
759
767
  nil => {
@@ -895,7 +895,7 @@ module HexaPDF
895
895
  #
896
896
  # * Any other string is treated as a color name. HexaPDF supports CSS Color Module Level 3
897
897
  # color names (see https://www.w3.org/TR/css-color-3/#svg-color) as well as HexaPDF design
898
- # colors.
898
+ # colors. See ColorSpace::COLOR_NAMES for the list of supported names.
899
899
  #
900
900
  # * Four numeric arguments specify a CMYK color (see ColorSpace::DeviceCMYK::Color).
901
901
  #