hexapdf 1.5.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +54 -0
- data/README.md +8 -7
- data/examples/022-outline.rb +5 -1
- data/examples/032-acro_form_list_and_fill.rb +47 -0
- data/examples/033-text_extraction.rb +34 -0
- data/lib/hexapdf/cli/debug_info.rb +98 -0
- data/lib/hexapdf/cli/images.rb +2 -2
- data/lib/hexapdf/cli/info.rb +2 -0
- data/lib/hexapdf/cli/inspect.rb +5 -1
- data/lib/hexapdf/cli.rb +2 -0
- data/lib/hexapdf/configuration.rb +8 -0
- data/lib/hexapdf/content/canvas.rb +1 -1
- data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
- data/lib/hexapdf/content.rb +2 -0
- data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
- data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
- data/lib/hexapdf/document.rb +7 -3
- data/lib/hexapdf/encryption/security_handler.rb +3 -1
- data/lib/hexapdf/filter/brotli_decode.rb +88 -0
- data/lib/hexapdf/filter.rb +1 -0
- data/lib/hexapdf/font/cmap.rb +10 -6
- data/lib/hexapdf/font/true_type/builder.rb +1 -1
- data/lib/hexapdf/font/true_type/font.rb +13 -0
- data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
- data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
- data/lib/hexapdf/font/true_type.rb +1 -0
- data/lib/hexapdf/layout/style.rb +6 -2
- data/lib/hexapdf/parser.rb +29 -4
- data/lib/hexapdf/revision.rb +6 -2
- data/lib/hexapdf/task/pdfa.rb +108 -1
- data/lib/hexapdf/type/acro_form/field.rb +4 -1
- data/lib/hexapdf/type/acro_form/form.rb +4 -0
- data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
- data/lib/hexapdf/type/annotations/widget.rb +9 -0
- data/lib/hexapdf/type/document_security_store.rb +80 -0
- data/lib/hexapdf/type/page.rb +11 -0
- data/lib/hexapdf/type.rb +1 -0
- data/lib/hexapdf/version.rb +1 -1
- data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
- data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
- data/test/hexapdf/digital_signature/common.rb +19 -5
- data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
- data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
- data/test/hexapdf/encryption/test_security_handler.rb +7 -5
- data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
- data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
- data/test/hexapdf/font/true_type/test_builder.rb +9 -0
- data/test/hexapdf/font/true_type/test_font.rb +17 -3
- data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
- data/test/hexapdf/task/test_pdfa.rb +72 -0
- data/test/hexapdf/test_document.rb +13 -0
- data/test/hexapdf/test_parser.rb +55 -3
- data/test/hexapdf/test_revision.rb +27 -6
- data/test/hexapdf/type/acro_form/test_field.rb +5 -0
- data/test/hexapdf/type/acro_form/test_form.rb +6 -0
- data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
- data/test/hexapdf/type/annotations/test_widget.rb +11 -0
- data/test/hexapdf/type/test_page.rb +8 -0
- data/test/test_helper.rb +6 -0
- metadata +41 -4
data/lib/hexapdf/layout/style.rb
CHANGED
|
@@ -211,6 +211,8 @@ module HexaPDF
|
|
|
211
211
|
attr_reader :width
|
|
212
212
|
|
|
213
213
|
# The colors of each edge. See Quad.
|
|
214
|
+
#
|
|
215
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
214
216
|
attr_reader :color
|
|
215
217
|
|
|
216
218
|
# The styles of each edge. See Quad.
|
|
@@ -897,7 +899,7 @@ module HexaPDF
|
|
|
897
899
|
#
|
|
898
900
|
# The color used for filling (e.g. text), defaults to black.
|
|
899
901
|
#
|
|
900
|
-
# See: HexaPDF::Content::
|
|
902
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
901
903
|
#
|
|
902
904
|
# Examples:
|
|
903
905
|
#
|
|
@@ -926,7 +928,7 @@ module HexaPDF
|
|
|
926
928
|
#
|
|
927
929
|
# The color used for stroking (e.g. text outlines), defaults to black.
|
|
928
930
|
#
|
|
929
|
-
# See: HexaPDF::Content::
|
|
931
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
930
932
|
#
|
|
931
933
|
# Examples:
|
|
932
934
|
#
|
|
@@ -1175,6 +1177,8 @@ module HexaPDF
|
|
|
1175
1177
|
#
|
|
1176
1178
|
# The color used for backgrounds, defaults to +nil+ (i.e. no background).
|
|
1177
1179
|
#
|
|
1180
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
1181
|
+
#
|
|
1178
1182
|
# Examples:
|
|
1179
1183
|
#
|
|
1180
1184
|
# #>pdf-composer100
|
data/lib/hexapdf/parser.rb
CHANGED
|
@@ -112,8 +112,18 @@ module HexaPDF
|
|
|
112
112
|
end
|
|
113
113
|
|
|
114
114
|
if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
msg = "The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
|
|
116
|
+
"the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref"
|
|
117
|
+
# Some invalid PDFs contain entries where the generation number in the xref is different
|
|
118
|
+
# from the one found in the indirect object. If the file were reconstructed the generation
|
|
119
|
+
# number from the indirect object itself would be used.
|
|
120
|
+
# To gracefully handle such invalid PDFs they need to have a single revision.
|
|
121
|
+
# The other code part that handles this is in Revision#object.
|
|
122
|
+
if oid == xref_entry.oid && @document.revisions.count == 1
|
|
123
|
+
maybe_raise(msg, pos: xref_entry.pos)
|
|
124
|
+
else
|
|
125
|
+
raise_malformed(msg)
|
|
126
|
+
end
|
|
117
127
|
end
|
|
118
128
|
|
|
119
129
|
if obj.kind_of?(Reference)
|
|
@@ -209,9 +219,24 @@ module HexaPDF
|
|
|
209
219
|
tok = @tokenizer.next_token
|
|
210
220
|
|
|
211
221
|
object[:Length] = length
|
|
222
|
+
if object.key?(:Filter)
|
|
223
|
+
begin
|
|
224
|
+
object[:Filter] = @document.unwrap(object[:Filter])
|
|
225
|
+
rescue HexaPDF::Error
|
|
226
|
+
maybe_raise("Invalid /Filter entry for stream", pos: @tokenizer.pos)
|
|
227
|
+
object.delete(:Filter)
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
if object.key?(:DecodeParms)
|
|
231
|
+
begin
|
|
232
|
+
object[:DecodeParms] = @document.unwrap(object[:DecodeParms])
|
|
233
|
+
rescue HexaPDF::Error
|
|
234
|
+
maybe_raise("Invalid /DecodeParms entry for stream", pos: @tokenizer.pos)
|
|
235
|
+
object.delete(:DecodeParms)
|
|
236
|
+
end
|
|
237
|
+
end
|
|
212
238
|
stream = StreamData.new(@tokenizer.io, offset: pos, length: length,
|
|
213
|
-
filter:
|
|
214
|
-
decode_parms: @document.unwrap(object[:DecodeParms]))
|
|
239
|
+
filter: object[:Filter], decode_parms: object[:DecodeParms])
|
|
215
240
|
end
|
|
216
241
|
|
|
217
242
|
unless tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
|
data/lib/hexapdf/revision.rb
CHANGED
|
@@ -128,6 +128,11 @@ module HexaPDF
|
|
|
128
128
|
@objects[oid, gen]
|
|
129
129
|
elsif (xref_entry = @xref_section[oid, gen])
|
|
130
130
|
load_object(xref_entry)
|
|
131
|
+
elsif (xref_entry = @xref_section[oid]) && (obj = load_object(xref_entry))&.gen == gen
|
|
132
|
+
# This branch handles invalid PDFs with a single revision containing xref entries where the
|
|
133
|
+
# gen doesn't match the gen of the indirect object. Also see the special handling in
|
|
134
|
+
# Parser#load_object.
|
|
135
|
+
obj
|
|
131
136
|
else
|
|
132
137
|
nil
|
|
133
138
|
end
|
|
@@ -219,8 +224,7 @@ module HexaPDF
|
|
|
219
224
|
seen = {}
|
|
220
225
|
@objects.each {|oid, _gen, data| seen[oid] = true; yield(data) }
|
|
221
226
|
@xref_section.each do |oid, _gen, data|
|
|
222
|
-
|
|
223
|
-
yield(@objects[oid] || load_object(data))
|
|
227
|
+
yield(@objects[oid] || load_object(data)) unless seen.key?(oid)
|
|
224
228
|
end
|
|
225
229
|
@all_objects_loaded = true
|
|
226
230
|
end
|
data/lib/hexapdf/task/pdfa.rb
CHANGED
|
@@ -40,6 +40,7 @@ require 'hexapdf/content/parser'
|
|
|
40
40
|
require 'hexapdf/content/operator'
|
|
41
41
|
require 'hexapdf/type/xref_stream'
|
|
42
42
|
require 'hexapdf/type/object_stream'
|
|
43
|
+
require 'hexapdf/font/true_type'
|
|
43
44
|
|
|
44
45
|
module HexaPDF
|
|
45
46
|
module Task
|
|
@@ -51,6 +52,13 @@ module HexaPDF
|
|
|
51
52
|
# * prevents the Standard 14 PDF fonts to be used.
|
|
52
53
|
# * adds an appropriate output intent if none is set.
|
|
53
54
|
# * adds the necessary PDF/A metadata properties.
|
|
55
|
+
#
|
|
56
|
+
# Additionally, it applies fixes to the document so that the structures and content of
|
|
57
|
+
# non-conforming PDFs are corrected. See ::call for more information on the available fixes.
|
|
58
|
+
#
|
|
59
|
+
# Note that you should use a PDF/A validation tool like veraPDF (https://verapdf.org/) to ensure
|
|
60
|
+
# that the resulting files confirm to the PDF/A specification because not all documents can be
|
|
61
|
+
# fixed at the moment.
|
|
54
62
|
module PDFA
|
|
55
63
|
|
|
56
64
|
# Performs the necessary tasks to make the document PDF/A compatible.
|
|
@@ -58,7 +66,22 @@ module HexaPDF
|
|
|
58
66
|
# +level+::
|
|
59
67
|
# Specifies the PDF/A conformance level that should be used. Can be one of the following
|
|
60
68
|
# strings: 2b, 2u, 3b, 3u.
|
|
61
|
-
|
|
69
|
+
#
|
|
70
|
+
# +fixes+::
|
|
71
|
+
# Specifies the fixes that should be applied when converting a non-conforming PDF. If a
|
|
72
|
+
# document is created with HexaPDF but also includes parts of loaded documents, this
|
|
73
|
+
# argument hast to be set to +:all+.
|
|
74
|
+
#
|
|
75
|
+
# Can be +:default+ (which is also the default value), +:all+ or an array with one or more
|
|
76
|
+
# fix names.
|
|
77
|
+
#
|
|
78
|
+
# +:default+:: Applies all fixes if the document was loaded from a file. Otherwise applies
|
|
79
|
+
# only those fixes necessary for files created with HexaPDF.
|
|
80
|
+
#
|
|
81
|
+
# +:all+: Applies all available fixes.
|
|
82
|
+
#
|
|
83
|
+
# +:glyph_widths+:: Corrects mismatching width information in fonts.
|
|
84
|
+
def self.call(doc, level: '3u', fixes: :default)
|
|
62
85
|
unless level.match?(/\A[23][bu]\z/)
|
|
63
86
|
raise ArgumentError, "The given PDF/A conformance level '#{level}' is not supported"
|
|
64
87
|
end
|
|
@@ -68,6 +91,15 @@ module HexaPDF
|
|
|
68
91
|
doc.metadata.property('pdfaid', 'part', part)
|
|
69
92
|
doc.metadata.property('pdfaid', 'conformance', conformance.upcase)
|
|
70
93
|
add_srgb_icc_output_intent(doc) unless doc.catalog.key?(:OutputIntents)
|
|
94
|
+
|
|
95
|
+
fixes = if fixes == :all || (fixes == :default && doc.revisions.parser)
|
|
96
|
+
ALL_FIXES
|
|
97
|
+
elsif fixes == :default
|
|
98
|
+
ALL_FIXES - FIXES_FOR_LOADED_DOCUMENTS
|
|
99
|
+
else
|
|
100
|
+
fixes
|
|
101
|
+
end
|
|
102
|
+
fixes.each {|fix| send(fix, doc) }
|
|
71
103
|
end
|
|
72
104
|
end
|
|
73
105
|
|
|
@@ -81,6 +113,81 @@ module HexaPDF
|
|
|
81
113
|
]
|
|
82
114
|
end
|
|
83
115
|
|
|
116
|
+
ALL_FIXES = [:fix_glyph_widths] # :nodoc:
|
|
117
|
+
|
|
118
|
+
FIXES_FOR_LOADED_DOCUMENTS = [:fix_glyph_widths] # :nodoc:
|
|
119
|
+
|
|
120
|
+
# Makes the glyph widths stored in the embedded fonts the same as the ones specified in the
|
|
121
|
+
# PDF font data structures.
|
|
122
|
+
#
|
|
123
|
+
# Note: Currently only handles Type 2 CIDFonts.
|
|
124
|
+
def self.fix_glyph_widths(doc) # :nodoc:
|
|
125
|
+
# Step 1: Collect all CIDs together with their respective fonts
|
|
126
|
+
processor = CIDCollector.new
|
|
127
|
+
doc.pages.each do |page|
|
|
128
|
+
page.process_contents(processor)
|
|
129
|
+
page.each_annotation do |annotation|
|
|
130
|
+
next unless (appearance = annotation.appearance)
|
|
131
|
+
appearance.process_contents(processor, original_resources: page.resources)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Step 2: Process all found fonts
|
|
136
|
+
processor.map.each do |font_object, all_cids|
|
|
137
|
+
next if all_cids.empty?
|
|
138
|
+
font = HexaPDF::Font::TrueType::Font.new(StringIO.new(font_object.font_file.stream))
|
|
139
|
+
cid_to_gid = cid_to_gid_mapping(font_object)
|
|
140
|
+
|
|
141
|
+
# Process all found CIDs by comparing their width with the ones defined in the font and
|
|
142
|
+
# correcting the font if necessary.
|
|
143
|
+
raw_hmtx = font[:hmtx].raw_data
|
|
144
|
+
width_conversion_factor = 1000.0 / font[:head].units_per_em
|
|
145
|
+
all_cids.each do |cid|
|
|
146
|
+
cid_width = font_object.width(cid)
|
|
147
|
+
gid = cid_to_gid[cid]
|
|
148
|
+
gid_width = font[:hmtx][gid].advance_width * width_conversion_factor
|
|
149
|
+
next if (cid_width - gid_width).abs.round <= 1
|
|
150
|
+
raw_hmtx[4 * gid, 2] = [(cid_width / width_conversion_factor).round].pack('n')
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
font_object.font_file.stream = font.build('hmtx' => raw_hmtx)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Processes the contents of a stream and collects the CIDs for each composite font.
|
|
158
|
+
class CIDCollector < HexaPDF::Content::Processor
|
|
159
|
+
|
|
160
|
+
# The mapping from the composite font's descendant font to the set of used CIDs.
|
|
161
|
+
attr_reader :map
|
|
162
|
+
|
|
163
|
+
def initialize(*) # :nodoc:
|
|
164
|
+
super
|
|
165
|
+
@map = Hash.new {|h, k| h[k] = Set.new }
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def show_text(data) # :nodoc:
|
|
169
|
+
font = graphics_state.font
|
|
170
|
+
return unless font[:Subtype] == :Type0 && font.descendant_font[:Subtype] == :CIDFontType2
|
|
171
|
+
|
|
172
|
+
Array(data).each do |item|
|
|
173
|
+
next if item.kind_of?(Numeric)
|
|
174
|
+
@map[font.descendant_font].merge(font.decode(item))
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
alias show_text_with_positioning show_text
|
|
178
|
+
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Returns an object responding to #[] that maps CIDs to GIDs for Type 2 CIDFonts.
|
|
182
|
+
def self.cid_to_gid_mapping(font)
|
|
183
|
+
if font[:CIDToGIDMap] == :Identity
|
|
184
|
+
proc {|cid| cid }
|
|
185
|
+
else
|
|
186
|
+
font[:CIDToGIDMap].stream.unpack('n*')
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
private_class_method :cid_to_gid_mapping
|
|
190
|
+
|
|
84
191
|
end
|
|
85
192
|
|
|
86
193
|
end
|
|
@@ -291,7 +291,10 @@ module HexaPDF
|
|
|
291
291
|
if embedded_widget?
|
|
292
292
|
yield(document.wrap(self))
|
|
293
293
|
elsif terminal_field?
|
|
294
|
-
self[:Kids]&.each
|
|
294
|
+
self[:Kids]&.each do |kid|
|
|
295
|
+
kid = document.wrap(kid)
|
|
296
|
+
yield(kid) if kid.type == :Annot && kid[:Subtype] == :Widget
|
|
297
|
+
end
|
|
295
298
|
end
|
|
296
299
|
|
|
297
300
|
unless direct_only
|
|
@@ -412,6 +412,8 @@ module HexaPDF
|
|
|
412
412
|
#
|
|
413
413
|
# * For radio buttons the value needs to be a String or a Symbol representing the name of
|
|
414
414
|
# the radio button widget to select.
|
|
415
|
+
#
|
|
416
|
+
# * Values for password fields are ignored as they should not be stored in the PDF.
|
|
415
417
|
def fill(data)
|
|
416
418
|
data.each do |field_name, value|
|
|
417
419
|
field = field_by_name(field_name)
|
|
@@ -427,6 +429,8 @@ module HexaPDF
|
|
|
427
429
|
when /\A(?:n(o)?|f(alse)?)\z/ then false
|
|
428
430
|
else value
|
|
429
431
|
end
|
|
432
|
+
when :password_field
|
|
433
|
+
# Ignore the value
|
|
430
434
|
else
|
|
431
435
|
raise HexaPDF::Error, "AcroForm field type #{field.concrete_field_type} not yet supported"
|
|
432
436
|
end
|
|
@@ -344,8 +344,10 @@ module HexaPDF
|
|
|
344
344
|
super
|
|
345
345
|
|
|
346
346
|
if self[:V] && !(self[:V].kind_of?(String) || self[:V].kind_of?(HexaPDF::Stream))
|
|
347
|
-
|
|
348
|
-
|
|
347
|
+
correctable = self[:V].kind_of?(Symbol)
|
|
348
|
+
yield("Text field doesn't contain text but an object of type #{self[:V].class}", correctable)
|
|
349
|
+
return unless correctable
|
|
350
|
+
self[:V] = self[:V].to_s
|
|
349
351
|
end
|
|
350
352
|
if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
|
|
351
353
|
correctable = true
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# -*- encoding: utf-8; frozen_string_literal: true -*-
|
|
2
|
+
#
|
|
3
|
+
#--
|
|
4
|
+
# This file is part of HexaPDF.
|
|
5
|
+
#
|
|
6
|
+
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
|
|
7
|
+
# Copyright (C) 2014-2025 Thomas Leitner
|
|
8
|
+
#
|
|
9
|
+
# HexaPDF is free software: you can redistribute it and/or modify it
|
|
10
|
+
# under the terms of the GNU Affero General Public License version 3 as
|
|
11
|
+
# published by the Free Software Foundation with the addition of the
|
|
12
|
+
# following permission added to Section 15 as permitted in Section 7(a):
|
|
13
|
+
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
|
|
14
|
+
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
|
|
15
|
+
# INFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
16
|
+
#
|
|
17
|
+
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
|
|
18
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
|
|
20
|
+
# License for more details.
|
|
21
|
+
#
|
|
22
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
23
|
+
# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
+
#
|
|
25
|
+
# The interactive user interfaces in modified source and object code
|
|
26
|
+
# versions of HexaPDF must display Appropriate Legal Notices, as required
|
|
27
|
+
# under Section 5 of the GNU Affero General Public License version 3.
|
|
28
|
+
#
|
|
29
|
+
# In accordance with Section 7(b) of the GNU Affero General Public
|
|
30
|
+
# License, a covered work must retain the producer line in every PDF that
|
|
31
|
+
# is created or manipulated using HexaPDF.
|
|
32
|
+
#
|
|
33
|
+
# If the GNU Affero General Public License doesn't fit your need,
|
|
34
|
+
# commercial licenses are available at <https://gettalong.at/hexapdf/>.
|
|
35
|
+
#++
|
|
36
|
+
|
|
37
|
+
require 'hexapdf/dictionary'
|
|
38
|
+
|
|
39
|
+
module HexaPDF
|
|
40
|
+
module Type
|
|
41
|
+
|
|
42
|
+
# The document security store (DSS) dictionary contains data needed for verifying digital
|
|
43
|
+
# signatures.
|
|
44
|
+
#
|
|
45
|
+
# See: PDF2.0 s12.8.4.3
|
|
46
|
+
class DocumentSecurityStore < Dictionary
|
|
47
|
+
|
|
48
|
+
# The validation-related information (VRI) dictionary contains validation information for one
|
|
49
|
+
# signature. It signifies that the signature has been validated using this information.
|
|
50
|
+
#
|
|
51
|
+
# See: PDF2.0 s12.8.4.4
|
|
52
|
+
class ValidationRelatedInformation < Dictionary
|
|
53
|
+
|
|
54
|
+
define_type :VRI
|
|
55
|
+
|
|
56
|
+
define_field :Type, type: Symbol, default: type
|
|
57
|
+
define_field :Cert, type: PDFArray
|
|
58
|
+
define_field :CRL, type: PDFArray
|
|
59
|
+
define_field :OCSP, type: PDFArray
|
|
60
|
+
define_field :TU, type: PDFDate
|
|
61
|
+
define_field :TS, type: Stream
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
define_type :DSS
|
|
66
|
+
|
|
67
|
+
define_field :Type, type: Symbol, default: type
|
|
68
|
+
define_field :VRI, type: Dictionary
|
|
69
|
+
define_field :Certs, type: PDFArray
|
|
70
|
+
define_field :OCSPs, type: PDFArray
|
|
71
|
+
define_field :CRLs, type: PDFArray
|
|
72
|
+
define_field :SW, type: Symbol, default: :A, allowed_values: [:A, :B, :S, :N]
|
|
73
|
+
define_field :S, type: Symbol, default: :P, allowed_values: [:A, :P]
|
|
74
|
+
define_field :A, type: PDFArray, default: [0.5, 0.5]
|
|
75
|
+
define_field :FB, type: Boolean, default: false, version: '1.5'
|
|
76
|
+
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/hexapdf/type/page.rb
CHANGED
|
@@ -395,6 +395,17 @@ module HexaPDF
|
|
|
395
395
|
Content::Parser.parse(contents, processor)
|
|
396
396
|
end
|
|
397
397
|
|
|
398
|
+
# Extracts the layouted text from the page.
|
|
399
|
+
#
|
|
400
|
+
# See HexaPDF::Content::SmartTextExtractor.layout_text_runs for the available +options+.
|
|
401
|
+
def extract_text(**options)
|
|
402
|
+
processor = Content::SmartTextExtractor::TextRunProcessor.new
|
|
403
|
+
process_contents(processor)
|
|
404
|
+
box = box(:media)
|
|
405
|
+
Content::SmartTextExtractor.layout_text_runs(processor.text_runs, box.width, box.height,
|
|
406
|
+
**options)
|
|
407
|
+
end
|
|
408
|
+
|
|
398
409
|
# Returns the index of the page in the page tree.
|
|
399
410
|
def index
|
|
400
411
|
idx = 0
|
data/lib/hexapdf/type.rb
CHANGED
|
@@ -89,6 +89,7 @@ module HexaPDF
|
|
|
89
89
|
autoload(:MarkedContentReference, 'hexapdf/type/marked_content_reference')
|
|
90
90
|
autoload(:ObjectReference, 'hexapdf/type/object_reference')
|
|
91
91
|
autoload(:Measure, 'hexapdf/type/measure')
|
|
92
|
+
autoload(:DocumentSecurityStore, 'hexapdf/type/document_security_store')
|
|
92
93
|
|
|
93
94
|
end
|
|
94
95
|
|
data/lib/hexapdf/version.rb
CHANGED
|
Binary file
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
require 'test_helper'
|
|
4
|
+
require 'hexapdf/content/smart_text_extractor'
|
|
5
|
+
require 'hexapdf/document'
|
|
6
|
+
|
|
7
|
+
describe HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun do
|
|
8
|
+
it "has various accessors" do
|
|
9
|
+
text_run = HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new('s', 1, 2, 3, 5)
|
|
10
|
+
assert_equal('s', text_run.string)
|
|
11
|
+
assert_equal(2, text_run.width)
|
|
12
|
+
assert_equal(3, text_run.height)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe HexaPDF::Content::SmartTextExtractor::TextRunProcessor do
|
|
17
|
+
it "turns glyphs into TextRun objects" do
|
|
18
|
+
processor = HexaPDF::Content::SmartTextExtractor::TextRunProcessor.new
|
|
19
|
+
doc = HexaPDF::Document.new
|
|
20
|
+
page = doc.pages.add
|
|
21
|
+
page.canvas.font('Helvetica', size: 10).
|
|
22
|
+
text('Te', at: [10, 500]).
|
|
23
|
+
text_matrix(0.866, -0.5, 0.5, 0.866, 0, 0).
|
|
24
|
+
text('Te')
|
|
25
|
+
page.process_contents(processor)
|
|
26
|
+
assert_equal([['T', 10, 497.75, 16.11, 509.31], ['e', 16.11, 497.75, 21.67, 509.31],
|
|
27
|
+
["T", -1.125, -5.0035, 9.94626, 8.06246],
|
|
28
|
+
["e", 4.16626, -7.7835, 14.761220000000002, 5.00746]],
|
|
29
|
+
processor.text_runs.map(&:to_a))
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe HexaPDF::Content::SmartTextExtractor do
|
|
34
|
+
def text_run(str, left, bottom, right, top)
|
|
35
|
+
HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new(str, left, bottom, right, top)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def layout_runs(runs, width = 595, height = 842, **options)
|
|
39
|
+
runs = runs.map {|args| text_run(*args) }
|
|
40
|
+
HexaPDF::Content::SmartTextExtractor.layout_text_runs(runs, width, height, **options)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "works for a page with no text" do
|
|
44
|
+
assert_equal('', layout_runs([]))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "works for a single run on the left side of the page" do
|
|
48
|
+
assert_equal('test', layout_runs([['test', 0, 100, 20, 110]]))
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "works for a single run not on the left side of the page" do
|
|
52
|
+
assert_equal('test', layout_runs([['test', 50, 100, 70, 110]]))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "preserves the relative indent" do
|
|
56
|
+
assert_equal("Hello\n World", layout_runs([['Hello', 50, 100, 70, 110],
|
|
57
|
+
['World', 70, 80, 90, 100]]))
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "combines text runs if they have the same top/bottom and there is less than 1pt between them" do
|
|
61
|
+
x = +'Hello'
|
|
62
|
+
assert_equal('HelloWorld', layout_runs([[x, 50, 100, 60, 110],
|
|
63
|
+
['World', 60, 100, 70, 110]]))
|
|
64
|
+
assert_equal('HelloWorld', x)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "preserves the space between two runs" do
|
|
68
|
+
assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
|
|
69
|
+
['World', 72, 100, 92, 110]]))
|
|
70
|
+
assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
|
|
71
|
+
['World', 80, 100, 100, 110]]))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "inserts a space after very narrow text parts if necessary" do
|
|
75
|
+
assert_equal('Hello World!', layout_runs([['Hello', 50, 100, 60, 110],
|
|
76
|
+
['World!', 63, 100, 87, 110]]))
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "preserves the visual horizontal ordering of two runs" do
|
|
80
|
+
assert_equal('Hello World', layout_runs([['World', 72, 100, 92, 110],
|
|
81
|
+
['Hello', 50, 100, 70, 110]]))
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it "preserves the visual vertical ordering of two runs" do
|
|
85
|
+
assert_equal("Hello\nWorld", layout_runs([['World', 50, 80, 70, 100],
|
|
86
|
+
['Hello', 50, 100, 70, 110]]))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "inserts a single blank line between paragraphs" do
|
|
90
|
+
assert_equal("Hello\nWorld\n\nHere",
|
|
91
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
92
|
+
['World', 50, 90, 70, 100],
|
|
93
|
+
['Here', 50, 65, 66, 75]]))
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "inserts multiply lines for large gaps between paragraphs" do
|
|
97
|
+
assert_equal("Hello\nWorld\nHere\n\n\n\n\n\n\nFoot",
|
|
98
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
99
|
+
['World', 50, 90, 70, 100],
|
|
100
|
+
['Here', 50, 80, 70, 90],
|
|
101
|
+
['Foot', 50, 10, 66, 20]]))
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it "ignores outliers when calculating the normal line spacing" do
|
|
105
|
+
assert_equal("Hello\nWorld\n\n\n\nHere",
|
|
106
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
107
|
+
['World', 50, 90, 70, 100],
|
|
108
|
+
['Here', 50, 50, 70, 60]]))
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "can use a different line_tolerance_factor" do
|
|
112
|
+
assert_equal("HelloWorld",
|
|
113
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
114
|
+
['World', 50, 90, 70, 100]], line_tolerance_factor: 1))
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "can use a different paragraph_distance_threshold" do
|
|
118
|
+
assert_equal("Hello\n\nWorld",
|
|
119
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
120
|
+
['World', 50, 90, 70, 100]], paragraph_distance_threshold: 1))
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it "can use a different large_distance_threshold" do
|
|
124
|
+
assert_equal("Hello\nWorld\n\nHere",
|
|
125
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
126
|
+
['World', 50, 90, 70, 100],
|
|
127
|
+
['Here', 50, 50, 66, 60]], large_distance_threshold: 8))
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -13,7 +13,7 @@ module HexaPDF
|
|
|
13
13
|
@ca_certificate ||=
|
|
14
14
|
begin
|
|
15
15
|
cert = create_cert(name: '/C=AT/O=HexaPDF/CN=HexaPDF Test Root CA', serial: 0,
|
|
16
|
-
public_key: ca_key
|
|
16
|
+
public_key: ca_key)
|
|
17
17
|
add_extensions(cert, cert, ca_key, is_ca: true, key_usage: 'cRLSign,keyCertSign')
|
|
18
18
|
cert
|
|
19
19
|
end
|
|
@@ -27,7 +27,7 @@ module HexaPDF
|
|
|
27
27
|
@signer_certificate ||=
|
|
28
28
|
begin
|
|
29
29
|
cert = create_cert(name: '/CN=RSA signer/DC=gettalong', serial: 2,
|
|
30
|
-
public_key: signer_key
|
|
30
|
+
public_key: signer_key, issuer: ca_certificate)
|
|
31
31
|
add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
|
|
32
32
|
cert
|
|
33
33
|
end
|
|
@@ -37,7 +37,7 @@ module HexaPDF
|
|
|
37
37
|
@non_repudiation_signer_certificate ||=
|
|
38
38
|
begin
|
|
39
39
|
cert = create_cert(name: '/CN=Non repudiation signer/DC=gettalong', serial: 2,
|
|
40
|
-
public_key: signer_key
|
|
40
|
+
public_key: signer_key, issuer: ca_certificate)
|
|
41
41
|
add_extensions(cert, ca_certificate, ca_key, key_usage: 'nonRepudiation')
|
|
42
42
|
cert
|
|
43
43
|
end
|
|
@@ -51,7 +51,21 @@ module HexaPDF
|
|
|
51
51
|
@dsa_signer_certificate ||=
|
|
52
52
|
begin
|
|
53
53
|
cert = create_cert(name: '/CN=DSA signer/DC=gettalong', serial: 3,
|
|
54
|
-
public_key: dsa_signer_key
|
|
54
|
+
public_key: dsa_signer_key, issuer: ca_certificate)
|
|
55
|
+
add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
|
|
56
|
+
cert
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def ecdsa_signer_key
|
|
61
|
+
@ecdsa_signer_key ||= OpenSSL::PKey::EC.generate('sect163k1')
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def ecdsa_signer_certificate
|
|
65
|
+
@ecdsa_signer_certificate ||=
|
|
66
|
+
begin
|
|
67
|
+
cert = create_cert(name: '/CN=ECDSA signer/DC=gettalong', serial: 4,
|
|
68
|
+
public_key: ecdsa_signer_key, issuer: ca_certificate)
|
|
55
69
|
add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
|
|
56
70
|
cert
|
|
57
71
|
end
|
|
@@ -61,7 +75,7 @@ module HexaPDF
|
|
|
61
75
|
@timestamp_certificate ||=
|
|
62
76
|
begin
|
|
63
77
|
cert = create_cert(name: '/CN=timestamp/DC=gettalong', serial: 3,
|
|
64
|
-
public_key: signer_key
|
|
78
|
+
public_key: signer_key, issuer: ca_certificate)
|
|
65
79
|
add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature',
|
|
66
80
|
extended_key_usage: 'timeStamping')
|
|
67
81
|
cert
|
|
@@ -154,10 +154,35 @@ describe HexaPDF::DigitalSignature::Signing::SignedDataCreator do
|
|
|
154
154
|
assert_equal(CERTIFICATES.signer_key.sign('SHA256', to_sign), @structure.value[5].value)
|
|
155
155
|
end
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
157
|
+
describe "DSA key pair" do
|
|
158
|
+
before do
|
|
159
|
+
@signed_data.certificate = CERTIFICATES.dsa_signer_certificate
|
|
160
|
+
@signed_data.key = CERTIFICATES.dsa_signer_key
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "works with a DSA key pair" do
|
|
164
|
+
@structure = @signed_data.create("data").value[1].value[4].value[0]
|
|
165
|
+
assert_equal('2.16.840.1.101.3.4.3.2', @structure.value[4].value[0].value)
|
|
166
|
+
assert_nil(@structure.value[4].value[1].value)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
it "fails if the digest algorithm is not SHA256" do
|
|
170
|
+
@signed_data.digest_algorithm = 'sha512'
|
|
171
|
+
assert_raises { @signed_data.create("data") }
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
describe "ECDSA key pair" do
|
|
176
|
+
before do
|
|
177
|
+
@signed_data.certificate = CERTIFICATES.ecdsa_signer_certificate
|
|
178
|
+
@signed_data.key = CERTIFICATES.ecdsa_signer_key
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it "works with an ECDSA key pair" do
|
|
182
|
+
structure = @signed_data.create("data").value[1].value[4].value[0]
|
|
183
|
+
assert_equal('1.2.840.10045.4.3.2', structure.value[4].value[0].value)
|
|
184
|
+
assert_nil(structure.value[4].value[1].value)
|
|
185
|
+
end
|
|
161
186
|
end
|
|
162
187
|
|
|
163
188
|
it "can use a different digest algorithm" do
|