hexapdf 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/examples/032-acro_form_list_and_fill.rb +47 -0
- data/examples/033-text_extraction.rb +34 -0
- data/lib/hexapdf/cli/info.rb +2 -0
- data/lib/hexapdf/configuration.rb +8 -0
- data/lib/hexapdf/content/canvas.rb +1 -1
- data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
- data/lib/hexapdf/content.rb +2 -0
- data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
- data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
- data/lib/hexapdf/document.rb +7 -3
- data/lib/hexapdf/filter/brotli_decode.rb +88 -0
- data/lib/hexapdf/filter.rb +1 -0
- data/lib/hexapdf/font/true_type/builder.rb +1 -1
- data/lib/hexapdf/font/true_type/font.rb +13 -0
- data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
- data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
- data/lib/hexapdf/font/true_type.rb +1 -0
- data/lib/hexapdf/layout/style.rb +6 -2
- data/lib/hexapdf/task/pdfa.rb +108 -1
- data/lib/hexapdf/type/acro_form/form.rb +4 -0
- data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
- data/lib/hexapdf/type/annotations/widget.rb +9 -0
- data/lib/hexapdf/type/document_security_store.rb +80 -0
- data/lib/hexapdf/type/page.rb +11 -0
- data/lib/hexapdf/type.rb +1 -0
- data/lib/hexapdf/version.rb +1 -1
- data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
- data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
- data/test/hexapdf/digital_signature/common.rb +19 -5
- data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
- data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
- data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
- data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
- data/test/hexapdf/font/true_type/test_builder.rb +9 -0
- data/test/hexapdf/font/true_type/test_font.rb +17 -3
- data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
- data/test/hexapdf/task/test_pdfa.rb +72 -0
- data/test/hexapdf/test_document.rb +13 -0
- data/test/hexapdf/type/acro_form/test_form.rb +6 -0
- data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
- data/test/hexapdf/type/annotations/test_widget.rb +11 -0
- data/test/hexapdf/type/test_page.rb +8 -0
- metadata +25 -3
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# -*- encoding: utf-8; frozen_string_literal: true -*-
|
|
2
|
+
#
|
|
3
|
+
#--
|
|
4
|
+
# This file is part of HexaPDF.
|
|
5
|
+
#
|
|
6
|
+
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
|
|
7
|
+
# Copyright (C) 2014-2025 Thomas Leitner
|
|
8
|
+
#
|
|
9
|
+
# HexaPDF is free software: you can redistribute it and/or modify it
|
|
10
|
+
# under the terms of the GNU Affero General Public License version 3 as
|
|
11
|
+
# published by the Free Software Foundation with the addition of the
|
|
12
|
+
# following permission added to Section 15 as permitted in Section 7(a):
|
|
13
|
+
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
|
|
14
|
+
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
|
|
15
|
+
# INFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
16
|
+
#
|
|
17
|
+
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
|
|
18
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
|
|
20
|
+
# License for more details.
|
|
21
|
+
#
|
|
22
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
23
|
+
# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
+
#
|
|
25
|
+
# The interactive user interfaces in modified source and object code
|
|
26
|
+
# versions of HexaPDF must display Appropriate Legal Notices, as required
|
|
27
|
+
# under Section 5 of the GNU Affero General Public License version 3.
|
|
28
|
+
#
|
|
29
|
+
# In accordance with Section 7(b) of the GNU Affero General Public
|
|
30
|
+
# License, a covered work must retain the producer line in every PDF that
|
|
31
|
+
# is created or manipulated using HexaPDF.
|
|
32
|
+
#
|
|
33
|
+
# If the GNU Affero General Public License doesn't fit your need,
|
|
34
|
+
# commercial licenses are available at <https://gettalong.at/hexapdf/>.
|
|
35
|
+
#++
|
|
36
|
+
|
|
37
|
+
require 'fiber'
|
|
38
|
+
require 'brotli'
|
|
39
|
+
require 'hexapdf/filter/predictor'
|
|
40
|
+
require 'hexapdf/configuration'
|
|
41
|
+
|
|
42
|
+
module HexaPDF
|
|
43
|
+
module Filter
|
|
44
|
+
|
|
45
|
+
# Implements the Brotli filter using the brotli library which must be installed manually.
|
|
46
|
+
#
|
|
47
|
+
# The BrotliDecode specification is not yet available as a standard but will be in the near
|
|
48
|
+
# future. Therefore it is recommended to wait using it for encoding streams until most of the
|
|
49
|
+
# PDF ecosystem has support for it.
|
|
50
|
+
#
|
|
51
|
+
# See: HexaPDF::Filter
|
|
52
|
+
module BrotliDecode
|
|
53
|
+
|
|
54
|
+
# See HexaPDF::Filter
|
|
55
|
+
#
|
|
56
|
+
# Note that the brotli gem currently doesn't support a streaming decoder. This means that the
|
|
57
|
+
# whole source must be read and decoded at once.
|
|
58
|
+
def self.decoder(source, options = nil)
|
|
59
|
+
fib = Fiber.new do
|
|
60
|
+
data = Filter.string_from_source(source)
|
|
61
|
+
data.empty? ? data: Brotli.inflate(data)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if options && options[:Predictor]
|
|
65
|
+
Predictor.decoder(fib, options)
|
|
66
|
+
else
|
|
67
|
+
fib
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# See HexaPDF::Filter
|
|
72
|
+
#
|
|
73
|
+
# As with ::decoder a usable streaming encoder is not available.
|
|
74
|
+
def self.encoder(source, options = nil)
|
|
75
|
+
if options && options[:Predictor]
|
|
76
|
+
source = Predictor.encoder(source, options)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
Fiber.new do
|
|
80
|
+
Brotli.deflate(Filter.string_from_source(source),
|
|
81
|
+
quality: HexaPDF::GlobalConfiguration['filter.brotli.compression'])
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
end
|
data/lib/hexapdf/filter.rb
CHANGED
|
@@ -134,6 +134,7 @@ module HexaPDF
|
|
|
134
134
|
autoload(:FlateDecode, 'hexapdf/filter/flate_decode')
|
|
135
135
|
autoload(:LZWDecode, 'hexapdf/filter/lzw_decode')
|
|
136
136
|
autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode')
|
|
137
|
+
autoload(:BrotliDecode, 'hexapdf/filter/brotli_decode')
|
|
137
138
|
|
|
138
139
|
autoload(:Predictor, 'hexapdf/filter/predictor')
|
|
139
140
|
|
|
@@ -48,7 +48,7 @@ module HexaPDF
|
|
|
48
48
|
entry_selector = tables.length.bit_length - 1
|
|
49
49
|
range_shift = tables.length * 16 - search_range
|
|
50
50
|
|
|
51
|
-
font_data = "\x0\x1\x0\x0".b +
|
|
51
|
+
font_data = (tables.key?('glyf') ? "\x0\x1\x0\x0" : "OTTO").b +
|
|
52
52
|
[tables.length, search_range, entry_selector, range_shift].pack('n4')
|
|
53
53
|
|
|
54
54
|
offset = font_data.length + tables.length * 16
|
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
#++
|
|
36
36
|
|
|
37
37
|
require 'hexapdf/font/true_type/table'
|
|
38
|
+
require 'hexapdf/font/true_type/builder'
|
|
38
39
|
require 'set'
|
|
39
40
|
|
|
40
41
|
module HexaPDF
|
|
@@ -84,6 +85,18 @@ module HexaPDF
|
|
|
84
85
|
@tables = {}
|
|
85
86
|
end
|
|
86
87
|
|
|
88
|
+
# Uses Builder to build a font file for this font.
|
|
89
|
+
#
|
|
90
|
+
# The +table_overrides+ argument can be used to supply mappings from table names (in string
|
|
91
|
+
# form) to raw table data that should override the respective font's tables.
|
|
92
|
+
def build(table_overrides = {})
|
|
93
|
+
tables = directory.table_names.each_with_object({}) do |name, hash|
|
|
94
|
+
hash[name] = self[name.to_sym].raw_data
|
|
95
|
+
end
|
|
96
|
+
tables.merge!(table_overrides)
|
|
97
|
+
Builder.build(tables)
|
|
98
|
+
end
|
|
99
|
+
|
|
87
100
|
# Returns the table instance for the given tag (a symbol), or +nil+ if no such table exists.
|
|
88
101
|
def [](tag)
|
|
89
102
|
return @tables[tag] if @tables.key?(tag)
|
|
@@ -176,9 +176,14 @@ module HexaPDF
|
|
|
176
176
|
# Adds the components of compound glyphs to the subset.
|
|
177
177
|
def add_glyph_components
|
|
178
178
|
glyf = @font[:glyf]
|
|
179
|
+
process_glyph_components = lambda do |gid|
|
|
180
|
+
glyf[gid].components&.each do |cgid|
|
|
181
|
+
use_glyph(cgid)
|
|
182
|
+
process_glyph_components.call(cgid) if glyf[cgid].compound?
|
|
183
|
+
end
|
|
184
|
+
end
|
|
179
185
|
@glyph_map.keys.each do |gid|
|
|
180
|
-
|
|
181
|
-
glyf[gid].components&.each {|cgid| use_glyph(cgid) }
|
|
186
|
+
process_glyph_components.call(gid) unless gid.kind_of?(Symbol)
|
|
182
187
|
end
|
|
183
188
|
end
|
|
184
189
|
|
data/lib/hexapdf/layout/style.rb
CHANGED
|
@@ -211,6 +211,8 @@ module HexaPDF
|
|
|
211
211
|
attr_reader :width
|
|
212
212
|
|
|
213
213
|
# The colors of each edge. See Quad.
|
|
214
|
+
#
|
|
215
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
214
216
|
attr_reader :color
|
|
215
217
|
|
|
216
218
|
# The styles of each edge. See Quad.
|
|
@@ -897,7 +899,7 @@ module HexaPDF
|
|
|
897
899
|
#
|
|
898
900
|
# The color used for filling (e.g. text), defaults to black.
|
|
899
901
|
#
|
|
900
|
-
# See: HexaPDF::Content::
|
|
902
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
901
903
|
#
|
|
902
904
|
# Examples:
|
|
903
905
|
#
|
|
@@ -926,7 +928,7 @@ module HexaPDF
|
|
|
926
928
|
#
|
|
927
929
|
# The color used for stroking (e.g. text outlines), defaults to black.
|
|
928
930
|
#
|
|
929
|
-
# See: HexaPDF::Content::
|
|
931
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
930
932
|
#
|
|
931
933
|
# Examples:
|
|
932
934
|
#
|
|
@@ -1175,6 +1177,8 @@ module HexaPDF
|
|
|
1175
1177
|
#
|
|
1176
1178
|
# The color used for backgrounds, defaults to +nil+ (i.e. no background).
|
|
1177
1179
|
#
|
|
1180
|
+
# See: HexaPDF::Content::ColorSpace.device_color_from_specification
|
|
1181
|
+
#
|
|
1178
1182
|
# Examples:
|
|
1179
1183
|
#
|
|
1180
1184
|
# #>pdf-composer100
|
data/lib/hexapdf/task/pdfa.rb
CHANGED
|
@@ -40,6 +40,7 @@ require 'hexapdf/content/parser'
|
|
|
40
40
|
require 'hexapdf/content/operator'
|
|
41
41
|
require 'hexapdf/type/xref_stream'
|
|
42
42
|
require 'hexapdf/type/object_stream'
|
|
43
|
+
require 'hexapdf/font/true_type'
|
|
43
44
|
|
|
44
45
|
module HexaPDF
|
|
45
46
|
module Task
|
|
@@ -51,6 +52,13 @@ module HexaPDF
|
|
|
51
52
|
# * prevents the Standard 14 PDF fonts to be used.
|
|
52
53
|
# * adds an appropriate output intent if none is set.
|
|
53
54
|
# * adds the necessary PDF/A metadata properties.
|
|
55
|
+
#
|
|
56
|
+
# Additionally, it applies fixes to the document so that the structures and content of
|
|
57
|
+
# non-conforming PDFs are corrected. See ::call for more information on the available fixes.
|
|
58
|
+
#
|
|
59
|
+
# Note that you should use a PDF/A validation tool like veraPDF (https://verapdf.org/) to ensure
|
|
60
|
+
# that the resulting files confirm to the PDF/A specification because not all documents can be
|
|
61
|
+
# fixed at the moment.
|
|
54
62
|
module PDFA
|
|
55
63
|
|
|
56
64
|
# Performs the necessary tasks to make the document PDF/A compatible.
|
|
@@ -58,7 +66,22 @@ module HexaPDF
|
|
|
58
66
|
# +level+::
|
|
59
67
|
# Specifies the PDF/A conformance level that should be used. Can be one of the following
|
|
60
68
|
# strings: 2b, 2u, 3b, 3u.
|
|
61
|
-
|
|
69
|
+
#
|
|
70
|
+
# +fixes+::
|
|
71
|
+
# Specifies the fixes that should be applied when converting a non-conforming PDF. If a
|
|
72
|
+
# document is created with HexaPDF but also includes parts of loaded documents, this
|
|
73
|
+
# argument hast to be set to +:all+.
|
|
74
|
+
#
|
|
75
|
+
# Can be +:default+ (which is also the default value), +:all+ or an array with one or more
|
|
76
|
+
# fix names.
|
|
77
|
+
#
|
|
78
|
+
# +:default+:: Applies all fixes if the document was loaded from a file. Otherwise applies
|
|
79
|
+
# only those fixes necessary for files created with HexaPDF.
|
|
80
|
+
#
|
|
81
|
+
# +:all+: Applies all available fixes.
|
|
82
|
+
#
|
|
83
|
+
# +:glyph_widths+:: Corrects mismatching width information in fonts.
|
|
84
|
+
def self.call(doc, level: '3u', fixes: :default)
|
|
62
85
|
unless level.match?(/\A[23][bu]\z/)
|
|
63
86
|
raise ArgumentError, "The given PDF/A conformance level '#{level}' is not supported"
|
|
64
87
|
end
|
|
@@ -68,6 +91,15 @@ module HexaPDF
|
|
|
68
91
|
doc.metadata.property('pdfaid', 'part', part)
|
|
69
92
|
doc.metadata.property('pdfaid', 'conformance', conformance.upcase)
|
|
70
93
|
add_srgb_icc_output_intent(doc) unless doc.catalog.key?(:OutputIntents)
|
|
94
|
+
|
|
95
|
+
fixes = if fixes == :all || (fixes == :default && doc.revisions.parser)
|
|
96
|
+
ALL_FIXES
|
|
97
|
+
elsif fixes == :default
|
|
98
|
+
ALL_FIXES - FIXES_FOR_LOADED_DOCUMENTS
|
|
99
|
+
else
|
|
100
|
+
fixes
|
|
101
|
+
end
|
|
102
|
+
fixes.each {|fix| send(fix, doc) }
|
|
71
103
|
end
|
|
72
104
|
end
|
|
73
105
|
|
|
@@ -81,6 +113,81 @@ module HexaPDF
|
|
|
81
113
|
]
|
|
82
114
|
end
|
|
83
115
|
|
|
116
|
+
ALL_FIXES = [:fix_glyph_widths] # :nodoc:
|
|
117
|
+
|
|
118
|
+
FIXES_FOR_LOADED_DOCUMENTS = [:fix_glyph_widths] # :nodoc:
|
|
119
|
+
|
|
120
|
+
# Makes the glyph widths stored in the embedded fonts the same as the ones specified in the
|
|
121
|
+
# PDF font data structures.
|
|
122
|
+
#
|
|
123
|
+
# Note: Currently only handles Type 2 CIDFonts.
|
|
124
|
+
def self.fix_glyph_widths(doc) # :nodoc:
|
|
125
|
+
# Step 1: Collect all CIDs together with their respective fonts
|
|
126
|
+
processor = CIDCollector.new
|
|
127
|
+
doc.pages.each do |page|
|
|
128
|
+
page.process_contents(processor)
|
|
129
|
+
page.each_annotation do |annotation|
|
|
130
|
+
next unless (appearance = annotation.appearance)
|
|
131
|
+
appearance.process_contents(processor, original_resources: page.resources)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Step 2: Process all found fonts
|
|
136
|
+
processor.map.each do |font_object, all_cids|
|
|
137
|
+
next if all_cids.empty?
|
|
138
|
+
font = HexaPDF::Font::TrueType::Font.new(StringIO.new(font_object.font_file.stream))
|
|
139
|
+
cid_to_gid = cid_to_gid_mapping(font_object)
|
|
140
|
+
|
|
141
|
+
# Process all found CIDs by comparing their width with the ones defined in the font and
|
|
142
|
+
# correcting the font if necessary.
|
|
143
|
+
raw_hmtx = font[:hmtx].raw_data
|
|
144
|
+
width_conversion_factor = 1000.0 / font[:head].units_per_em
|
|
145
|
+
all_cids.each do |cid|
|
|
146
|
+
cid_width = font_object.width(cid)
|
|
147
|
+
gid = cid_to_gid[cid]
|
|
148
|
+
gid_width = font[:hmtx][gid].advance_width * width_conversion_factor
|
|
149
|
+
next if (cid_width - gid_width).abs.round <= 1
|
|
150
|
+
raw_hmtx[4 * gid, 2] = [(cid_width / width_conversion_factor).round].pack('n')
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
font_object.font_file.stream = font.build('hmtx' => raw_hmtx)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Processes the contents of a stream and collects the CIDs for each composite font.
|
|
158
|
+
class CIDCollector < HexaPDF::Content::Processor
|
|
159
|
+
|
|
160
|
+
# The mapping from the composite font's descendant font to the set of used CIDs.
|
|
161
|
+
attr_reader :map
|
|
162
|
+
|
|
163
|
+
def initialize(*) # :nodoc:
|
|
164
|
+
super
|
|
165
|
+
@map = Hash.new {|h, k| h[k] = Set.new }
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def show_text(data) # :nodoc:
|
|
169
|
+
font = graphics_state.font
|
|
170
|
+
return unless font[:Subtype] == :Type0 && font.descendant_font[:Subtype] == :CIDFontType2
|
|
171
|
+
|
|
172
|
+
Array(data).each do |item|
|
|
173
|
+
next if item.kind_of?(Numeric)
|
|
174
|
+
@map[font.descendant_font].merge(font.decode(item))
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
alias show_text_with_positioning show_text
|
|
178
|
+
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Returns an object responding to #[] that maps CIDs to GIDs for Type 2 CIDFonts.
|
|
182
|
+
def self.cid_to_gid_mapping(font)
|
|
183
|
+
if font[:CIDToGIDMap] == :Identity
|
|
184
|
+
proc {|cid| cid }
|
|
185
|
+
else
|
|
186
|
+
font[:CIDToGIDMap].stream.unpack('n*')
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
private_class_method :cid_to_gid_mapping
|
|
190
|
+
|
|
84
191
|
end
|
|
85
192
|
|
|
86
193
|
end
|
|
@@ -412,6 +412,8 @@ module HexaPDF
|
|
|
412
412
|
#
|
|
413
413
|
# * For radio buttons the value needs to be a String or a Symbol representing the name of
|
|
414
414
|
# the radio button widget to select.
|
|
415
|
+
#
|
|
416
|
+
# * Values for password fields are ignored as they should not be stored in the PDF.
|
|
415
417
|
def fill(data)
|
|
416
418
|
data.each do |field_name, value|
|
|
417
419
|
field = field_by_name(field_name)
|
|
@@ -427,6 +429,8 @@ module HexaPDF
|
|
|
427
429
|
when /\A(?:n(o)?|f(alse)?)\z/ then false
|
|
428
430
|
else value
|
|
429
431
|
end
|
|
432
|
+
when :password_field
|
|
433
|
+
# Ignore the value
|
|
430
434
|
else
|
|
431
435
|
raise HexaPDF::Error, "AcroForm field type #{field.concrete_field_type} not yet supported"
|
|
432
436
|
end
|
|
@@ -344,8 +344,10 @@ module HexaPDF
|
|
|
344
344
|
super
|
|
345
345
|
|
|
346
346
|
if self[:V] && !(self[:V].kind_of?(String) || self[:V].kind_of?(HexaPDF::Stream))
|
|
347
|
-
|
|
348
|
-
|
|
347
|
+
correctable = self[:V].kind_of?(Symbol)
|
|
348
|
+
yield("Text field doesn't contain text but an object of type #{self[:V].class}", correctable)
|
|
349
|
+
return unless correctable
|
|
350
|
+
self[:V] = self[:V].to_s
|
|
349
351
|
end
|
|
350
352
|
if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
|
|
351
353
|
correctable = true
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# -*- encoding: utf-8; frozen_string_literal: true -*-
|
|
2
|
+
#
|
|
3
|
+
#--
|
|
4
|
+
# This file is part of HexaPDF.
|
|
5
|
+
#
|
|
6
|
+
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
|
|
7
|
+
# Copyright (C) 2014-2025 Thomas Leitner
|
|
8
|
+
#
|
|
9
|
+
# HexaPDF is free software: you can redistribute it and/or modify it
|
|
10
|
+
# under the terms of the GNU Affero General Public License version 3 as
|
|
11
|
+
# published by the Free Software Foundation with the addition of the
|
|
12
|
+
# following permission added to Section 15 as permitted in Section 7(a):
|
|
13
|
+
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
|
|
14
|
+
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
|
|
15
|
+
# INFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
16
|
+
#
|
|
17
|
+
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
|
|
18
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
|
|
20
|
+
# License for more details.
|
|
21
|
+
#
|
|
22
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
23
|
+
# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
+
#
|
|
25
|
+
# The interactive user interfaces in modified source and object code
|
|
26
|
+
# versions of HexaPDF must display Appropriate Legal Notices, as required
|
|
27
|
+
# under Section 5 of the GNU Affero General Public License version 3.
|
|
28
|
+
#
|
|
29
|
+
# In accordance with Section 7(b) of the GNU Affero General Public
|
|
30
|
+
# License, a covered work must retain the producer line in every PDF that
|
|
31
|
+
# is created or manipulated using HexaPDF.
|
|
32
|
+
#
|
|
33
|
+
# If the GNU Affero General Public License doesn't fit your need,
|
|
34
|
+
# commercial licenses are available at <https://gettalong.at/hexapdf/>.
|
|
35
|
+
#++
|
|
36
|
+
|
|
37
|
+
require 'hexapdf/dictionary'
|
|
38
|
+
|
|
39
|
+
module HexaPDF
|
|
40
|
+
module Type
|
|
41
|
+
|
|
42
|
+
# The document security store (DSS) dictionary contains data needed for verifying digital
|
|
43
|
+
# signatures.
|
|
44
|
+
#
|
|
45
|
+
# See: PDF2.0 s12.8.4.3
|
|
46
|
+
class DocumentSecurityStore < Dictionary
|
|
47
|
+
|
|
48
|
+
# The validation-related information (VRI) dictionary contains validation information for one
|
|
49
|
+
# signature. It signifies that the signature has been validated using this information.
|
|
50
|
+
#
|
|
51
|
+
# See: PDF2.0 s12.8.4.4
|
|
52
|
+
class ValidationRelatedInformation < Dictionary
|
|
53
|
+
|
|
54
|
+
define_type :VRI
|
|
55
|
+
|
|
56
|
+
define_field :Type, type: Symbol, default: type
|
|
57
|
+
define_field :Cert, type: PDFArray
|
|
58
|
+
define_field :CRL, type: PDFArray
|
|
59
|
+
define_field :OCSP, type: PDFArray
|
|
60
|
+
define_field :TU, type: PDFDate
|
|
61
|
+
define_field :TS, type: Stream
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
define_type :DSS
|
|
66
|
+
|
|
67
|
+
define_field :Type, type: Symbol, default: type
|
|
68
|
+
define_field :VRI, type: Dictionary
|
|
69
|
+
define_field :Certs, type: PDFArray
|
|
70
|
+
define_field :OCSPs, type: PDFArray
|
|
71
|
+
define_field :CRLs, type: PDFArray
|
|
72
|
+
define_field :SW, type: Symbol, default: :A, allowed_values: [:A, :B, :S, :N]
|
|
73
|
+
define_field :S, type: Symbol, default: :P, allowed_values: [:A, :P]
|
|
74
|
+
define_field :A, type: PDFArray, default: [0.5, 0.5]
|
|
75
|
+
define_field :FB, type: Boolean, default: false, version: '1.5'
|
|
76
|
+
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/hexapdf/type/page.rb
CHANGED
|
@@ -395,6 +395,17 @@ module HexaPDF
|
|
|
395
395
|
Content::Parser.parse(contents, processor)
|
|
396
396
|
end
|
|
397
397
|
|
|
398
|
+
# Extracts the layouted text from the page.
|
|
399
|
+
#
|
|
400
|
+
# See HexaPDF::Content::SmartTextExtractor.layout_text_runs for the available +options+.
|
|
401
|
+
def extract_text(**options)
|
|
402
|
+
processor = Content::SmartTextExtractor::TextRunProcessor.new
|
|
403
|
+
process_contents(processor)
|
|
404
|
+
box = box(:media)
|
|
405
|
+
Content::SmartTextExtractor.layout_text_runs(processor.text_runs, box.width, box.height,
|
|
406
|
+
**options)
|
|
407
|
+
end
|
|
408
|
+
|
|
398
409
|
# Returns the index of the page in the page tree.
|
|
399
410
|
def index
|
|
400
411
|
idx = 0
|
data/lib/hexapdf/type.rb
CHANGED
|
@@ -89,6 +89,7 @@ module HexaPDF
|
|
|
89
89
|
autoload(:MarkedContentReference, 'hexapdf/type/marked_content_reference')
|
|
90
90
|
autoload(:ObjectReference, 'hexapdf/type/object_reference')
|
|
91
91
|
autoload(:Measure, 'hexapdf/type/measure')
|
|
92
|
+
autoload(:DocumentSecurityStore, 'hexapdf/type/document_security_store')
|
|
92
93
|
|
|
93
94
|
end
|
|
94
95
|
|
data/lib/hexapdf/version.rb
CHANGED
|
Binary file
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
require 'test_helper'
|
|
4
|
+
require 'hexapdf/content/smart_text_extractor'
|
|
5
|
+
require 'hexapdf/document'
|
|
6
|
+
|
|
7
|
+
describe HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun do
|
|
8
|
+
it "has various accessors" do
|
|
9
|
+
text_run = HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new('s', 1, 2, 3, 5)
|
|
10
|
+
assert_equal('s', text_run.string)
|
|
11
|
+
assert_equal(2, text_run.width)
|
|
12
|
+
assert_equal(3, text_run.height)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe HexaPDF::Content::SmartTextExtractor::TextRunProcessor do
|
|
17
|
+
it "turns glyphs into TextRun objects" do
|
|
18
|
+
processor = HexaPDF::Content::SmartTextExtractor::TextRunProcessor.new
|
|
19
|
+
doc = HexaPDF::Document.new
|
|
20
|
+
page = doc.pages.add
|
|
21
|
+
page.canvas.font('Helvetica', size: 10).
|
|
22
|
+
text('Te', at: [10, 500]).
|
|
23
|
+
text_matrix(0.866, -0.5, 0.5, 0.866, 0, 0).
|
|
24
|
+
text('Te')
|
|
25
|
+
page.process_contents(processor)
|
|
26
|
+
assert_equal([['T', 10, 497.75, 16.11, 509.31], ['e', 16.11, 497.75, 21.67, 509.31],
|
|
27
|
+
["T", -1.125, -5.0035, 9.94626, 8.06246],
|
|
28
|
+
["e", 4.16626, -7.7835, 14.761220000000002, 5.00746]],
|
|
29
|
+
processor.text_runs.map(&:to_a))
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe HexaPDF::Content::SmartTextExtractor do
|
|
34
|
+
def text_run(str, left, bottom, right, top)
|
|
35
|
+
HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new(str, left, bottom, right, top)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def layout_runs(runs, width = 595, height = 842, **options)
|
|
39
|
+
runs = runs.map {|args| text_run(*args) }
|
|
40
|
+
HexaPDF::Content::SmartTextExtractor.layout_text_runs(runs, width, height, **options)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "works for a page with no text" do
|
|
44
|
+
assert_equal('', layout_runs([]))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "works for a single run on the left side of the page" do
|
|
48
|
+
assert_equal('test', layout_runs([['test', 0, 100, 20, 110]]))
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "works for a single run not on the left side of the page" do
|
|
52
|
+
assert_equal('test', layout_runs([['test', 50, 100, 70, 110]]))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "preserves the relative indent" do
|
|
56
|
+
assert_equal("Hello\n World", layout_runs([['Hello', 50, 100, 70, 110],
|
|
57
|
+
['World', 70, 80, 90, 100]]))
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "combines text runs if they have the same top/bottom and there is less than 1pt between them" do
|
|
61
|
+
x = +'Hello'
|
|
62
|
+
assert_equal('HelloWorld', layout_runs([[x, 50, 100, 60, 110],
|
|
63
|
+
['World', 60, 100, 70, 110]]))
|
|
64
|
+
assert_equal('HelloWorld', x)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "preserves the space between two runs" do
|
|
68
|
+
assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
|
|
69
|
+
['World', 72, 100, 92, 110]]))
|
|
70
|
+
assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
|
|
71
|
+
['World', 80, 100, 100, 110]]))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "inserts a space after very narrow text parts if necessary" do
|
|
75
|
+
assert_equal('Hello World!', layout_runs([['Hello', 50, 100, 60, 110],
|
|
76
|
+
['World!', 63, 100, 87, 110]]))
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "preserves the visual horizontal ordering of two runs" do
|
|
80
|
+
assert_equal('Hello World', layout_runs([['World', 72, 100, 92, 110],
|
|
81
|
+
['Hello', 50, 100, 70, 110]]))
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it "preserves the visual vertical ordering of two runs" do
|
|
85
|
+
assert_equal("Hello\nWorld", layout_runs([['World', 50, 80, 70, 100],
|
|
86
|
+
['Hello', 50, 100, 70, 110]]))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "inserts a single blank line between paragraphs" do
|
|
90
|
+
assert_equal("Hello\nWorld\n\nHere",
|
|
91
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
92
|
+
['World', 50, 90, 70, 100],
|
|
93
|
+
['Here', 50, 65, 66, 75]]))
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "inserts multiply lines for large gaps between paragraphs" do
|
|
97
|
+
assert_equal("Hello\nWorld\nHere\n\n\n\n\n\n\nFoot",
|
|
98
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
99
|
+
['World', 50, 90, 70, 100],
|
|
100
|
+
['Here', 50, 80, 70, 90],
|
|
101
|
+
['Foot', 50, 10, 66, 20]]))
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it "ignores outliers when calculating the normal line spacing" do
|
|
105
|
+
assert_equal("Hello\nWorld\n\n\n\nHere",
|
|
106
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
107
|
+
['World', 50, 90, 70, 100],
|
|
108
|
+
['Here', 50, 50, 70, 60]]))
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "can use a different line_tolerance_factor" do
|
|
112
|
+
assert_equal("HelloWorld",
|
|
113
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
114
|
+
['World', 50, 90, 70, 100]], line_tolerance_factor: 1))
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "can use a different paragraph_distance_threshold" do
|
|
118
|
+
assert_equal("Hello\n\nWorld",
|
|
119
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
120
|
+
['World', 50, 90, 70, 100]], paragraph_distance_threshold: 1))
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it "can use a different large_distance_threshold" do
|
|
124
|
+
assert_equal("Hello\nWorld\n\nHere",
|
|
125
|
+
layout_runs([['Hello', 50, 100, 70, 110],
|
|
126
|
+
['World', 50, 90, 70, 100],
|
|
127
|
+
['Here', 50, 50, 66, 60]], large_distance_threshold: 8))
|
|
128
|
+
end
|
|
129
|
+
end
|