hexapdf 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/examples/032-acro_form_list_and_fill.rb +47 -0
- data/examples/033-text_extraction.rb +34 -0
- data/lib/hexapdf/cli/info.rb +2 -0
- data/lib/hexapdf/configuration.rb +8 -0
- data/lib/hexapdf/content/canvas.rb +1 -1
- data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
- data/lib/hexapdf/content.rb +2 -0
- data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
- data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
- data/lib/hexapdf/document.rb +7 -3
- data/lib/hexapdf/filter/brotli_decode.rb +88 -0
- data/lib/hexapdf/filter.rb +1 -0
- data/lib/hexapdf/font/true_type/builder.rb +1 -1
- data/lib/hexapdf/font/true_type/font.rb +13 -0
- data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
- data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
- data/lib/hexapdf/font/true_type.rb +1 -0
- data/lib/hexapdf/layout/style.rb +6 -2
- data/lib/hexapdf/task/pdfa.rb +108 -1
- data/lib/hexapdf/type/acro_form/form.rb +4 -0
- data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
- data/lib/hexapdf/type/annotations/widget.rb +9 -0
- data/lib/hexapdf/type/document_security_store.rb +80 -0
- data/lib/hexapdf/type/page.rb +11 -0
- data/lib/hexapdf/type.rb +1 -0
- data/lib/hexapdf/version.rb +1 -1
- data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
- data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
- data/test/hexapdf/digital_signature/common.rb +19 -5
- data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
- data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
- data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
- data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
- data/test/hexapdf/font/true_type/test_builder.rb +9 -0
- data/test/hexapdf/font/true_type/test_font.rb +17 -3
- data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
- data/test/hexapdf/task/test_pdfa.rb +72 -0
- data/test/hexapdf/test_document.rb +13 -0
- data/test/hexapdf/type/acro_form/test_form.rb +6 -0
- data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
- data/test/hexapdf/type/annotations/test_widget.rb +11 -0
- data/test/hexapdf/type/test_page.rb +8 -0
- metadata +25 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 04f2a87f1aaa95513275432d718996b7d598fc15e476f6999f6b6fe9f29cd0f8
|
|
4
|
+
data.tar.gz: 539d2b0e984db4ca4095bf0aad5208fbbdff5a08acc80d270a6b1c824f12c87e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c35f8b0267ef60c6392ae99d8c001e4d6b5e18ea1f5a62132d44bbf865d52cc8a9b08436e107c35a01d3a6edaeb7be9bcede20931a255416d4ea4d07778f8fc0
|
|
7
|
+
data.tar.gz: bfdedefe99c534d62b11f406b447902ea6824758153448ebfba35d0e456850134ba36a6cb2c97d668983e8a5b5b96bf0ab0a03c6136f2478a7717a0e7bb0933b
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,31 @@
|
|
|
1
|
+
## 1.7.0 - 2026-04-13
|
|
2
|
+
|
|
3
|
+
### Added
|
|
4
|
+
|
|
5
|
+
* Smart text extraction for retrieving layouted text from pages
|
|
6
|
+
* Support for digitally signing with ECDSA keys
|
|
7
|
+
* Support for digitally signing with DSA keys
|
|
8
|
+
* Support for BrotliDecode filter
|
|
9
|
+
* [HexaPDF::Type::DocumentSecurityStore] and
|
|
10
|
+
[HexaPDF::Type::ValidationRelatedInformation]
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
* **Breaking change**: [HexaPDF::Document#unwrap] to not unwrap streams
|
|
15
|
+
* Automatic detection of digital signature size to account for small deviations
|
|
16
|
+
* [HexaPDF::Type::AcroForm::Form#fill] to ignore password fields
|
|
17
|
+
* [HexaPDF::Type::AcroForm::TextField] validation to convert invalid Symbol
|
|
18
|
+
values to String
|
|
19
|
+
* [HexaPDF::Type::Annotations::Widget] validation to also validate a widget as a
|
|
20
|
+
field if necessary
|
|
21
|
+
* PDF/A task to include a fix for mismatching glyph widths for Type 2 CID fonts
|
|
22
|
+
|
|
23
|
+
### Fixed
|
|
24
|
+
|
|
25
|
+
* Writing of PDF documents with an invalid value for the /Info dictionary
|
|
26
|
+
* Subsetting of TrueType fonts in case compound glyphs are themselves compound
|
|
27
|
+
|
|
28
|
+
|
|
1
29
|
## 1.6.0 - 2026-02-10
|
|
2
30
|
|
|
3
31
|
### Added
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# # PDF Forms - List and fill fields
|
|
2
|
+
#
|
|
3
|
+
# This example shows how to list the form fields of an interactive PDF form and
|
|
4
|
+
# how to fill out the form.
|
|
5
|
+
#
|
|
6
|
+
# The output file from the [PDF forms](acro_form.html) example can be used as
|
|
7
|
+
# input.
|
|
8
|
+
#
|
|
9
|
+
# One way to list and fill a PDF form is to use the [HexaPDF CLI with the 'form'
|
|
10
|
+
# command](/documentation/hexapdf.1.html#form). Here, however, we are doing it
|
|
11
|
+
# with the HexaPDF API.
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# : `ruby acro_form_list_and_fill.rb [INPUT.PDF]`
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
require 'base64'
|
|
18
|
+
require 'hexapdf'
|
|
19
|
+
|
|
20
|
+
doc = HexaPDF::Document.open(ARGV[0] || 'acro_form.pdf')
|
|
21
|
+
exit unless doc.acro_form
|
|
22
|
+
|
|
23
|
+
puts "Listing all form fields:"
|
|
24
|
+
doc.acro_form.each_field do |field|
|
|
25
|
+
puts "#{field.full_field_name} (#{field.concrete_field_type})"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# We are using this to generate some values for existing text fields. In the
|
|
29
|
+
# real world one would be getting the values from the user.
|
|
30
|
+
puts "\nFilling in the text fields with random values:"
|
|
31
|
+
values = {}
|
|
32
|
+
doc.acro_form.each_field do |field|
|
|
33
|
+
next unless field.field_type == :Tx
|
|
34
|
+
value = Base64.encode64(field.full_field_name).strip
|
|
35
|
+
value = if field.key?(:MaxLen)
|
|
36
|
+
value[0, field[:MaxLen]]
|
|
37
|
+
else
|
|
38
|
+
"Value #{field.field_type} #{value}"
|
|
39
|
+
end
|
|
40
|
+
values[field.full_field_name] = value
|
|
41
|
+
puts "#{field.full_field_name}: #{value}"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Now actually fill out the form the values
|
|
45
|
+
doc.acro_form.fill(values)
|
|
46
|
+
|
|
47
|
+
doc.write('acro_form_list_and_fill.pdf', optimize: true)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# # Text Extraction
|
|
2
|
+
#
|
|
3
|
+
# This example shows how to extract layouted text from a page.
|
|
4
|
+
#
|
|
5
|
+
# It uses the provided input PDF or creates a small sample PDF as input. Then it
|
|
6
|
+
# extracts the text for each page and creates new pages with the extracted text
|
|
7
|
+
# in a fixed-width font.
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# : `ruby text_extraction.rb [INPUT.PDF]`
|
|
11
|
+
#
|
|
12
|
+
|
|
13
|
+
require 'hexapdf'
|
|
14
|
+
|
|
15
|
+
# Use the input PDF or create a sample PDF.
|
|
16
|
+
if ARGV.length > 0
|
|
17
|
+
doc = HexaPDF::Document.open(ARGV[0])
|
|
18
|
+
else
|
|
19
|
+
composer = HexaPDF::Composer.new do |pdf|
|
|
20
|
+
pdf.lorem_ipsum(count: 3, padding: [0, 0, 20])
|
|
21
|
+
pdf.lorem_ipsum(padding: [0, 50, 20], text_indent: 40)
|
|
22
|
+
pdf.lorem_ipsum(count: 2)
|
|
23
|
+
end
|
|
24
|
+
doc = composer.document
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Extract the existing pages and add new ones with the extracted text
|
|
28
|
+
doc.pages.count.times do |index|
|
|
29
|
+
text = doc.pages[index].extract_text
|
|
30
|
+
doc.pages.add.canvas.font('/usr/share/fonts/truetype/freefont/FreeMono.ttf', size: 6).
|
|
31
|
+
text(text, at: [10, 820])
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
doc.write('text_extraction.pdf', optimize: true)
|
data/lib/hexapdf/cli/info.rb
CHANGED
|
@@ -559,6 +559,7 @@ module HexaPDF
|
|
|
559
559
|
JPXDecode: 'HexaPDF::Filter::PassThrough',
|
|
560
560
|
Crypt: 'HexaPDF::Filter::Crypt',
|
|
561
561
|
Encryption: 'HexaPDF::Filter::Encryption',
|
|
562
|
+
BrotliDecode: 'HexaPDF::Filter::BrotliDecode',
|
|
562
563
|
},
|
|
563
564
|
'font.default' => 'Times',
|
|
564
565
|
'font.fallback' => ['ZapfDingbats', 'Symbol'],
|
|
@@ -636,6 +637,11 @@ module HexaPDF
|
|
|
636
637
|
#
|
|
637
638
|
# See PDF2.0 s8.6
|
|
638
639
|
#
|
|
640
|
+
# filter.brotli.compression::
|
|
641
|
+
# Specifies the compression level that should be used with the BrotliDecode filter. The level
|
|
642
|
+
# can range from 0 (no compression), 1 (best speed) to 11 (best compression). The default
|
|
643
|
+
# value is 8 which is a good compromise between speed and resulting size.
|
|
644
|
+
#
|
|
639
645
|
# filter.flate.compression::
|
|
640
646
|
# Specifies the compression level that should be used with the FlateDecode filter. The level
|
|
641
647
|
# can range from 0 (no compression), 1 (best speed) to 9 (best compression, default).
|
|
@@ -754,6 +760,8 @@ module HexaPDF
|
|
|
754
760
|
MCR: 'HexaPDF::Type::MarkedContentReference',
|
|
755
761
|
OBJR: 'HexaPDF::Type::ObjectReference',
|
|
756
762
|
Measure: 'HexaPDF::Type::Measure',
|
|
763
|
+
DSS: 'HexaPDF::Type::DocumentSecurityStore',
|
|
764
|
+
VRI: 'HexaPDF::Type::DocumentSecurityStore::ValidationRelatedInformation',
|
|
757
765
|
},
|
|
758
766
|
'object.subtype_map' => {
|
|
759
767
|
nil => {
|
|
@@ -895,7 +895,7 @@ module HexaPDF
|
|
|
895
895
|
#
|
|
896
896
|
# * Any other string is treated as a color name. HexaPDF supports CSS Color Module Level 3
|
|
897
897
|
# color names (see https://www.w3.org/TR/css-color-3/#svg-color) as well as HexaPDF design
|
|
898
|
-
# colors.
|
|
898
|
+
# colors. See ColorSpace::COLOR_NAMES for the list of supported names.
|
|
899
899
|
#
|
|
900
900
|
# * Four numeric arguments specify a CMYK color (see ColorSpace::DeviceCMYK::Color).
|
|
901
901
|
#
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# -*- encoding: utf-8; frozen_string_literal: true -*-
|
|
2
|
+
#
|
|
3
|
+
#--
|
|
4
|
+
# This file is part of HexaPDF.
|
|
5
|
+
#
|
|
6
|
+
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
|
|
7
|
+
# Copyright (C) 2014-2025 Thomas Leitner
|
|
8
|
+
#
|
|
9
|
+
# HexaPDF is free software: you can redistribute it and/or modify it
|
|
10
|
+
# under the terms of the GNU Affero General Public License version 3 as
|
|
11
|
+
# published by the Free Software Foundation with the addition of the
|
|
12
|
+
# following permission added to Section 15 as permitted in Section 7(a):
|
|
13
|
+
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
|
|
14
|
+
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
|
|
15
|
+
# INFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
16
|
+
#
|
|
17
|
+
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
|
|
18
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
|
|
20
|
+
# License for more details.
|
|
21
|
+
#
|
|
22
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
23
|
+
# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
+
#
|
|
25
|
+
# The interactive user interfaces in modified source and object code
|
|
26
|
+
# versions of HexaPDF must display Appropriate Legal Notices, as required
|
|
27
|
+
# under Section 5 of the GNU Affero General Public License version 3.
|
|
28
|
+
#
|
|
29
|
+
# In accordance with Section 7(b) of the GNU Affero General Public
|
|
30
|
+
# License, a covered work must retain the producer line in every PDF that
|
|
31
|
+
# is created or manipulated using HexaPDF.
|
|
32
|
+
#
|
|
33
|
+
# If the GNU Affero General Public License doesn't fit your need,
|
|
34
|
+
# commercial licenses are available at <https://gettalong.at/hexapdf/>.
|
|
35
|
+
#++
|
|
36
|
+
|
|
37
|
+
module HexaPDF
|
|
38
|
+
module Content
|
|
39
|
+
|
|
40
|
+
# This module converts the glyphs on a page to a single text string while preserving the layout.
|
|
41
|
+
#
|
|
42
|
+
# The general algorithm is:
|
|
43
|
+
#
|
|
44
|
+
# 1. Collect all individual glyphs with their user space coordinates in
|
|
45
|
+
# TextRunCollector::TextRun objects.
|
|
46
|
+
#
|
|
47
|
+
# 2. Sort text runs top to bottom and then left to right.
|
|
48
|
+
#
|
|
49
|
+
# 3. Group those text runs into lines based on a "baseline" while also combining neighboring
|
|
50
|
+
# text runs into larger runs.
|
|
51
|
+
#
|
|
52
|
+
# 4. Render each line into a string by taking into account the page size and the median glyph
|
|
53
|
+
# width for a text run to column mapping.
|
|
54
|
+
#
|
|
55
|
+
# 5. Add blank lines between text lines based on the page's normal line spacing.
|
|
56
|
+
module SmartTextExtractor
|
|
57
|
+
|
|
58
|
+
# This module provides the functionality for collecting the necessary TextRun instances for
|
|
59
|
+
# layouting the text.
|
|
60
|
+
#
|
|
61
|
+
# To use this module include it in a processor class. Then invoke the #collect_text_runs
|
|
62
|
+
# method in the #show_text and #show_text_with_positioning methods.
|
|
63
|
+
#
|
|
64
|
+
# Example:
|
|
65
|
+
#
|
|
66
|
+
# class CustomProcessor < HexaPDF::Content::Processor
|
|
67
|
+
# include TextRunCollector
|
|
68
|
+
#
|
|
69
|
+
# def show_text(str)
|
|
70
|
+
# collect_text_runs(decode_text_with_positioning(str))
|
|
71
|
+
# end
|
|
72
|
+
# alias show_text_with_positioning show_text
|
|
73
|
+
#
|
|
74
|
+
# end
|
|
75
|
+
#
|
|
76
|
+
# Once the processor has done its job, the collected text runs are available via the
|
|
77
|
+
# #text_runs method. Use them as input for SmartTextExtractor.layout_text_runs.
|
|
78
|
+
module TextRunCollector
|
|
79
|
+
|
|
80
|
+
# Represents a single run of continuous glyphs and their combined bounding box in user
|
|
81
|
+
# space.
|
|
82
|
+
TextRun = Struct.new(:string, :left, :bottom, :right, :top) do
|
|
83
|
+
# The "baseline" is approximated with the bottom of the bounding box.
|
|
84
|
+
#
|
|
85
|
+
# This works because HexaPDF uses a font's bounding box instead of the glyph's bounding
|
|
86
|
+
# box for each glyph. So while differently sized glyphs will have different "baseline"
|
|
87
|
+
# values, this is taken into account in the algorithm in the same way as subscript and
|
|
88
|
+
# superscript.
|
|
89
|
+
#
|
|
90
|
+
# Using this "fake" baseline works well enough and avoids additional calculations.
|
|
91
|
+
def baseline = bottom
|
|
92
|
+
|
|
93
|
+
# The height of the text run's bounding box.
|
|
94
|
+
def height = top - bottom
|
|
95
|
+
|
|
96
|
+
# The width of the text run's bounding box.
|
|
97
|
+
def width = right - left
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Array with all collected TextRun instances.
|
|
101
|
+
attr_reader :text_runs
|
|
102
|
+
|
|
103
|
+
def initialize # :nodoc:
|
|
104
|
+
super
|
|
105
|
+
@text_runs = []
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
# Collects all text runs from the glyphs in the +boxes+ array.
|
|
111
|
+
def collect_text_runs(boxes)
|
|
112
|
+
boxes.each do |box|
|
|
113
|
+
llx, lly, lrx, lry, urx, ury, ulx, uly = *box.points
|
|
114
|
+
x_min, x_max = [llx, lrx, ulx, urx].minmax
|
|
115
|
+
y_min, y_max = [lly, lry, uly, ury].minmax
|
|
116
|
+
@text_runs << TextRun.new(+box.string, x_min, y_min, x_max, y_max)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# This processor class is used when layouting the text through
|
|
122
|
+
# HexaPDF::Type::Page#extract_text.
|
|
123
|
+
class TextRunProcessor < HexaPDF::Content::Processor
|
|
124
|
+
|
|
125
|
+
include TextRunCollector
|
|
126
|
+
|
|
127
|
+
def show_text(str)
|
|
128
|
+
collect_text_runs(decode_text_with_positioning(str))
|
|
129
|
+
end
|
|
130
|
+
alias show_text_with_positioning show_text
|
|
131
|
+
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Converts an array of TextRun objects into a single string representation, preserving the
|
|
135
|
+
# visual layout.
|
|
136
|
+
#
|
|
137
|
+
# The +page_width+ and +page_height+ arguments specify the width and height of the page from
|
|
138
|
+
# which the text runs were extracted.
|
|
139
|
+
#
|
|
140
|
+
# The remaining keyword arguments can be used to fine-tune the algorithm for one's needs:
|
|
141
|
+
#
|
|
142
|
+
# +line_tolerance_factor+::
|
|
143
|
+
# The tolerance factor is applied to the median text run height to determine the range
|
|
144
|
+
# within which two text runs are considered to be on the same line. This ensures that
|
|
145
|
+
# small differences in the baseline due to, for example, subscript or superscript parts
|
|
146
|
+
# don't result in multiple lines.
|
|
147
|
+
#
|
|
148
|
+
# The factor should not be too large to avoid forcing separate visual lines into one line
|
|
149
|
+
# but also not too small to avoid subscript/superscript begin on separate lines. The
|
|
150
|
+
# default seems to work quite well.
|
|
151
|
+
#
|
|
152
|
+
# +paragraph_distance_threshold+::
|
|
153
|
+
# If the number of normal line spacings between two adjacent baselines is at least this
|
|
154
|
+
# large (but smaller than +large_distance_threshold+), the gap is interpreted as a
|
|
155
|
+
# paragraph break and a single blank line is inserted.
|
|
156
|
+
#
|
|
157
|
+
# +large_distance_threshold+::
|
|
158
|
+
# Works like +paragraph_distance_threshold+ and indicates if a number of normal line
|
|
159
|
+
# spacings is too large for being a paragraph break. A proportional number of blank lines
|
|
160
|
+
# is inserted in this case.
|
|
161
|
+
#
|
|
162
|
+
# This is used to represent large parts with non-text content like images.
|
|
163
|
+
def self.layout_text_runs(text_runs, page_width, page_height,
|
|
164
|
+
line_tolerance_factor: 0.4, paragraph_distance_threshold: 1.35,
|
|
165
|
+
large_distance_threshold: 3.0)
|
|
166
|
+
return '' if text_runs.empty?
|
|
167
|
+
|
|
168
|
+
# Use the median height of all text runs as an approximation of the main font size used on
|
|
169
|
+
# the page. The line tolerance uses a hard floor for small fonts.
|
|
170
|
+
median_height = median(text_runs.map(&:height).sort)
|
|
171
|
+
line_tolerance = [median_height * line_tolerance_factor, 2].max
|
|
172
|
+
|
|
173
|
+
# Group the text runs into lines which are sorted top to bottom. Text runs are pre-sorted by
|
|
174
|
+
# baseline from top to bottom and left to right (the latter is done so that consecutive text
|
|
175
|
+
# runs can be combined).
|
|
176
|
+
sorted = text_runs.sort_by {|run| [-run.baseline, run.left] }
|
|
177
|
+
lines = group_into_lines(sorted, line_tolerance)
|
|
178
|
+
|
|
179
|
+
# Calculate the normal line spacing, excluding anything too small/big.
|
|
180
|
+
line_distances = lines.map {|l| l.baseline }.each_cons(2).map {|a, b| a - b }.
|
|
181
|
+
select {|d| d >= median_height * 0.5 && d <= median_height * 2 }.sort
|
|
182
|
+
normal_line_spacing = line_distances.empty? ? median_height * 1.2 : median(line_distances)
|
|
183
|
+
|
|
184
|
+
# Convert the lines into actual text strings. Blank lines are inserted between the lines
|
|
185
|
+
# based on the normal line spacing.
|
|
186
|
+
output_lines = []
|
|
187
|
+
left_margin = lines.map {|line| line.text_runs[0].left }.min
|
|
188
|
+
glyph_widths = lines.flat_map do |line|
|
|
189
|
+
line.text_runs.flat_map {|run| [run.width.to_f / run.string.length] * run.string.length }
|
|
190
|
+
end.sort
|
|
191
|
+
median_glyph_width = median(glyph_widths)
|
|
192
|
+
|
|
193
|
+
lines.each_with_index do |line, index|
|
|
194
|
+
output_lines << text_runs_to_string(line.text_runs, median_glyph_width, left_margin)
|
|
195
|
+
next if index == lines.length - 1
|
|
196
|
+
|
|
197
|
+
# Add blank lines as needed.
|
|
198
|
+
ratio = (line.baseline - lines[index + 1].baseline) / normal_line_spacing
|
|
199
|
+
if ratio >= large_distance_threshold
|
|
200
|
+
# Subtract 1 because the newline after the output line already counts as one
|
|
201
|
+
# newline. Also cap at a maximum of 40 to avoid huge gaps.
|
|
202
|
+
[ratio.round - 1, 40].min.times { output_lines << '' }
|
|
203
|
+
elsif ratio >= paragraph_distance_threshold
|
|
204
|
+
output_lines << ''
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
output_lines.join("\n")
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Holds an array of TextRun objects and their median baseline.
|
|
212
|
+
Line = Struct.new(:text_runs, :baseline)
|
|
213
|
+
|
|
214
|
+
# Groups a sorted list of TextRuns (sorted by baseline, then left) into lines.
|
|
215
|
+
#
|
|
216
|
+
# Since the text_runs are already sorted, a single run through +sorted_text_runs+ is
|
|
217
|
+
# sufficient. A new line is created if a text run's baseline differs by more than +tolerance+
|
|
218
|
+
# from the current line's (median) baseline.
|
|
219
|
+
#
|
|
220
|
+
# The result is a list of Line objects with their contents sorted left to right.
|
|
221
|
+
def self.group_into_lines(sorted_text_runs, tolerance)
|
|
222
|
+
lines = []
|
|
223
|
+
current_line = []
|
|
224
|
+
current_baseline = sorted_text_runs[0].baseline
|
|
225
|
+
current_baselines = [current_baseline]
|
|
226
|
+
|
|
227
|
+
sorted_text_runs.each do |text_run|
|
|
228
|
+
# Try to combine text_runs that share exactly the same height and are next to each
|
|
229
|
+
# other. This avoids potentially garbled output because if two text parts are above each
|
|
230
|
+
# other but end up on the same line, the text runs would be mixed up (think: centered
|
|
231
|
+
# table header where some cells contain two lines).
|
|
232
|
+
if (last = current_line[-1]) && last.bottom == text_run.bottom &&
|
|
233
|
+
last.top == text_run.top && text_run.left - last.right < 1
|
|
234
|
+
last.string << text_run.string
|
|
235
|
+
last.right = text_run.right
|
|
236
|
+
elsif (current_baseline - text_run.baseline).abs <= tolerance
|
|
237
|
+
current_line << text_run
|
|
238
|
+
current_baselines << text_run.baseline
|
|
239
|
+
current_baseline = median(current_baselines)
|
|
240
|
+
else
|
|
241
|
+
lines << Line.new(current_line.sort_by!(&:left), current_baseline)
|
|
242
|
+
current_line = [text_run]
|
|
243
|
+
current_baseline = text_run.baseline
|
|
244
|
+
current_baselines.clear
|
|
245
|
+
current_baselines << current_baseline
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
lines << Line.new(current_line.sort_by!(&:left), current_baseline)
|
|
249
|
+
end
|
|
250
|
+
private_class_method :group_into_lines
|
|
251
|
+
|
|
252
|
+
# Returns the median value of the given sorted array of numerics.
|
|
253
|
+
def self.median(sorted_array)
|
|
254
|
+
mid = sorted_array.length / 2
|
|
255
|
+
sorted_array.length.odd? ? sorted_array[mid] : (sorted_array[mid - 1] + sorted_array[mid]) / 2.0
|
|
256
|
+
end
|
|
257
|
+
private_class_method :median
|
|
258
|
+
|
|
259
|
+
# Renders an array of TextRun objects representing one line to a single string.
|
|
260
|
+
#
|
|
261
|
+
# +median_glyph_width+:: Is used to determine the column for each text run.
|
|
262
|
+
# +left_margin+:: Is removed from the left side to avoid unnecessary indentation.
|
|
263
|
+
def self.text_runs_to_string(text_runs, median_glyph_width, left_margin)
|
|
264
|
+
# Minimum gap to classify as a word boundary
|
|
265
|
+
space_threshold = median_glyph_width * 0.5
|
|
266
|
+
|
|
267
|
+
result = +''
|
|
268
|
+
# The column where the last text run ended. Can be different from result.size due to fitting
|
|
269
|
+
# proportional-width fonts to a fixed-column output.
|
|
270
|
+
cursor = 0
|
|
271
|
+
|
|
272
|
+
text_runs.each_with_index do |text_run, index|
|
|
273
|
+
target_col = ((text_run.left - left_margin) / median_glyph_width).round
|
|
274
|
+
advance = target_col - cursor
|
|
275
|
+
|
|
276
|
+
if advance > 0
|
|
277
|
+
result << ' ' * advance
|
|
278
|
+
cursor += advance
|
|
279
|
+
elsif index >= 1 && text_run.left - text_runs[index - 1].right > space_threshold &&
|
|
280
|
+
result[-1] != ' '
|
|
281
|
+
# Force space even if advance < 0 when the actual spacing between text runs is large
|
|
282
|
+
# enough. This might happen because we are projecting proportional-width fonts to a
|
|
283
|
+
# fixed-column output.
|
|
284
|
+
cursor = target_col
|
|
285
|
+
result << ' '
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
result << text_run.string
|
|
289
|
+
|
|
290
|
+
# Move cursor to the text run's right edge but at least the text run's character count
|
|
291
|
+
# from the current position. This avoids gaps when there is too much difference between
|
|
292
|
+
# the on-page position and the approximated cursor. However, a one column difference is
|
|
293
|
+
# ignored to account for rounding errors.
|
|
294
|
+
cursor += text_run.string.size
|
|
295
|
+
text_run_right_edge_cursor = ((text_run.right - left_margin) / median_glyph_width).round
|
|
296
|
+
cursor = [text_run_right_edge_cursor, cursor].max if text_run_right_edge_cursor != cursor + 1
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
result.rstrip
|
|
300
|
+
end
|
|
301
|
+
private_class_method :text_runs_to_string
|
|
302
|
+
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
end
|
data/lib/hexapdf/content.rb
CHANGED
|
@@ -44,6 +44,7 @@ module HexaPDF
|
|
|
44
44
|
#
|
|
45
45
|
# * The Canvas class which provides an interface for drawing graphics and text.
|
|
46
46
|
# * The Parser and Processor classes for processing an existing content stream.
|
|
47
|
+
# * SmartTextExtractor for extracting layouted text from a page.
|
|
47
48
|
module Content
|
|
48
49
|
|
|
49
50
|
autoload(:Canvas, 'hexapdf/content/canvas')
|
|
@@ -52,6 +53,7 @@ module HexaPDF
|
|
|
52
53
|
autoload(:ColorSpace, 'hexapdf/content/color_space')
|
|
53
54
|
autoload(:Operator, 'hexapdf/content/operator')
|
|
54
55
|
autoload(:CanvasComposer, 'hexapdf/content/canvas_composer')
|
|
56
|
+
autoload(:SmartTextExtractor, 'hexapdf/content/smart_text_extractor')
|
|
55
57
|
|
|
56
58
|
end
|
|
57
59
|
|
|
@@ -52,9 +52,6 @@ module HexaPDF
|
|
|
52
52
|
# The signing handler is used by default by all methods that need a signing handler. Therefore
|
|
53
53
|
# it is usually only necessary to provide the actual attribute values.
|
|
54
54
|
#
|
|
55
|
-
# *Note*: Currently only RSA is supported, DSA and ECDSA are not. See the examples below for
|
|
56
|
-
# how to handle them using external signing.
|
|
57
|
-
#
|
|
58
55
|
#
|
|
59
56
|
# == CMS and PAdES Signatures
|
|
60
57
|
#
|
|
@@ -131,17 +128,6 @@ module HexaPDF
|
|
|
131
128
|
# document.sign("output.pdf", certificate: my_cert, certificate_chain: my_chain,
|
|
132
129
|
# external_signing: signing_proc)
|
|
133
130
|
#
|
|
134
|
-
# # Signing with DSA or ECDSA certificate/keys
|
|
135
|
-
# signing_proc = lambda do |io, byte_range|
|
|
136
|
-
# io.pos = byte_range[0]
|
|
137
|
-
# data = io.read(byte_range[1])
|
|
138
|
-
# io.pos = byte_range[2]
|
|
139
|
-
# data << io.read(byte_range[3])
|
|
140
|
-
# OpenSSL::PKCS7.sign(certificate, key, data, certificate_chain,
|
|
141
|
-
# OpenSSL::PKCS7::DETACHED | OpenSSL::PKCS7::BINARY).to_der
|
|
142
|
-
# end
|
|
143
|
-
# document.sign("output.pdf", signature_size: 10_000, external_signing: signing_proc)
|
|
144
|
-
#
|
|
145
131
|
#
|
|
146
132
|
# == Implementing a Signing Handler
|
|
147
133
|
#
|
|
@@ -277,7 +263,7 @@ module HexaPDF
|
|
|
277
263
|
# If a custom size is set using #signature_size=, it used. Otherwise the size is determined
|
|
278
264
|
# by using #sign to sign an empty string.
|
|
279
265
|
def signature_size
|
|
280
|
-
@signature_size || sign(StringIO.new, [0, 0, 0, 0]).size
|
|
266
|
+
@signature_size || sign(StringIO.new, [0, 0, 0, 0]).size + 5
|
|
281
267
|
end
|
|
282
268
|
|
|
283
269
|
# Finalizes the signature field as well as the signature dictionary before writing.
|
|
@@ -121,7 +121,7 @@ module HexaPDF
|
|
|
121
121
|
private
|
|
122
122
|
|
|
123
123
|
# Creates the set of signed attributes for the signer information structure.
|
|
124
|
-
def create_signed_attrs(data, signing_time: true)
|
|
124
|
+
def create_signed_attrs(data, ess_cert_hash: 'sha256', signing_time: true)
|
|
125
125
|
signing_time = (self.signing_time || Time.now).utc if signing_time
|
|
126
126
|
set(
|
|
127
127
|
attribute('content-type', oid('id-data')),
|
|
@@ -132,12 +132,13 @@ module HexaPDF
|
|
|
132
132
|
),
|
|
133
133
|
attribute(
|
|
134
134
|
'id-aa-signingCertificateV2',
|
|
135
|
-
sequence( # SigningCertificateV2
|
|
135
|
+
sequence( # SigningCertificateV2, see RFC5035
|
|
136
136
|
sequence( # Seq of ESSCertIDv2
|
|
137
137
|
sequence( # ESSCertIDv2
|
|
138
|
-
#
|
|
139
|
-
|
|
140
|
-
|
|
138
|
+
(sequence( # AlgorithmIdentifier RFC3280 4.1.1.2
|
|
139
|
+
oid(ess_cert_hash) # algorithm
|
|
140
|
+
) unless ess_cert_hash == 'sha256'),
|
|
141
|
+
binary(OpenSSL::Digest.digest(ess_cert_hash, @certificate.to_der)), # certHash
|
|
141
142
|
sequence( # issuerSerial
|
|
142
143
|
sequence( # issuer
|
|
143
144
|
implicit(4, sequence(@certificate.issuer)) # choice 4 directoryName
|
|
@@ -184,13 +185,19 @@ module HexaPDF
|
|
|
184
185
|
# Creates a signer information structure containing the actual meat of the whole CMS object.
|
|
185
186
|
def create_signer_info(signature, signed_attrs, unsigned_attrs = nil)
|
|
186
187
|
certificate_pkey_algorithm = @certificate.public_key.oid
|
|
187
|
-
signature_algorithm =
|
|
188
|
+
signature_algorithm = case certificate_pkey_algorithm
|
|
189
|
+
when 'rsaEncryption'
|
|
188
190
|
sequence( # signatureAlgorithm
|
|
189
191
|
oid('rsaEncryption'), # algorithmID
|
|
190
192
|
null # params
|
|
191
193
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
+
when 'DSA'
|
|
195
|
+
unless @digest_algorithm == 'sha256'
|
|
196
|
+
raise HexaPDF::Error, "Only SHA256 supported with DSA"
|
|
197
|
+
end
|
|
198
|
+
sequence(oid('id-dsa-with-sha256'), null)
|
|
199
|
+
when 'id-ecPublicKey'
|
|
200
|
+
sequence(oid("ecdsa-with-#{@digest_algorithm.upcase}"), null)
|
|
194
201
|
end
|
|
195
202
|
|
|
196
203
|
sequence(
|
|
@@ -273,6 +280,12 @@ module HexaPDF
|
|
|
273
280
|
'sha384' => '2.16.840.1.101.3.4.2.2',
|
|
274
281
|
'sha512' => '2.16.840.1.101.3.4.2.3',
|
|
275
282
|
'rsaEncryption' => '1.2.840.113549.1.1.1',
|
|
283
|
+
'id-dsa-with-sha1' => '1.2.840.10040.4.3',
|
|
284
|
+
'id-dsa-with-sha256' => '2.16.840.1.101.3.4.3.2',
|
|
285
|
+
'ecdsa-with-SHA1' => '1.2.840.10045.4.1',
|
|
286
|
+
'ecdsa-with-SHA256' => '1.2.840.10045.4.3.2',
|
|
287
|
+
'ecdsa-with-SHA384' => '1.2.840.10045.4.3.3',
|
|
288
|
+
'ecdsa-with-SHA512' => '1.2.840.10045.4.3.4',
|
|
276
289
|
'id-aa-signingCertificate' => '1.2.840.113549.1.9.16.2.12',
|
|
277
290
|
'id-aa-timeStampToken' => '1.2.840.113549.1.9.16.2.14',
|
|
278
291
|
'id-aa-signingCertificateV2' => '1.2.840.113549.1.9.16.2.47',
|
data/lib/hexapdf/document.rb
CHANGED
|
@@ -394,11 +394,12 @@ module HexaPDF
|
|
|
394
394
|
# :call-seq:
|
|
395
395
|
# document.unwrap(obj) -> unwrapped_obj
|
|
396
396
|
#
|
|
397
|
-
# Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
|
|
398
|
-
# instead of HexaPDF::Reference and HexaPDF::Object
|
|
397
|
+
# Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...)
|
|
398
|
+
# instead of HexaPDF::Reference and HexaPDF::Object. Only HexaPDF::Stream objects are retained
|
|
399
|
+
# as they are not representable by native Ruby objects.
|
|
399
400
|
def unwrap(object, seen = {})
|
|
400
401
|
object = deref(object)
|
|
401
|
-
object = object.data if object.kind_of?(HexaPDF::Object)
|
|
402
|
+
object = object.data if object.kind_of?(HexaPDF::Object) && !object.kind_of?(HexaPDF::Stream)
|
|
402
403
|
if seen.key?(object)
|
|
403
404
|
raise HexaPDF::Error, "Can't unwrap a recursive structure"
|
|
404
405
|
end
|
|
@@ -413,6 +414,8 @@ module HexaPDF
|
|
|
413
414
|
when HexaPDF::PDFData
|
|
414
415
|
seen[object] = true
|
|
415
416
|
unwrap(object.value, seen.dup)
|
|
417
|
+
when HexaPDF::Stream
|
|
418
|
+
object
|
|
416
419
|
else
|
|
417
420
|
object
|
|
418
421
|
end
|
|
@@ -790,6 +793,7 @@ module HexaPDF
|
|
|
790
793
|
if @metadata
|
|
791
794
|
metadata.modification_date(Time.now)
|
|
792
795
|
else
|
|
796
|
+
trailer.delete(:Info) unless trailer.info.kind_of?(HexaPDF::Dictionary)
|
|
793
797
|
trailer.info[:ModDate] = Time.now
|
|
794
798
|
end
|
|
795
799
|
end
|