pdf-reader 2.6.0 → 2.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +21 -1
- data/Rakefile +1 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +1 -0
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +7 -5
- data/lib/pdf/reader/filter/flate.rb +2 -0
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +44 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +5 -2
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +73 -11
- data/lib/pdf/reader/page_layout.rb +28 -32
- data/lib/pdf/reader/page_state.rb +11 -10
- data/lib/pdf/reader/page_text_receiver.rb +53 -9
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +7 -1
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +1 -0
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +29 -6
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1763 -0
- metadata +13 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6182ffd59631afba6a2c234547a428382b1ec2d7b414d89830b1143f1a0e1704
|
4
|
+
data.tar.gz: 6c0e6a7d32cf24912edc3aa96d72b7f70497d2fdd0e0913b86f871bbf9fa104f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42dafbe0c36ce838da4c3120bf2187efde647e486971896d9a9c59c37dac3da0f2ccf3ecd98d8dd1d3acc5404bfcf26e64a327d7797648646afd6b40be02fec2
|
7
|
+
data.tar.gz: 40f0b0958024b558d6aca7eb2b3b6f042f034059c8fca52ce97fab7d55a39c313797605341331c65efd1099a1310ccbe386c354024dbd3cbc61c1d96c423842d
|
data/CHANGELOG
CHANGED
@@ -1,6 +1,26 @@
|
|
1
|
+
v2.8.0 (28th Decemeber 2021)
|
2
|
+
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
3
|
+
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
4
|
+
- including extracting the text for only part of the page
|
5
|
+
- Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
|
6
|
+
- Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
|
7
|
+
- Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
|
8
|
+
|
9
|
+
v2.7.0 (13th December 2021)
|
10
|
+
- Include RBI type files in the gem
|
11
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
12
|
+
now be typed checked by sorbet
|
13
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
14
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
15
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
16
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
17
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
18
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
19
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
20
|
+
|
1
21
|
v2.6.0 (12th November 2021)
|
2
22
|
- Text extraction improvements
|
3
|
-
- Improved text layout on pages with a
|
23
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
24
|
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
25
|
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
26
|
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 28
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/rspec.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Filter our text/characters that are positioned outside a rectangle. Usually the page
|
8
|
+
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
|
+
class BoundingRectangleRunsFilter
|
10
|
+
|
11
|
+
def self.runs_within_rect(runs, rect)
|
12
|
+
runs.select { |run| rect.contains?(run.origin) }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,6 +33,7 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
38
|
"begincodespacerange" => 1,
|
37
39
|
"endcodespacerange" => 1,
|
@@ -53,7 +55,7 @@ class PDF::Reader
|
|
53
55
|
|
54
56
|
def process_data(data)
|
55
57
|
parser = build_parser(data)
|
56
|
-
mode =
|
58
|
+
mode = :none
|
57
59
|
instructions = []
|
58
60
|
|
59
61
|
while token = parser.parse_token(CMAP_KEYWORDS)
|
@@ -62,13 +64,13 @@ class PDF::Reader
|
|
62
64
|
elsif token == "endbfchar"
|
63
65
|
process_bfchar_instructions(instructions)
|
64
66
|
instructions = []
|
65
|
-
mode =
|
67
|
+
mode = :none
|
66
68
|
elsif token == "beginbfrange"
|
67
69
|
mode = :range
|
68
70
|
elsif token == "endbfrange"
|
69
71
|
process_bfrange_instructions(instructions)
|
70
72
|
instructions = []
|
71
|
-
mode =
|
73
|
+
mode = :none
|
72
74
|
elsif mode == :char || mode == :range
|
73
75
|
instructions << token
|
74
76
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -46,6 +47,13 @@ class PDF::Reader
|
|
46
47
|
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_not_nil(object, name)
|
55
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
|
+
end
|
49
57
|
end
|
50
58
|
|
51
59
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options || {}
|
11
13
|
end
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,6 +9,7 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
11
13
|
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
14
|
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
13
15
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|
data/lib/pdf/reader/filter.rb
CHANGED
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -42,6 +43,7 @@ class PDF::Reader
|
|
42
43
|
@tounicode = nil
|
43
44
|
|
44
45
|
extract_base_info(obj)
|
46
|
+
extract_type3_info(obj)
|
45
47
|
extract_descriptor(obj)
|
46
48
|
extract_descendants(obj)
|
47
49
|
@width_calc = build_width_calculator
|
@@ -72,8 +74,44 @@ class PDF::Reader
|
|
72
74
|
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
75
|
end
|
74
76
|
|
77
|
+
# In most cases glyph width is converted into text space with a simple divide by 1000.
|
78
|
+
#
|
79
|
+
# However, Type3 fonts provide their own FontMatrix that's used for the transformation.
|
80
|
+
#
|
81
|
+
def glyph_width_in_text_space(code_point)
|
82
|
+
glyph_width_in_glyph_space = glyph_width(code_point)
|
83
|
+
|
84
|
+
if @subtype == :Type3
|
85
|
+
x1, y1 = font_matrix_transform(0,0)
|
86
|
+
x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
|
87
|
+
(x2 - x1).abs.round(2)
|
88
|
+
else
|
89
|
+
glyph_width_in_glyph_space / 1000.0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
75
93
|
private
|
76
94
|
|
95
|
+
# Only valid for Type3 fonts
|
96
|
+
def font_matrix_transform(x, y)
|
97
|
+
return x, y if @font_matrix.nil?
|
98
|
+
|
99
|
+
matrix = TransformationMatrix.new(
|
100
|
+
@font_matrix[0], @font_matrix[1],
|
101
|
+
@font_matrix[2], @font_matrix[3],
|
102
|
+
@font_matrix[4], @font_matrix[5],
|
103
|
+
)
|
104
|
+
|
105
|
+
if x == 0 && y == 0
|
106
|
+
[matrix.e, matrix.f]
|
107
|
+
else
|
108
|
+
[
|
109
|
+
(matrix.a * x) + (matrix.c * y) + (matrix.e),
|
110
|
+
(matrix.b * x) + (matrix.d * y) + (matrix.f)
|
111
|
+
]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
77
115
|
def default_encoding(font_name)
|
78
116
|
case font_name.to_s
|
79
117
|
when "Symbol" then
|
@@ -137,6 +175,12 @@ class PDF::Reader
|
|
137
175
|
end
|
138
176
|
end
|
139
177
|
|
178
|
+
def extract_type3_info(obj)
|
179
|
+
if @subtype == :Type3
|
180
|
+
@font_matrix = @ohash.object(obj[:FontMatrix]) || [ 0.001, 0, 0, 0.001, 0, 0 ]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
140
184
|
def extract_descriptor(obj)
|
141
185
|
if obj[:FontDescriptor]
|
142
186
|
# create a font descriptor object if we can, in other words, unless this is
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -35,9 +36,9 @@ module PDF
|
|
35
36
|
|
36
37
|
def read
|
37
38
|
bits_left_in_chunk = @bits_in_chunk
|
38
|
-
chunk =
|
39
|
+
chunk = -1
|
39
40
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
40
|
-
chunk = 0 if chunk
|
41
|
+
chunk = 0 if chunk < 0
|
41
42
|
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
42
43
|
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
43
44
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
@@ -83,6 +84,7 @@ module PDF
|
|
83
84
|
#
|
84
85
|
def self.decode(data)
|
85
86
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
87
|
+
string_table = StringTable.new
|
86
88
|
result = "".dup
|
87
89
|
until (code = stream.read) == CODE_EOD
|
88
90
|
if code == CODE_CLEAR_TABLE
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -336,8 +337,10 @@ class PDF::Reader
|
|
336
337
|
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
337
338
|
obj
|
338
339
|
when Hash then
|
339
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
340
|
-
|
340
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
341
|
+
arr.each_with_object({}) { |(k,v), accum|
|
342
|
+
accum[k] = v
|
343
|
+
}
|
341
344
|
when Array then
|
342
345
|
obj.collect { |item| decrypt(ref, item) }
|
343
346
|
when String
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -68,22 +69,56 @@ module PDF
|
|
68
69
|
@attributes
|
69
70
|
end
|
70
71
|
|
72
|
+
def height
|
73
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
74
|
+
rect.apply_rotation(rotate) if rotate > 0
|
75
|
+
rect.height
|
76
|
+
end
|
77
|
+
|
78
|
+
def width
|
79
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
80
|
+
rect.apply_rotation(rotate) if rotate > 0
|
81
|
+
rect.width
|
82
|
+
end
|
83
|
+
|
84
|
+
def origin
|
85
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
86
|
+
rect.apply_rotation(rotate) if rotate > 0
|
87
|
+
|
88
|
+
rect.bottom_left
|
89
|
+
end
|
90
|
+
|
71
91
|
# Convenience method to identify the page's orientation.
|
72
92
|
#
|
73
93
|
def orientation
|
74
|
-
|
94
|
+
if height > width
|
95
|
+
"portrait"
|
96
|
+
else
|
97
|
+
"landscape"
|
98
|
+
end
|
75
99
|
end
|
76
100
|
|
77
101
|
# returns the plain text content of this page encoded as UTF-8. Any
|
78
102
|
# characters that can't be translated will be returned as a ▯
|
79
103
|
#
|
80
|
-
def text
|
104
|
+
def text(opts = {})
|
81
105
|
receiver = PageTextReceiver.new
|
82
106
|
walk(receiver)
|
83
|
-
receiver.
|
107
|
+
runs = receiver.runs(opts)
|
108
|
+
|
109
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
110
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
111
|
+
|
112
|
+
PageLayout.new(runs, mediabox).to_s
|
84
113
|
end
|
85
114
|
alias :to_s :text
|
86
115
|
|
116
|
+
def runs(opts = {})
|
117
|
+
receiver = PageTextReceiver.new
|
118
|
+
walk(receiver)
|
119
|
+
receiver.runs(opts)
|
120
|
+
end
|
121
|
+
|
87
122
|
# processes the raw content stream for this page in sequential order and
|
88
123
|
# passes callbacks to the receiver objects.
|
89
124
|
#
|
@@ -139,23 +174,50 @@ module PDF
|
|
139
174
|
# returns the "boxes" that define the page object.
|
140
175
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
176
|
#
|
177
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
178
|
+
#
|
142
179
|
def boxes
|
143
|
-
|
144
|
-
|
180
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
181
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
182
|
+
end
|
183
|
+
|
184
|
+
# returns the "boxes" that define the page object.
|
185
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
186
|
+
#
|
187
|
+
def rectangles
|
188
|
+
mediabox = objects.deref!(attributes[:MediaBox])
|
189
|
+
cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
|
190
|
+
bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
|
191
|
+
trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
|
192
|
+
artbox = objects.deref!(attributes[:ArtBox]) || cropbox
|
193
|
+
|
194
|
+
mediarect = Rectangle.new(*mediabox)
|
195
|
+
croprect = Rectangle.new(*cropbox)
|
196
|
+
bleedrect = Rectangle.new(*bleedbox)
|
197
|
+
trimrect = Rectangle.new(*trimbox)
|
198
|
+
artrect = Rectangle.new(*artbox)
|
199
|
+
|
200
|
+
if rotate > 0
|
201
|
+
mediarect.apply_rotation(rotate)
|
202
|
+
croprect.apply_rotation(rotate)
|
203
|
+
bleedrect.apply_rotation(rotate)
|
204
|
+
trimrect.apply_rotation(rotate)
|
205
|
+
artrect.apply_rotation(rotate)
|
206
|
+
end
|
145
207
|
|
146
208
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
209
|
+
MediaBox: mediarect,
|
210
|
+
CropBox: croprect,
|
211
|
+
BleedBox: bleedrect,
|
212
|
+
TrimBox: trimrect,
|
213
|
+
ArtBox: artrect,
|
152
214
|
}
|
153
215
|
end
|
154
216
|
|
155
217
|
private
|
156
218
|
|
157
219
|
def root
|
158
|
-
|
220
|
+
@root ||= objects.deref(@objects.trailer[:Root])
|
159
221
|
end
|
160
222
|
|
161
223
|
# Returns the resources that accompany this page. Includes
|