pdf-reader 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +13 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +1 -0
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +7 -5
- data/lib/pdf/reader/filter/flate.rb +2 -0
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +5 -2
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +24 -14
- data/lib/pdf/reader/page_state.rb +11 -10
- data/lib/pdf/reader/page_text_receiver.rb +13 -8
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +4 -1
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +1 -0
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +12 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
|
4
|
+
data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
|
7
|
+
data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
|
data/CHANGELOG
CHANGED
@@ -1,6 +1,18 @@
|
|
1
|
+
v2.7.0 (13th December 2021)
|
2
|
+
- Include RBI type files in the gem
|
3
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
4
|
+
now be typed checked by sorbet
|
5
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
6
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
7
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
8
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
9
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
10
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
11
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
12
|
+
|
1
13
|
v2.6.0 (12th November 2021)
|
2
14
|
- Text extraction improvements
|
3
|
-
- Improved text layout on pages with a
|
15
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
16
|
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
17
|
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
18
|
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
data/examples/rspec.rb
CHANGED
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,6 +33,7 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
38
|
"begincodespacerange" => 1,
|
37
39
|
"endcodespacerange" => 1,
|
@@ -53,7 +55,7 @@ class PDF::Reader
|
|
53
55
|
|
54
56
|
def process_data(data)
|
55
57
|
parser = build_parser(data)
|
56
|
-
mode =
|
58
|
+
mode = :none
|
57
59
|
instructions = []
|
58
60
|
|
59
61
|
while token = parser.parse_token(CMAP_KEYWORDS)
|
@@ -62,13 +64,13 @@ class PDF::Reader
|
|
62
64
|
elsif token == "endbfchar"
|
63
65
|
process_bfchar_instructions(instructions)
|
64
66
|
instructions = []
|
65
|
-
mode =
|
67
|
+
mode = :none
|
66
68
|
elsif token == "beginbfrange"
|
67
69
|
mode = :range
|
68
70
|
elsif token == "endbfrange"
|
69
71
|
process_bfrange_instructions(instructions)
|
70
72
|
instructions = []
|
71
|
-
mode =
|
73
|
+
mode = :none
|
72
74
|
elsif mode == :char || mode == :range
|
73
75
|
instructions << token
|
74
76
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -46,6 +47,13 @@ class PDF::Reader
|
|
46
47
|
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_not_nil(object, name)
|
55
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
|
+
end
|
49
57
|
end
|
50
58
|
|
51
59
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options || {}
|
11
13
|
end
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,6 +9,7 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
11
13
|
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
14
|
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
13
15
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|
data/lib/pdf/reader/filter.rb
CHANGED
data/lib/pdf/reader/font.rb
CHANGED
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -35,9 +36,9 @@ module PDF
|
|
35
36
|
|
36
37
|
def read
|
37
38
|
bits_left_in_chunk = @bits_in_chunk
|
38
|
-
chunk =
|
39
|
+
chunk = -1
|
39
40
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
40
|
-
chunk = 0 if chunk
|
41
|
+
chunk = 0 if chunk < 0
|
41
42
|
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
42
43
|
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
43
44
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
@@ -83,6 +84,7 @@ module PDF
|
|
83
84
|
#
|
84
85
|
def self.decode(data)
|
85
86
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
87
|
+
string_table = StringTable.new
|
86
88
|
result = "".dup
|
87
89
|
until (code = stream.read) == CODE_EOD
|
88
90
|
if code == CODE_CLEAR_TABLE
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -336,8 +337,10 @@ class PDF::Reader
|
|
336
337
|
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
337
338
|
obj
|
338
339
|
when Hash then
|
339
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
340
|
-
|
340
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
341
|
+
arr.each_with_object({}) { |(k,v), accum|
|
342
|
+
accum[k] = v
|
343
|
+
}
|
341
344
|
when Array then
|
342
345
|
obj.collect { |item| decrypt(ref, item) }
|
343
346
|
when String
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -68,10 +69,33 @@ module PDF
|
|
68
69
|
@attributes
|
69
70
|
end
|
70
71
|
|
72
|
+
def height
|
73
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
74
|
+
rect.apply_rotation(rotate) if rotate > 0
|
75
|
+
rect.height
|
76
|
+
end
|
77
|
+
|
78
|
+
def width
|
79
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
80
|
+
rect.apply_rotation(rotate) if rotate > 0
|
81
|
+
rect.width
|
82
|
+
end
|
83
|
+
|
84
|
+
def origin
|
85
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
86
|
+
rect.apply_rotation(rotate) if rotate > 0
|
87
|
+
|
88
|
+
rect.bottom_left
|
89
|
+
end
|
90
|
+
|
71
91
|
# Convenience method to identify the page's orientation.
|
72
92
|
#
|
73
93
|
def orientation
|
74
|
-
|
94
|
+
if height > width
|
95
|
+
"portrait"
|
96
|
+
else
|
97
|
+
"landscape"
|
98
|
+
end
|
75
99
|
end
|
76
100
|
|
77
101
|
# returns the plain text content of this page encoded as UTF-8. Any
|
@@ -139,23 +163,50 @@ module PDF
|
|
139
163
|
# returns the "boxes" that define the page object.
|
140
164
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
165
|
#
|
166
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
167
|
+
#
|
142
168
|
def boxes
|
143
|
-
|
144
|
-
|
169
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
170
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns the "boxes" that define the page object.
|
174
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
|
+
#
|
176
|
+
def rectangles
|
177
|
+
mediabox = objects.deref!(attributes[:MediaBox])
|
178
|
+
cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
|
179
|
+
bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
|
180
|
+
trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
|
181
|
+
artbox = objects.deref!(attributes[:ArtBox]) || cropbox
|
182
|
+
|
183
|
+
mediarect = Rectangle.new(*mediabox)
|
184
|
+
croprect = Rectangle.new(*cropbox)
|
185
|
+
bleedrect = Rectangle.new(*bleedbox)
|
186
|
+
trimrect = Rectangle.new(*trimbox)
|
187
|
+
artrect = Rectangle.new(*artbox)
|
188
|
+
|
189
|
+
if rotate > 0
|
190
|
+
mediarect.apply_rotation(rotate)
|
191
|
+
croprect.apply_rotation(rotate)
|
192
|
+
bleedrect.apply_rotation(rotate)
|
193
|
+
trimrect.apply_rotation(rotate)
|
194
|
+
artrect.apply_rotation(rotate)
|
195
|
+
end
|
145
196
|
|
146
197
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
198
|
+
MediaBox: mediarect,
|
199
|
+
CropBox: croprect,
|
200
|
+
BleedBox: bleedrect,
|
201
|
+
TrimBox: trimrect,
|
202
|
+
ArtBox: artrect,
|
152
203
|
}
|
153
204
|
end
|
154
205
|
|
155
206
|
private
|
156
207
|
|
157
208
|
def root
|
158
|
-
|
209
|
+
@root ||= objects.deref(@objects.trailer[:Root])
|
159
210
|
end
|
160
211
|
|
161
212
|
# Returns the resources that accompany this page. Includes
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/overlapping_runs_filter'
|
@@ -16,16 +17,17 @@ class PDF::Reader
|
|
16
17
|
DEFAULT_FONT_SIZE = 12
|
17
18
|
|
18
19
|
def initialize(runs, mediabox)
|
19
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
20
23
|
|
21
24
|
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
22
25
|
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
26
|
+
@mediabox = mediabox
|
23
27
|
@runs = merge_runs(runs)
|
24
28
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
25
29
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
26
30
|
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
27
|
-
@page_width = (mediabox[2] - mediabox[0]).abs
|
28
|
-
@page_height = (mediabox[3] - mediabox[1]).abs
|
29
31
|
@x_offset = @runs.map(&:x).sort.first || 0
|
30
32
|
lowest_y = @runs.map(&:y).sort.first || 0
|
31
33
|
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
@@ -48,6 +50,16 @@ class PDF::Reader
|
|
48
50
|
|
49
51
|
private
|
50
52
|
|
53
|
+
def page_width
|
54
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
|
55
|
+
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
56
|
+
end
|
57
|
+
|
58
|
+
def page_height
|
59
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
|
60
|
+
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
61
|
+
end
|
62
|
+
|
51
63
|
# given an array of strings, return a new array with empty rows from the
|
52
64
|
# beginning and end removed.
|
53
65
|
#
|
@@ -66,19 +78,19 @@ class PDF::Reader
|
|
66
78
|
end
|
67
79
|
|
68
80
|
def row_count
|
69
|
-
@row_count ||= (
|
81
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
70
82
|
end
|
71
83
|
|
72
84
|
def col_count
|
73
|
-
@col_count ||= ((
|
85
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
74
86
|
end
|
75
87
|
|
76
88
|
def row_multiplier
|
77
|
-
@row_multiplier ||=
|
89
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
78
90
|
end
|
79
91
|
|
80
92
|
def col_multiplier
|
81
|
-
@col_multiplier ||=
|
93
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
82
94
|
end
|
83
95
|
|
84
96
|
def mean(collection)
|
@@ -108,17 +120,15 @@ class PDF::Reader
|
|
108
120
|
end
|
109
121
|
|
110
122
|
def group_chars_into_runs(chars)
|
111
|
-
|
112
|
-
while head = chars.shift
|
123
|
+
chars.each_with_object([]) do |char, runs|
|
113
124
|
if runs.empty?
|
114
|
-
runs <<
|
115
|
-
elsif runs.last.mergable?(
|
116
|
-
runs[-1] = runs.last +
|
125
|
+
runs << char
|
126
|
+
elsif runs.last.mergable?(char)
|
127
|
+
runs[-1] = runs.last + char
|
117
128
|
else
|
118
|
-
runs <<
|
129
|
+
runs << char
|
119
130
|
end
|
120
131
|
end
|
121
|
-
runs
|
122
132
|
end
|
123
133
|
|
124
134
|
def local_string_insert(haystack, needle, index)
|