pdf-reader 2.6.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +13 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +1 -0
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +7 -5
- data/lib/pdf/reader/filter/flate.rb +2 -0
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +5 -2
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +24 -14
- data/lib/pdf/reader/page_state.rb +11 -10
- data/lib/pdf/reader/page_text_receiver.rb +13 -8
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +4 -1
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +1 -0
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +12 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
|
4
|
+
data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
|
7
|
+
data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
|
data/CHANGELOG
CHANGED
@@ -1,6 +1,18 @@
|
|
1
|
+
v2.7.0 (13th December 2021)
|
2
|
+
- Include RBI type files in the gem
|
3
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
4
|
+
now be typed checked by sorbet
|
5
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
6
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
7
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
8
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
9
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
10
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
11
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
12
|
+
|
1
13
|
v2.6.0 (12th November 2021)
|
2
14
|
- Text extraction improvements
|
3
|
-
- Improved text layout on pages with a
|
15
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
16
|
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
17
|
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
18
|
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
data/examples/rspec.rb
CHANGED
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,6 +33,7 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
38
|
"begincodespacerange" => 1,
|
37
39
|
"endcodespacerange" => 1,
|
@@ -53,7 +55,7 @@ class PDF::Reader
|
|
53
55
|
|
54
56
|
def process_data(data)
|
55
57
|
parser = build_parser(data)
|
56
|
-
mode =
|
58
|
+
mode = :none
|
57
59
|
instructions = []
|
58
60
|
|
59
61
|
while token = parser.parse_token(CMAP_KEYWORDS)
|
@@ -62,13 +64,13 @@ class PDF::Reader
|
|
62
64
|
elsif token == "endbfchar"
|
63
65
|
process_bfchar_instructions(instructions)
|
64
66
|
instructions = []
|
65
|
-
mode =
|
67
|
+
mode = :none
|
66
68
|
elsif token == "beginbfrange"
|
67
69
|
mode = :range
|
68
70
|
elsif token == "endbfrange"
|
69
71
|
process_bfrange_instructions(instructions)
|
70
72
|
instructions = []
|
71
|
-
mode =
|
73
|
+
mode = :none
|
72
74
|
elsif mode == :char || mode == :range
|
73
75
|
instructions << token
|
74
76
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -46,6 +47,13 @@ class PDF::Reader
|
|
46
47
|
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_not_nil(object, name)
|
55
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
|
+
end
|
49
57
|
end
|
50
58
|
|
51
59
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options || {}
|
11
13
|
end
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,6 +9,7 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
11
13
|
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
14
|
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
13
15
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|
data/lib/pdf/reader/filter.rb
CHANGED
data/lib/pdf/reader/font.rb
CHANGED
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -35,9 +36,9 @@ module PDF
|
|
35
36
|
|
36
37
|
def read
|
37
38
|
bits_left_in_chunk = @bits_in_chunk
|
38
|
-
chunk =
|
39
|
+
chunk = -1
|
39
40
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
40
|
-
chunk = 0 if chunk
|
41
|
+
chunk = 0 if chunk < 0
|
41
42
|
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
42
43
|
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
43
44
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
@@ -83,6 +84,7 @@ module PDF
|
|
83
84
|
#
|
84
85
|
def self.decode(data)
|
85
86
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
87
|
+
string_table = StringTable.new
|
86
88
|
result = "".dup
|
87
89
|
until (code = stream.read) == CODE_EOD
|
88
90
|
if code == CODE_CLEAR_TABLE
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -336,8 +337,10 @@ class PDF::Reader
|
|
336
337
|
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
337
338
|
obj
|
338
339
|
when Hash then
|
339
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
340
|
-
|
340
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
341
|
+
arr.each_with_object({}) { |(k,v), accum|
|
342
|
+
accum[k] = v
|
343
|
+
}
|
341
344
|
when Array then
|
342
345
|
obj.collect { |item| decrypt(ref, item) }
|
343
346
|
when String
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -68,10 +69,33 @@ module PDF
|
|
68
69
|
@attributes
|
69
70
|
end
|
70
71
|
|
72
|
+
def height
|
73
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
74
|
+
rect.apply_rotation(rotate) if rotate > 0
|
75
|
+
rect.height
|
76
|
+
end
|
77
|
+
|
78
|
+
def width
|
79
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
80
|
+
rect.apply_rotation(rotate) if rotate > 0
|
81
|
+
rect.width
|
82
|
+
end
|
83
|
+
|
84
|
+
def origin
|
85
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
86
|
+
rect.apply_rotation(rotate) if rotate > 0
|
87
|
+
|
88
|
+
rect.bottom_left
|
89
|
+
end
|
90
|
+
|
71
91
|
# Convenience method to identify the page's orientation.
|
72
92
|
#
|
73
93
|
def orientation
|
74
|
-
|
94
|
+
if height > width
|
95
|
+
"portrait"
|
96
|
+
else
|
97
|
+
"landscape"
|
98
|
+
end
|
75
99
|
end
|
76
100
|
|
77
101
|
# returns the plain text content of this page encoded as UTF-8. Any
|
@@ -139,23 +163,50 @@ module PDF
|
|
139
163
|
# returns the "boxes" that define the page object.
|
140
164
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
165
|
#
|
166
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
167
|
+
#
|
142
168
|
def boxes
|
143
|
-
|
144
|
-
|
169
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
170
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns the "boxes" that define the page object.
|
174
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
|
+
#
|
176
|
+
def rectangles
|
177
|
+
mediabox = objects.deref!(attributes[:MediaBox])
|
178
|
+
cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
|
179
|
+
bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
|
180
|
+
trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
|
181
|
+
artbox = objects.deref!(attributes[:ArtBox]) || cropbox
|
182
|
+
|
183
|
+
mediarect = Rectangle.new(*mediabox)
|
184
|
+
croprect = Rectangle.new(*cropbox)
|
185
|
+
bleedrect = Rectangle.new(*bleedbox)
|
186
|
+
trimrect = Rectangle.new(*trimbox)
|
187
|
+
artrect = Rectangle.new(*artbox)
|
188
|
+
|
189
|
+
if rotate > 0
|
190
|
+
mediarect.apply_rotation(rotate)
|
191
|
+
croprect.apply_rotation(rotate)
|
192
|
+
bleedrect.apply_rotation(rotate)
|
193
|
+
trimrect.apply_rotation(rotate)
|
194
|
+
artrect.apply_rotation(rotate)
|
195
|
+
end
|
145
196
|
|
146
197
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
198
|
+
MediaBox: mediarect,
|
199
|
+
CropBox: croprect,
|
200
|
+
BleedBox: bleedrect,
|
201
|
+
TrimBox: trimrect,
|
202
|
+
ArtBox: artrect,
|
152
203
|
}
|
153
204
|
end
|
154
205
|
|
155
206
|
private
|
156
207
|
|
157
208
|
def root
|
158
|
-
|
209
|
+
@root ||= objects.deref(@objects.trailer[:Root])
|
159
210
|
end
|
160
211
|
|
161
212
|
# Returns the resources that accompany this page. Includes
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/overlapping_runs_filter'
|
@@ -16,16 +17,17 @@ class PDF::Reader
|
|
16
17
|
DEFAULT_FONT_SIZE = 12
|
17
18
|
|
18
19
|
def initialize(runs, mediabox)
|
19
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
20
23
|
|
21
24
|
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
22
25
|
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
26
|
+
@mediabox = mediabox
|
23
27
|
@runs = merge_runs(runs)
|
24
28
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
25
29
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
26
30
|
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
27
|
-
@page_width = (mediabox[2] - mediabox[0]).abs
|
28
|
-
@page_height = (mediabox[3] - mediabox[1]).abs
|
29
31
|
@x_offset = @runs.map(&:x).sort.first || 0
|
30
32
|
lowest_y = @runs.map(&:y).sort.first || 0
|
31
33
|
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
@@ -48,6 +50,16 @@ class PDF::Reader
|
|
48
50
|
|
49
51
|
private
|
50
52
|
|
53
|
+
def page_width
|
54
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
|
55
|
+
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
56
|
+
end
|
57
|
+
|
58
|
+
def page_height
|
59
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
|
60
|
+
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
61
|
+
end
|
62
|
+
|
51
63
|
# given an array of strings, return a new array with empty rows from the
|
52
64
|
# beginning and end removed.
|
53
65
|
#
|
@@ -66,19 +78,19 @@ class PDF::Reader
|
|
66
78
|
end
|
67
79
|
|
68
80
|
def row_count
|
69
|
-
@row_count ||= (
|
81
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
70
82
|
end
|
71
83
|
|
72
84
|
def col_count
|
73
|
-
@col_count ||= ((
|
85
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
74
86
|
end
|
75
87
|
|
76
88
|
def row_multiplier
|
77
|
-
@row_multiplier ||=
|
89
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
78
90
|
end
|
79
91
|
|
80
92
|
def col_multiplier
|
81
|
-
@col_multiplier ||=
|
93
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
82
94
|
end
|
83
95
|
|
84
96
|
def mean(collection)
|
@@ -108,17 +120,15 @@ class PDF::Reader
|
|
108
120
|
end
|
109
121
|
|
110
122
|
def group_chars_into_runs(chars)
|
111
|
-
|
112
|
-
while head = chars.shift
|
123
|
+
chars.each_with_object([]) do |char, runs|
|
113
124
|
if runs.empty?
|
114
|
-
runs <<
|
115
|
-
elsif runs.last.mergable?(
|
116
|
-
runs[-1] = runs.last +
|
125
|
+
runs << char
|
126
|
+
elsif runs.last.mergable?(char)
|
127
|
+
runs[-1] = runs.last + char
|
117
128
|
else
|
118
|
-
runs <<
|
129
|
+
runs << char
|
119
130
|
end
|
120
131
|
end
|
121
|
-
runs
|
122
132
|
end
|
123
133
|
|
124
134
|
def local_string_insert(haystack, needle, index)
|