pdf-reader 2.6.0 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +13 -1
  3. data/examples/rspec.rb +1 -0
  4. data/lib/pdf/reader/buffer.rb +1 -0
  5. data/lib/pdf/reader/cid_widths.rb +1 -0
  6. data/lib/pdf/reader/cmap.rb +5 -3
  7. data/lib/pdf/reader/encoding.rb +2 -1
  8. data/lib/pdf/reader/error.rb +8 -0
  9. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  10. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  11. data/lib/pdf/reader/filter/depredict.rb +7 -5
  12. data/lib/pdf/reader/filter/flate.rb +2 -0
  13. data/lib/pdf/reader/filter/lzw.rb +2 -0
  14. data/lib/pdf/reader/filter/null.rb +1 -0
  15. data/lib/pdf/reader/filter/run_length.rb +19 -13
  16. data/lib/pdf/reader/filter.rb +1 -0
  17. data/lib/pdf/reader/font.rb +1 -0
  18. data/lib/pdf/reader/font_descriptor.rb +1 -0
  19. data/lib/pdf/reader/form_xobject.rb +1 -0
  20. data/lib/pdf/reader/glyph_hash.rb +1 -0
  21. data/lib/pdf/reader/lzw.rb +4 -2
  22. data/lib/pdf/reader/null_security_handler.rb +1 -0
  23. data/lib/pdf/reader/object_cache.rb +1 -0
  24. data/lib/pdf/reader/object_hash.rb +5 -2
  25. data/lib/pdf/reader/object_stream.rb +1 -0
  26. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  27. data/lib/pdf/reader/page.rb +60 -9
  28. data/lib/pdf/reader/page_layout.rb +24 -14
  29. data/lib/pdf/reader/page_state.rb +11 -10
  30. data/lib/pdf/reader/page_text_receiver.rb +13 -8
  31. data/lib/pdf/reader/pages_strategy.rb +1 -0
  32. data/lib/pdf/reader/parser.rb +4 -1
  33. data/lib/pdf/reader/point.rb +25 -0
  34. data/lib/pdf/reader/print_receiver.rb +1 -0
  35. data/lib/pdf/reader/rectangle.rb +95 -0
  36. data/lib/pdf/reader/reference.rb +1 -0
  37. data/lib/pdf/reader/register_receiver.rb +1 -0
  38. data/lib/pdf/reader/resource_methods.rb +5 -0
  39. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  40. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  41. data/lib/pdf/reader/stream.rb +1 -0
  42. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  43. data/lib/pdf/reader/text_run.rb +1 -0
  44. data/lib/pdf/reader/token.rb +1 -0
  45. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  46. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  47. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  48. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  49. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  50. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  52. data/lib/pdf/reader/width_calculator.rb +1 -0
  53. data/lib/pdf/reader/xref.rb +1 -0
  54. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  55. data/lib/pdf/reader.rb +14 -4
  56. data/lib/pdf-reader.rb +1 -0
  57. data/rbi/pdf-reader.rbi +1744 -0
  58. metadata +12 -10
  59. data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
4
- data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
3
+ metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
+ data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
5
5
  SHA512:
6
- metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
7
- data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
6
+ metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
+ data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
data/CHANGELOG CHANGED
@@ -1,6 +1,18 @@
1
+ v2.7.0 (13th December 2021)
2
+ - Include RBI type files in the gem
3
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
4
+ now be typed checked by sorbet
5
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
6
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
7
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
8
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
9
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
10
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
11
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
12
+
1
13
  v2.6.0 (12th November 2021)
2
14
  - Text extraction improvements
3
- - Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
15
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
4
16
  - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
5
17
  - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
6
18
  - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,6 +33,7 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
38
  "begincodespacerange" => 1,
37
39
  "endcodespacerange" => 1,
@@ -53,7 +55,7 @@ class PDF::Reader
53
55
 
54
56
  def process_data(data)
55
57
  parser = build_parser(data)
56
- mode = nil
58
+ mode = :none
57
59
  instructions = []
58
60
 
59
61
  while token = parser.parse_token(CMAP_KEYWORDS)
@@ -62,13 +64,13 @@ class PDF::Reader
62
64
  elsif token == "endbfchar"
63
65
  process_bfchar_instructions(instructions)
64
66
  instructions = []
65
- mode = nil
67
+ mode = :none
66
68
  elsif token == "beginbfrange"
67
69
  mode = :range
68
70
  elsif token == "endbfrange"
69
71
  process_bfrange_instructions(instructions)
70
72
  instructions = []
71
- mode = nil
73
+ mode = :none
72
74
  elsif mode == :char || mode == :range
73
75
  instructions << token
74
76
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -46,6 +47,13 @@ class PDF::Reader
46
47
  raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_not_nil(object, name)
55
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
56
+ end
49
57
  end
50
58
 
51
59
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
12
  @options = options || {}
11
13
  end
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
 
@@ -8,6 +9,7 @@ class PDF::Reader
8
9
  module Filter # :nodoc:
9
10
  # implementation of the Flate (zlib) stream filter
10
11
  class Flate
12
+
11
13
  ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
14
  ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
15
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the LZW stream filter
8
9
  class Lzw
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
6
7
  module Filter # :nodoc:
7
8
  # implementation of the run length stream filter
8
9
  class RunLength
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
20
22
  length = data.getbyte(pos)
21
23
  pos += 1
22
24
 
23
- case
24
- when length == 128
25
- break
26
- when length < 128
27
- # When the length is < 128, we copy the following length+1 bytes
28
- # literally.
29
- out << data[pos, length + 1]
30
- pos += length
31
- else
32
- # When the length is > 128, we copy the next byte (257 - length)
33
- # times; i.e., "\xFA\x00" ([250, 0]) will expand to
34
- # "\x00\x00\x00\x00\x00\x00\x00".
35
- out << data[pos, 1] * (257 - length)
25
+ unless length.nil?
26
+ case
27
+ # nothing
28
+ when length == 128
29
+ break
30
+ when length < 128
31
+ # When the length is < 128, we copy the following length+1 bytes
32
+ # literally.
33
+ out << data[pos, length + 1]
34
+ pos += length
35
+ else
36
+ # When the length is > 128, we copy the next byte (257 - length)
37
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
38
+ # "\x00\x00\x00\x00\x00\x00\x00".
39
+ previous_byte = data[pos, 1] || ""
40
+ out << previous_byte * (257 - length)
41
+ end
36
42
  end
37
43
 
38
44
  pos += 1
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ttfunk'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest/md5'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -35,9 +36,9 @@ module PDF
35
36
 
36
37
  def read
37
38
  bits_left_in_chunk = @bits_in_chunk
38
- chunk = nil
39
+ chunk = -1
39
40
  while bits_left_in_chunk > 0 and @current_pos < @data.size
40
- chunk = 0 if chunk.nil?
41
+ chunk = 0 if chunk < 0
41
42
  codepoint = @data[@current_pos, 1].unpack("C*")[0]
42
43
  current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
43
44
  dif = bits_left_in_chunk - @bits_left_in_byte
@@ -83,6 +84,7 @@ module PDF
83
84
  #
84
85
  def self.decode(data)
85
86
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
87
+ string_table = StringTable.new
86
88
  result = "".dup
87
89
  until (code = stream.read) == CODE_EOD
88
90
  if code == CODE_CLEAR_TABLE
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'hashery/lru_hash'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -336,8 +337,10 @@ class PDF::Reader
336
337
  obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
337
338
  obj
338
339
  when Hash then
339
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
340
- Hash[*arr]
340
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
341
+ arr.each_with_object({}) { |(k,v), accum|
342
+ accum[k] = v
343
+ }
341
344
  when Array then
342
345
  obj.collect { |item| decrypt(ref, item) }
343
346
  when String
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,6 @@
1
+ # typed: true
1
2
  # coding: utf-8
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -38,7 +40,8 @@ class PDF::Reader
38
40
 
39
41
  def self.detect_intersection(sweep_line_status, event_point)
40
42
  sweep_line_status.each do |open_text_run|
41
- if event_point.x >= open_text_run.x &&
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
42
45
  event_point.x <= open_text_run.endx &&
43
46
  open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
47
  return true
@@ -51,10 +54,14 @@ class PDF::Reader
51
54
  # Utility class used to avoid modifying the underlying TextRun objects while we're
52
55
  # looking for duplicates
53
56
  class EventPoint
54
- attr_reader :x, :run
55
57
 
56
- def initialize x, run
57
- @x, @run = x, run
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
58
65
  end
59
66
 
60
67
  def start?
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -68,10 +69,33 @@ module PDF
68
69
  @attributes
69
70
  end
70
71
 
72
+ def height
73
+ rect = Rectangle.new(*attributes[:MediaBox])
74
+ rect.apply_rotation(rotate) if rotate > 0
75
+ rect.height
76
+ end
77
+
78
+ def width
79
+ rect = Rectangle.new(*attributes[:MediaBox])
80
+ rect.apply_rotation(rotate) if rotate > 0
81
+ rect.width
82
+ end
83
+
84
+ def origin
85
+ rect = Rectangle.new(*attributes[:MediaBox])
86
+ rect.apply_rotation(rotate) if rotate > 0
87
+
88
+ rect.bottom_left
89
+ end
90
+
71
91
  # Convenience method to identify the page's orientation.
72
92
  #
73
93
  def orientation
74
- OrientationDetector.new(attributes).orientation
94
+ if height > width
95
+ "portrait"
96
+ else
97
+ "landscape"
98
+ end
75
99
  end
76
100
 
77
101
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -139,23 +163,50 @@ module PDF
139
163
  # returns the "boxes" that define the page object.
140
164
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
165
  #
166
+ # DEPRECATED. Recommend using Page#rectangles instead
167
+ #
142
168
  def boxes
143
- mediabox = attributes[:MediaBox]
144
- cropbox = attributes[:Cropbox] || mediabox
169
+ # In ruby 2.4+ we could use Hash#transform_values
170
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
171
+ end
172
+
173
+ # returns the "boxes" that define the page object.
174
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
+ #
176
+ def rectangles
177
+ mediabox = objects.deref!(attributes[:MediaBox])
178
+ cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
+ bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
+ trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
+ artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
+
183
+ mediarect = Rectangle.new(*mediabox)
184
+ croprect = Rectangle.new(*cropbox)
185
+ bleedrect = Rectangle.new(*bleedbox)
186
+ trimrect = Rectangle.new(*trimbox)
187
+ artrect = Rectangle.new(*artbox)
188
+
189
+ if rotate > 0
190
+ mediarect.apply_rotation(rotate)
191
+ croprect.apply_rotation(rotate)
192
+ bleedrect.apply_rotation(rotate)
193
+ trimrect.apply_rotation(rotate)
194
+ artrect.apply_rotation(rotate)
195
+ end
145
196
 
146
197
  {
147
- MediaBox: objects.deref!(mediabox),
148
- CropBox: objects.deref!(cropbox),
149
- BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
- TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
- ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
198
+ MediaBox: mediarect,
199
+ CropBox: croprect,
200
+ BleedBox: bleedrect,
201
+ TrimBox: trimrect,
202
+ ArtBox: artrect,
152
203
  }
153
204
  end
154
205
 
155
206
  private
156
207
 
157
208
  def root
158
- root ||= objects.deref(@objects.trailer[:Root])
209
+ @root ||= objects.deref(@objects.trailer[:Root])
159
210
  end
160
211
 
161
212
  # Returns the resources that accompany this page. Includes
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/overlapping_runs_filter'
@@ -16,16 +17,17 @@ class PDF::Reader
16
17
  DEFAULT_FONT_SIZE = 12
17
18
 
18
19
  def initialize(runs, mediabox)
19
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
20
23
 
21
24
  runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
22
25
  runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
+ @mediabox = mediabox
23
27
  @runs = merge_runs(runs)
24
28
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
25
29
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
26
30
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
27
- @page_width = (mediabox[2] - mediabox[0]).abs
28
- @page_height = (mediabox[3] - mediabox[1]).abs
29
31
  @x_offset = @runs.map(&:x).sort.first || 0
30
32
  lowest_y = @runs.map(&:y).sort.first || 0
31
33
  @y_offset = lowest_y > 0 ? 0 : lowest_y
@@ -48,6 +50,16 @@ class PDF::Reader
48
50
 
49
51
  private
50
52
 
53
+ def page_width
54
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
+ (@mediabox[2].to_f - @mediabox[0].to_f).abs
56
+ end
57
+
58
+ def page_height
59
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
+ (@mediabox[3].to_f - @mediabox[1].to_f).abs
61
+ end
62
+
51
63
  # given an array of strings, return a new array with empty rows from the
52
64
  # beginning and end removed.
53
65
  #
@@ -66,19 +78,19 @@ class PDF::Reader
66
78
  end
67
79
 
68
80
  def row_count
69
- @row_count ||= (@page_height / @mean_font_size).floor
81
+ @row_count ||= (page_height / @mean_font_size).floor
70
82
  end
71
83
 
72
84
  def col_count
73
- @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
85
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
74
86
  end
75
87
 
76
88
  def row_multiplier
77
- @row_multiplier ||= @page_height.to_f / row_count.to_f
89
+ @row_multiplier ||= page_height.to_f / row_count.to_f
78
90
  end
79
91
 
80
92
  def col_multiplier
81
- @col_multiplier ||= @page_width.to_f / col_count.to_f
93
+ @col_multiplier ||= page_width.to_f / col_count.to_f
82
94
  end
83
95
 
84
96
  def mean(collection)
@@ -108,17 +120,15 @@ class PDF::Reader
108
120
  end
109
121
 
110
122
  def group_chars_into_runs(chars)
111
- runs = []
112
- while head = chars.shift
123
+ chars.each_with_object([]) do |char, runs|
113
124
  if runs.empty?
114
- runs << head
115
- elsif runs.last.mergable?(head)
116
- runs[-1] = runs.last + head
125
+ runs << char
126
+ elsif runs.last.mergable?(char)
127
+ runs[-1] = runs.last + char
117
128
  else
118
- runs << head
129
+ runs << char
119
130
  end
120
131
  end
121
- runs
122
132
  end
123
133
 
124
134
  def local_string_insert(haystack, needle, index)