pdf-reader 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +13 -1
  3. data/examples/rspec.rb +1 -0
  4. data/lib/pdf/reader/buffer.rb +1 -0
  5. data/lib/pdf/reader/cid_widths.rb +1 -0
  6. data/lib/pdf/reader/cmap.rb +5 -3
  7. data/lib/pdf/reader/encoding.rb +2 -1
  8. data/lib/pdf/reader/error.rb +8 -0
  9. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  10. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  11. data/lib/pdf/reader/filter/depredict.rb +7 -5
  12. data/lib/pdf/reader/filter/flate.rb +2 -0
  13. data/lib/pdf/reader/filter/lzw.rb +2 -0
  14. data/lib/pdf/reader/filter/null.rb +1 -0
  15. data/lib/pdf/reader/filter/run_length.rb +19 -13
  16. data/lib/pdf/reader/filter.rb +1 -0
  17. data/lib/pdf/reader/font.rb +1 -0
  18. data/lib/pdf/reader/font_descriptor.rb +1 -0
  19. data/lib/pdf/reader/form_xobject.rb +1 -0
  20. data/lib/pdf/reader/glyph_hash.rb +1 -0
  21. data/lib/pdf/reader/lzw.rb +4 -2
  22. data/lib/pdf/reader/null_security_handler.rb +1 -0
  23. data/lib/pdf/reader/object_cache.rb +1 -0
  24. data/lib/pdf/reader/object_hash.rb +5 -2
  25. data/lib/pdf/reader/object_stream.rb +1 -0
  26. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  27. data/lib/pdf/reader/page.rb +60 -9
  28. data/lib/pdf/reader/page_layout.rb +24 -14
  29. data/lib/pdf/reader/page_state.rb +11 -10
  30. data/lib/pdf/reader/page_text_receiver.rb +13 -8
  31. data/lib/pdf/reader/pages_strategy.rb +1 -0
  32. data/lib/pdf/reader/parser.rb +4 -1
  33. data/lib/pdf/reader/point.rb +25 -0
  34. data/lib/pdf/reader/print_receiver.rb +1 -0
  35. data/lib/pdf/reader/rectangle.rb +95 -0
  36. data/lib/pdf/reader/reference.rb +1 -0
  37. data/lib/pdf/reader/register_receiver.rb +1 -0
  38. data/lib/pdf/reader/resource_methods.rb +5 -0
  39. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  40. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  41. data/lib/pdf/reader/stream.rb +1 -0
  42. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  43. data/lib/pdf/reader/text_run.rb +1 -0
  44. data/lib/pdf/reader/token.rb +1 -0
  45. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  46. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  47. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  48. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  49. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  50. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  52. data/lib/pdf/reader/width_calculator.rb +1 -0
  53. data/lib/pdf/reader/xref.rb +1 -0
  54. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  55. data/lib/pdf/reader.rb +14 -4
  56. data/lib/pdf-reader.rb +1 -0
  57. data/rbi/pdf-reader.rbi +1744 -0
  58. metadata +12 -10
  59. data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
4
- data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
3
+ metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
+ data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
5
5
  SHA512:
6
- metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
7
- data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
6
+ metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
+ data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
data/CHANGELOG CHANGED
@@ -1,6 +1,18 @@
1
+ v2.7.0 (13th December 2021)
2
+ - Include RBI type files in the gem
3
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
4
+ now be typed checked by sorbet
5
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
6
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
7
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
8
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
9
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
10
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
11
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
12
+
1
13
  v2.6.0 (12th November 2021)
2
14
  - Text extraction improvements
3
- - Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
15
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
4
16
  - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
5
17
  - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
6
18
  - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,6 +33,7 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
38
  "begincodespacerange" => 1,
37
39
  "endcodespacerange" => 1,
@@ -53,7 +55,7 @@ class PDF::Reader
53
55
 
54
56
  def process_data(data)
55
57
  parser = build_parser(data)
56
- mode = nil
58
+ mode = :none
57
59
  instructions = []
58
60
 
59
61
  while token = parser.parse_token(CMAP_KEYWORDS)
@@ -62,13 +64,13 @@ class PDF::Reader
62
64
  elsif token == "endbfchar"
63
65
  process_bfchar_instructions(instructions)
64
66
  instructions = []
65
- mode = nil
67
+ mode = :none
66
68
  elsif token == "beginbfrange"
67
69
  mode = :range
68
70
  elsif token == "endbfrange"
69
71
  process_bfrange_instructions(instructions)
70
72
  instructions = []
71
- mode = nil
73
+ mode = :none
72
74
  elsif mode == :char || mode == :range
73
75
  instructions << token
74
76
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -46,6 +47,13 @@ class PDF::Reader
46
47
  raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_not_nil(object, name)
55
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
56
+ end
49
57
  end
50
58
 
51
59
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
12
  @options = options || {}
11
13
  end
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
 
@@ -8,6 +9,7 @@ class PDF::Reader
8
9
  module Filter # :nodoc:
9
10
  # implementation of the Flate (zlib) stream filter
10
11
  class Flate
12
+
11
13
  ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
14
  ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
15
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the LZW stream filter
8
9
  class Lzw
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
6
7
  module Filter # :nodoc:
7
8
  # implementation of the run length stream filter
8
9
  class RunLength
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
20
22
  length = data.getbyte(pos)
21
23
  pos += 1
22
24
 
23
- case
24
- when length == 128
25
- break
26
- when length < 128
27
- # When the length is < 128, we copy the following length+1 bytes
28
- # literally.
29
- out << data[pos, length + 1]
30
- pos += length
31
- else
32
- # When the length is > 128, we copy the next byte (257 - length)
33
- # times; i.e., "\xFA\x00" ([250, 0]) will expand to
34
- # "\x00\x00\x00\x00\x00\x00\x00".
35
- out << data[pos, 1] * (257 - length)
25
+ unless length.nil?
26
+ case
27
+ # nothing
28
+ when length == 128
29
+ break
30
+ when length < 128
31
+ # When the length is < 128, we copy the following length+1 bytes
32
+ # literally.
33
+ out << data[pos, length + 1]
34
+ pos += length
35
+ else
36
+ # When the length is > 128, we copy the next byte (257 - length)
37
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
38
+ # "\x00\x00\x00\x00\x00\x00\x00".
39
+ previous_byte = data[pos, 1] || ""
40
+ out << previous_byte * (257 - length)
41
+ end
36
42
  end
37
43
 
38
44
  pos += 1
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ttfunk'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest/md5'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -35,9 +36,9 @@ module PDF
35
36
 
36
37
  def read
37
38
  bits_left_in_chunk = @bits_in_chunk
38
- chunk = nil
39
+ chunk = -1
39
40
  while bits_left_in_chunk > 0 and @current_pos < @data.size
40
- chunk = 0 if chunk.nil?
41
+ chunk = 0 if chunk < 0
41
42
  codepoint = @data[@current_pos, 1].unpack("C*")[0]
42
43
  current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
43
44
  dif = bits_left_in_chunk - @bits_left_in_byte
@@ -83,6 +84,7 @@ module PDF
83
84
  #
84
85
  def self.decode(data)
85
86
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
87
+ string_table = StringTable.new
86
88
  result = "".dup
87
89
  until (code = stream.read) == CODE_EOD
88
90
  if code == CODE_CLEAR_TABLE
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'hashery/lru_hash'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -336,8 +337,10 @@ class PDF::Reader
336
337
  obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
337
338
  obj
338
339
  when Hash then
339
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
340
- Hash[*arr]
340
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
341
+ arr.each_with_object({}) { |(k,v), accum|
342
+ accum[k] = v
343
+ }
341
344
  when Array then
342
345
  obj.collect { |item| decrypt(ref, item) }
343
346
  when String
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,6 @@
1
+ # typed: true
1
2
  # coding: utf-8
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -38,7 +40,8 @@ class PDF::Reader
38
40
 
39
41
  def self.detect_intersection(sweep_line_status, event_point)
40
42
  sweep_line_status.each do |open_text_run|
41
- if event_point.x >= open_text_run.x &&
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
42
45
  event_point.x <= open_text_run.endx &&
43
46
  open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
47
  return true
@@ -51,10 +54,14 @@ class PDF::Reader
51
54
  # Utility class used to avoid modifying the underlying TextRun objects while we're
52
55
  # looking for duplicates
53
56
  class EventPoint
54
- attr_reader :x, :run
55
57
 
56
- def initialize x, run
57
- @x, @run = x, run
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
58
65
  end
59
66
 
60
67
  def start?
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -68,10 +69,33 @@ module PDF
68
69
  @attributes
69
70
  end
70
71
 
72
+ def height
73
+ rect = Rectangle.new(*attributes[:MediaBox])
74
+ rect.apply_rotation(rotate) if rotate > 0
75
+ rect.height
76
+ end
77
+
78
+ def width
79
+ rect = Rectangle.new(*attributes[:MediaBox])
80
+ rect.apply_rotation(rotate) if rotate > 0
81
+ rect.width
82
+ end
83
+
84
+ def origin
85
+ rect = Rectangle.new(*attributes[:MediaBox])
86
+ rect.apply_rotation(rotate) if rotate > 0
87
+
88
+ rect.bottom_left
89
+ end
90
+
71
91
  # Convenience method to identify the page's orientation.
72
92
  #
73
93
  def orientation
74
- OrientationDetector.new(attributes).orientation
94
+ if height > width
95
+ "portrait"
96
+ else
97
+ "landscape"
98
+ end
75
99
  end
76
100
 
77
101
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -139,23 +163,50 @@ module PDF
139
163
  # returns the "boxes" that define the page object.
140
164
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
165
  #
166
+ # DEPRECATED. Recommend using Page#rectangles instead
167
+ #
142
168
  def boxes
143
- mediabox = attributes[:MediaBox]
144
- cropbox = attributes[:Cropbox] || mediabox
169
+ # In ruby 2.4+ we could use Hash#transform_values
170
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
171
+ end
172
+
173
+ # returns the "boxes" that define the page object.
174
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
+ #
176
+ def rectangles
177
+ mediabox = objects.deref!(attributes[:MediaBox])
178
+ cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
+ bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
+ trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
+ artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
+
183
+ mediarect = Rectangle.new(*mediabox)
184
+ croprect = Rectangle.new(*cropbox)
185
+ bleedrect = Rectangle.new(*bleedbox)
186
+ trimrect = Rectangle.new(*trimbox)
187
+ artrect = Rectangle.new(*artbox)
188
+
189
+ if rotate > 0
190
+ mediarect.apply_rotation(rotate)
191
+ croprect.apply_rotation(rotate)
192
+ bleedrect.apply_rotation(rotate)
193
+ trimrect.apply_rotation(rotate)
194
+ artrect.apply_rotation(rotate)
195
+ end
145
196
 
146
197
  {
147
- MediaBox: objects.deref!(mediabox),
148
- CropBox: objects.deref!(cropbox),
149
- BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
- TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
- ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
198
+ MediaBox: mediarect,
199
+ CropBox: croprect,
200
+ BleedBox: bleedrect,
201
+ TrimBox: trimrect,
202
+ ArtBox: artrect,
152
203
  }
153
204
  end
154
205
 
155
206
  private
156
207
 
157
208
  def root
158
- root ||= objects.deref(@objects.trailer[:Root])
209
+ @root ||= objects.deref(@objects.trailer[:Root])
159
210
  end
160
211
 
161
212
  # Returns the resources that accompany this page. Includes
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/overlapping_runs_filter'
@@ -16,16 +17,17 @@ class PDF::Reader
16
17
  DEFAULT_FONT_SIZE = 12
17
18
 
18
19
  def initialize(runs, mediabox)
19
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
20
23
 
21
24
  runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
22
25
  runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
+ @mediabox = mediabox
23
27
  @runs = merge_runs(runs)
24
28
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
25
29
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
26
30
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
27
- @page_width = (mediabox[2] - mediabox[0]).abs
28
- @page_height = (mediabox[3] - mediabox[1]).abs
29
31
  @x_offset = @runs.map(&:x).sort.first || 0
30
32
  lowest_y = @runs.map(&:y).sort.first || 0
31
33
  @y_offset = lowest_y > 0 ? 0 : lowest_y
@@ -48,6 +50,16 @@ class PDF::Reader
48
50
 
49
51
  private
50
52
 
53
+ def page_width
54
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
+ (@mediabox[2].to_f - @mediabox[0].to_f).abs
56
+ end
57
+
58
+ def page_height
59
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
+ (@mediabox[3].to_f - @mediabox[1].to_f).abs
61
+ end
62
+
51
63
  # given an array of strings, return a new array with empty rows from the
52
64
  # beginning and end removed.
53
65
  #
@@ -66,19 +78,19 @@ class PDF::Reader
66
78
  end
67
79
 
68
80
  def row_count
69
- @row_count ||= (@page_height / @mean_font_size).floor
81
+ @row_count ||= (page_height / @mean_font_size).floor
70
82
  end
71
83
 
72
84
  def col_count
73
- @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
85
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
74
86
  end
75
87
 
76
88
  def row_multiplier
77
- @row_multiplier ||= @page_height.to_f / row_count.to_f
89
+ @row_multiplier ||= page_height.to_f / row_count.to_f
78
90
  end
79
91
 
80
92
  def col_multiplier
81
- @col_multiplier ||= @page_width.to_f / col_count.to_f
93
+ @col_multiplier ||= page_width.to_f / col_count.to_f
82
94
  end
83
95
 
84
96
  def mean(collection)
@@ -108,17 +120,15 @@ class PDF::Reader
108
120
  end
109
121
 
110
122
  def group_chars_into_runs(chars)
111
- runs = []
112
- while head = chars.shift
123
+ chars.each_with_object([]) do |char, runs|
113
124
  if runs.empty?
114
- runs << head
115
- elsif runs.last.mergable?(head)
116
- runs[-1] = runs.last + head
125
+ runs << char
126
+ elsif runs.last.mergable?(char)
127
+ runs[-1] = runs.last + char
117
128
  else
118
- runs << head
129
+ runs << char
119
130
  end
120
131
  end
121
- runs
122
132
  end
123
133
 
124
134
  def local_string_insert(haystack, needle, index)