pdf-reader 2.4.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ttfunk'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest/md5'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -103,19 +104,25 @@ class PDF::Reader
103
104
 
104
105
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
105
106
  # a text file supplied by Adobe at:
106
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
107
+ # https://github.com/adobe-type-tools/agl-aglfn
107
108
  def load_adobe_glyph_mapping
108
109
  keyed_by_name = {}
109
110
  keyed_by_codepoint = {}
110
111
 
111
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
112
- f.each do |l|
113
- _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
114
- if name && code
115
- cp = "0x#{code}".hex
116
- keyed_by_name[name.to_sym] = cp
117
- keyed_by_codepoint[cp] ||= []
118
- keyed_by_codepoint[cp] << name.to_sym
112
+ paths = [
113
+ File.dirname(__FILE__) + "/glyphlist.txt",
114
+ File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
115
+ ]
116
+ paths.each do |path|
117
+ File.open(path, "r:BINARY") do |f|
118
+ f.each do |l|
119
+ _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
120
+ if name && code
121
+ cp = "0x#{code}".hex
122
+ keyed_by_name[name.to_sym] = cp
123
+ keyed_by_codepoint[cp] ||= []
124
+ keyed_by_codepoint[cp] << name.to_sym
125
+ end
119
126
  end
120
127
  end
121
128
  end
@@ -0,0 +1,245 @@
1
+ # -----------------------------------------------------------
2
+ # Copyright 2002-2019 Adobe (http://www.adobe.com/).
3
+ #
4
+ # Redistribution and use in source and binary forms, with or
5
+ # without modification, are permitted provided that the
6
+ # following conditions are met:
7
+ #
8
+ # Redistributions of source code must retain the above
9
+ # copyright notice, this list of conditions and the following
10
+ # disclaimer.
11
+ #
12
+ # Redistributions in binary form must reproduce the above
13
+ # copyright notice, this list of conditions and the following
14
+ # disclaimer in the documentation and/or other materials
15
+ # provided with the distribution.
16
+ #
17
+ # Neither the name of Adobe nor the names of its contributors
18
+ # may be used to endorse or promote products derived from this
19
+ # software without specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
22
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
23
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28
+ # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32
+ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ # -----------------------------------------------------------
35
+ # Name: ITC Zapf Dingbats Glyph List
36
+ # Table version: 2.0
37
+ # Date: September 20, 2002
38
+ # URL: https://github.com/adobe-type-tools/agl-aglfn
39
+ #
40
+ # Format: two semicolon-delimited fields:
41
+ # (1) glyph name--upper/lowercase letters and digits
42
+ # (2) Unicode scalar value--four uppercase hexadecimal digits
43
+ #
44
+ a100;275E
45
+ a101;2761
46
+ a102;2762
47
+ a103;2763
48
+ a104;2764
49
+ a105;2710
50
+ a106;2765
51
+ a107;2766
52
+ a108;2767
53
+ a109;2660
54
+ a10;2721
55
+ a110;2665
56
+ a111;2666
57
+ a112;2663
58
+ a117;2709
59
+ a118;2708
60
+ a119;2707
61
+ a11;261B
62
+ a120;2460
63
+ a121;2461
64
+ a122;2462
65
+ a123;2463
66
+ a124;2464
67
+ a125;2465
68
+ a126;2466
69
+ a127;2467
70
+ a128;2468
71
+ a129;2469
72
+ a12;261E
73
+ a130;2776
74
+ a131;2777
75
+ a132;2778
76
+ a133;2779
77
+ a134;277A
78
+ a135;277B
79
+ a136;277C
80
+ a137;277D
81
+ a138;277E
82
+ a139;277F
83
+ a13;270C
84
+ a140;2780
85
+ a141;2781
86
+ a142;2782
87
+ a143;2783
88
+ a144;2784
89
+ a145;2785
90
+ a146;2786
91
+ a147;2787
92
+ a148;2788
93
+ a149;2789
94
+ a14;270D
95
+ a150;278A
96
+ a151;278B
97
+ a152;278C
98
+ a153;278D
99
+ a154;278E
100
+ a155;278F
101
+ a156;2790
102
+ a157;2791
103
+ a158;2792
104
+ a159;2793
105
+ a15;270E
106
+ a160;2794
107
+ a161;2192
108
+ a162;27A3
109
+ a163;2194
110
+ a164;2195
111
+ a165;2799
112
+ a166;279B
113
+ a167;279C
114
+ a168;279D
115
+ a169;279E
116
+ a16;270F
117
+ a170;279F
118
+ a171;27A0
119
+ a172;27A1
120
+ a173;27A2
121
+ a174;27A4
122
+ a175;27A5
123
+ a176;27A6
124
+ a177;27A7
125
+ a178;27A8
126
+ a179;27A9
127
+ a17;2711
128
+ a180;27AB
129
+ a181;27AD
130
+ a182;27AF
131
+ a183;27B2
132
+ a184;27B3
133
+ a185;27B5
134
+ a186;27B8
135
+ a187;27BA
136
+ a188;27BB
137
+ a189;27BC
138
+ a18;2712
139
+ a190;27BD
140
+ a191;27BE
141
+ a192;279A
142
+ a193;27AA
143
+ a194;27B6
144
+ a195;27B9
145
+ a196;2798
146
+ a197;27B4
147
+ a198;27B7
148
+ a199;27AC
149
+ a19;2713
150
+ a1;2701
151
+ a200;27AE
152
+ a201;27B1
153
+ a202;2703
154
+ a203;2750
155
+ a204;2752
156
+ a205;276E
157
+ a206;2770
158
+ a20;2714
159
+ a21;2715
160
+ a22;2716
161
+ a23;2717
162
+ a24;2718
163
+ a25;2719
164
+ a26;271A
165
+ a27;271B
166
+ a28;271C
167
+ a29;2722
168
+ a2;2702
169
+ a30;2723
170
+ a31;2724
171
+ a32;2725
172
+ a33;2726
173
+ a34;2727
174
+ a35;2605
175
+ a36;2729
176
+ a37;272A
177
+ a38;272B
178
+ a39;272C
179
+ a3;2704
180
+ a40;272D
181
+ a41;272E
182
+ a42;272F
183
+ a43;2730
184
+ a44;2731
185
+ a45;2732
186
+ a46;2733
187
+ a47;2734
188
+ a48;2735
189
+ a49;2736
190
+ a4;260E
191
+ a50;2737
192
+ a51;2738
193
+ a52;2739
194
+ a53;273A
195
+ a54;273B
196
+ a55;273C
197
+ a56;273D
198
+ a57;273E
199
+ a58;273F
200
+ a59;2740
201
+ a5;2706
202
+ a60;2741
203
+ a61;2742
204
+ a62;2743
205
+ a63;2744
206
+ a64;2745
207
+ a65;2746
208
+ a66;2747
209
+ a67;2748
210
+ a68;2749
211
+ a69;274A
212
+ a6;271D
213
+ a70;274B
214
+ a71;25CF
215
+ a72;274D
216
+ a73;25A0
217
+ a74;274F
218
+ a75;2751
219
+ a76;25B2
220
+ a77;25BC
221
+ a78;25C6
222
+ a79;2756
223
+ a7;271E
224
+ a81;25D7
225
+ a82;2758
226
+ a83;2759
227
+ a84;275A
228
+ a85;276F
229
+ a86;2771
230
+ a87;2772
231
+ a88;2773
232
+ a89;2768
233
+ a8;271F
234
+ a90;2769
235
+ a91;276C
236
+ a92;276D
237
+ a93;276A
238
+ a94;276B
239
+ a95;2774
240
+ a96;2775
241
+ a97;275B
242
+ a98;275C
243
+ a99;275D
244
+ a9;2720
245
+ # END
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -35,9 +36,9 @@ module PDF
35
36
 
36
37
  def read
37
38
  bits_left_in_chunk = @bits_in_chunk
38
- chunk = nil
39
+ chunk = -1
39
40
  while bits_left_in_chunk > 0 and @current_pos < @data.size
40
- chunk = 0 if chunk.nil?
41
+ chunk = 0 if chunk < 0
41
42
  codepoint = @data[@current_pos, 1].unpack("C*")[0]
42
43
  current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
43
44
  dif = bits_left_in_chunk - @bits_left_in_byte
@@ -83,6 +84,7 @@ module PDF
83
84
  #
84
85
  def self.decode(data)
85
86
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
87
+ string_table = StringTable.new
86
88
  result = "".dup
87
89
  until (code = stream.read) == CODE_EOD
88
90
  if code == CODE_CLEAR_TABLE
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'hashery/lru_hash'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -331,11 +332,15 @@ class PDF::Reader
331
332
  def decrypt(ref, obj)
332
333
  case obj
333
334
  when PDF::Reader::Stream then
334
- obj.data = sec_handler.decrypt(obj.data, ref)
335
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
336
+ # Therefore we shouldn't try to decrypt it.
337
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
335
338
  obj
336
339
  when Hash then
337
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
338
- Hash[*arr]
340
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
341
+ arr.each_with_object({}) { |(k,v), accum|
342
+ accum[k] = v
343
+ }
339
344
  when Array then
340
345
  obj.collect { |item| decrypt(ref, item) }
341
346
  when String
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,6 @@
1
+ # typed: true
1
2
  # coding: utf-8
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -38,7 +40,8 @@ class PDF::Reader
38
40
 
39
41
  def self.detect_intersection(sweep_line_status, event_point)
40
42
  sweep_line_status.each do |open_text_run|
41
- if event_point.x >= open_text_run.x &&
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
42
45
  event_point.x <= open_text_run.endx &&
43
46
  open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
47
  return true
@@ -51,10 +54,14 @@ class PDF::Reader
51
54
  # Utility class used to avoid modifying the underlying TextRun objects while we're
52
55
  # looking for duplicates
53
56
  class EventPoint
54
- attr_reader :x, :run
55
57
 
56
- def initialize x, run
57
- @x, @run = x, run
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
58
65
  end
59
66
 
60
67
  def start?
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -68,10 +69,33 @@ module PDF
68
69
  @attributes
69
70
  end
70
71
 
72
+ def height
73
+ rect = Rectangle.new(*attributes[:MediaBox])
74
+ rect.apply_rotation(rotate) if rotate > 0
75
+ rect.height
76
+ end
77
+
78
+ def width
79
+ rect = Rectangle.new(*attributes[:MediaBox])
80
+ rect.apply_rotation(rotate) if rotate > 0
81
+ rect.width
82
+ end
83
+
84
+ def origin
85
+ rect = Rectangle.new(*attributes[:MediaBox])
86
+ rect.apply_rotation(rotate) if rotate > 0
87
+
88
+ rect.bottom_left
89
+ end
90
+
71
91
  # Convenience method to identify the page's orientation.
72
92
  #
73
93
  def orientation
74
- OrientationDetector.new(attributes).orientation
94
+ if height > width
95
+ "portrait"
96
+ else
97
+ "landscape"
98
+ end
75
99
  end
76
100
 
77
101
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -139,23 +163,50 @@ module PDF
139
163
  # returns the "boxes" that define the page object.
140
164
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
165
  #
166
+ # DEPRECATED. Recommend using Page#rectangles instead
167
+ #
142
168
  def boxes
143
- mediabox = attributes[:MediaBox]
144
- cropbox = attributes[:Cropbox] || mediabox
169
+ # In ruby 2.4+ we could use Hash#transform_values
170
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
171
+ end
172
+
173
+ # returns the "boxes" that define the page object.
174
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
+ #
176
+ def rectangles
177
+ mediabox = objects.deref!(attributes[:MediaBox])
178
+ cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
+ bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
+ trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
+ artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
+
183
+ mediarect = Rectangle.new(*mediabox)
184
+ croprect = Rectangle.new(*cropbox)
185
+ bleedrect = Rectangle.new(*bleedbox)
186
+ trimrect = Rectangle.new(*trimbox)
187
+ artrect = Rectangle.new(*artbox)
188
+
189
+ if rotate > 0
190
+ mediarect.apply_rotation(rotate)
191
+ croprect.apply_rotation(rotate)
192
+ bleedrect.apply_rotation(rotate)
193
+ trimrect.apply_rotation(rotate)
194
+ artrect.apply_rotation(rotate)
195
+ end
145
196
 
146
197
  {
147
- MediaBox: objects.deref!(mediabox),
148
- CropBox: objects.deref!(cropbox),
149
- BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
- TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
- ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
198
+ MediaBox: mediarect,
199
+ CropBox: croprect,
200
+ BleedBox: bleedrect,
201
+ TrimBox: trimrect,
202
+ ArtBox: artrect,
152
203
  }
153
204
  end
154
205
 
155
206
  private
156
207
 
157
208
  def root
158
- root ||= objects.deref(@objects.trailer[:Root])
209
+ @root ||= objects.deref(@objects.trailer[:Root])
159
210
  end
160
211
 
161
212
  # Returns the resources that accompany this page. Includes
@@ -1,7 +1,9 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/overlapping_runs_filter'
6
+ require 'pdf/reader/zero_width_runs_filter'
5
7
 
6
8
  class PDF::Reader
7
9
 
@@ -15,14 +17,17 @@ class PDF::Reader
15
17
  DEFAULT_FONT_SIZE = 12
16
18
 
17
19
  def initialize(runs, mediabox)
18
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
19
-
20
- @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
+
24
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
+ @mediabox = mediabox
27
+ @runs = merge_runs(runs)
21
28
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
22
29
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
23
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
24
- @page_width = (mediabox[2] - mediabox[0]).abs
25
- @page_height = (mediabox[3] - mediabox[1]).abs
30
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
26
31
  @x_offset = @runs.map(&:x).sort.first || 0
27
32
  lowest_y = @runs.map(&:y).sort.first || 0
28
33
  @y_offset = lowest_y > 0 ? 0 : lowest_y
@@ -30,6 +35,7 @@ class PDF::Reader
30
35
 
31
36
  def to_s
32
37
  return "" if @runs.empty?
38
+ return "" if row_count == 0
33
39
 
34
40
  page = row_count.times.map { |i| " " * col_count }
35
41
  @runs.each do |run|
@@ -44,6 +50,16 @@ class PDF::Reader
44
50
 
45
51
  private
46
52
 
53
+ def page_width
54
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
+ (@mediabox[2].to_f - @mediabox[0].to_f).abs
56
+ end
57
+
58
+ def page_height
59
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
+ (@mediabox[3].to_f - @mediabox[1].to_f).abs
61
+ end
62
+
47
63
  # given an array of strings, return a new array with empty rows from the
48
64
  # beginning and end removed.
49
65
  #
@@ -62,19 +78,19 @@ class PDF::Reader
62
78
  end
63
79
 
64
80
  def row_count
65
- @row_count ||= (@page_height / @mean_font_size).floor
81
+ @row_count ||= (page_height / @mean_font_size).floor
66
82
  end
67
83
 
68
84
  def col_count
69
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
85
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
70
86
  end
71
87
 
72
88
  def row_multiplier
73
- @row_multiplier ||= @page_height.to_f / row_count.to_f
89
+ @row_multiplier ||= page_height.to_f / row_count.to_f
74
90
  end
75
91
 
76
92
  def col_multiplier
77
- @col_multiplier ||= @page_width.to_f / col_count.to_f
93
+ @col_multiplier ||= page_width.to_f / col_count.to_f
78
94
  end
79
95
 
80
96
  def mean(collection)
@@ -85,12 +101,12 @@ class PDF::Reader
85
101
  end
86
102
  end
87
103
 
88
- def each_line(&block)
89
- @runs.sort.group_by { |run|
90
- run.y.to_i
91
- }.map { |y, collection|
92
- yield y, collection
93
- }
104
+ def median(collection)
105
+ if collection.size == 0
106
+ 0
107
+ else
108
+ collection.sort[(collection.size * 0.5).floor]
109
+ end
94
110
  end
95
111
 
96
112
  # take a collection of TextRun objects and merge any that are in close
@@ -104,17 +120,15 @@ class PDF::Reader
104
120
  end
105
121
 
106
122
  def group_chars_into_runs(chars)
107
- runs = []
108
- while head = chars.shift
123
+ chars.each_with_object([]) do |char, runs|
109
124
  if runs.empty?
110
- runs << head
111
- elsif runs.last.mergable?(head)
112
- runs[-1] = runs.last + head
125
+ runs << char
126
+ elsif runs.last.mergable?(char)
127
+ runs[-1] = runs.last + char
113
128
  else
114
- runs << head
129
+ runs << char
115
130
  end
116
131
  end
117
- runs
118
132
  end
119
133
 
120
134
  def local_string_insert(haystack, needle, index)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -30,15 +31,7 @@ class PDF::Reader
30
31
  @xobject_stack = [page.xobjects]
31
32
  @cs_stack = [page.color_spaces]
32
33
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- if page.rotate == 0
34
- state[:ctm] = identity_matrix
35
- else
36
- rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
37
- rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
38
- state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
39
- rotate_sin * -1, rotate_cos,
40
- 0, 0)
41
- end
34
+ state[:ctm] = identity_matrix
42
35
  end
43
36
 
44
37
  #####################################################
@@ -320,7 +313,7 @@ class PDF::Reader
320
313
  # may need to be added
321
314
  #
322
315
  def process_glyph_displacement(w0, tj, word_boundary)
323
- fs = font_size # font size
316
+ fs = state[:text_font_size]
324
317
  tc = state[:char_spacing]
325
318
  if word_boundary
326
319
  tw = state[:word_spacing]
@@ -330,22 +323,24 @@ class PDF::Reader
330
323
  th = state[:h_scaling]
331
324
  # optimise the common path to reduce Float allocations
332
325
  if th == 1 && tj == 0 && tc == 0 && tw == 0
333
- glyph_width = w0 * fs
334
- tx = glyph_width
326
+ tx = w0 * fs
327
+ elsif tj != 0
328
+ # don't apply spacing to TJ displacement
329
+ tx = (w0 - (tj/1000.0)) * fs * th
335
330
  else
336
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
337
- tx = glyph_width + ((tc + tw) * th)
331
+ # apply horizontal scaling to spacing values but not font size
332
+ tx = ((w0 * fs) + tc + tw) * th
338
333
  end
339
-
340
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
341
- # ctm[0] here, but this gets my tests green and I'm out of
342
- # ideas for now
343
334
  # TODO: support ty > 0
344
- if ctm.a == 1 || ctm.a == 0
345
- @text_matrix.horizontal_displacement_multiply!(tx)
346
- else
347
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
348
- end
335
+ ty = 0
336
+ temp = TransformationMatrix.new(1, 0,
337
+ 0, 1,
338
+ tx, ty)
339
+ @text_matrix = temp.multiply!(
340
+ @text_matrix.a, @text_matrix.b,
341
+ @text_matrix.c, @text_matrix.d,
342
+ @text_matrix.e, @text_matrix.f
343
+ )
349
344
  @font_size = @text_rendering_matrix = nil # invalidate cached value
350
345
  end
351
346