pdf-reader 2.4.1 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ttfunk'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest/md5'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -103,19 +104,25 @@ class PDF::Reader
103
104
 
104
105
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
105
106
  # a text file supplied by Adobe at:
106
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
107
+ # https://github.com/adobe-type-tools/agl-aglfn
107
108
  def load_adobe_glyph_mapping
108
109
  keyed_by_name = {}
109
110
  keyed_by_codepoint = {}
110
111
 
111
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
112
- f.each do |l|
113
- _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
114
- if name && code
115
- cp = "0x#{code}".hex
116
- keyed_by_name[name.to_sym] = cp
117
- keyed_by_codepoint[cp] ||= []
118
- keyed_by_codepoint[cp] << name.to_sym
112
+ paths = [
113
+ File.dirname(__FILE__) + "/glyphlist.txt",
114
+ File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
115
+ ]
116
+ paths.each do |path|
117
+ File.open(path, "r:BINARY") do |f|
118
+ f.each do |l|
119
+ _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
120
+ if name && code
121
+ cp = "0x#{code}".hex
122
+ keyed_by_name[name.to_sym] = cp
123
+ keyed_by_codepoint[cp] ||= []
124
+ keyed_by_codepoint[cp] << name.to_sym
125
+ end
119
126
  end
120
127
  end
121
128
  end
@@ -0,0 +1,245 @@
1
+ # -----------------------------------------------------------
2
+ # Copyright 2002-2019 Adobe (http://www.adobe.com/).
3
+ #
4
+ # Redistribution and use in source and binary forms, with or
5
+ # without modification, are permitted provided that the
6
+ # following conditions are met:
7
+ #
8
+ # Redistributions of source code must retain the above
9
+ # copyright notice, this list of conditions and the following
10
+ # disclaimer.
11
+ #
12
+ # Redistributions in binary form must reproduce the above
13
+ # copyright notice, this list of conditions and the following
14
+ # disclaimer in the documentation and/or other materials
15
+ # provided with the distribution.
16
+ #
17
+ # Neither the name of Adobe nor the names of its contributors
18
+ # may be used to endorse or promote products derived from this
19
+ # software without specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
22
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
23
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28
+ # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32
+ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ # -----------------------------------------------------------
35
+ # Name: ITC Zapf Dingbats Glyph List
36
+ # Table version: 2.0
37
+ # Date: September 20, 2002
38
+ # URL: https://github.com/adobe-type-tools/agl-aglfn
39
+ #
40
+ # Format: two semicolon-delimited fields:
41
+ # (1) glyph name--upper/lowercase letters and digits
42
+ # (2) Unicode scalar value--four uppercase hexadecimal digits
43
+ #
44
+ a100;275E
45
+ a101;2761
46
+ a102;2762
47
+ a103;2763
48
+ a104;2764
49
+ a105;2710
50
+ a106;2765
51
+ a107;2766
52
+ a108;2767
53
+ a109;2660
54
+ a10;2721
55
+ a110;2665
56
+ a111;2666
57
+ a112;2663
58
+ a117;2709
59
+ a118;2708
60
+ a119;2707
61
+ a11;261B
62
+ a120;2460
63
+ a121;2461
64
+ a122;2462
65
+ a123;2463
66
+ a124;2464
67
+ a125;2465
68
+ a126;2466
69
+ a127;2467
70
+ a128;2468
71
+ a129;2469
72
+ a12;261E
73
+ a130;2776
74
+ a131;2777
75
+ a132;2778
76
+ a133;2779
77
+ a134;277A
78
+ a135;277B
79
+ a136;277C
80
+ a137;277D
81
+ a138;277E
82
+ a139;277F
83
+ a13;270C
84
+ a140;2780
85
+ a141;2781
86
+ a142;2782
87
+ a143;2783
88
+ a144;2784
89
+ a145;2785
90
+ a146;2786
91
+ a147;2787
92
+ a148;2788
93
+ a149;2789
94
+ a14;270D
95
+ a150;278A
96
+ a151;278B
97
+ a152;278C
98
+ a153;278D
99
+ a154;278E
100
+ a155;278F
101
+ a156;2790
102
+ a157;2791
103
+ a158;2792
104
+ a159;2793
105
+ a15;270E
106
+ a160;2794
107
+ a161;2192
108
+ a162;27A3
109
+ a163;2194
110
+ a164;2195
111
+ a165;2799
112
+ a166;279B
113
+ a167;279C
114
+ a168;279D
115
+ a169;279E
116
+ a16;270F
117
+ a170;279F
118
+ a171;27A0
119
+ a172;27A1
120
+ a173;27A2
121
+ a174;27A4
122
+ a175;27A5
123
+ a176;27A6
124
+ a177;27A7
125
+ a178;27A8
126
+ a179;27A9
127
+ a17;2711
128
+ a180;27AB
129
+ a181;27AD
130
+ a182;27AF
131
+ a183;27B2
132
+ a184;27B3
133
+ a185;27B5
134
+ a186;27B8
135
+ a187;27BA
136
+ a188;27BB
137
+ a189;27BC
138
+ a18;2712
139
+ a190;27BD
140
+ a191;27BE
141
+ a192;279A
142
+ a193;27AA
143
+ a194;27B6
144
+ a195;27B9
145
+ a196;2798
146
+ a197;27B4
147
+ a198;27B7
148
+ a199;27AC
149
+ a19;2713
150
+ a1;2701
151
+ a200;27AE
152
+ a201;27B1
153
+ a202;2703
154
+ a203;2750
155
+ a204;2752
156
+ a205;276E
157
+ a206;2770
158
+ a20;2714
159
+ a21;2715
160
+ a22;2716
161
+ a23;2717
162
+ a24;2718
163
+ a25;2719
164
+ a26;271A
165
+ a27;271B
166
+ a28;271C
167
+ a29;2722
168
+ a2;2702
169
+ a30;2723
170
+ a31;2724
171
+ a32;2725
172
+ a33;2726
173
+ a34;2727
174
+ a35;2605
175
+ a36;2729
176
+ a37;272A
177
+ a38;272B
178
+ a39;272C
179
+ a3;2704
180
+ a40;272D
181
+ a41;272E
182
+ a42;272F
183
+ a43;2730
184
+ a44;2731
185
+ a45;2732
186
+ a46;2733
187
+ a47;2734
188
+ a48;2735
189
+ a49;2736
190
+ a4;260E
191
+ a50;2737
192
+ a51;2738
193
+ a52;2739
194
+ a53;273A
195
+ a54;273B
196
+ a55;273C
197
+ a56;273D
198
+ a57;273E
199
+ a58;273F
200
+ a59;2740
201
+ a5;2706
202
+ a60;2741
203
+ a61;2742
204
+ a62;2743
205
+ a63;2744
206
+ a64;2745
207
+ a65;2746
208
+ a66;2747
209
+ a67;2748
210
+ a68;2749
211
+ a69;274A
212
+ a6;271D
213
+ a70;274B
214
+ a71;25CF
215
+ a72;274D
216
+ a73;25A0
217
+ a74;274F
218
+ a75;2751
219
+ a76;25B2
220
+ a77;25BC
221
+ a78;25C6
222
+ a79;2756
223
+ a7;271E
224
+ a81;25D7
225
+ a82;2758
226
+ a83;2759
227
+ a84;275A
228
+ a85;276F
229
+ a86;2771
230
+ a87;2772
231
+ a88;2773
232
+ a89;2768
233
+ a8;271F
234
+ a90;2769
235
+ a91;276C
236
+ a92;276D
237
+ a93;276A
238
+ a94;276B
239
+ a95;2774
240
+ a96;2775
241
+ a97;275B
242
+ a98;275C
243
+ a99;275D
244
+ a9;2720
245
+ # END
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -35,9 +36,9 @@ module PDF
35
36
 
36
37
  def read
37
38
  bits_left_in_chunk = @bits_in_chunk
38
- chunk = nil
39
+ chunk = -1
39
40
  while bits_left_in_chunk > 0 and @current_pos < @data.size
40
- chunk = 0 if chunk.nil?
41
+ chunk = 0 if chunk < 0
41
42
  codepoint = @data[@current_pos, 1].unpack("C*")[0]
42
43
  current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
43
44
  dif = bits_left_in_chunk - @bits_left_in_byte
@@ -83,6 +84,7 @@ module PDF
83
84
  #
84
85
  def self.decode(data)
85
86
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
87
+ string_table = StringTable.new
86
88
  result = "".dup
87
89
  until (code = stream.read) == CODE_EOD
88
90
  if code == CODE_CLEAR_TABLE
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'hashery/lru_hash'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -331,11 +332,15 @@ class PDF::Reader
331
332
  def decrypt(ref, obj)
332
333
  case obj
333
334
  when PDF::Reader::Stream then
334
- obj.data = sec_handler.decrypt(obj.data, ref)
335
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
336
+ # Therefore we shouldn't try to decrypt it.
337
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
335
338
  obj
336
339
  when Hash then
337
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
338
- Hash[*arr]
340
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
341
+ arr.each_with_object({}) { |(k,v), accum|
342
+ accum[k] = v
343
+ }
339
344
  when Array then
340
345
  obj.collect { |item| decrypt(ref, item) }
341
346
  when String
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,6 @@
1
+ # typed: true
1
2
  # coding: utf-8
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -38,7 +40,8 @@ class PDF::Reader
38
40
 
39
41
  def self.detect_intersection(sweep_line_status, event_point)
40
42
  sweep_line_status.each do |open_text_run|
41
- if event_point.x >= open_text_run.x &&
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
42
45
  event_point.x <= open_text_run.endx &&
43
46
  open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
47
  return true
@@ -51,10 +54,14 @@ class PDF::Reader
51
54
  # Utility class used to avoid modifying the underlying TextRun objects while we're
52
55
  # looking for duplicates
53
56
  class EventPoint
54
- attr_reader :x, :run
55
57
 
56
- def initialize x, run
57
- @x, @run = x, run
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
58
65
  end
59
66
 
60
67
  def start?
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -68,10 +69,33 @@ module PDF
68
69
  @attributes
69
70
  end
70
71
 
72
+ def height
73
+ rect = Rectangle.new(*attributes[:MediaBox])
74
+ rect.apply_rotation(rotate) if rotate > 0
75
+ rect.height
76
+ end
77
+
78
+ def width
79
+ rect = Rectangle.new(*attributes[:MediaBox])
80
+ rect.apply_rotation(rotate) if rotate > 0
81
+ rect.width
82
+ end
83
+
84
+ def origin
85
+ rect = Rectangle.new(*attributes[:MediaBox])
86
+ rect.apply_rotation(rotate) if rotate > 0
87
+
88
+ rect.bottom_left
89
+ end
90
+
71
91
  # Convenience method to identify the page's orientation.
72
92
  #
73
93
  def orientation
74
- OrientationDetector.new(attributes).orientation
94
+ if height > width
95
+ "portrait"
96
+ else
97
+ "landscape"
98
+ end
75
99
  end
76
100
 
77
101
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -139,23 +163,50 @@ module PDF
139
163
  # returns the "boxes" that define the page object.
140
164
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
165
  #
166
+ # DEPRECATED. Recommend using Page#rectangles instead
167
+ #
142
168
  def boxes
143
- mediabox = attributes[:MediaBox]
144
- cropbox = attributes[:Cropbox] || mediabox
169
+ # In ruby 2.4+ we could use Hash#transform_values
170
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
171
+ end
172
+
173
+ # returns the "boxes" that define the page object.
174
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
+ #
176
+ def rectangles
177
+ mediabox = objects.deref!(attributes[:MediaBox])
178
+ cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
+ bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
+ trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
+ artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
+
183
+ mediarect = Rectangle.new(*mediabox)
184
+ croprect = Rectangle.new(*cropbox)
185
+ bleedrect = Rectangle.new(*bleedbox)
186
+ trimrect = Rectangle.new(*trimbox)
187
+ artrect = Rectangle.new(*artbox)
188
+
189
+ if rotate > 0
190
+ mediarect.apply_rotation(rotate)
191
+ croprect.apply_rotation(rotate)
192
+ bleedrect.apply_rotation(rotate)
193
+ trimrect.apply_rotation(rotate)
194
+ artrect.apply_rotation(rotate)
195
+ end
145
196
 
146
197
  {
147
- MediaBox: objects.deref!(mediabox),
148
- CropBox: objects.deref!(cropbox),
149
- BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
- TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
- ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
198
+ MediaBox: mediarect,
199
+ CropBox: croprect,
200
+ BleedBox: bleedrect,
201
+ TrimBox: trimrect,
202
+ ArtBox: artrect,
152
203
  }
153
204
  end
154
205
 
155
206
  private
156
207
 
157
208
  def root
158
- root ||= objects.deref(@objects.trailer[:Root])
209
+ @root ||= objects.deref(@objects.trailer[:Root])
159
210
  end
160
211
 
161
212
  # Returns the resources that accompany this page. Includes
@@ -1,7 +1,9 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/overlapping_runs_filter'
6
+ require 'pdf/reader/zero_width_runs_filter'
5
7
 
6
8
  class PDF::Reader
7
9
 
@@ -15,14 +17,17 @@ class PDF::Reader
15
17
  DEFAULT_FONT_SIZE = 12
16
18
 
17
19
  def initialize(runs, mediabox)
18
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
19
-
20
- @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
+
24
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
+ @mediabox = mediabox
27
+ @runs = merge_runs(runs)
21
28
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
22
29
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
23
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
24
- @page_width = (mediabox[2] - mediabox[0]).abs
25
- @page_height = (mediabox[3] - mediabox[1]).abs
30
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
26
31
  @x_offset = @runs.map(&:x).sort.first || 0
27
32
  lowest_y = @runs.map(&:y).sort.first || 0
28
33
  @y_offset = lowest_y > 0 ? 0 : lowest_y
@@ -30,6 +35,7 @@ class PDF::Reader
30
35
 
31
36
  def to_s
32
37
  return "" if @runs.empty?
38
+ return "" if row_count == 0
33
39
 
34
40
  page = row_count.times.map { |i| " " * col_count }
35
41
  @runs.each do |run|
@@ -44,6 +50,16 @@ class PDF::Reader
44
50
 
45
51
  private
46
52
 
53
+ def page_width
54
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
+ (@mediabox[2].to_f - @mediabox[0].to_f).abs
56
+ end
57
+
58
+ def page_height
59
+ # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
+ (@mediabox[3].to_f - @mediabox[1].to_f).abs
61
+ end
62
+
47
63
  # given an array of strings, return a new array with empty rows from the
48
64
  # beginning and end removed.
49
65
  #
@@ -62,19 +78,19 @@ class PDF::Reader
62
78
  end
63
79
 
64
80
  def row_count
65
- @row_count ||= (@page_height / @mean_font_size).floor
81
+ @row_count ||= (page_height / @mean_font_size).floor
66
82
  end
67
83
 
68
84
  def col_count
69
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
85
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
70
86
  end
71
87
 
72
88
  def row_multiplier
73
- @row_multiplier ||= @page_height.to_f / row_count.to_f
89
+ @row_multiplier ||= page_height.to_f / row_count.to_f
74
90
  end
75
91
 
76
92
  def col_multiplier
77
- @col_multiplier ||= @page_width.to_f / col_count.to_f
93
+ @col_multiplier ||= page_width.to_f / col_count.to_f
78
94
  end
79
95
 
80
96
  def mean(collection)
@@ -85,12 +101,12 @@ class PDF::Reader
85
101
  end
86
102
  end
87
103
 
88
- def each_line(&block)
89
- @runs.sort.group_by { |run|
90
- run.y.to_i
91
- }.map { |y, collection|
92
- yield y, collection
93
- }
104
+ def median(collection)
105
+ if collection.size == 0
106
+ 0
107
+ else
108
+ collection.sort[(collection.size * 0.5).floor]
109
+ end
94
110
  end
95
111
 
96
112
  # take a collection of TextRun objects and merge any that are in close
@@ -104,17 +120,15 @@ class PDF::Reader
104
120
  end
105
121
 
106
122
  def group_chars_into_runs(chars)
107
- runs = []
108
- while head = chars.shift
123
+ chars.each_with_object([]) do |char, runs|
109
124
  if runs.empty?
110
- runs << head
111
- elsif runs.last.mergable?(head)
112
- runs[-1] = runs.last + head
125
+ runs << char
126
+ elsif runs.last.mergable?(char)
127
+ runs[-1] = runs.last + char
113
128
  else
114
- runs << head
129
+ runs << char
115
130
  end
116
131
  end
117
- runs
118
132
  end
119
133
 
120
134
  def local_string_insert(haystack, needle, index)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -30,15 +31,7 @@ class PDF::Reader
30
31
  @xobject_stack = [page.xobjects]
31
32
  @cs_stack = [page.color_spaces]
32
33
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- if page.rotate == 0
34
- state[:ctm] = identity_matrix
35
- else
36
- rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
37
- rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
38
- state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
39
- rotate_sin * -1, rotate_cos,
40
- 0, 0)
41
- end
34
+ state[:ctm] = identity_matrix
42
35
  end
43
36
 
44
37
  #####################################################
@@ -320,7 +313,7 @@ class PDF::Reader
320
313
  # may need to be added
321
314
  #
322
315
  def process_glyph_displacement(w0, tj, word_boundary)
323
- fs = font_size # font size
316
+ fs = state[:text_font_size]
324
317
  tc = state[:char_spacing]
325
318
  if word_boundary
326
319
  tw = state[:word_spacing]
@@ -330,22 +323,24 @@ class PDF::Reader
330
323
  th = state[:h_scaling]
331
324
  # optimise the common path to reduce Float allocations
332
325
  if th == 1 && tj == 0 && tc == 0 && tw == 0
333
- glyph_width = w0 * fs
334
- tx = glyph_width
326
+ tx = w0 * fs
327
+ elsif tj != 0
328
+ # don't apply spacing to TJ displacement
329
+ tx = (w0 - (tj/1000.0)) * fs * th
335
330
  else
336
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
337
- tx = glyph_width + ((tc + tw) * th)
331
+ # apply horizontal scaling to spacing values but not font size
332
+ tx = ((w0 * fs) + tc + tw) * th
338
333
  end
339
-
340
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
341
- # ctm[0] here, but this gets my tests green and I'm out of
342
- # ideas for now
343
334
  # TODO: support ty > 0
344
- if ctm.a == 1 || ctm.a == 0
345
- @text_matrix.horizontal_displacement_multiply!(tx)
346
- else
347
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
348
- end
335
+ ty = 0
336
+ temp = TransformationMatrix.new(1, 0,
337
+ 0, 1,
338
+ tx, ty)
339
+ @text_matrix = temp.multiply!(
340
+ @text_matrix.a, @text_matrix.b,
341
+ @text_matrix.c, @text_matrix.d,
342
+ @text_matrix.e, @text_matrix.f
343
+ )
349
344
  @font_size = @text_rendering_matrix = nil # invalidate cached value
350
345
  end
351
346