pdf-reader 2.4.1 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +40 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +27 -14
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +37 -23
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +28 -5
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +12 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +17 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -41,16 +42,14 @@ module PDF
|
|
41
42
|
# starting a new page
|
42
43
|
def page=(page)
|
43
44
|
@state = PageState.new(page)
|
45
|
+
@page = page
|
44
46
|
@content = []
|
45
47
|
@characters = []
|
46
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
|
-
device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
|
48
|
-
device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
|
49
|
-
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
50
48
|
end
|
51
49
|
|
52
50
|
def content
|
53
|
-
|
51
|
+
mediabox = @page.rectangles[:MediaBox].to_a
|
52
|
+
PageLayout.new(@characters, mediabox).to_s
|
54
53
|
end
|
55
54
|
|
56
55
|
#####################################################
|
@@ -104,6 +103,8 @@ module PDF
|
|
104
103
|
glyphs.each_with_index do |glyph_code, index|
|
105
104
|
# paint the current glyph
|
106
105
|
newx, newy = @state.trm_transform(0,0)
|
106
|
+
newx, newy = apply_rotation(newx, newy)
|
107
|
+
|
107
108
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
108
109
|
|
109
110
|
# apply to glyph displacment for the current glyph so the next
|
@@ -118,6 +119,28 @@ module PDF
|
|
118
119
|
end
|
119
120
|
end
|
120
121
|
|
122
|
+
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
+
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
+
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
+
#
|
126
|
+
# We should provide an API for extracting the text with positioning data and spec
|
127
|
+
# that. I suspect the co-ords might be wrong for rotated pages
|
128
|
+
def apply_rotation(x, y)
|
129
|
+
if @page.rotate == 90
|
130
|
+
tmp = x
|
131
|
+
x = y
|
132
|
+
y = tmp * -1
|
133
|
+
elsif @page.rotate == 180
|
134
|
+
y *= -1
|
135
|
+
x *= -1
|
136
|
+
elsif @page.rotate == 270
|
137
|
+
tmp = y
|
138
|
+
y = x
|
139
|
+
x = tmp * -1
|
140
|
+
end
|
141
|
+
return x, y
|
142
|
+
end
|
143
|
+
|
121
144
|
end
|
122
145
|
end
|
123
146
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -166,7 +167,9 @@ class PDF::Reader
|
|
166
167
|
|
167
168
|
# add a missing digit if required, as required by the spec
|
168
169
|
str << "0" unless str.size % 2 == 0
|
169
|
-
str.
|
170
|
+
str.chars.each_slice(2).map { |nibbles|
|
171
|
+
nibbles.join("").hex.chr
|
172
|
+
}.join.force_encoding("binary")
|
170
173
|
end
|
171
174
|
################################################################################
|
172
175
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
@@ -175,15 +178,18 @@ class PDF::Reader
|
|
175
178
|
return "".dup.force_encoding("binary") if str == ")"
|
176
179
|
Error.assert_equal(parse_token, ")")
|
177
180
|
|
178
|
-
str.gsub!(/\\([nrtbf()\\\n]
|
179
|
-
|
181
|
+
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
|
182
|
+
if $2.nil? # not octal digits
|
183
|
+
MAPPING[match] || "".dup
|
184
|
+
else # must be octal digits
|
185
|
+
($2.oct & 0xff).chr # ignore high level overflow
|
186
|
+
end
|
180
187
|
end
|
181
188
|
str.force_encoding("binary")
|
182
189
|
end
|
183
190
|
|
184
191
|
MAPPING = {
|
185
192
|
"\r" => "\n",
|
186
|
-
"\n\r" => "\n",
|
187
193
|
"\r\n" => "\n",
|
188
194
|
"\\n" => "\n",
|
189
195
|
"\\r" => "\r",
|
@@ -194,10 +200,9 @@ class PDF::Reader
|
|
194
200
|
"\\)" => ")",
|
195
201
|
"\\\\" => "\\",
|
196
202
|
"\\\n" => "",
|
203
|
+
"\\\r" => "",
|
204
|
+
"\\\r\n" => "",
|
197
205
|
}
|
198
|
-
0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
|
199
|
-
0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
|
200
|
-
0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
|
201
206
|
|
202
207
|
################################################################################
|
203
208
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs are all about positioning content on a page, so there's lots of need to
|
9
|
+
# work with a set of X,Y coordinates.
|
10
|
+
#
|
11
|
+
class Point
|
12
|
+
|
13
|
+
attr_reader :x, :y
|
14
|
+
|
15
|
+
def initialize(x, y)
|
16
|
+
@x, @y = x, y
|
17
|
+
end
|
18
|
+
|
19
|
+
def ==(other)
|
20
|
+
other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs represent rectangles all over the place. They're 4 element arrays, like this:
|
9
|
+
#
|
10
|
+
# [A, B, C, D]
|
11
|
+
#
|
12
|
+
# Four element arrays are yucky to work with though, so here's a class that's better.
|
13
|
+
# Initialize it with the 4 elements, and get utility functions (width, height, etc)
|
14
|
+
# for free.
|
15
|
+
#
|
16
|
+
# By convention the first two elements are x1, y1, the co-ords for the bottom left corner
|
17
|
+
# of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
|
18
|
+
# corner of the rectangle. It's valid for the alternative corners to be used though, so
|
19
|
+
# we don't assume which is which.
|
20
|
+
#
|
21
|
+
class Rectangle
|
22
|
+
|
23
|
+
attr_reader :bottom_left, :bottom_right, :top_left, :top_right
|
24
|
+
|
25
|
+
def initialize(x1, y1, x2, y2)
|
26
|
+
set_corners(x1, y1, x2, y2)
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
to_a == other.to_a
|
31
|
+
end
|
32
|
+
|
33
|
+
def height
|
34
|
+
top_right.y - bottom_right.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def width
|
38
|
+
bottom_right.x - bottom_left.x
|
39
|
+
end
|
40
|
+
|
41
|
+
# A pdf-style 4-number array
|
42
|
+
def to_a
|
43
|
+
[
|
44
|
+
bottom_left.x,
|
45
|
+
bottom_left.y,
|
46
|
+
top_right.x,
|
47
|
+
top_right.y,
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
def apply_rotation(degrees)
|
52
|
+
return if degrees != 90 && degrees != 180 && degrees != 270
|
53
|
+
|
54
|
+
if degrees == 90
|
55
|
+
new_x1 = bottom_left.x
|
56
|
+
new_y1 = bottom_left.y - width
|
57
|
+
new_x2 = bottom_left.x + height
|
58
|
+
new_y2 = bottom_left.y
|
59
|
+
elsif degrees == 180
|
60
|
+
new_x1 = bottom_left.x - width
|
61
|
+
new_y1 = bottom_left.y - height
|
62
|
+
new_x2 = bottom_left.x
|
63
|
+
new_y2 = bottom_left.y
|
64
|
+
elsif degrees == 270
|
65
|
+
new_x1 = bottom_left.x - height
|
66
|
+
new_y1 = bottom_left.y
|
67
|
+
new_x2 = bottom_left.x
|
68
|
+
new_y2 = bottom_left.y + width
|
69
|
+
end
|
70
|
+
set_corners(new_x1, new_y1, new_x2, new_y2)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def set_corners(x1, y1, x2, y2)
|
76
|
+
@bottom_left = PDF::Reader::Point.new(
|
77
|
+
[x1, x2].min,
|
78
|
+
[y1, y2].min,
|
79
|
+
)
|
80
|
+
@bottom_right = PDF::Reader::Point.new(
|
81
|
+
[x1, x2].max,
|
82
|
+
[y1, y2].min,
|
83
|
+
)
|
84
|
+
@top_left = PDF::Reader::Point.new(
|
85
|
+
[x1, x2].min,
|
86
|
+
[y1, y2].max,
|
87
|
+
)
|
88
|
+
@top_right = PDF::Reader::Point.new(
|
89
|
+
[x1, x2].max,
|
90
|
+
[y1, y2].max,
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
# Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
|
6
|
+
# are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
|
7
|
+
|
4
8
|
module PDF
|
5
9
|
class Reader
|
6
10
|
|
7
11
|
# mixin for common methods in Page and FormXobjects
|
8
12
|
#
|
9
13
|
module ResourceMethods
|
14
|
+
|
10
15
|
# Returns a Hash of color spaces that are available to this page
|
11
16
|
#
|
12
17
|
# NOTE: this method de-serialise objects from the underlying PDF
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
data/lib/pdf/reader/token.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'afm'
|
@@ -37,23 +38,15 @@ class PDF::Reader
|
|
37
38
|
def glyph_width(code_point)
|
38
39
|
return 0 if code_point.nil? || code_point < 0
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
names = @font.encoding.int_to_name(code_point)
|
42
|
+
metrics = names.map { |name|
|
43
|
+
@metrics.char_metrics[name.to_s]
|
44
|
+
}.compact.first
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
}.compact.first
|
47
|
-
end
|
48
|
-
|
49
|
-
if m
|
50
|
-
m[:wx]
|
51
|
-
elsif @font.widths[code_point - 1]
|
52
|
-
@font.widths[code_point - 1]
|
53
|
-
elsif control_character?(code_point)
|
54
|
-
0
|
46
|
+
if metrics
|
47
|
+
metrics[:wx]
|
55
48
|
else
|
56
|
-
0
|
49
|
+
@font.widths[code_point - 1] || 0
|
57
50
|
end
|
58
51
|
end
|
59
52
|
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -131,6 +132,9 @@ class PDF::Reader
|
|
131
132
|
generation = buf.token.to_i
|
132
133
|
state = buf.token
|
133
134
|
|
135
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
136
|
+
# TODO should this fix be logged?
|
137
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
138
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
139
|
objid += 1
|
136
140
|
params.clear
|
@@ -146,7 +150,9 @@ class PDF::Reader
|
|
146
150
|
end
|
147
151
|
|
148
152
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
153
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
154
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
155
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
156
|
|
151
157
|
trailer
|
152
158
|
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class ZeroWidthRunsFilter
|
8
|
+
|
9
|
+
def self.exclude_zero_width_runs(runs)
|
10
|
+
runs.reject { |run| run.width == 0 }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -136,7 +137,7 @@ module PDF
|
|
136
137
|
def page_count
|
137
138
|
pages = @objects.deref(root[:Pages])
|
138
139
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
140
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
141
|
end
|
141
142
|
@page_count ||= @objects.deref(pages[:Count])
|
142
143
|
end
|
@@ -221,7 +222,7 @@ module PDF
|
|
221
222
|
when Array then
|
222
223
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
224
|
when String then
|
224
|
-
if obj
|
225
|
+
if has_utf16_bom?(obj)
|
225
226
|
utf16_to_utf8(obj)
|
226
227
|
else
|
227
228
|
pdfdoc_to_utf8(obj)
|
@@ -231,6 +232,14 @@ module PDF
|
|
231
232
|
end
|
232
233
|
end
|
233
234
|
|
235
|
+
def has_utf16_bom?(str)
|
236
|
+
first_bytes = str[0,2]
|
237
|
+
|
238
|
+
return false if first_bytes.nil?
|
239
|
+
|
240
|
+
first_bytes.unpack("C*") == [254, 255]
|
241
|
+
end
|
242
|
+
|
234
243
|
# TODO find a PDF I can use to spec this behaviour
|
235
244
|
#
|
236
245
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +251,7 @@ module PDF
|
|
242
251
|
# String#encode
|
243
252
|
#
|
244
253
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
254
|
+
str = obj[2, obj.size].to_s
|
246
255
|
str = str.unpack("n*").pack("U*")
|
247
256
|
str.force_encoding("utf-8")
|
248
257
|
str
|
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
|
|
286
295
|
require 'pdf/reader/object_stream'
|
287
296
|
require 'pdf/reader/pages_strategy'
|
288
297
|
require 'pdf/reader/parser'
|
298
|
+
require 'pdf/reader/point'
|
289
299
|
require 'pdf/reader/print_receiver'
|
300
|
+
require 'pdf/reader/rectangle'
|
290
301
|
require 'pdf/reader/reference'
|
291
302
|
require 'pdf/reader/register_receiver'
|
292
303
|
require 'pdf/reader/null_security_handler'
|
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
|
|
299
310
|
require 'pdf/reader/page_text_receiver'
|
300
311
|
require 'pdf/reader/token'
|
301
312
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
313
|
require 'pdf/reader/page'
|