pdf-reader 2.6.0 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +13 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +1 -0
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +7 -5
- data/lib/pdf/reader/filter/flate.rb +2 -0
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +5 -2
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +24 -14
- data/lib/pdf/reader/page_state.rb +11 -10
- data/lib/pdf/reader/page_text_receiver.rb +13 -8
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +4 -1
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +1 -0
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +12 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -312,7 +313,7 @@ class PDF::Reader
|
|
312
313
|
# may need to be added
|
313
314
|
#
|
314
315
|
def process_glyph_displacement(w0, tj, word_boundary)
|
315
|
-
fs =
|
316
|
+
fs = state[:text_font_size]
|
316
317
|
tc = state[:char_spacing]
|
317
318
|
if word_boundary
|
318
319
|
tw = state[:word_spacing]
|
@@ -330,16 +331,16 @@ class PDF::Reader
|
|
330
331
|
# apply horizontal scaling to spacing values but not font size
|
331
332
|
tx = ((w0 * fs) + tc + tw) * th
|
332
333
|
end
|
333
|
-
|
334
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
335
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
336
|
-
# ideas for now
|
337
334
|
# TODO: support ty > 0
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
335
|
+
ty = 0
|
336
|
+
temp = TransformationMatrix.new(1, 0,
|
337
|
+
0, 1,
|
338
|
+
tx, ty)
|
339
|
+
@text_matrix = temp.multiply!(
|
340
|
+
@text_matrix.a, @text_matrix.b,
|
341
|
+
@text_matrix.c, @text_matrix.d,
|
342
|
+
@text_matrix.e, @text_matrix.f
|
343
|
+
)
|
343
344
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
344
345
|
end
|
345
346
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -44,14 +45,11 @@ module PDF
|
|
44
45
|
@page = page
|
45
46
|
@content = []
|
46
47
|
@characters = []
|
47
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
-
device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
|
49
|
-
device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
|
50
|
-
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
51
48
|
end
|
52
49
|
|
53
50
|
def content
|
54
|
-
|
51
|
+
mediabox = @page.rectangles[:MediaBox].to_a
|
52
|
+
PageLayout.new(@characters, mediabox).to_s
|
55
53
|
end
|
56
54
|
|
57
55
|
#####################################################
|
@@ -121,6 +119,12 @@ module PDF
|
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
122
|
+
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
+
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
+
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
+
#
|
126
|
+
# We should provide an API for extracting the text with positioning data and spec
|
127
|
+
# that. I suspect the co-ords might be wrong for rotated pages
|
124
128
|
def apply_rotation(x, y)
|
125
129
|
if @page.rotate == 90
|
126
130
|
tmp = x
|
@@ -128,10 +132,11 @@ module PDF
|
|
128
132
|
y = tmp * -1
|
129
133
|
elsif @page.rotate == 180
|
130
134
|
y *= -1
|
135
|
+
x *= -1
|
131
136
|
elsif @page.rotate == 270
|
132
|
-
tmp =
|
133
|
-
|
134
|
-
|
137
|
+
tmp = y
|
138
|
+
y = x
|
139
|
+
x = tmp * -1
|
135
140
|
end
|
136
141
|
return x, y
|
137
142
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -166,7 +167,9 @@ class PDF::Reader
|
|
166
167
|
|
167
168
|
# add a missing digit if required, as required by the spec
|
168
169
|
str << "0" unless str.size % 2 == 0
|
169
|
-
str.
|
170
|
+
str.chars.each_slice(2).map { |nibbles|
|
171
|
+
nibbles.join("").hex.chr
|
172
|
+
}.join.force_encoding("binary")
|
170
173
|
end
|
171
174
|
################################################################################
|
172
175
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs are all about positioning content on a page, so there's lots of need to
|
9
|
+
# work with a set of X,Y coordinates.
|
10
|
+
#
|
11
|
+
class Point
|
12
|
+
|
13
|
+
attr_reader :x, :y
|
14
|
+
|
15
|
+
def initialize(x, y)
|
16
|
+
@x, @y = x, y
|
17
|
+
end
|
18
|
+
|
19
|
+
def ==(other)
|
20
|
+
other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs represent rectangles all over the place. They're 4 element arrays, like this:
|
9
|
+
#
|
10
|
+
# [A, B, C, D]
|
11
|
+
#
|
12
|
+
# Four element arrays are yucky to work with though, so here's a class that's better.
|
13
|
+
# Initialize it with the 4 elements, and get utility functions (width, height, etc)
|
14
|
+
# for free.
|
15
|
+
#
|
16
|
+
# By convention the first two elements are x1, y1, the co-ords for the bottom left corner
|
17
|
+
# of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
|
18
|
+
# corner of the rectangle. It's valid for the alternative corners to be used though, so
|
19
|
+
# we don't assume which is which.
|
20
|
+
#
|
21
|
+
class Rectangle
|
22
|
+
|
23
|
+
attr_reader :bottom_left, :bottom_right, :top_left, :top_right
|
24
|
+
|
25
|
+
def initialize(x1, y1, x2, y2)
|
26
|
+
set_corners(x1, y1, x2, y2)
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
to_a == other.to_a
|
31
|
+
end
|
32
|
+
|
33
|
+
def height
|
34
|
+
top_right.y - bottom_right.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def width
|
38
|
+
bottom_right.x - bottom_left.x
|
39
|
+
end
|
40
|
+
|
41
|
+
# A pdf-style 4-number array
|
42
|
+
def to_a
|
43
|
+
[
|
44
|
+
bottom_left.x,
|
45
|
+
bottom_left.y,
|
46
|
+
top_right.x,
|
47
|
+
top_right.y,
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
def apply_rotation(degrees)
|
52
|
+
return if degrees != 90 && degrees != 180 && degrees != 270
|
53
|
+
|
54
|
+
if degrees == 90
|
55
|
+
new_x1 = bottom_left.x
|
56
|
+
new_y1 = bottom_left.y - width
|
57
|
+
new_x2 = bottom_left.x + height
|
58
|
+
new_y2 = bottom_left.y
|
59
|
+
elsif degrees == 180
|
60
|
+
new_x1 = bottom_left.x - width
|
61
|
+
new_y1 = bottom_left.y - height
|
62
|
+
new_x2 = bottom_left.x
|
63
|
+
new_y2 = bottom_left.y
|
64
|
+
elsif degrees == 270
|
65
|
+
new_x1 = bottom_left.x - height
|
66
|
+
new_y1 = bottom_left.y
|
67
|
+
new_x2 = bottom_left.x
|
68
|
+
new_y2 = bottom_left.y + width
|
69
|
+
end
|
70
|
+
set_corners(new_x1, new_y1, new_x2, new_y2)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def set_corners(x1, y1, x2, y2)
|
76
|
+
@bottom_left = PDF::Reader::Point.new(
|
77
|
+
[x1, x2].min,
|
78
|
+
[y1, y2].min,
|
79
|
+
)
|
80
|
+
@bottom_right = PDF::Reader::Point.new(
|
81
|
+
[x1, x2].max,
|
82
|
+
[y1, y2].min,
|
83
|
+
)
|
84
|
+
@top_left = PDF::Reader::Point.new(
|
85
|
+
[x1, x2].min,
|
86
|
+
[y1, y2].max,
|
87
|
+
)
|
88
|
+
@top_right = PDF::Reader::Point.new(
|
89
|
+
[x1, x2].max,
|
90
|
+
[y1, y2].max,
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
# Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
|
6
|
+
# are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
|
7
|
+
|
4
8
|
module PDF
|
5
9
|
class Reader
|
6
10
|
|
7
11
|
# mixin for common methods in Page and FormXobjects
|
8
12
|
#
|
9
13
|
module ResourceMethods
|
14
|
+
|
10
15
|
# Returns a Hash of color spaces that are available to this page
|
11
16
|
#
|
12
17
|
# NOTE: this method de-serialise objects from the underlying PDF
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
data/lib/pdf/reader/token.rb
CHANGED
data/lib/pdf/reader/xref.rb
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -136,7 +137,7 @@ module PDF
|
|
136
137
|
def page_count
|
137
138
|
pages = @objects.deref(root[:Pages])
|
138
139
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
140
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
141
|
end
|
141
142
|
@page_count ||= @objects.deref(pages[:Count])
|
142
143
|
end
|
@@ -221,7 +222,7 @@ module PDF
|
|
221
222
|
when Array then
|
222
223
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
224
|
when String then
|
224
|
-
if obj
|
225
|
+
if has_utf16_bom?(obj)
|
225
226
|
utf16_to_utf8(obj)
|
226
227
|
else
|
227
228
|
pdfdoc_to_utf8(obj)
|
@@ -231,6 +232,14 @@ module PDF
|
|
231
232
|
end
|
232
233
|
end
|
233
234
|
|
235
|
+
def has_utf16_bom?(str)
|
236
|
+
first_bytes = str[0,2]
|
237
|
+
|
238
|
+
return false if first_bytes.nil?
|
239
|
+
|
240
|
+
first_bytes.unpack("C*") == [254, 255]
|
241
|
+
end
|
242
|
+
|
234
243
|
# TODO find a PDF I can use to spec this behaviour
|
235
244
|
#
|
236
245
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +251,7 @@ module PDF
|
|
242
251
|
# String#encode
|
243
252
|
#
|
244
253
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
254
|
+
str = obj[2, obj.size].to_s
|
246
255
|
str = str.unpack("n*").pack("U*")
|
247
256
|
str.force_encoding("utf-8")
|
248
257
|
str
|
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
|
|
286
295
|
require 'pdf/reader/object_stream'
|
287
296
|
require 'pdf/reader/pages_strategy'
|
288
297
|
require 'pdf/reader/parser'
|
298
|
+
require 'pdf/reader/point'
|
289
299
|
require 'pdf/reader/print_receiver'
|
300
|
+
require 'pdf/reader/rectangle'
|
290
301
|
require 'pdf/reader/reference'
|
291
302
|
require 'pdf/reader/register_receiver'
|
292
303
|
require 'pdf/reader/null_security_handler'
|
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
|
|
299
310
|
require 'pdf/reader/page_text_receiver'
|
300
311
|
require 'pdf/reader/token'
|
301
312
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
313
|
require 'pdf/reader/page'
|