pdf-reader 2.1.0 → 2.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +28 -1
- data/README.md +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader.rb +2 -2
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +12 -11
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +22 -12
- data/lib/pdf/reader/encoding.rb +12 -9
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +6 -4
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +3 -1
- data/lib/pdf/reader/font.rb +11 -2
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +2 -1
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +22 -10
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +29 -0
- data/lib/pdf/reader/page_layout.rb +10 -5
- data/lib/pdf/reader/page_state.rb +10 -1
- data/lib/pdf/reader/page_text_receiver.rb +5 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +5 -4
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +25 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- metadata +17 -13
- data/lib/pdf/hash.rb +0 -19
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -95,25 +96,34 @@ class PDF::Reader
|
|
95
96
|
Parser.new(buffer)
|
96
97
|
end
|
97
98
|
|
99
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
100
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
101
|
+
#
|
102
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
103
|
+
#
|
104
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
105
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
106
|
+
#
|
98
107
|
def str_to_int(str)
|
99
108
|
return nil if str.nil? || str.size == 0
|
100
|
-
unpacked_string = if str.
|
109
|
+
unpacked_string = if str.bytesize == 1 # UTF-8
|
101
110
|
str.unpack("C*")
|
102
111
|
else # UTF-16
|
103
112
|
str.unpack("n*")
|
104
113
|
end
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
114
|
+
result = []
|
115
|
+
while unpacked_string.any? do
|
116
|
+
if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
|
117
|
+
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
118
|
+
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
119
|
+
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
120
|
+
points = [unpacked_string.shift, unpacked_string.shift]
|
121
|
+
result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
|
122
|
+
else
|
123
|
+
result << unpacked_string.shift
|
124
|
+
end
|
116
125
|
end
|
126
|
+
result
|
117
127
|
end
|
118
128
|
|
119
129
|
def process_bfchar_instructions(instructions)
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -39,20 +40,22 @@ class PDF::Reader
|
|
39
40
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
40
41
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
41
42
|
|
42
|
-
if enc.kind_of?(Hash)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
enc = enc.to_sym
|
43
|
+
@enc_name = if enc.kind_of?(Hash)
|
44
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
45
|
+
elsif enc && enc.respond_to?(:to_sym)
|
46
|
+
enc.to_sym
|
47
47
|
else
|
48
|
-
|
48
|
+
:StandardEncoding
|
49
49
|
end
|
50
50
|
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@map_file = get_mapping_file(enc)
|
51
|
+
@unpack = get_unpack(@enc_name)
|
52
|
+
@map_file = get_mapping_file(@enc_name)
|
54
53
|
|
55
54
|
load_mapping(@map_file) if @map_file
|
55
|
+
|
56
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
57
|
+
self.differences = enc[:Differences]
|
58
|
+
end
|
56
59
|
end
|
57
60
|
|
58
61
|
# set the differences table for this encoding. should be an array in the following format:
|
data/lib/pdf/reader/error.rb
CHANGED
data/lib/pdf/reader/filter.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
|
4
5
|
require 'zlib'
|
@@ -7,6 +8,8 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Flate (zlib) stream filter
|
9
10
|
class Flate
|
11
|
+
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
|
+
|
10
13
|
def initialize(options = {})
|
11
14
|
@options = options
|
12
15
|
end
|
@@ -16,16 +19,15 @@ class PDF::Reader
|
|
16
19
|
def filter(data)
|
17
20
|
deflated = nil
|
18
21
|
begin
|
19
|
-
deflated = Zlib::Inflate.new.inflate(data)
|
22
|
+
deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
20
23
|
rescue Zlib::DataError => e
|
21
24
|
# by default, Ruby's Zlib assumes the data it's inflating
|
22
|
-
# is RFC1951 deflated data, wrapped in a
|
23
|
-
#
|
25
|
+
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
26
|
+
# fails, then use a lightly-documented 'feature' to attempt to inflate
|
24
27
|
# the data as a raw RFC1951 stream.
|
25
28
|
#
|
26
29
|
# See
|
27
30
|
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
28
|
-
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
29
31
|
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
30
32
|
end
|
31
33
|
Depredict.new(@options).filter(deflated)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
class PDF::Reader # :nodoc:
|
4
6
|
module Filter # :nodoc:
|
@@ -12,7 +14,7 @@ class PDF::Reader # :nodoc:
|
|
12
14
|
# Decode the specified data with the RunLengthDecode compression algorithm
|
13
15
|
def filter(data)
|
14
16
|
pos = 0
|
15
|
-
out = ""
|
17
|
+
out = "".dup
|
16
18
|
|
17
19
|
while pos < data.length
|
18
20
|
length = data.getbyte(pos)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -96,7 +97,13 @@ class PDF::Reader
|
|
96
97
|
elsif @subtype == :Type3
|
97
98
|
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
98
99
|
elsif @subtype == :TrueType
|
99
|
-
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
100
107
|
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
101
108
|
PDF::Reader::WidthCalculator::Composite.new(self)
|
102
109
|
else
|
@@ -124,7 +131,9 @@ class PDF::Reader
|
|
124
131
|
if obj[:ToUnicode]
|
125
132
|
# ToUnicode is optional for Type1 and Type3
|
126
133
|
stream = @ohash.object(obj[:ToUnicode])
|
127
|
-
|
134
|
+
if stream.is_a?(PDF::Reader::Stream)
|
135
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
136
|
+
end
|
128
137
|
end
|
129
138
|
end
|
130
139
|
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PDF
|
4
5
|
|
@@ -82,7 +83,7 @@ module PDF
|
|
82
83
|
#
|
83
84
|
def self.decode(data)
|
84
85
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
85
|
-
result =
|
86
|
+
result = "".dup
|
86
87
|
until (code = stream.read) == CODE_EOD
|
87
88
|
if code == CODE_CLEAR_TABLE
|
88
89
|
stream.set_bits_in_chunk(9)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Provides low level access to the objects in a PDF file via a hash-like
|
@@ -77,16 +78,7 @@ class PDF::Reader
|
|
77
78
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
78
79
|
end
|
79
80
|
|
80
|
-
|
81
|
-
@cache[key]
|
82
|
-
elsif xref[key].is_a?(Integer)
|
83
|
-
buf = new_buffer(xref[key])
|
84
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
85
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
86
|
-
container_key = xref[key]
|
87
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
88
|
-
@cache[key] = object_streams[container_key][key.id]
|
89
|
-
end
|
81
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
90
82
|
rescue InvalidObjectError
|
91
83
|
return default
|
92
84
|
end
|
@@ -253,6 +245,26 @@ class PDF::Reader
|
|
253
245
|
|
254
246
|
private
|
255
247
|
|
248
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
249
|
+
# in the xref table
|
250
|
+
#
|
251
|
+
def fetch_object(key)
|
252
|
+
if xref[key].is_a?(Integer)
|
253
|
+
buf = new_buffer(xref[key])
|
254
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# parse a object that's embedded in an object stream in the PDF
|
259
|
+
#
|
260
|
+
def fetch_object_stream(key)
|
261
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
262
|
+
container_key = xref[key]
|
263
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
264
|
+
object_streams[container_key][key.id]
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
256
268
|
# Private implementation of deref!, which exists to ensure the `seen` argument
|
257
269
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
258
270
|
# doesn't need to be part of the public API.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Small util class for detecting the orientation of a single PDF page. Accounts
|
@@ -21,12 +22,12 @@ class PDF::Reader
|
|
21
22
|
def detect_orientation
|
22
23
|
llx,lly,urx,ury = @attributes[:MediaBox]
|
23
24
|
rotation = @attributes[:Rotate].to_i
|
24
|
-
width = urx.to_i - llx.to_i
|
25
|
-
height = ury.to_i - lly.to_i
|
25
|
+
width = (urx.to_i - llx.to_i).abs
|
26
|
+
height = (ury.to_i - lly.to_i).abs
|
26
27
|
if width > height
|
27
|
-
|
28
|
+
(rotation % 180).zero? ? 'landscape' : 'portrait'
|
28
29
|
else
|
29
|
-
|
30
|
+
(rotation % 180).zero? ? 'portrait' : 'landscape'
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
5
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
6
|
+
class OverlappingRunsFilter
|
7
|
+
|
8
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
9
|
+
# have identical characters) then one will be discarded
|
10
|
+
OVERLAPPING_THRESHOLD = 0.5
|
11
|
+
|
12
|
+
def self.exclude_redundant_runs(runs)
|
13
|
+
sweep_line_status = Array.new
|
14
|
+
event_point_schedule = Array.new
|
15
|
+
to_exclude = []
|
16
|
+
|
17
|
+
runs.each do |run|
|
18
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
19
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
20
|
+
end
|
21
|
+
|
22
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
|
+
|
24
|
+
event_point_schedule.each do |event_point|
|
25
|
+
run = event_point.run
|
26
|
+
|
27
|
+
if event_point.start?
|
28
|
+
if detect_intersection(sweep_line_status, event_point)
|
29
|
+
to_exclude << run
|
30
|
+
end
|
31
|
+
sweep_line_status.push(run)
|
32
|
+
else
|
33
|
+
sweep_line_status.delete(run)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
runs - to_exclude
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
40
|
+
sweep_line_status.each do |open_text_run|
|
41
|
+
if event_point.x >= open_text_run.x &&
|
42
|
+
event_point.x <= open_text_run.endx &&
|
43
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
|
+
return true
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
|
+
# looking for duplicates
|
53
|
+
class EventPoint
|
54
|
+
attr_reader :x, :run
|
55
|
+
|
56
|
+
def initialize x, run
|
57
|
+
@x, @run = x, run
|
58
|
+
end
|
59
|
+
|
60
|
+
def start?
|
61
|
+
@x == @run.x
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PDF
|
4
5
|
class Reader
|
@@ -123,6 +124,34 @@ module PDF
|
|
123
124
|
}.join(" ")
|
124
125
|
end
|
125
126
|
|
127
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
128
|
+
#
|
129
|
+
def rotate
|
130
|
+
value = attributes[:Rotate].to_i
|
131
|
+
case value
|
132
|
+
when 0, 90, 180, 270
|
133
|
+
value
|
134
|
+
else
|
135
|
+
0
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the "boxes" that define the page object.
|
140
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
|
+
#
|
142
|
+
def boxes
|
143
|
+
mediabox = attributes[:MediaBox]
|
144
|
+
cropbox = attributes[:Cropbox] || mediabox
|
145
|
+
|
146
|
+
{
|
147
|
+
MediaBox: objects.deref!(mediabox),
|
148
|
+
CropBox: objects.deref!(cropbox),
|
149
|
+
BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
|
150
|
+
TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
|
151
|
+
ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
126
155
|
private
|
127
156
|
|
128
157
|
def root
|