pdf-reader 2.4.2 → 2.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +44 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +44 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +73 -11
- data/lib/pdf/reader/page_layout.rb +37 -37
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +68 -6
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +15 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +29 -6
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1763 -0
- metadata +12 -7
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'afm'
|
@@ -37,23 +38,15 @@ class PDF::Reader
|
|
37
38
|
def glyph_width(code_point)
|
38
39
|
return 0 if code_point.nil? || code_point < 0
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
names = @font.encoding.int_to_name(code_point)
|
42
|
+
metrics = names.map { |name|
|
43
|
+
@metrics.char_metrics[name.to_s]
|
44
|
+
}.compact.first
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
}.compact.first
|
47
|
-
end
|
48
|
-
|
49
|
-
if m
|
50
|
-
m[:wx]
|
51
|
-
elsif @font.widths[code_point - 1]
|
52
|
-
@font.widths[code_point - 1]
|
53
|
-
elsif control_character?(code_point)
|
54
|
-
0
|
46
|
+
if metrics
|
47
|
+
metrics[:wx]
|
55
48
|
else
|
56
|
-
0
|
49
|
+
@font.widths[code_point - 1] || 0
|
57
50
|
end
|
58
51
|
end
|
59
52
|
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -131,6 +132,9 @@ class PDF::Reader
|
|
131
132
|
generation = buf.token.to_i
|
132
133
|
state = buf.token
|
133
134
|
|
135
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
136
|
+
# TODO should this fix be logged?
|
137
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
138
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
139
|
objid += 1
|
136
140
|
params.clear
|
@@ -146,7 +150,9 @@ class PDF::Reader
|
|
146
150
|
end
|
147
151
|
|
148
152
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
153
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
154
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
155
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
156
|
|
151
157
|
trailer
|
152
158
|
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class ZeroWidthRunsFilter
|
8
|
+
|
9
|
+
def self.exclude_zero_width_runs(runs)
|
10
|
+
runs.reject { |run| run.width == 0 }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -111,17 +112,25 @@ module PDF
|
|
111
112
|
#
|
112
113
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
114
|
#
|
115
|
+
# Using this method directly is supported, but it's more common to use
|
116
|
+
# `PDF::Reader.open`
|
117
|
+
#
|
114
118
|
def initialize(input, opts = {})
|
115
119
|
@cache = PDF::Reader::ObjectCache.new
|
116
120
|
opts.merge!(:cache => @cache)
|
117
121
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
122
|
end
|
119
123
|
|
124
|
+
# Return a Hash with some basic information about the PDF file
|
125
|
+
#
|
120
126
|
def info
|
121
127
|
dict = @objects.deref(@objects.trailer[:Info])
|
122
128
|
doc_strings_to_utf8(dict)
|
123
129
|
end
|
124
130
|
|
131
|
+
# Return a Hash with extra metadata provided by the author of the PDF file. Not
|
132
|
+
# always present.
|
133
|
+
#
|
125
134
|
def metadata
|
126
135
|
stream = @objects.deref(root[:Metadata])
|
127
136
|
if stream.nil?
|
@@ -133,20 +142,24 @@ module PDF
|
|
133
142
|
end
|
134
143
|
end
|
135
144
|
|
145
|
+
# To number of pages in this PDF
|
146
|
+
#
|
136
147
|
def page_count
|
137
148
|
pages = @objects.deref(root[:Pages])
|
138
149
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
150
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
151
|
end
|
141
152
|
@page_count ||= @objects.deref(pages[:Count])
|
142
153
|
end
|
143
154
|
|
155
|
+
# The PDF version this file uses
|
156
|
+
#
|
144
157
|
def pdf_version
|
145
158
|
@objects.pdf_version
|
146
159
|
end
|
147
160
|
|
148
|
-
# syntactic sugar for opening a PDF file. Accepts the
|
149
|
-
# as new().
|
161
|
+
# syntactic sugar for opening a PDF file and the most common approach. Accepts the
|
162
|
+
# same arguments as new().
|
150
163
|
#
|
151
164
|
# PDF::Reader.open("somefile.pdf") do |reader|
|
152
165
|
# puts reader.pdf_version
|
@@ -221,7 +234,7 @@ module PDF
|
|
221
234
|
when Array then
|
222
235
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
236
|
when String then
|
224
|
-
if obj
|
237
|
+
if has_utf16_bom?(obj)
|
225
238
|
utf16_to_utf8(obj)
|
226
239
|
else
|
227
240
|
pdfdoc_to_utf8(obj)
|
@@ -231,6 +244,14 @@ module PDF
|
|
231
244
|
end
|
232
245
|
end
|
233
246
|
|
247
|
+
def has_utf16_bom?(str)
|
248
|
+
first_bytes = str[0,2]
|
249
|
+
|
250
|
+
return false if first_bytes.nil?
|
251
|
+
|
252
|
+
first_bytes.unpack("C*") == [254, 255]
|
253
|
+
end
|
254
|
+
|
234
255
|
# TODO find a PDF I can use to spec this behaviour
|
235
256
|
#
|
236
257
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +263,7 @@ module PDF
|
|
242
263
|
# String#encode
|
243
264
|
#
|
244
265
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
266
|
+
str = obj[2, obj.size].to_s
|
246
267
|
str = str.unpack("n*").pack("U*")
|
247
268
|
str.force_encoding("utf-8")
|
248
269
|
str
|
@@ -264,6 +285,7 @@ end
|
|
264
285
|
|
265
286
|
require 'pdf/reader/resource_methods'
|
266
287
|
require 'pdf/reader/buffer'
|
288
|
+
require 'pdf/reader/bounding_rectangle_runs_filter'
|
267
289
|
require 'pdf/reader/cid_widths'
|
268
290
|
require 'pdf/reader/cmap'
|
269
291
|
require 'pdf/reader/encoding'
|
@@ -286,7 +308,9 @@ require 'pdf/reader/object_hash'
|
|
286
308
|
require 'pdf/reader/object_stream'
|
287
309
|
require 'pdf/reader/pages_strategy'
|
288
310
|
require 'pdf/reader/parser'
|
311
|
+
require 'pdf/reader/point'
|
289
312
|
require 'pdf/reader/print_receiver'
|
313
|
+
require 'pdf/reader/rectangle'
|
290
314
|
require 'pdf/reader/reference'
|
291
315
|
require 'pdf/reader/register_receiver'
|
292
316
|
require 'pdf/reader/null_security_handler'
|
@@ -299,5 +323,4 @@ require 'pdf/reader/page_state'
|
|
299
323
|
require 'pdf/reader/page_text_receiver'
|
300
324
|
require 'pdf/reader/token'
|
301
325
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
326
|
require 'pdf/reader/page'
|