pdf-reader 2.5.0 → 2.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +17 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/lib/pdf/reader/buffer.rb +62 -21
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/error.rb +3 -3
- data/lib/pdf/reader/filter/ascii85.rb +5 -1
- data/lib/pdf/reader/filter/depredict.rb +3 -3
- data/lib/pdf/reader/glyph_hash.rb +15 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/page_layout.rb +12 -9
- data/lib/pdf/reader/page_text_receiver.rb +2 -2
- data/lib/pdf/reader/parser.rb +8 -6
- data/lib/pdf/reader/xref.rb +6 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +11 -0
- metadata +11 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
|
4
|
+
data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
|
7
|
+
data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
v2.6.0 (12th November 2021)
|
2
|
+
- Text extraction improvements
|
3
|
+
- Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
7
|
+
- Performance improvements
|
8
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
9
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
10
|
+
- Successfully parse more files
|
11
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
12
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
13
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
14
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
15
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
16
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
17
|
+
|
1
18
|
v2.5.0 (6th June 2021)
|
2
19
|
- bump minimum ruby version to 2.0
|
3
20
|
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 32
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -48,6 +48,15 @@ class PDF::Reader
|
|
48
48
|
ID = "ID"
|
49
49
|
FWD_SLASH = "/"
|
50
50
|
NULL_BYTE = "\x00"
|
51
|
+
CR = "\r"
|
52
|
+
LF = "\n"
|
53
|
+
CRLF = "\r\n"
|
54
|
+
WHITE_SPACE = [LF, CR, ' ']
|
55
|
+
|
56
|
+
# Quite a few PDFs have trailing junk.
|
57
|
+
# This can be several k of nuls in some cases
|
58
|
+
# Allow for this here
|
59
|
+
TRAILING_BYTECOUNT = 5000
|
51
60
|
|
52
61
|
attr_reader :pos
|
53
62
|
|
@@ -86,9 +95,12 @@ class PDF::Reader
|
|
86
95
|
#
|
87
96
|
# options:
|
88
97
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
98
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
99
|
+
# that is sitting under the io cursor.
|
100
|
+
# Note:
|
101
|
+
# Skipping a bare CR is not spec-compliant.
|
102
|
+
# This is because the data may start with LF.
|
103
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
104
|
def read(bytes, opts = {})
|
93
105
|
reset_pos
|
94
106
|
|
@@ -97,9 +109,9 @@ class PDF::Reader
|
|
97
109
|
str = @io.read(2)
|
98
110
|
if str.nil?
|
99
111
|
return nil
|
100
|
-
elsif str ==
|
112
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
113
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
114
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
115
|
@io.seek(-1, IO::SEEK_CUR)
|
104
116
|
else
|
105
117
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +139,8 @@ class PDF::Reader
|
|
127
139
|
#
|
128
140
|
def find_first_xref_offset
|
129
141
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
142
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
143
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
132
144
|
|
133
145
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
146
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -217,7 +229,9 @@ class PDF::Reader
|
|
217
229
|
return if @tokens.size < 3
|
218
230
|
return if @tokens[2] != "R"
|
219
231
|
|
220
|
-
|
232
|
+
# must match whole tokens
|
233
|
+
digits_only = %r{\A\d+\z}
|
234
|
+
if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
|
221
235
|
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
222
236
|
@tokens[1] = nil
|
223
237
|
@tokens[2] = nil
|
@@ -225,24 +239,51 @@ class PDF::Reader
|
|
225
239
|
end
|
226
240
|
end
|
227
241
|
|
242
|
+
# Extract data between ID and EI
|
243
|
+
# If the EI follows white-space the space is dropped from the data
|
244
|
+
# The EI must followed by white-space or end of buffer
|
245
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
246
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
247
|
+
idstart = @io.pos
|
248
|
+
chr = prevchr = nil
|
249
|
+
eisize = 0 # how many chars in the end marker
|
250
|
+
seeking = 'E' # what are we looking for now?
|
251
|
+
loop do
|
234
252
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
253
|
+
break if chr.nil?
|
254
|
+
case seeking
|
255
|
+
when 'E'
|
256
|
+
if chr == 'E'
|
257
|
+
seeking = 'I'
|
258
|
+
if WHITE_SPACE.include? prevchr
|
259
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
260
|
+
else # assume the EI immediately follows the data
|
261
|
+
eisize = 2 # leave prevchr in data
|
262
|
+
end
|
263
|
+
end
|
264
|
+
when 'I'
|
265
|
+
if chr == 'I'
|
266
|
+
seeking = :END
|
267
|
+
else
|
268
|
+
seeking = 'E'
|
269
|
+
end
|
270
|
+
when :END
|
271
|
+
if WHITE_SPACE.include? chr
|
272
|
+
eisize += 1 # Drop trailer
|
273
|
+
break
|
274
|
+
else
|
275
|
+
seeking = 'E'
|
276
|
+
end
|
239
277
|
end
|
278
|
+
prevchr = chr
|
240
279
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
280
|
+
unless seeking == :END
|
281
|
+
raise MalformedPDFError, "EI terminator not found"
|
282
|
+
end
|
283
|
+
eiend = @io.pos
|
284
|
+
@io.seek(idstart, IO::SEEK_SET)
|
285
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
244
286
|
@tokens << string_token(str)
|
245
|
-
@io.seek(-3, IO::SEEK_CUR) unless chr.nil?
|
246
287
|
end
|
247
288
|
|
248
289
|
# if we're currently inside a hex string, read hex nibbles until
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -208,7 +208,7 @@ class PDF::Reader
|
|
208
208
|
def load_mapping(file)
|
209
209
|
File.open(file, "r:BINARY") do |f|
|
210
210
|
f.each do |l|
|
211
|
-
_m, single_byte, unicode = *l.match(
|
211
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
212
212
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
213
213
|
end
|
214
214
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -33,17 +33,17 @@ class PDF::Reader
|
|
33
33
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
34
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
35
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
36
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
37
|
end
|
38
38
|
################################################################################
|
39
39
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
40
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
41
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
42
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
43
|
end
|
44
44
|
################################################################################
|
45
45
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
46
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
47
|
end
|
48
48
|
################################################################################
|
49
49
|
end
|
@@ -17,7 +17,11 @@ class PDF::Reader
|
|
17
17
|
#
|
18
18
|
def filter(data)
|
19
19
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
20
|
+
if defined?(::Ascii85Native)
|
21
|
+
::Ascii85Native::decode(data)
|
22
|
+
else
|
23
|
+
::Ascii85::decode(data)
|
24
|
+
end
|
21
25
|
rescue Exception => e
|
22
26
|
# Oops, there was a problem decoding the stream
|
23
27
|
raise MalformedPDFError,
|
@@ -34,7 +34,7 @@ class PDF::Reader
|
|
34
34
|
################################################################################
|
35
35
|
def tiff_depredict(data)
|
36
36
|
data = data.unpack("C*")
|
37
|
-
unfiltered =
|
37
|
+
unfiltered = ''
|
38
38
|
bpc = @options[:BitsPerComponent] || 8
|
39
39
|
pixel_bits = bpc * @options[:Colors]
|
40
40
|
pixel_bytes = pixel_bits / 8
|
@@ -51,11 +51,11 @@ class PDF::Reader
|
|
51
51
|
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
52
52
|
row_data[index] = (byte + left) % 256
|
53
53
|
end
|
54
|
-
unfiltered += row_data
|
54
|
+
unfiltered += row_data.pack("C*")
|
55
55
|
pos += line_len
|
56
56
|
end
|
57
57
|
|
58
|
-
unfiltered
|
58
|
+
unfiltered
|
59
59
|
end
|
60
60
|
################################################################################
|
61
61
|
def png_depredict(data)
|
@@ -103,19 +103,25 @@ class PDF::Reader
|
|
103
103
|
|
104
104
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
105
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
106
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
107
|
def load_adobe_glyph_mapping
|
108
108
|
keyed_by_name = {}
|
109
109
|
keyed_by_codepoint = {}
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
111
|
+
paths = [
|
112
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
113
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
114
|
+
]
|
115
|
+
paths.each do |path|
|
116
|
+
File.open(path, "r:BINARY") do |f|
|
117
|
+
f.each do |l|
|
118
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
119
|
+
if name && code
|
120
|
+
cp = "0x#{code}".hex
|
121
|
+
keyed_by_name[name.to_sym] = cp
|
122
|
+
keyed_by_codepoint[cp] ||= []
|
123
|
+
keyed_by_codepoint[cp] << name.to_sym
|
124
|
+
end
|
119
125
|
end
|
120
126
|
end
|
121
127
|
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
# -----------------------------------------------------------
|
2
|
+
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or
|
5
|
+
# without modification, are permitted provided that the
|
6
|
+
# following conditions are met:
|
7
|
+
#
|
8
|
+
# Redistributions of source code must retain the above
|
9
|
+
# copyright notice, this list of conditions and the following
|
10
|
+
# disclaimer.
|
11
|
+
#
|
12
|
+
# Redistributions in binary form must reproduce the above
|
13
|
+
# copyright notice, this list of conditions and the following
|
14
|
+
# disclaimer in the documentation and/or other materials
|
15
|
+
# provided with the distribution.
|
16
|
+
#
|
17
|
+
# Neither the name of Adobe nor the names of its contributors
|
18
|
+
# may be used to endorse or promote products derived from this
|
19
|
+
# software without specific prior written permission.
|
20
|
+
#
|
21
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
22
|
+
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
23
|
+
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
24
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
27
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
28
|
+
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
29
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
30
|
+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
31
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
32
|
+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
33
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
# -----------------------------------------------------------
|
35
|
+
# Name: ITC Zapf Dingbats Glyph List
|
36
|
+
# Table version: 2.0
|
37
|
+
# Date: September 20, 2002
|
38
|
+
# URL: https://github.com/adobe-type-tools/agl-aglfn
|
39
|
+
#
|
40
|
+
# Format: two semicolon-delimited fields:
|
41
|
+
# (1) glyph name--upper/lowercase letters and digits
|
42
|
+
# (2) Unicode scalar value--four uppercase hexadecimal digits
|
43
|
+
#
|
44
|
+
a100;275E
|
45
|
+
a101;2761
|
46
|
+
a102;2762
|
47
|
+
a103;2763
|
48
|
+
a104;2764
|
49
|
+
a105;2710
|
50
|
+
a106;2765
|
51
|
+
a107;2766
|
52
|
+
a108;2767
|
53
|
+
a109;2660
|
54
|
+
a10;2721
|
55
|
+
a110;2665
|
56
|
+
a111;2666
|
57
|
+
a112;2663
|
58
|
+
a117;2709
|
59
|
+
a118;2708
|
60
|
+
a119;2707
|
61
|
+
a11;261B
|
62
|
+
a120;2460
|
63
|
+
a121;2461
|
64
|
+
a122;2462
|
65
|
+
a123;2463
|
66
|
+
a124;2464
|
67
|
+
a125;2465
|
68
|
+
a126;2466
|
69
|
+
a127;2467
|
70
|
+
a128;2468
|
71
|
+
a129;2469
|
72
|
+
a12;261E
|
73
|
+
a130;2776
|
74
|
+
a131;2777
|
75
|
+
a132;2778
|
76
|
+
a133;2779
|
77
|
+
a134;277A
|
78
|
+
a135;277B
|
79
|
+
a136;277C
|
80
|
+
a137;277D
|
81
|
+
a138;277E
|
82
|
+
a139;277F
|
83
|
+
a13;270C
|
84
|
+
a140;2780
|
85
|
+
a141;2781
|
86
|
+
a142;2782
|
87
|
+
a143;2783
|
88
|
+
a144;2784
|
89
|
+
a145;2785
|
90
|
+
a146;2786
|
91
|
+
a147;2787
|
92
|
+
a148;2788
|
93
|
+
a149;2789
|
94
|
+
a14;270D
|
95
|
+
a150;278A
|
96
|
+
a151;278B
|
97
|
+
a152;278C
|
98
|
+
a153;278D
|
99
|
+
a154;278E
|
100
|
+
a155;278F
|
101
|
+
a156;2790
|
102
|
+
a157;2791
|
103
|
+
a158;2792
|
104
|
+
a159;2793
|
105
|
+
a15;270E
|
106
|
+
a160;2794
|
107
|
+
a161;2192
|
108
|
+
a162;27A3
|
109
|
+
a163;2194
|
110
|
+
a164;2195
|
111
|
+
a165;2799
|
112
|
+
a166;279B
|
113
|
+
a167;279C
|
114
|
+
a168;279D
|
115
|
+
a169;279E
|
116
|
+
a16;270F
|
117
|
+
a170;279F
|
118
|
+
a171;27A0
|
119
|
+
a172;27A1
|
120
|
+
a173;27A2
|
121
|
+
a174;27A4
|
122
|
+
a175;27A5
|
123
|
+
a176;27A6
|
124
|
+
a177;27A7
|
125
|
+
a178;27A8
|
126
|
+
a179;27A9
|
127
|
+
a17;2711
|
128
|
+
a180;27AB
|
129
|
+
a181;27AD
|
130
|
+
a182;27AF
|
131
|
+
a183;27B2
|
132
|
+
a184;27B3
|
133
|
+
a185;27B5
|
134
|
+
a186;27B8
|
135
|
+
a187;27BA
|
136
|
+
a188;27BB
|
137
|
+
a189;27BC
|
138
|
+
a18;2712
|
139
|
+
a190;27BD
|
140
|
+
a191;27BE
|
141
|
+
a192;279A
|
142
|
+
a193;27AA
|
143
|
+
a194;27B6
|
144
|
+
a195;27B9
|
145
|
+
a196;2798
|
146
|
+
a197;27B4
|
147
|
+
a198;27B7
|
148
|
+
a199;27AC
|
149
|
+
a19;2713
|
150
|
+
a1;2701
|
151
|
+
a200;27AE
|
152
|
+
a201;27B1
|
153
|
+
a202;2703
|
154
|
+
a203;2750
|
155
|
+
a204;2752
|
156
|
+
a205;276E
|
157
|
+
a206;2770
|
158
|
+
a20;2714
|
159
|
+
a21;2715
|
160
|
+
a22;2716
|
161
|
+
a23;2717
|
162
|
+
a24;2718
|
163
|
+
a25;2719
|
164
|
+
a26;271A
|
165
|
+
a27;271B
|
166
|
+
a28;271C
|
167
|
+
a29;2722
|
168
|
+
a2;2702
|
169
|
+
a30;2723
|
170
|
+
a31;2724
|
171
|
+
a32;2725
|
172
|
+
a33;2726
|
173
|
+
a34;2727
|
174
|
+
a35;2605
|
175
|
+
a36;2729
|
176
|
+
a37;272A
|
177
|
+
a38;272B
|
178
|
+
a39;272C
|
179
|
+
a3;2704
|
180
|
+
a40;272D
|
181
|
+
a41;272E
|
182
|
+
a42;272F
|
183
|
+
a43;2730
|
184
|
+
a44;2731
|
185
|
+
a45;2732
|
186
|
+
a46;2733
|
187
|
+
a47;2734
|
188
|
+
a48;2735
|
189
|
+
a49;2736
|
190
|
+
a4;260E
|
191
|
+
a50;2737
|
192
|
+
a51;2738
|
193
|
+
a52;2739
|
194
|
+
a53;273A
|
195
|
+
a54;273B
|
196
|
+
a55;273C
|
197
|
+
a56;273D
|
198
|
+
a57;273E
|
199
|
+
a58;273F
|
200
|
+
a59;2740
|
201
|
+
a5;2706
|
202
|
+
a60;2741
|
203
|
+
a61;2742
|
204
|
+
a62;2743
|
205
|
+
a63;2744
|
206
|
+
a64;2745
|
207
|
+
a65;2746
|
208
|
+
a66;2747
|
209
|
+
a67;2748
|
210
|
+
a68;2749
|
211
|
+
a69;274A
|
212
|
+
a6;271D
|
213
|
+
a70;274B
|
214
|
+
a71;25CF
|
215
|
+
a72;274D
|
216
|
+
a73;25A0
|
217
|
+
a74;274F
|
218
|
+
a75;2751
|
219
|
+
a76;25B2
|
220
|
+
a77;25BC
|
221
|
+
a78;25C6
|
222
|
+
a79;2756
|
223
|
+
a7;271E
|
224
|
+
a81;25D7
|
225
|
+
a82;2758
|
226
|
+
a83;2759
|
227
|
+
a84;275A
|
228
|
+
a85;276F
|
229
|
+
a86;2771
|
230
|
+
a87;2772
|
231
|
+
a88;2773
|
232
|
+
a89;2768
|
233
|
+
a8;271F
|
234
|
+
a90;2769
|
235
|
+
a91;276C
|
236
|
+
a92;276D
|
237
|
+
a93;276A
|
238
|
+
a94;276B
|
239
|
+
a95;2774
|
240
|
+
a96;2775
|
241
|
+
a97;275B
|
242
|
+
a98;275C
|
243
|
+
a99;275D
|
244
|
+
a9;2720
|
245
|
+
# END
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require 'pdf/reader/overlapping_runs_filter'
|
5
|
+
require 'pdf/reader/zero_width_runs_filter'
|
5
6
|
|
6
7
|
class PDF::Reader
|
7
8
|
|
@@ -17,10 +18,12 @@ class PDF::Reader
|
|
17
18
|
def initialize(runs, mediabox)
|
18
19
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
19
20
|
|
20
|
-
|
21
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
22
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
23
|
+
@runs = merge_runs(runs)
|
21
24
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
25
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
23
|
-
@
|
26
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
24
27
|
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
28
|
@page_height = (mediabox[3] - mediabox[1]).abs
|
26
29
|
@x_offset = @runs.map(&:x).sort.first || 0
|
@@ -67,7 +70,7 @@ class PDF::Reader
|
|
67
70
|
end
|
68
71
|
|
69
72
|
def col_count
|
70
|
-
@col_count ||= ((@page_width / @
|
73
|
+
@col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
|
71
74
|
end
|
72
75
|
|
73
76
|
def row_multiplier
|
@@ -86,12 +89,12 @@ class PDF::Reader
|
|
86
89
|
end
|
87
90
|
end
|
88
91
|
|
89
|
-
def
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
def median(collection)
|
93
|
+
if collection.size == 0
|
94
|
+
0
|
95
|
+
else
|
96
|
+
collection.sort[(collection.size * 0.5).floor]
|
97
|
+
end
|
95
98
|
end
|
96
99
|
|
97
100
|
# take a collection of TextRun objects and merge any that are in close
|
@@ -45,8 +45,8 @@ module PDF
|
|
45
45
|
@content = []
|
46
46
|
@characters = []
|
47
47
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
-
device_bl =
|
49
|
-
device_tr =
|
48
|
+
device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
|
49
|
+
device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
|
50
50
|
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
51
51
|
end
|
52
52
|
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -175,15 +175,18 @@ class PDF::Reader
|
|
175
175
|
return "".dup.force_encoding("binary") if str == ")"
|
176
176
|
Error.assert_equal(parse_token, ")")
|
177
177
|
|
178
|
-
str.gsub!(/\\([nrtbf()\\\n]
|
179
|
-
|
178
|
+
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
|
179
|
+
if $2.nil? # not octal digits
|
180
|
+
MAPPING[match] || "".dup
|
181
|
+
else # must be octal digits
|
182
|
+
($2.oct & 0xff).chr # ignore high level overflow
|
183
|
+
end
|
180
184
|
end
|
181
185
|
str.force_encoding("binary")
|
182
186
|
end
|
183
187
|
|
184
188
|
MAPPING = {
|
185
189
|
"\r" => "\n",
|
186
|
-
"\n\r" => "\n",
|
187
190
|
"\r\n" => "\n",
|
188
191
|
"\\n" => "\n",
|
189
192
|
"\\r" => "\r",
|
@@ -194,10 +197,9 @@ class PDF::Reader
|
|
194
197
|
"\\)" => ")",
|
195
198
|
"\\\\" => "\\",
|
196
199
|
"\\\n" => "",
|
200
|
+
"\\\r" => "",
|
201
|
+
"\\\r\n" => "",
|
197
202
|
}
|
198
|
-
0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
|
199
|
-
0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
|
200
|
-
0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
|
201
203
|
|
202
204
|
################################################################################
|
203
205
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -131,6 +131,9 @@ class PDF::Reader
|
|
131
131
|
generation = buf.token.to_i
|
132
132
|
state = buf.token
|
133
133
|
|
134
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
135
|
+
# TODO should this fix be logged?
|
136
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
137
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
138
|
objid += 1
|
136
139
|
params.clear
|
@@ -146,7 +149,9 @@ class PDF::Reader
|
|
146
149
|
end
|
147
150
|
|
148
151
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
152
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
153
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
154
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
155
|
|
151
156
|
trailer
|
152
157
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -239,6 +239,7 @@ files:
|
|
239
239
|
- lib/pdf/reader/font_descriptor.rb
|
240
240
|
- lib/pdf/reader/form_xobject.rb
|
241
241
|
- lib/pdf/reader/glyph_hash.rb
|
242
|
+
- lib/pdf/reader/glyphlist-zapfdingbats.txt
|
242
243
|
- lib/pdf/reader/glyphlist.txt
|
243
244
|
- lib/pdf/reader/lzw.rb
|
244
245
|
- lib/pdf/reader/null_security_handler.rb
|
@@ -272,15 +273,16 @@ files:
|
|
272
273
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
273
274
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
274
275
|
- lib/pdf/reader/xref.rb
|
276
|
+
- lib/pdf/reader/zero_width_runs_filter.rb
|
275
277
|
homepage: https://github.com/yob/pdf-reader
|
276
278
|
licenses:
|
277
279
|
- MIT
|
278
280
|
metadata:
|
279
281
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
280
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
281
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
282
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
283
|
-
post_install_message:
|
282
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.6.0/CHANGELOG
|
283
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.6.0
|
284
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.6.0
|
285
|
+
post_install_message:
|
284
286
|
rdoc_options:
|
285
287
|
- "--title"
|
286
288
|
- PDF::Reader Documentation
|
@@ -300,8 +302,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
300
302
|
- !ruby/object:Gem::Version
|
301
303
|
version: '0'
|
302
304
|
requirements: []
|
303
|
-
rubygems_version: 3.
|
304
|
-
signing_key:
|
305
|
+
rubygems_version: 3.1.4
|
306
|
+
signing_key:
|
305
307
|
specification_version: 4
|
306
308
|
summary: A library for accessing the content of PDF files
|
307
309
|
test_files: []
|