pdf-reader 2.1.0 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +28 -1
- data/README.md +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader.rb +2 -2
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +12 -11
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +22 -12
- data/lib/pdf/reader/encoding.rb +12 -9
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +6 -4
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +3 -1
- data/lib/pdf/reader/font.rb +11 -2
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +2 -1
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +22 -10
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +29 -0
- data/lib/pdf/reader/page_layout.rb +10 -5
- data/lib/pdf/reader/page_state.rb +10 -1
- data/lib/pdf/reader/page_text_receiver.rb +5 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +5 -4
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +25 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- metadata +17 -13
- data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'pdf/reader/overlapping_runs_filter'
|
2
5
|
|
3
6
|
class PDF::Reader
|
4
7
|
|
@@ -14,13 +17,15 @@ class PDF::Reader
|
|
14
17
|
def initialize(runs, mediabox)
|
15
18
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
16
19
|
|
17
|
-
@runs = merge_runs(runs)
|
20
|
+
@runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
|
18
21
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
19
22
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
20
23
|
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
21
|
-
@page_width = mediabox[2] - mediabox[0]
|
22
|
-
@page_height = mediabox[3] - mediabox[1]
|
23
|
-
@x_offset = @runs.map(&:x).sort.first
|
24
|
+
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
|
+
@page_height = (mediabox[3] - mediabox[1]).abs
|
26
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
27
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
28
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
24
29
|
end
|
25
30
|
|
26
31
|
def to_s
|
@@ -29,7 +34,7 @@ class PDF::Reader
|
|
29
34
|
page = row_count.times.map { |i| " " * col_count }
|
30
35
|
@runs.each do |run|
|
31
36
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
32
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
37
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
33
38
|
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
34
39
|
local_string_insert(page[y_pos-1], run.text, x_pos)
|
35
40
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'pdf/reader/transformation_matrix'
|
4
5
|
|
@@ -29,7 +30,15 @@ class PDF::Reader
|
|
29
30
|
@xobject_stack = [page.xobjects]
|
30
31
|
@cs_stack = [page.color_spaces]
|
31
32
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
32
|
-
|
33
|
+
if page.rotate == 0
|
34
|
+
state[:ctm] = identity_matrix
|
35
|
+
else
|
36
|
+
rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
|
37
|
+
rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
|
38
|
+
state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
|
39
|
+
rotate_sin * -1, rotate_cos,
|
40
|
+
0, 0)
|
41
|
+
end
|
33
42
|
end
|
34
43
|
|
35
44
|
#####################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'forwardable'
|
4
5
|
require 'pdf/reader/page_layout'
|
@@ -43,10 +44,13 @@ module PDF
|
|
43
44
|
@content = []
|
44
45
|
@characters = []
|
45
46
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
|
+
device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
|
48
|
+
device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
|
49
|
+
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
46
50
|
end
|
47
51
|
|
48
52
|
def content
|
49
|
-
PageLayout.new(@characters, @
|
53
|
+
PageLayout.new(@characters, @device_mediabox).to_s
|
50
54
|
end
|
51
55
|
|
52
56
|
#####################################################
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -132,7 +133,7 @@ class PDF::Reader
|
|
132
133
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
133
134
|
def pdf_name
|
134
135
|
tok = @buffer.token
|
135
|
-
tok.gsub
|
136
|
+
tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
|
136
137
|
match[1, 2].hex.chr
|
137
138
|
end
|
138
139
|
tok.to_sym
|
@@ -154,7 +155,7 @@ class PDF::Reader
|
|
154
155
|
################################################################################
|
155
156
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
156
157
|
def hex_string
|
157
|
-
str = ""
|
158
|
+
str = "".dup
|
158
159
|
|
159
160
|
loop do
|
160
161
|
token = @buffer.token
|
@@ -171,11 +172,11 @@ class PDF::Reader
|
|
171
172
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
172
173
|
def string
|
173
174
|
str = @buffer.token
|
174
|
-
return "".force_encoding("binary") if str == ")"
|
175
|
+
return "".dup.force_encoding("binary") if str == ")"
|
175
176
|
Error.assert_equal(parse_token, ")")
|
176
177
|
|
177
178
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
178
|
-
MAPPING[match] || ""
|
179
|
+
MAPPING[match] || "".dup
|
179
180
|
end
|
180
181
|
str.force_encoding("binary")
|
181
182
|
end
|
data/lib/pdf/reader/reference.rb
CHANGED
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# A value object that represents one or more consecutive characters on a page.
|
@@ -37,6 +38,10 @@ class PDF::Reader
|
|
37
38
|
@endx ||= x + width
|
38
39
|
end
|
39
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
40
45
|
def mean_character_width
|
41
46
|
@width / character_count
|
42
47
|
end
|
@@ -59,8 +64,28 @@ class PDF::Reader
|
|
59
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
60
65
|
end
|
61
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
62
83
|
private
|
63
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
64
89
|
def mergable_range
|
65
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
91
|
end
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'afm'
|
4
5
|
require 'pdf/reader/synchronized_cache'
|
@@ -11,11 +12,20 @@ class PDF::Reader
|
|
11
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
12
13
|
class BuiltIn
|
13
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
14
23
|
def initialize(font)
|
15
24
|
@font = font
|
16
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
17
26
|
|
18
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
19
29
|
|
20
30
|
if File.file?(metrics_path)
|
21
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -53,6 +63,13 @@ class PDF::Reader
|
|
53
63
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
54
64
|
end
|
55
65
|
|
66
|
+
def extract_basefont(font_name)
|
67
|
+
if BUILTINS.include?(font_name)
|
68
|
+
font_name
|
69
|
+
else
|
70
|
+
"Times-Roman"
|
71
|
+
end
|
72
|
+
end
|
56
73
|
end
|
57
74
|
end
|
58
75
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
module WidthCalculator
|
@@ -17,8 +18,7 @@ class PDF::Reader
|
|
17
18
|
|
18
19
|
def glyph_width(code_point)
|
19
20
|
return 0 if code_point.nil? || code_point < 0
|
20
|
-
|
21
|
-
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -203,8 +204,10 @@ class PDF::Reader
|
|
203
204
|
("\x00" + bytes).unpack("N")[0]
|
204
205
|
elsif bytes.size == 4
|
205
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
206
209
|
else
|
207
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
208
211
|
end
|
209
212
|
end
|
210
213
|
################################################################################
|
@@ -227,18 +230,21 @@ class PDF::Reader
|
|
227
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
228
231
|
# in the real world.
|
229
232
|
#
|
230
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
231
237
|
#
|
232
238
|
def calc_junk_offset(io)
|
233
239
|
io.rewind
|
234
240
|
offset = io.pos
|
235
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
236
242
|
offset += 1
|
237
243
|
end
|
238
244
|
io.rewind
|
239
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
240
246
|
rescue EOFError
|
241
|
-
|
247
|
+
nil
|
242
248
|
end
|
243
249
|
end
|
244
250
|
################################################################################
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1
|
4
|
+
version: 2.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "<"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '13.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "<"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '13.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.2'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: pry
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -167,7 +167,7 @@ dependencies:
|
|
167
167
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
168
168
|
possible to the PDF specification from Adobe
|
169
169
|
email:
|
170
|
-
-
|
170
|
+
- james@yob.id.au
|
171
171
|
executables:
|
172
172
|
- pdf_object
|
173
173
|
- pdf_text
|
@@ -199,7 +199,6 @@ files:
|
|
199
199
|
- examples/text.rb
|
200
200
|
- examples/version.rb
|
201
201
|
- lib/pdf-reader.rb
|
202
|
-
- lib/pdf/hash.rb
|
203
202
|
- lib/pdf/reader.rb
|
204
203
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
205
204
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
@@ -209,6 +208,7 @@ files:
|
|
209
208
|
- lib/pdf/reader/afm/Helvetica-BoldOblique.afm
|
210
209
|
- lib/pdf/reader/afm/Helvetica-Oblique.afm
|
211
210
|
- lib/pdf/reader/afm/Helvetica.afm
|
211
|
+
- lib/pdf/reader/afm/MustRead.html
|
212
212
|
- lib/pdf/reader/afm/Symbol.afm
|
213
213
|
- lib/pdf/reader/afm/Times-Bold.afm
|
214
214
|
- lib/pdf/reader/afm/Times-BoldItalic.afm
|
@@ -246,6 +246,7 @@ files:
|
|
246
246
|
- lib/pdf/reader/object_hash.rb
|
247
247
|
- lib/pdf/reader/object_stream.rb
|
248
248
|
- lib/pdf/reader/orientation_detector.rb
|
249
|
+
- lib/pdf/reader/overlapping_runs_filter.rb
|
249
250
|
- lib/pdf/reader/page.rb
|
250
251
|
- lib/pdf/reader/page_layout.rb
|
251
252
|
- lib/pdf/reader/page_state.rb
|
@@ -271,10 +272,14 @@ files:
|
|
271
272
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
272
273
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
273
274
|
- lib/pdf/reader/xref.rb
|
274
|
-
homepage:
|
275
|
+
homepage: https://github.com/yob/pdf-reader
|
275
276
|
licenses:
|
276
277
|
- MIT
|
277
|
-
metadata:
|
278
|
+
metadata:
|
279
|
+
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
280
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.1/CHANGELOG
|
281
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.1
|
282
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.1
|
278
283
|
post_install_message:
|
279
284
|
rdoc_options:
|
280
285
|
- "--title"
|
@@ -295,8 +300,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
295
300
|
- !ruby/object:Gem::Version
|
296
301
|
version: '0'
|
297
302
|
requirements: []
|
298
|
-
|
299
|
-
rubygems_version: 2.7.3
|
303
|
+
rubygems_version: 3.0.3
|
300
304
|
signing_key:
|
301
305
|
specification_version: 4
|
302
306
|
summary: A library for accessing the content of PDF files
|