pdf-reader 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -1
- data/README.rdoc +1 -0
- data/Rakefile +23 -8
- data/lib/pdf-reader.rb +3 -1
- data/lib/pdf/hash.rb +5 -1
- data/lib/pdf/reader.rb +8 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +14 -6
- data/lib/pdf/reader/cid_widths.rb +61 -0
- data/lib/pdf/reader/cmap.rb +8 -2
- data/lib/pdf/reader/encoding.rb +52 -27
- data/lib/pdf/reader/error.rb +16 -1
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/filter/ascii85.rb +3 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
- data/lib/pdf/reader/filter/depredict.rb +2 -0
- data/lib/pdf/reader/filter/flate.rb +3 -1
- data/lib/pdf/reader/filter/lzw.rb +1 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -1
- data/lib/pdf/reader/font.rb +74 -18
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/glyph_hash.rb +6 -0
- data/lib/pdf/reader/lzw.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +172 -69
- data/lib/pdf/reader/page_text_receiver.rb +50 -21
- data/lib/pdf/reader/pages_strategy.rb +17 -4
- data/lib/pdf/reader/parser.rb +25 -52
- data/lib/pdf/reader/print_receiver.rb +5 -0
- data/lib/pdf/reader/reference.rb +2 -0
- data/lib/pdf/reader/register_receiver.rb +1 -1
- data/lib/pdf/reader/standard_security_handler.rb +2 -0
- data/lib/pdf/reader/stream.rb +2 -0
- data/lib/pdf/reader/synchronized_cache.rb +32 -0
- data/lib/pdf/reader/text_receiver.rb +5 -4
- data/lib/pdf/reader/text_run.rb +80 -0
- data/lib/pdf/reader/token.rb +2 -0
- data/lib/pdf/reader/transformation_matrix.rb +194 -0
- data/lib/pdf/reader/width_calculator.rb +11 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
- data/lib/pdf/reader/width_calculator/composite.rb +27 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
- data/lib/pdf/reader/xref.rb +9 -2
- metadata +119 -13
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -21,7 +23,6 @@
|
|
21
23
|
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
24
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
25
|
#
|
24
|
-
|
25
26
|
class PDF::Reader
|
26
27
|
################################################################################
|
27
28
|
# An internal PDF::Reader class that helps to verify various parts of the PDF file
|
@@ -45,10 +46,24 @@ class PDF::Reader
|
|
45
46
|
end
|
46
47
|
################################################################################
|
47
48
|
end
|
49
|
+
|
48
50
|
################################################################################
|
51
|
+
# an exception that is raised when we believe the current PDF is not following
|
52
|
+
# the PDF spec and cannot be recovered
|
49
53
|
class MalformedPDFError < RuntimeError; end
|
54
|
+
|
55
|
+
################################################################################
|
56
|
+
# an exception that is raised when a PDF object appears to be invalid
|
50
57
|
class InvalidObjectError < MalformedPDFError; end
|
58
|
+
|
59
|
+
################################################################################
|
60
|
+
# an exception that is raised when a PDF follows the specs but uses a feature
|
61
|
+
# that we don't support just yet
|
51
62
|
class UnsupportedFeatureError < RuntimeError; end
|
63
|
+
|
64
|
+
################################################################################
|
65
|
+
# an exception that is raised when a PDF is encrypted and we don't have the
|
66
|
+
# necessary data to decrypt it
|
52
67
|
class EncryptedPDFError < UnsupportedFeatureError; end
|
53
68
|
end
|
54
69
|
################################################################################
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -4,6 +4,7 @@ require 'ascii85'
|
|
4
4
|
|
5
5
|
class PDF::Reader
|
6
6
|
module Filter # :nodoc:
|
7
|
+
# implementation of the Ascii85 filter
|
7
8
|
class Ascii85
|
8
9
|
def initialize(options = {})
|
9
10
|
@options = options
|
@@ -18,7 +19,8 @@ class PDF::Reader
|
|
18
19
|
::Ascii85::decode(data)
|
19
20
|
rescue Exception => e
|
20
21
|
# Oops, there was a problem decoding the stream
|
21
|
-
raise MalformedPDFError,
|
22
|
+
raise MalformedPDFError,
|
23
|
+
"Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#
|
3
3
|
class PDF::Reader
|
4
4
|
module Filter # :nodoc:
|
5
|
+
# implementation of the AsciiHex stream filter
|
5
6
|
class AsciiHex
|
6
7
|
def initialize(options = {})
|
7
8
|
@options = options
|
@@ -18,7 +19,8 @@ class PDF::Reader
|
|
18
19
|
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
19
20
|
rescue Exception => e
|
20
21
|
# Oops, there was a problem decoding the stream
|
21
|
-
raise MalformedPDFError,
|
22
|
+
raise MalformedPDFError,
|
23
|
+
"Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -5,6 +5,7 @@ require 'zlib'
|
|
5
5
|
|
6
6
|
class PDF::Reader
|
7
7
|
module Filter # :nodoc:
|
8
|
+
# implementation of the Flate (zlib) stream filter
|
8
9
|
class Flate
|
9
10
|
def initialize(options = {})
|
10
11
|
@options = options
|
@@ -30,7 +31,8 @@ class PDF::Reader
|
|
30
31
|
Depredict.new(@options).filter(deflated)
|
31
32
|
rescue Exception => e
|
32
33
|
# Oops, there was a problem inflating the stream
|
33
|
-
raise MalformedPDFError,
|
34
|
+
raise MalformedPDFError,
|
35
|
+
"Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
34
36
|
end
|
35
37
|
end
|
36
38
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
@@ -23,11 +25,16 @@
|
|
23
25
|
#
|
24
26
|
################################################################################
|
25
27
|
|
28
|
+
require 'pdf/reader/width_calculator'
|
29
|
+
|
26
30
|
class PDF::Reader
|
31
|
+
# Represents a single font PDF object and provides some useful methods
|
32
|
+
# for extracting info. Mainly used for converting text to UTF-8.
|
33
|
+
#
|
27
34
|
class Font
|
28
|
-
attr_accessor :
|
29
|
-
attr_reader :widths, :first_char, :
|
30
|
-
|
35
|
+
attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
|
36
|
+
attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
|
37
|
+
:cid_widths, :cid_default_width
|
31
38
|
|
32
39
|
def initialize(ohash = nil, obj = nil)
|
33
40
|
if ohash.nil? || obj.nil?
|
@@ -40,6 +47,7 @@ class PDF::Reader
|
|
40
47
|
extract_base_info(obj)
|
41
48
|
extract_descriptor(obj)
|
42
49
|
extract_descendants(obj)
|
50
|
+
@width_calc = build_width_calculator
|
43
51
|
|
44
52
|
@encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
|
45
53
|
end
|
@@ -66,39 +74,79 @@ class PDF::Reader
|
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
77
|
+
def unpack(data)
|
78
|
+
data.unpack(encoding.unpack)
|
79
|
+
end
|
80
|
+
|
81
|
+
# looks up the specified codepoint and returns a value that is in (pdf)
|
82
|
+
# glyph space, which is 1000 glyph units = 1 text space unit
|
83
|
+
def glyph_width(code_point)
|
84
|
+
if code_point.is_a?(String)
|
85
|
+
code_point = code_point.unpack(encoding.unpack).first
|
86
|
+
end
|
87
|
+
|
88
|
+
@cached_widths ||= {}
|
89
|
+
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
90
|
end
|
74
91
|
|
75
92
|
private
|
76
93
|
|
94
|
+
def build_width_calculator
|
95
|
+
if @subtype == :Type0
|
96
|
+
PDF::Reader::WidthCalculator::TypeZero.new(self)
|
97
|
+
elsif @subtype == :Type1
|
98
|
+
if @font_descriptor.nil?
|
99
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
100
|
+
else
|
101
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
|
102
|
+
end
|
103
|
+
elsif @subtype == :Type3
|
104
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
105
|
+
elsif @subtype == :TrueType
|
106
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
107
|
+
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
108
|
+
PDF::Reader::WidthCalculator::Composite.new(self)
|
109
|
+
else
|
110
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
77
114
|
def extract_base_info(obj)
|
78
115
|
@subtype = @ohash.object(obj[:Subtype])
|
79
116
|
@basefont = @ohash.object(obj[:BaseFont])
|
80
117
|
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
81
118
|
@widths = @ohash.object(obj[:Widths]) || []
|
82
119
|
@first_char = @ohash.object(obj[:FirstChar])
|
120
|
+
@last_char = @ohash.object(obj[:LastChar])
|
121
|
+
|
122
|
+
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
123
|
+
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
124
|
+
@cid_widths = @ohash.object(obj[:W]) || []
|
125
|
+
@cid_default_width = @ohash.object(obj[:DW]) || 1000
|
126
|
+
|
83
127
|
if obj[:ToUnicode]
|
128
|
+
# ToUnicode is optional for Type1 and Type3
|
84
129
|
stream = @ohash.object(obj[:ToUnicode])
|
85
130
|
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
86
131
|
end
|
87
132
|
end
|
88
133
|
|
89
134
|
def extract_descriptor(obj)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
135
|
+
if obj[:FontDescriptor]
|
136
|
+
# create a font descriptor object if we can, in other words, unless this is
|
137
|
+
# a CID Font
|
138
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
139
|
+
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
140
|
+
else
|
141
|
+
@font_descriptor = nil
|
142
|
+
end
|
97
143
|
end
|
98
144
|
|
99
145
|
def extract_descendants(obj)
|
100
146
|
return unless obj[:DescendantFonts]
|
101
|
-
|
147
|
+
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
148
|
+
# A one-element array specifying the CIDFont dictionary that is the
|
149
|
+
# descendant of this Type 0 font.
|
102
150
|
descendants = @ohash.object(obj[:DescendantFonts])
|
103
151
|
@descendantfonts = descendants.map { |desc|
|
104
152
|
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
@@ -106,7 +154,11 @@ class PDF::Reader
|
|
106
154
|
end
|
107
155
|
|
108
156
|
def to_utf8_via_cmap(params)
|
109
|
-
if params.class ==
|
157
|
+
if params.class == Fixnum
|
158
|
+
[
|
159
|
+
@tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
160
|
+
].flatten.pack("U*")
|
161
|
+
elsif params.class == String
|
110
162
|
params.unpack(encoding.unpack).map { |c|
|
111
163
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
164
|
}.flatten.pack("U*")
|
@@ -118,9 +170,13 @@ class PDF::Reader
|
|
118
170
|
end
|
119
171
|
|
120
172
|
def to_utf8_via_encoding(params)
|
121
|
-
|
173
|
+
if encoding.kind_of?(String)
|
174
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
|
175
|
+
end
|
122
176
|
|
123
|
-
if params.class ==
|
177
|
+
if params.class == Fixnum
|
178
|
+
encoding.int_to_utf8_string(params)
|
179
|
+
elsif params.class == String
|
124
180
|
encoding.to_utf8(params)
|
125
181
|
elsif params.class == Array
|
126
182
|
params.collect { |param| to_utf8_via_encoding(param) }
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'ttfunk'
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
|
8
|
+
class FontDescriptor
|
9
|
+
|
10
|
+
attr_reader :font_name, :font_family, :font_stretch, :font_weight,
|
11
|
+
:font_bounding_box, :cap_height, :ascent, :descent, :leading,
|
12
|
+
:avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
|
13
|
+
:x_height, :font_flags
|
14
|
+
|
15
|
+
def initialize(ohash, fd_hash)
|
16
|
+
@ascent = ohash.object(fd_hash[:Ascent]) || 0
|
17
|
+
@descent = ohash.object(fd_hash[:Descent]) || 0
|
18
|
+
@missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
|
19
|
+
@font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
|
20
|
+
@avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
|
21
|
+
@cap_height = ohash.object(fd_hash[:CapHeight]) || 0
|
22
|
+
@font_flags = ohash.object(fd_hash[:Flags]) || 0
|
23
|
+
@italic_angle = ohash.object(fd_hash[:ItalicAngle])
|
24
|
+
@font_name = ohash.object(fd_hash[:FontName]).to_s
|
25
|
+
@leading = ohash.object(fd_hash[:Leading]) || 0
|
26
|
+
@max_width = ohash.object(fd_hash[:MaxWidth]) || 0
|
27
|
+
@stem_v = ohash.object(fd_hash[:StemV])
|
28
|
+
@x_height = ohash.object(fd_hash[:XHeight])
|
29
|
+
@font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
|
30
|
+
@font_weight = ohash.object(fd_hash[:FontWeight]) || 400
|
31
|
+
@font_family = ohash.object(fd_hash[:FontFamily])
|
32
|
+
|
33
|
+
# A FontDescriptor may have an embedded font program in FontFile
|
34
|
+
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
35
|
+
# FontFile3 (Other font program as defined by Subtype entry)
|
36
|
+
# Subtype entries:
|
37
|
+
# 1) Type1C: Type 1 Font Program in Compact Font Format
|
38
|
+
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
39
|
+
# 3) OpenType: OpenType Font Program
|
40
|
+
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
41
|
+
@font_program_stream = ohash.object(fd_hash[:FontFile2])
|
42
|
+
#TODO handle FontFile and FontFile3
|
43
|
+
|
44
|
+
@is_ttf = true if @font_program_stream
|
45
|
+
end
|
46
|
+
|
47
|
+
def glyph_width(char_code)
|
48
|
+
if @is_ttf
|
49
|
+
if ttf_program_stream.cmap.unicode.length > 0
|
50
|
+
glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
|
51
|
+
else
|
52
|
+
glyph_id = char_code
|
53
|
+
end
|
54
|
+
char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
|
55
|
+
if char_metric
|
56
|
+
puts "Char Code: #{char_code} -- Advance Width: #{char_metric.advance_width}" > 0
|
57
|
+
return char_metric.advance_width
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# PDF states that a glyph is 1000 units wide, true type doesn't enforce
|
63
|
+
# any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
|
64
|
+
def glyph_to_pdf_scale_factor
|
65
|
+
if @is_ttf
|
66
|
+
@glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
|
67
|
+
else
|
68
|
+
@glyph_to_pdf_sf ||= 1.0
|
69
|
+
end
|
70
|
+
@glyph_to_pdf_sf
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def ttf_program_stream
|
76
|
+
@ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2011 James Healy (jimmy@deefa.com)
|
@@ -24,6 +26,9 @@
|
|
24
26
|
################################################################################
|
25
27
|
|
26
28
|
class PDF::Reader
|
29
|
+
# A Hash-like object that can convert glyph names into a unicode codepoint.
|
30
|
+
# The mapping is read from a data file on disk the first time it's needed.
|
31
|
+
#
|
27
32
|
class GlyphHash # :nodoc:
|
28
33
|
def initialize
|
29
34
|
# only parse the glyph list once, and cache the results (for performance)
|
@@ -45,6 +50,7 @@ class PDF::Reader
|
|
45
50
|
# => 48
|
46
51
|
#
|
47
52
|
# h[:34]
|
53
|
+
# => 34
|
48
54
|
#
|
49
55
|
def [](name)
|
50
56
|
return nil unless name.is_a?(Symbol)
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -41,8 +41,8 @@ class PDF::Reader
|
|
41
41
|
#
|
42
42
|
def initialize(input, opts = {})
|
43
43
|
@io = extract_io_from(input)
|
44
|
-
@pdf_version = read_version
|
45
44
|
@xref = PDF::Reader::XRef.new(@io)
|
45
|
+
@pdf_version = read_version
|
46
46
|
@trailer = @xref.trailer
|
47
47
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
48
48
|
@sec_handler = build_security_handler(opts)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# Takes a collection of TextRun objects and renders them into a single
|
6
|
+
# string that best approximates the way they'd appear on a render PDF page.
|
7
|
+
#
|
8
|
+
# media box should be a 4 number array that describes the dimensions of the
|
9
|
+
# page to be rendered as described by the page's MediaBox attribute
|
10
|
+
class PageLayout
|
11
|
+
def initialize(runs, mediabox)
|
12
|
+
@runs = merge_runs(runs)
|
13
|
+
@mean_font_size = mean(@runs.map(&:font_size)) || 0
|
14
|
+
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
15
|
+
@page_width = mediabox[2] - mediabox[0]
|
16
|
+
@page_height = mediabox[3] - mediabox[1]
|
17
|
+
@x_offset = @runs.map(&:x).sort.first
|
18
|
+
@current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
|
19
|
+
RUBY_VERSION >= "1.9.0"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
return "" if @runs.empty?
|
24
|
+
|
25
|
+
page = row_count.times.map { |i| " " * col_count }
|
26
|
+
@runs.each do |run|
|
27
|
+
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
28
|
+
y_pos = row_count - (run.y / row_multiplier).round
|
29
|
+
if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
|
30
|
+
local_string_insert(page[y_pos], run.text, x_pos)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
interesting_rows(page).map(&:rstrip).join("\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# given an array of strings, return a new array with empty rows from the
|
39
|
+
# beginning and end removed.
|
40
|
+
#
|
41
|
+
# interesting_rows([ "", "one", "two", "" ])
|
42
|
+
# => [ "one", "two" ]
|
43
|
+
#
|
44
|
+
def interesting_rows(rows)
|
45
|
+
line_lengths = rows.map { |l| l.strip.length }
|
46
|
+
first_line_with_text = line_lengths.index { |l| l > 0 }
|
47
|
+
last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
|
48
|
+
interesting_line_count = last_line_with_text - first_line_with_text
|
49
|
+
rows[first_line_with_text, interesting_line_count].map
|
50
|
+
end
|
51
|
+
|
52
|
+
def row_count
|
53
|
+
@row_count ||= (@page_height / @mean_font_size).floor
|
54
|
+
end
|
55
|
+
|
56
|
+
def col_count
|
57
|
+
@col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
|
58
|
+
end
|
59
|
+
|
60
|
+
def row_multiplier
|
61
|
+
@row_multiplier ||= @page_height / row_count
|
62
|
+
end
|
63
|
+
|
64
|
+
def col_multiplier
|
65
|
+
@col_multiplier ||= @page_width / col_count
|
66
|
+
end
|
67
|
+
|
68
|
+
def mean(collection)
|
69
|
+
if collection.size == 0
|
70
|
+
0
|
71
|
+
else
|
72
|
+
collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def each_line(&block)
|
77
|
+
@runs.sort.group_by { |run|
|
78
|
+
run.y.to_i
|
79
|
+
}.map { |y, collection|
|
80
|
+
yield y, collection
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
# take a collection of TextRun objects and merge any that are in close
|
85
|
+
# proximity
|
86
|
+
def merge_runs(runs)
|
87
|
+
runs.group_by { |char|
|
88
|
+
char.y.to_i
|
89
|
+
}.map { |y, chars|
|
90
|
+
group_chars_into_runs(chars.sort)
|
91
|
+
}.flatten.sort
|
92
|
+
end
|
93
|
+
|
94
|
+
def group_chars_into_runs(chars)
|
95
|
+
runs = []
|
96
|
+
while head = chars.shift
|
97
|
+
if runs.empty?
|
98
|
+
runs << head
|
99
|
+
elsif runs.last.mergable?(head)
|
100
|
+
runs[-1] = runs.last + head
|
101
|
+
else
|
102
|
+
runs << head
|
103
|
+
end
|
104
|
+
end
|
105
|
+
runs
|
106
|
+
end
|
107
|
+
|
108
|
+
# This is a simple alternative to String#[]=. We can't use the string
|
109
|
+
# method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
|
110
|
+
#
|
111
|
+
# See my bug report at https://github.com/rubinius/rubinius/issues/1985
|
112
|
+
def local_string_insert(haystack, needle, index)
|
113
|
+
if @current_platform_is_rbx_19
|
114
|
+
char_count = needle.length
|
115
|
+
haystack.replace(
|
116
|
+
(haystack[0,index] || "") +
|
117
|
+
needle +
|
118
|
+
(haystack[index+char_count,500] || "")
|
119
|
+
)
|
120
|
+
else
|
121
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|