pdf-reader 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -1
- data/README.rdoc +1 -0
- data/Rakefile +23 -8
- data/lib/pdf-reader.rb +3 -1
- data/lib/pdf/hash.rb +5 -1
- data/lib/pdf/reader.rb +8 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +14 -6
- data/lib/pdf/reader/cid_widths.rb +61 -0
- data/lib/pdf/reader/cmap.rb +8 -2
- data/lib/pdf/reader/encoding.rb +52 -27
- data/lib/pdf/reader/error.rb +16 -1
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/filter/ascii85.rb +3 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
- data/lib/pdf/reader/filter/depredict.rb +2 -0
- data/lib/pdf/reader/filter/flate.rb +3 -1
- data/lib/pdf/reader/filter/lzw.rb +1 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -1
- data/lib/pdf/reader/font.rb +74 -18
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/glyph_hash.rb +6 -0
- data/lib/pdf/reader/lzw.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +172 -69
- data/lib/pdf/reader/page_text_receiver.rb +50 -21
- data/lib/pdf/reader/pages_strategy.rb +17 -4
- data/lib/pdf/reader/parser.rb +25 -52
- data/lib/pdf/reader/print_receiver.rb +5 -0
- data/lib/pdf/reader/reference.rb +2 -0
- data/lib/pdf/reader/register_receiver.rb +1 -1
- data/lib/pdf/reader/standard_security_handler.rb +2 -0
- data/lib/pdf/reader/stream.rb +2 -0
- data/lib/pdf/reader/synchronized_cache.rb +32 -0
- data/lib/pdf/reader/text_receiver.rb +5 -4
- data/lib/pdf/reader/text_run.rb +80 -0
- data/lib/pdf/reader/token.rb +2 -0
- data/lib/pdf/reader/transformation_matrix.rb +194 -0
- data/lib/pdf/reader/width_calculator.rb +11 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
- data/lib/pdf/reader/width_calculator/composite.rb +27 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
- data/lib/pdf/reader/xref.rb +9 -2
- metadata +119 -13
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -21,7 +23,6 @@
|
|
21
23
|
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
24
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
25
|
#
|
24
|
-
|
25
26
|
class PDF::Reader
|
26
27
|
################################################################################
|
27
28
|
# An internal PDF::Reader class that helps to verify various parts of the PDF file
|
@@ -45,10 +46,24 @@ class PDF::Reader
|
|
45
46
|
end
|
46
47
|
################################################################################
|
47
48
|
end
|
49
|
+
|
48
50
|
################################################################################
|
51
|
+
# an exception that is raised when we believe the current PDF is not following
|
52
|
+
# the PDF spec and cannot be recovered
|
49
53
|
class MalformedPDFError < RuntimeError; end
|
54
|
+
|
55
|
+
################################################################################
|
56
|
+
# an exception that is raised when a PDF object appears to be invalid
|
50
57
|
class InvalidObjectError < MalformedPDFError; end
|
58
|
+
|
59
|
+
################################################################################
|
60
|
+
# an exception that is raised when a PDF follows the specs but uses a feature
|
61
|
+
# that we don't support just yet
|
51
62
|
class UnsupportedFeatureError < RuntimeError; end
|
63
|
+
|
64
|
+
################################################################################
|
65
|
+
# an exception that is raised when a PDF is encrypted and we don't have the
|
66
|
+
# necessary data to decrypt it
|
52
67
|
class EncryptedPDFError < UnsupportedFeatureError; end
|
53
68
|
end
|
54
69
|
################################################################################
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -4,6 +4,7 @@ require 'ascii85'
|
|
4
4
|
|
5
5
|
class PDF::Reader
|
6
6
|
module Filter # :nodoc:
|
7
|
+
# implementation of the Ascii85 filter
|
7
8
|
class Ascii85
|
8
9
|
def initialize(options = {})
|
9
10
|
@options = options
|
@@ -18,7 +19,8 @@ class PDF::Reader
|
|
18
19
|
::Ascii85::decode(data)
|
19
20
|
rescue Exception => e
|
20
21
|
# Oops, there was a problem decoding the stream
|
21
|
-
raise MalformedPDFError,
|
22
|
+
raise MalformedPDFError,
|
23
|
+
"Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#
|
3
3
|
class PDF::Reader
|
4
4
|
module Filter # :nodoc:
|
5
|
+
# implementation of the AsciiHex stream filter
|
5
6
|
class AsciiHex
|
6
7
|
def initialize(options = {})
|
7
8
|
@options = options
|
@@ -18,7 +19,8 @@ class PDF::Reader
|
|
18
19
|
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
19
20
|
rescue Exception => e
|
20
21
|
# Oops, there was a problem decoding the stream
|
21
|
-
raise MalformedPDFError,
|
22
|
+
raise MalformedPDFError,
|
23
|
+
"Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -5,6 +5,7 @@ require 'zlib'
|
|
5
5
|
|
6
6
|
class PDF::Reader
|
7
7
|
module Filter # :nodoc:
|
8
|
+
# implementation of the Flate (zlib) stream filter
|
8
9
|
class Flate
|
9
10
|
def initialize(options = {})
|
10
11
|
@options = options
|
@@ -30,7 +31,8 @@ class PDF::Reader
|
|
30
31
|
Depredict.new(@options).filter(deflated)
|
31
32
|
rescue Exception => e
|
32
33
|
# Oops, there was a problem inflating the stream
|
33
|
-
raise MalformedPDFError,
|
34
|
+
raise MalformedPDFError,
|
35
|
+
"Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
34
36
|
end
|
35
37
|
end
|
36
38
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
@@ -23,11 +25,16 @@
|
|
23
25
|
#
|
24
26
|
################################################################################
|
25
27
|
|
28
|
+
require 'pdf/reader/width_calculator'
|
29
|
+
|
26
30
|
class PDF::Reader
|
31
|
+
# Represents a single font PDF object and provides some useful methods
|
32
|
+
# for extracting info. Mainly used for converting text to UTF-8.
|
33
|
+
#
|
27
34
|
class Font
|
28
|
-
attr_accessor :
|
29
|
-
attr_reader :widths, :first_char, :
|
30
|
-
|
35
|
+
attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
|
36
|
+
attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
|
37
|
+
:cid_widths, :cid_default_width
|
31
38
|
|
32
39
|
def initialize(ohash = nil, obj = nil)
|
33
40
|
if ohash.nil? || obj.nil?
|
@@ -40,6 +47,7 @@ class PDF::Reader
|
|
40
47
|
extract_base_info(obj)
|
41
48
|
extract_descriptor(obj)
|
42
49
|
extract_descendants(obj)
|
50
|
+
@width_calc = build_width_calculator
|
43
51
|
|
44
52
|
@encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
|
45
53
|
end
|
@@ -66,39 +74,79 @@ class PDF::Reader
|
|
66
74
|
end
|
67
75
|
end
|
68
76
|
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
77
|
+
def unpack(data)
|
78
|
+
data.unpack(encoding.unpack)
|
79
|
+
end
|
80
|
+
|
81
|
+
# looks up the specified codepoint and returns a value that is in (pdf)
|
82
|
+
# glyph space, which is 1000 glyph units = 1 text space unit
|
83
|
+
def glyph_width(code_point)
|
84
|
+
if code_point.is_a?(String)
|
85
|
+
code_point = code_point.unpack(encoding.unpack).first
|
86
|
+
end
|
87
|
+
|
88
|
+
@cached_widths ||= {}
|
89
|
+
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
90
|
end
|
74
91
|
|
75
92
|
private
|
76
93
|
|
94
|
+
def build_width_calculator
|
95
|
+
if @subtype == :Type0
|
96
|
+
PDF::Reader::WidthCalculator::TypeZero.new(self)
|
97
|
+
elsif @subtype == :Type1
|
98
|
+
if @font_descriptor.nil?
|
99
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
100
|
+
else
|
101
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
|
102
|
+
end
|
103
|
+
elsif @subtype == :Type3
|
104
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
105
|
+
elsif @subtype == :TrueType
|
106
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
107
|
+
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
108
|
+
PDF::Reader::WidthCalculator::Composite.new(self)
|
109
|
+
else
|
110
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
77
114
|
def extract_base_info(obj)
|
78
115
|
@subtype = @ohash.object(obj[:Subtype])
|
79
116
|
@basefont = @ohash.object(obj[:BaseFont])
|
80
117
|
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
81
118
|
@widths = @ohash.object(obj[:Widths]) || []
|
82
119
|
@first_char = @ohash.object(obj[:FirstChar])
|
120
|
+
@last_char = @ohash.object(obj[:LastChar])
|
121
|
+
|
122
|
+
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
123
|
+
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
124
|
+
@cid_widths = @ohash.object(obj[:W]) || []
|
125
|
+
@cid_default_width = @ohash.object(obj[:DW]) || 1000
|
126
|
+
|
83
127
|
if obj[:ToUnicode]
|
128
|
+
# ToUnicode is optional for Type1 and Type3
|
84
129
|
stream = @ohash.object(obj[:ToUnicode])
|
85
130
|
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
86
131
|
end
|
87
132
|
end
|
88
133
|
|
89
134
|
def extract_descriptor(obj)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
135
|
+
if obj[:FontDescriptor]
|
136
|
+
# create a font descriptor object if we can, in other words, unless this is
|
137
|
+
# a CID Font
|
138
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
139
|
+
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
140
|
+
else
|
141
|
+
@font_descriptor = nil
|
142
|
+
end
|
97
143
|
end
|
98
144
|
|
99
145
|
def extract_descendants(obj)
|
100
146
|
return unless obj[:DescendantFonts]
|
101
|
-
|
147
|
+
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
148
|
+
# A one-element array specifying the CIDFont dictionary that is the
|
149
|
+
# descendant of this Type 0 font.
|
102
150
|
descendants = @ohash.object(obj[:DescendantFonts])
|
103
151
|
@descendantfonts = descendants.map { |desc|
|
104
152
|
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
@@ -106,7 +154,11 @@ class PDF::Reader
|
|
106
154
|
end
|
107
155
|
|
108
156
|
def to_utf8_via_cmap(params)
|
109
|
-
if params.class ==
|
157
|
+
if params.class == Fixnum
|
158
|
+
[
|
159
|
+
@tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
160
|
+
].flatten.pack("U*")
|
161
|
+
elsif params.class == String
|
110
162
|
params.unpack(encoding.unpack).map { |c|
|
111
163
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
164
|
}.flatten.pack("U*")
|
@@ -118,9 +170,13 @@ class PDF::Reader
|
|
118
170
|
end
|
119
171
|
|
120
172
|
def to_utf8_via_encoding(params)
|
121
|
-
|
173
|
+
if encoding.kind_of?(String)
|
174
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
|
175
|
+
end
|
122
176
|
|
123
|
-
if params.class ==
|
177
|
+
if params.class == Fixnum
|
178
|
+
encoding.int_to_utf8_string(params)
|
179
|
+
elsif params.class == String
|
124
180
|
encoding.to_utf8(params)
|
125
181
|
elsif params.class == Array
|
126
182
|
params.collect { |param| to_utf8_via_encoding(param) }
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'ttfunk'
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
|
8
|
+
class FontDescriptor
|
9
|
+
|
10
|
+
attr_reader :font_name, :font_family, :font_stretch, :font_weight,
|
11
|
+
:font_bounding_box, :cap_height, :ascent, :descent, :leading,
|
12
|
+
:avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
|
13
|
+
:x_height, :font_flags
|
14
|
+
|
15
|
+
def initialize(ohash, fd_hash)
|
16
|
+
@ascent = ohash.object(fd_hash[:Ascent]) || 0
|
17
|
+
@descent = ohash.object(fd_hash[:Descent]) || 0
|
18
|
+
@missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
|
19
|
+
@font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
|
20
|
+
@avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
|
21
|
+
@cap_height = ohash.object(fd_hash[:CapHeight]) || 0
|
22
|
+
@font_flags = ohash.object(fd_hash[:Flags]) || 0
|
23
|
+
@italic_angle = ohash.object(fd_hash[:ItalicAngle])
|
24
|
+
@font_name = ohash.object(fd_hash[:FontName]).to_s
|
25
|
+
@leading = ohash.object(fd_hash[:Leading]) || 0
|
26
|
+
@max_width = ohash.object(fd_hash[:MaxWidth]) || 0
|
27
|
+
@stem_v = ohash.object(fd_hash[:StemV])
|
28
|
+
@x_height = ohash.object(fd_hash[:XHeight])
|
29
|
+
@font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
|
30
|
+
@font_weight = ohash.object(fd_hash[:FontWeight]) || 400
|
31
|
+
@font_family = ohash.object(fd_hash[:FontFamily])
|
32
|
+
|
33
|
+
# A FontDescriptor may have an embedded font program in FontFile
|
34
|
+
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
35
|
+
# FontFile3 (Other font program as defined by Subtype entry)
|
36
|
+
# Subtype entries:
|
37
|
+
# 1) Type1C: Type 1 Font Program in Compact Font Format
|
38
|
+
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
39
|
+
# 3) OpenType: OpenType Font Program
|
40
|
+
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
41
|
+
@font_program_stream = ohash.object(fd_hash[:FontFile2])
|
42
|
+
#TODO handle FontFile and FontFile3
|
43
|
+
|
44
|
+
@is_ttf = true if @font_program_stream
|
45
|
+
end
|
46
|
+
|
47
|
+
def glyph_width(char_code)
|
48
|
+
if @is_ttf
|
49
|
+
if ttf_program_stream.cmap.unicode.length > 0
|
50
|
+
glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
|
51
|
+
else
|
52
|
+
glyph_id = char_code
|
53
|
+
end
|
54
|
+
char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
|
55
|
+
if char_metric
|
56
|
+
puts "Char Code: #{char_code} -- Advance Width: #{char_metric.advance_width}" > 0
|
57
|
+
return char_metric.advance_width
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# PDF states that a glyph is 1000 units wide, true type doesn't enforce
|
63
|
+
# any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
|
64
|
+
def glyph_to_pdf_scale_factor
|
65
|
+
if @is_ttf
|
66
|
+
@glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
|
67
|
+
else
|
68
|
+
@glyph_to_pdf_sf ||= 1.0
|
69
|
+
end
|
70
|
+
@glyph_to_pdf_sf
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def ttf_program_stream
|
76
|
+
@ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
5
|
# Copyright (C) 2011 James Healy (jimmy@deefa.com)
|
@@ -24,6 +26,9 @@
|
|
24
26
|
################################################################################
|
25
27
|
|
26
28
|
class PDF::Reader
|
29
|
+
# A Hash-like object that can convert glyph names into a unicode codepoint.
|
30
|
+
# The mapping is read from a data file on disk the first time it's needed.
|
31
|
+
#
|
27
32
|
class GlyphHash # :nodoc:
|
28
33
|
def initialize
|
29
34
|
# only parse the glyph list once, and cache the results (for performance)
|
@@ -45,6 +50,7 @@ class PDF::Reader
|
|
45
50
|
# => 48
|
46
51
|
#
|
47
52
|
# h[:34]
|
53
|
+
# => 34
|
48
54
|
#
|
49
55
|
def [](name)
|
50
56
|
return nil unless name.is_a?(Symbol)
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -41,8 +41,8 @@ class PDF::Reader
|
|
41
41
|
#
|
42
42
|
def initialize(input, opts = {})
|
43
43
|
@io = extract_io_from(input)
|
44
|
-
@pdf_version = read_version
|
45
44
|
@xref = PDF::Reader::XRef.new(@io)
|
45
|
+
@pdf_version = read_version
|
46
46
|
@trailer = @xref.trailer
|
47
47
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
48
48
|
@sec_handler = build_security_handler(opts)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# Takes a collection of TextRun objects and renders them into a single
|
6
|
+
# string that best approximates the way they'd appear on a render PDF page.
|
7
|
+
#
|
8
|
+
# media box should be a 4 number array that describes the dimensions of the
|
9
|
+
# page to be rendered as described by the page's MediaBox attribute
|
10
|
+
class PageLayout
|
11
|
+
def initialize(runs, mediabox)
|
12
|
+
@runs = merge_runs(runs)
|
13
|
+
@mean_font_size = mean(@runs.map(&:font_size)) || 0
|
14
|
+
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
15
|
+
@page_width = mediabox[2] - mediabox[0]
|
16
|
+
@page_height = mediabox[3] - mediabox[1]
|
17
|
+
@x_offset = @runs.map(&:x).sort.first
|
18
|
+
@current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
|
19
|
+
RUBY_VERSION >= "1.9.0"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
return "" if @runs.empty?
|
24
|
+
|
25
|
+
page = row_count.times.map { |i| " " * col_count }
|
26
|
+
@runs.each do |run|
|
27
|
+
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
28
|
+
y_pos = row_count - (run.y / row_multiplier).round
|
29
|
+
if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
|
30
|
+
local_string_insert(page[y_pos], run.text, x_pos)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
interesting_rows(page).map(&:rstrip).join("\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# given an array of strings, return a new array with empty rows from the
|
39
|
+
# beginning and end removed.
|
40
|
+
#
|
41
|
+
# interesting_rows([ "", "one", "two", "" ])
|
42
|
+
# => [ "one", "two" ]
|
43
|
+
#
|
44
|
+
def interesting_rows(rows)
|
45
|
+
line_lengths = rows.map { |l| l.strip.length }
|
46
|
+
first_line_with_text = line_lengths.index { |l| l > 0 }
|
47
|
+
last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
|
48
|
+
interesting_line_count = last_line_with_text - first_line_with_text
|
49
|
+
rows[first_line_with_text, interesting_line_count].map
|
50
|
+
end
|
51
|
+
|
52
|
+
def row_count
|
53
|
+
@row_count ||= (@page_height / @mean_font_size).floor
|
54
|
+
end
|
55
|
+
|
56
|
+
def col_count
|
57
|
+
@col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
|
58
|
+
end
|
59
|
+
|
60
|
+
def row_multiplier
|
61
|
+
@row_multiplier ||= @page_height / row_count
|
62
|
+
end
|
63
|
+
|
64
|
+
def col_multiplier
|
65
|
+
@col_multiplier ||= @page_width / col_count
|
66
|
+
end
|
67
|
+
|
68
|
+
def mean(collection)
|
69
|
+
if collection.size == 0
|
70
|
+
0
|
71
|
+
else
|
72
|
+
collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def each_line(&block)
|
77
|
+
@runs.sort.group_by { |run|
|
78
|
+
run.y.to_i
|
79
|
+
}.map { |y, collection|
|
80
|
+
yield y, collection
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
# take a collection of TextRun objects and merge any that are in close
|
85
|
+
# proximity
|
86
|
+
def merge_runs(runs)
|
87
|
+
runs.group_by { |char|
|
88
|
+
char.y.to_i
|
89
|
+
}.map { |y, chars|
|
90
|
+
group_chars_into_runs(chars.sort)
|
91
|
+
}.flatten.sort
|
92
|
+
end
|
93
|
+
|
94
|
+
def group_chars_into_runs(chars)
|
95
|
+
runs = []
|
96
|
+
while head = chars.shift
|
97
|
+
if runs.empty?
|
98
|
+
runs << head
|
99
|
+
elsif runs.last.mergable?(head)
|
100
|
+
runs[-1] = runs.last + head
|
101
|
+
else
|
102
|
+
runs << head
|
103
|
+
end
|
104
|
+
end
|
105
|
+
runs
|
106
|
+
end
|
107
|
+
|
108
|
+
# This is a simple alternative to String#[]=. We can't use the string
|
109
|
+
# method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
|
110
|
+
#
|
111
|
+
# See my bug report at https://github.com/rubinius/rubinius/issues/1985
|
112
|
+
def local_string_insert(haystack, needle, index)
|
113
|
+
if @current_platform_is_rbx_19
|
114
|
+
char_count = needle.length
|
115
|
+
haystack.replace(
|
116
|
+
(haystack[0,index] || "") +
|
117
|
+
needle +
|
118
|
+
(haystack[index+char_count,500] || "")
|
119
|
+
)
|
120
|
+
else
|
121
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|