pdf-reader 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/CHANGELOG +7 -1
  2. data/README.rdoc +1 -0
  3. data/Rakefile +23 -8
  4. data/lib/pdf-reader.rb +3 -1
  5. data/lib/pdf/hash.rb +5 -1
  6. data/lib/pdf/reader.rb +8 -1
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  10. data/lib/pdf/reader/afm/Courier.afm +342 -0
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  15. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  16. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  17. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  18. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  19. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  20. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  21. data/lib/pdf/reader/buffer.rb +14 -6
  22. data/lib/pdf/reader/cid_widths.rb +61 -0
  23. data/lib/pdf/reader/cmap.rb +8 -2
  24. data/lib/pdf/reader/encoding.rb +52 -27
  25. data/lib/pdf/reader/error.rb +16 -1
  26. data/lib/pdf/reader/filter.rb +2 -0
  27. data/lib/pdf/reader/filter/ascii85.rb +3 -1
  28. data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
  29. data/lib/pdf/reader/filter/depredict.rb +2 -0
  30. data/lib/pdf/reader/filter/flate.rb +3 -1
  31. data/lib/pdf/reader/filter/lzw.rb +1 -0
  32. data/lib/pdf/reader/filter/null.rb +1 -0
  33. data/lib/pdf/reader/filter/run_length.rb +2 -1
  34. data/lib/pdf/reader/font.rb +74 -18
  35. data/lib/pdf/reader/font_descriptor.rb +80 -0
  36. data/lib/pdf/reader/glyph_hash.rb +6 -0
  37. data/lib/pdf/reader/lzw.rb +1 -0
  38. data/lib/pdf/reader/object_cache.rb +1 -1
  39. data/lib/pdf/reader/object_hash.rb +1 -1
  40. data/lib/pdf/reader/page_layout.rb +125 -0
  41. data/lib/pdf/reader/page_state.rb +172 -69
  42. data/lib/pdf/reader/page_text_receiver.rb +50 -21
  43. data/lib/pdf/reader/pages_strategy.rb +17 -4
  44. data/lib/pdf/reader/parser.rb +25 -52
  45. data/lib/pdf/reader/print_receiver.rb +5 -0
  46. data/lib/pdf/reader/reference.rb +2 -0
  47. data/lib/pdf/reader/register_receiver.rb +1 -1
  48. data/lib/pdf/reader/standard_security_handler.rb +2 -0
  49. data/lib/pdf/reader/stream.rb +2 -0
  50. data/lib/pdf/reader/synchronized_cache.rb +32 -0
  51. data/lib/pdf/reader/text_receiver.rb +5 -4
  52. data/lib/pdf/reader/text_run.rb +80 -0
  53. data/lib/pdf/reader/token.rb +2 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +194 -0
  55. data/lib/pdf/reader/width_calculator.rb +11 -0
  56. data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
  57. data/lib/pdf/reader/width_calculator/composite.rb +27 -0
  58. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  59. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
  60. data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
  61. data/lib/pdf/reader/xref.rb +9 -2
  62. metadata +119 -13
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -21,7 +23,6 @@
21
23
  # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
24
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
25
  #
24
-
25
26
  class PDF::Reader
26
27
  ################################################################################
27
28
  # An internal PDF::Reader class that helps to verify various parts of the PDF file
@@ -45,10 +46,24 @@ class PDF::Reader
45
46
  end
46
47
  ################################################################################
47
48
  end
49
+
48
50
  ################################################################################
51
+ # an exception that is raised when we believe the current PDF is not following
52
+ # the PDF spec and cannot be recovered
49
53
  class MalformedPDFError < RuntimeError; end
54
+
55
+ ################################################################################
56
+ # an exception that is raised when a PDF object appears to be invalid
50
57
  class InvalidObjectError < MalformedPDFError; end
58
+
59
+ ################################################################################
60
+ # an exception that is raised when a PDF follows the specs but uses a feature
61
+ # that we don't support just yet
51
62
  class UnsupportedFeatureError < RuntimeError; end
63
+
64
+ ################################################################################
65
+ # an exception that is raised when a PDF is encrypted and we don't have the
66
+ # necessary data to decrypt it
52
67
  class EncryptedPDFError < UnsupportedFeatureError; end
53
68
  end
54
69
  ################################################################################
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -4,6 +4,7 @@ require 'ascii85'
4
4
 
5
5
  class PDF::Reader
6
6
  module Filter # :nodoc:
7
+ # implementation of the Ascii85 filter
7
8
  class Ascii85
8
9
  def initialize(options = {})
9
10
  @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
18
19
  ::Ascii85::decode(data)
19
20
  rescue Exception => e
20
21
  # Oops, there was a problem decoding the stream
21
- raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
+ raise MalformedPDFError,
23
+ "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
24
  end
23
25
  end
24
26
  end
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the AsciiHex stream filter
5
6
  class AsciiHex
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
18
19
  data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
19
20
  rescue Exception => e
20
21
  # Oops, there was a problem decoding the stream
21
- raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
+ raise MalformedPDFError,
23
+ "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
24
  end
23
25
  end
24
26
  end
@@ -2,6 +2,8 @@
2
2
 
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # some filter implementations support preprocessing of the data to
6
+ # improve compression
5
7
  class Depredict
6
8
  def initialize(options = {})
7
9
  @options = options || {}
@@ -5,6 +5,7 @@ require 'zlib'
5
5
 
6
6
  class PDF::Reader
7
7
  module Filter # :nodoc:
8
+ # implementation of the Flate (zlib) stream filter
8
9
  class Flate
9
10
  def initialize(options = {})
10
11
  @options = options
@@ -30,7 +31,8 @@ class PDF::Reader
30
31
  Depredict.new(@options).filter(deflated)
31
32
  rescue Exception => e
32
33
  # Oops, there was a problem inflating the stream
33
- raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
+ raise MalformedPDFError,
35
+ "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
36
  end
35
37
  end
36
38
  end
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the LZW stream filter
5
6
  class Lzw
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the null stream filter
5
6
  class Null
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -1,7 +1,8 @@
1
1
  # coding: utf-8
2
2
  #
3
- class PDF::Reader
3
+ class PDF::Reader # :nodoc:
4
4
  module Filter # :nodoc:
5
+ # implementation of the run length stream filter
5
6
  class RunLength
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2008 James Healy (jimmy@deefa.com)
@@ -23,11 +25,16 @@
23
25
  #
24
26
  ################################################################################
25
27
 
28
+ require 'pdf/reader/width_calculator'
29
+
26
30
  class PDF::Reader
31
+ # Represents a single font PDF object and provides some useful methods
32
+ # for extracting info. Mainly used for converting text to UTF-8.
33
+ #
27
34
  class Font
28
- attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
- attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
30
- attr_reader :basefont
35
+ attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
36
+ attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
37
+ :cid_widths, :cid_default_width
31
38
 
32
39
  def initialize(ohash = nil, obj = nil)
33
40
  if ohash.nil? || obj.nil?
@@ -40,6 +47,7 @@ class PDF::Reader
40
47
  extract_base_info(obj)
41
48
  extract_descriptor(obj)
42
49
  extract_descendants(obj)
50
+ @width_calc = build_width_calculator
43
51
 
44
52
  @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
45
53
  end
@@ -66,39 +74,79 @@ class PDF::Reader
66
74
  end
67
75
  end
68
76
 
69
- def glyph_width(c)
70
- @missing_width ||= 0
71
- @widths ||= []
72
- @widths.fetch(c - @first_char, @missing_width)
77
+ def unpack(data)
78
+ data.unpack(encoding.unpack)
79
+ end
80
+
81
+ # looks up the specified codepoint and returns a value that is in (pdf)
82
+ # glyph space, which is 1000 glyph units = 1 text space unit
83
+ def glyph_width(code_point)
84
+ if code_point.is_a?(String)
85
+ code_point = code_point.unpack(encoding.unpack).first
86
+ end
87
+
88
+ @cached_widths ||= {}
89
+ @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
73
90
  end
74
91
 
75
92
  private
76
93
 
94
+ def build_width_calculator
95
+ if @subtype == :Type0
96
+ PDF::Reader::WidthCalculator::TypeZero.new(self)
97
+ elsif @subtype == :Type1
98
+ if @font_descriptor.nil?
99
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
100
+ else
101
+ PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
102
+ end
103
+ elsif @subtype == :Type3
104
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
105
+ elsif @subtype == :TrueType
106
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
107
+ elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
108
+ PDF::Reader::WidthCalculator::Composite.new(self)
109
+ else
110
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
111
+ end
112
+ end
113
+
77
114
  def extract_base_info(obj)
78
115
  @subtype = @ohash.object(obj[:Subtype])
79
116
  @basefont = @ohash.object(obj[:BaseFont])
80
117
  @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
81
118
  @widths = @ohash.object(obj[:Widths]) || []
82
119
  @first_char = @ohash.object(obj[:FirstChar])
120
+ @last_char = @ohash.object(obj[:LastChar])
121
+
122
+ # CID Fonts are not required to have a W or DW entry, if they don't exist,
123
+ # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
124
+ @cid_widths = @ohash.object(obj[:W]) || []
125
+ @cid_default_width = @ohash.object(obj[:DW]) || 1000
126
+
83
127
  if obj[:ToUnicode]
128
+ # ToUnicode is optional for Type1 and Type3
84
129
  stream = @ohash.object(obj[:ToUnicode])
85
130
  @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
86
131
  end
87
132
  end
88
133
 
89
134
  def extract_descriptor(obj)
90
- return unless obj[:FontDescriptor]
91
-
92
- fd = @ohash.object(obj[:FontDescriptor])
93
- @ascent = @ohash.object(fd[:Ascent])
94
- @descent = @ohash.object(fd[:Descent])
95
- @missing_width = @ohash.object(fd[:MissingWidth])
96
- @bbox = @ohash.object(fd[:FontBBox])
135
+ if obj[:FontDescriptor]
136
+ # create a font descriptor object if we can, in other words, unless this is
137
+ # a CID Font
138
+ fd = @ohash.object(obj[:FontDescriptor])
139
+ @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
140
+ else
141
+ @font_descriptor = nil
142
+ end
97
143
  end
98
144
 
99
145
  def extract_descendants(obj)
100
146
  return unless obj[:DescendantFonts]
101
-
147
+ # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
148
+ # A one-element array specifying the CIDFont dictionary that is the
149
+ # descendant of this Type 0 font.
102
150
  descendants = @ohash.object(obj[:DescendantFonts])
103
151
  @descendantfonts = descendants.map { |desc|
104
152
  PDF::Reader::Font.new(@ohash, @ohash.object(desc))
@@ -106,7 +154,11 @@ class PDF::Reader
106
154
  end
107
155
 
108
156
  def to_utf8_via_cmap(params)
109
- if params.class == String
157
+ if params.class == Fixnum
158
+ [
159
+ @tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
160
+ ].flatten.pack("U*")
161
+ elsif params.class == String
110
162
  params.unpack(encoding.unpack).map { |c|
111
163
  @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
112
164
  }.flatten.pack("U*")
@@ -118,9 +170,13 @@ class PDF::Reader
118
170
  end
119
171
 
120
172
  def to_utf8_via_encoding(params)
121
- raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
173
+ if encoding.kind_of?(String)
174
+ raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
175
+ end
122
176
 
123
- if params.class == String
177
+ if params.class == Fixnum
178
+ encoding.int_to_utf8_string(params)
179
+ elsif params.class == String
124
180
  encoding.to_utf8(params)
125
181
  elsif params.class == Array
126
182
  params.collect { |param| to_utf8_via_encoding(param) }
@@ -0,0 +1,80 @@
1
+ # coding: utf-8
2
+
3
+ require 'ttfunk'
4
+
5
+ class PDF::Reader
6
+
7
+ # Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
8
+ class FontDescriptor
9
+
10
+ attr_reader :font_name, :font_family, :font_stretch, :font_weight,
11
+ :font_bounding_box, :cap_height, :ascent, :descent, :leading,
12
+ :avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
13
+ :x_height, :font_flags
14
+
15
+ def initialize(ohash, fd_hash)
16
+ @ascent = ohash.object(fd_hash[:Ascent]) || 0
17
+ @descent = ohash.object(fd_hash[:Descent]) || 0
18
+ @missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
19
+ @font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
20
+ @avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
21
+ @cap_height = ohash.object(fd_hash[:CapHeight]) || 0
22
+ @font_flags = ohash.object(fd_hash[:Flags]) || 0
23
+ @italic_angle = ohash.object(fd_hash[:ItalicAngle])
24
+ @font_name = ohash.object(fd_hash[:FontName]).to_s
25
+ @leading = ohash.object(fd_hash[:Leading]) || 0
26
+ @max_width = ohash.object(fd_hash[:MaxWidth]) || 0
27
+ @stem_v = ohash.object(fd_hash[:StemV])
28
+ @x_height = ohash.object(fd_hash[:XHeight])
29
+ @font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
30
+ @font_weight = ohash.object(fd_hash[:FontWeight]) || 400
31
+ @font_family = ohash.object(fd_hash[:FontFamily])
32
+
33
+ # A FontDescriptor may have an embedded font program in FontFile
34
+ # (Type 1 Font Program), FontFile2 (TrueType font program), or
35
+ # FontFile3 (Other font program as defined by Subtype entry)
36
+ # Subtype entries:
37
+ # 1) Type1C: Type 1 Font Program in Compact Font Format
38
+ # 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
39
+ # 3) OpenType: OpenType Font Program
40
+ # see Section 9.9, PDF 32000-1:2008, pp 288-292
41
+ @font_program_stream = ohash.object(fd_hash[:FontFile2])
42
+ #TODO handle FontFile and FontFile3
43
+
44
+ @is_ttf = true if @font_program_stream
45
+ end
46
+
47
+ def glyph_width(char_code)
48
+ if @is_ttf
49
+ if ttf_program_stream.cmap.unicode.length > 0
50
+ glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
51
+ else
52
+ glyph_id = char_code
53
+ end
54
+ char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
55
+ if char_metric
56
+ puts "Char Code: #{char_code} -- Advance Width: #{char_metric.advance_width}" > 0
57
+ return char_metric.advance_width
58
+ end
59
+ end
60
+ end
61
+
62
+ # PDF states that a glyph is 1000 units wide, true type doesn't enforce
63
+ # any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
64
+ def glyph_to_pdf_scale_factor
65
+ if @is_ttf
66
+ @glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
67
+ else
68
+ @glyph_to_pdf_sf ||= 1.0
69
+ end
70
+ @glyph_to_pdf_sf
71
+ end
72
+
73
+ private
74
+
75
+ def ttf_program_stream
76
+ @ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
77
+ end
78
+ end
79
+
80
+ end
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2011 James Healy (jimmy@deefa.com)
@@ -24,6 +26,9 @@
24
26
  ################################################################################
25
27
 
26
28
  class PDF::Reader
29
+ # A Hash-like object that can convert glyph names into a unicode codepoint.
30
+ # The mapping is read from a data file on disk the first time it's needed.
31
+ #
27
32
  class GlyphHash # :nodoc:
28
33
  def initialize
29
34
  # only parse the glyph list once, and cache the results (for performance)
@@ -45,6 +50,7 @@ class PDF::Reader
45
50
  # => 48
46
51
  #
47
52
  # h[:34]
53
+ # => 34
48
54
  #
49
55
  def [](name)
50
56
  return nil unless name.is_a?(Symbol)
@@ -17,6 +17,7 @@ module PDF
17
17
  #
18
18
  class LZW # :nodoc:
19
19
 
20
+ # Wraps an LZW encoded string
20
21
  class BitStream # :nodoc:
21
22
 
22
23
  def initialize(data, bits_in_chunk)
@@ -1,6 +1,6 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'hashery'
3
+ require 'hashery/lru_hash'
4
4
 
5
5
  class PDF::Reader
6
6
 
@@ -41,8 +41,8 @@ class PDF::Reader
41
41
  #
42
42
  def initialize(input, opts = {})
43
43
  @io = extract_io_from(input)
44
- @pdf_version = read_version
45
44
  @xref = PDF::Reader::XRef.new(@io)
45
+ @pdf_version = read_version
46
46
  @trailer = @xref.trailer
47
47
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
48
48
  @sec_handler = build_security_handler(opts)
@@ -0,0 +1,125 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # Takes a collection of TextRun objects and renders them into a single
6
+ # string that best approximates the way they'd appear on a render PDF page.
7
+ #
8
+ # media box should be a 4 number array that describes the dimensions of the
9
+ # page to be rendered as described by the page's MediaBox attribute
10
+ class PageLayout
11
+ def initialize(runs, mediabox)
12
+ @runs = merge_runs(runs)
13
+ @mean_font_size = mean(@runs.map(&:font_size)) || 0
14
+ @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
15
+ @page_width = mediabox[2] - mediabox[0]
16
+ @page_height = mediabox[3] - mediabox[1]
17
+ @x_offset = @runs.map(&:x).sort.first
18
+ @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
19
+ RUBY_VERSION >= "1.9.0"
20
+ end
21
+
22
+ def to_s
23
+ return "" if @runs.empty?
24
+
25
+ page = row_count.times.map { |i| " " * col_count }
26
+ @runs.each do |run|
27
+ x_pos = ((run.x - @x_offset) / col_multiplier).round
28
+ y_pos = row_count - (run.y / row_multiplier).round
29
+ if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
30
+ local_string_insert(page[y_pos], run.text, x_pos)
31
+ end
32
+ end
33
+ interesting_rows(page).map(&:rstrip).join("\n")
34
+ end
35
+
36
+ private
37
+
38
+ # given an array of strings, return a new array with empty rows from the
39
+ # beginning and end removed.
40
+ #
41
+ # interesting_rows([ "", "one", "two", "" ])
42
+ # => [ "one", "two" ]
43
+ #
44
+ def interesting_rows(rows)
45
+ line_lengths = rows.map { |l| l.strip.length }
46
+ first_line_with_text = line_lengths.index { |l| l > 0 }
47
+ last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
48
+ interesting_line_count = last_line_with_text - first_line_with_text
49
+ rows[first_line_with_text, interesting_line_count].map
50
+ end
51
+
52
+ def row_count
53
+ @row_count ||= (@page_height / @mean_font_size).floor
54
+ end
55
+
56
+ def col_count
57
+ @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
58
+ end
59
+
60
+ def row_multiplier
61
+ @row_multiplier ||= @page_height / row_count
62
+ end
63
+
64
+ def col_multiplier
65
+ @col_multiplier ||= @page_width / col_count
66
+ end
67
+
68
+ def mean(collection)
69
+ if collection.size == 0
70
+ 0
71
+ else
72
+ collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
73
+ end
74
+ end
75
+
76
+ def each_line(&block)
77
+ @runs.sort.group_by { |run|
78
+ run.y.to_i
79
+ }.map { |y, collection|
80
+ yield y, collection
81
+ }
82
+ end
83
+
84
+ # take a collection of TextRun objects and merge any that are in close
85
+ # proximity
86
+ def merge_runs(runs)
87
+ runs.group_by { |char|
88
+ char.y.to_i
89
+ }.map { |y, chars|
90
+ group_chars_into_runs(chars.sort)
91
+ }.flatten.sort
92
+ end
93
+
94
+ def group_chars_into_runs(chars)
95
+ runs = []
96
+ while head = chars.shift
97
+ if runs.empty?
98
+ runs << head
99
+ elsif runs.last.mergable?(head)
100
+ runs[-1] = runs.last + head
101
+ else
102
+ runs << head
103
+ end
104
+ end
105
+ runs
106
+ end
107
+
108
+ # This is a simple alternative to String#[]=. We can't use the string
109
+ # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
110
+ #
111
+ # See my bug report at https://github.com/rubinius/rubinius/issues/1985
112
+ def local_string_insert(haystack, needle, index)
113
+ if @current_platform_is_rbx_19
114
+ char_count = needle.length
115
+ haystack.replace(
116
+ (haystack[0,index] || "") +
117
+ needle +
118
+ (haystack[index+char_count,500] || "")
119
+ )
120
+ else
121
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
122
+ end
123
+ end
124
+ end
125
+ end