pdf-reader 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/CHANGELOG +7 -1
  2. data/README.rdoc +1 -0
  3. data/Rakefile +23 -8
  4. data/lib/pdf-reader.rb +3 -1
  5. data/lib/pdf/hash.rb +5 -1
  6. data/lib/pdf/reader.rb +8 -1
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  10. data/lib/pdf/reader/afm/Courier.afm +342 -0
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  15. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  16. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  17. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  18. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  19. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  20. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  21. data/lib/pdf/reader/buffer.rb +14 -6
  22. data/lib/pdf/reader/cid_widths.rb +61 -0
  23. data/lib/pdf/reader/cmap.rb +8 -2
  24. data/lib/pdf/reader/encoding.rb +52 -27
  25. data/lib/pdf/reader/error.rb +16 -1
  26. data/lib/pdf/reader/filter.rb +2 -0
  27. data/lib/pdf/reader/filter/ascii85.rb +3 -1
  28. data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
  29. data/lib/pdf/reader/filter/depredict.rb +2 -0
  30. data/lib/pdf/reader/filter/flate.rb +3 -1
  31. data/lib/pdf/reader/filter/lzw.rb +1 -0
  32. data/lib/pdf/reader/filter/null.rb +1 -0
  33. data/lib/pdf/reader/filter/run_length.rb +2 -1
  34. data/lib/pdf/reader/font.rb +74 -18
  35. data/lib/pdf/reader/font_descriptor.rb +80 -0
  36. data/lib/pdf/reader/glyph_hash.rb +6 -0
  37. data/lib/pdf/reader/lzw.rb +1 -0
  38. data/lib/pdf/reader/object_cache.rb +1 -1
  39. data/lib/pdf/reader/object_hash.rb +1 -1
  40. data/lib/pdf/reader/page_layout.rb +125 -0
  41. data/lib/pdf/reader/page_state.rb +172 -69
  42. data/lib/pdf/reader/page_text_receiver.rb +50 -21
  43. data/lib/pdf/reader/pages_strategy.rb +17 -4
  44. data/lib/pdf/reader/parser.rb +25 -52
  45. data/lib/pdf/reader/print_receiver.rb +5 -0
  46. data/lib/pdf/reader/reference.rb +2 -0
  47. data/lib/pdf/reader/register_receiver.rb +1 -1
  48. data/lib/pdf/reader/standard_security_handler.rb +2 -0
  49. data/lib/pdf/reader/stream.rb +2 -0
  50. data/lib/pdf/reader/synchronized_cache.rb +32 -0
  51. data/lib/pdf/reader/text_receiver.rb +5 -4
  52. data/lib/pdf/reader/text_run.rb +80 -0
  53. data/lib/pdf/reader/token.rb +2 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +194 -0
  55. data/lib/pdf/reader/width_calculator.rb +11 -0
  56. data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
  57. data/lib/pdf/reader/width_calculator/composite.rb +27 -0
  58. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  59. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
  60. data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
  61. data/lib/pdf/reader/xref.rb +9 -2
  62. metadata +119 -13
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -21,7 +23,6 @@
21
23
  # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
24
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
25
  #
24
-
25
26
  class PDF::Reader
26
27
  ################################################################################
27
28
  # An internal PDF::Reader class that helps to verify various parts of the PDF file
@@ -45,10 +46,24 @@ class PDF::Reader
45
46
  end
46
47
  ################################################################################
47
48
  end
49
+
48
50
  ################################################################################
51
+ # an exception that is raised when we believe the current PDF is not following
52
+ # the PDF spec and cannot be recovered
49
53
  class MalformedPDFError < RuntimeError; end
54
+
55
+ ################################################################################
56
+ # an exception that is raised when a PDF object appears to be invalid
50
57
  class InvalidObjectError < MalformedPDFError; end
58
+
59
+ ################################################################################
60
+ # an exception that is raised when a PDF follows the specs but uses a feature
61
+ # that we don't support just yet
51
62
  class UnsupportedFeatureError < RuntimeError; end
63
+
64
+ ################################################################################
65
+ # an exception that is raised when a PDF is encrypted and we don't have the
66
+ # necessary data to decrypt it
52
67
  class EncryptedPDFError < UnsupportedFeatureError; end
53
68
  end
54
69
  ################################################################################
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -4,6 +4,7 @@ require 'ascii85'
4
4
 
5
5
  class PDF::Reader
6
6
  module Filter # :nodoc:
7
+ # implementation of the Ascii85 filter
7
8
  class Ascii85
8
9
  def initialize(options = {})
9
10
  @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
18
19
  ::Ascii85::decode(data)
19
20
  rescue Exception => e
20
21
  # Oops, there was a problem decoding the stream
21
- raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
+ raise MalformedPDFError,
23
+ "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
24
  end
23
25
  end
24
26
  end
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the AsciiHex stream filter
5
6
  class AsciiHex
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
18
19
  data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
19
20
  rescue Exception => e
20
21
  # Oops, there was a problem decoding the stream
21
- raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
+ raise MalformedPDFError,
23
+ "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
24
  end
23
25
  end
24
26
  end
@@ -2,6 +2,8 @@
2
2
 
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # some filter implementations support preprocessing of the data to
6
+ # improve compression
5
7
  class Depredict
6
8
  def initialize(options = {})
7
9
  @options = options || {}
@@ -5,6 +5,7 @@ require 'zlib'
5
5
 
6
6
  class PDF::Reader
7
7
  module Filter # :nodoc:
8
+ # implementation of the Flate (zlib) stream filter
8
9
  class Flate
9
10
  def initialize(options = {})
10
11
  @options = options
@@ -30,7 +31,8 @@ class PDF::Reader
30
31
  Depredict.new(@options).filter(deflated)
31
32
  rescue Exception => e
32
33
  # Oops, there was a problem inflating the stream
33
- raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
+ raise MalformedPDFError,
35
+ "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
36
  end
35
37
  end
36
38
  end
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the LZW stream filter
5
6
  class Lzw
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -2,6 +2,7 @@
2
2
  #
3
3
  class PDF::Reader
4
4
  module Filter # :nodoc:
5
+ # implementation of the null stream filter
5
6
  class Null
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -1,7 +1,8 @@
1
1
  # coding: utf-8
2
2
  #
3
- class PDF::Reader
3
+ class PDF::Reader # :nodoc:
4
4
  module Filter # :nodoc:
5
+ # implementation of the run length stream filter
5
6
  class RunLength
6
7
  def initialize(options = {})
7
8
  @options = options
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2008 James Healy (jimmy@deefa.com)
@@ -23,11 +25,16 @@
23
25
  #
24
26
  ################################################################################
25
27
 
28
+ require 'pdf/reader/width_calculator'
29
+
26
30
  class PDF::Reader
31
+ # Represents a single font PDF object and provides some useful methods
32
+ # for extracting info. Mainly used for converting text to UTF-8.
33
+ #
27
34
  class Font
28
- attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
- attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
30
- attr_reader :basefont
35
+ attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
36
+ attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
37
+ :cid_widths, :cid_default_width
31
38
 
32
39
  def initialize(ohash = nil, obj = nil)
33
40
  if ohash.nil? || obj.nil?
@@ -40,6 +47,7 @@ class PDF::Reader
40
47
  extract_base_info(obj)
41
48
  extract_descriptor(obj)
42
49
  extract_descendants(obj)
50
+ @width_calc = build_width_calculator
43
51
 
44
52
  @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
45
53
  end
@@ -66,39 +74,79 @@ class PDF::Reader
66
74
  end
67
75
  end
68
76
 
69
- def glyph_width(c)
70
- @missing_width ||= 0
71
- @widths ||= []
72
- @widths.fetch(c - @first_char, @missing_width)
77
+ def unpack(data)
78
+ data.unpack(encoding.unpack)
79
+ end
80
+
81
+ # looks up the specified codepoint and returns a value that is in (pdf)
82
+ # glyph space, which is 1000 glyph units = 1 text space unit
83
+ def glyph_width(code_point)
84
+ if code_point.is_a?(String)
85
+ code_point = code_point.unpack(encoding.unpack).first
86
+ end
87
+
88
+ @cached_widths ||= {}
89
+ @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
73
90
  end
74
91
 
75
92
  private
76
93
 
94
+ def build_width_calculator
95
+ if @subtype == :Type0
96
+ PDF::Reader::WidthCalculator::TypeZero.new(self)
97
+ elsif @subtype == :Type1
98
+ if @font_descriptor.nil?
99
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
100
+ else
101
+ PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
102
+ end
103
+ elsif @subtype == :Type3
104
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
105
+ elsif @subtype == :TrueType
106
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
107
+ elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
108
+ PDF::Reader::WidthCalculator::Composite.new(self)
109
+ else
110
+ PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
111
+ end
112
+ end
113
+
77
114
  def extract_base_info(obj)
78
115
  @subtype = @ohash.object(obj[:Subtype])
79
116
  @basefont = @ohash.object(obj[:BaseFont])
80
117
  @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
81
118
  @widths = @ohash.object(obj[:Widths]) || []
82
119
  @first_char = @ohash.object(obj[:FirstChar])
120
+ @last_char = @ohash.object(obj[:LastChar])
121
+
122
+ # CID Fonts are not required to have a W or DW entry, if they don't exist,
123
+ # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
124
+ @cid_widths = @ohash.object(obj[:W]) || []
125
+ @cid_default_width = @ohash.object(obj[:DW]) || 1000
126
+
83
127
  if obj[:ToUnicode]
128
+ # ToUnicode is optional for Type1 and Type3
84
129
  stream = @ohash.object(obj[:ToUnicode])
85
130
  @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
86
131
  end
87
132
  end
88
133
 
89
134
  def extract_descriptor(obj)
90
- return unless obj[:FontDescriptor]
91
-
92
- fd = @ohash.object(obj[:FontDescriptor])
93
- @ascent = @ohash.object(fd[:Ascent])
94
- @descent = @ohash.object(fd[:Descent])
95
- @missing_width = @ohash.object(fd[:MissingWidth])
96
- @bbox = @ohash.object(fd[:FontBBox])
135
+ if obj[:FontDescriptor]
136
+ # create a font descriptor object if we can, in other words, unless this is
137
+ # a CID Font
138
+ fd = @ohash.object(obj[:FontDescriptor])
139
+ @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
140
+ else
141
+ @font_descriptor = nil
142
+ end
97
143
  end
98
144
 
99
145
  def extract_descendants(obj)
100
146
  return unless obj[:DescendantFonts]
101
-
147
+ # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
148
+ # A one-element array specifying the CIDFont dictionary that is the
149
+ # descendant of this Type 0 font.
102
150
  descendants = @ohash.object(obj[:DescendantFonts])
103
151
  @descendantfonts = descendants.map { |desc|
104
152
  PDF::Reader::Font.new(@ohash, @ohash.object(desc))
@@ -106,7 +154,11 @@ class PDF::Reader
106
154
  end
107
155
 
108
156
  def to_utf8_via_cmap(params)
109
- if params.class == String
157
+ if params.class == Fixnum
158
+ [
159
+ @tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
160
+ ].flatten.pack("U*")
161
+ elsif params.class == String
110
162
  params.unpack(encoding.unpack).map { |c|
111
163
  @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
112
164
  }.flatten.pack("U*")
@@ -118,9 +170,13 @@ class PDF::Reader
118
170
  end
119
171
 
120
172
  def to_utf8_via_encoding(params)
121
- raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
173
+ if encoding.kind_of?(String)
174
+ raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
175
+ end
122
176
 
123
- if params.class == String
177
+ if params.class == Fixnum
178
+ encoding.int_to_utf8_string(params)
179
+ elsif params.class == String
124
180
  encoding.to_utf8(params)
125
181
  elsif params.class == Array
126
182
  params.collect { |param| to_utf8_via_encoding(param) }
@@ -0,0 +1,80 @@
1
+ # coding: utf-8
2
+
3
+ require 'ttfunk'
4
+
5
+ class PDF::Reader
6
+
7
+ # Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
8
+ class FontDescriptor
9
+
10
+ attr_reader :font_name, :font_family, :font_stretch, :font_weight,
11
+ :font_bounding_box, :cap_height, :ascent, :descent, :leading,
12
+ :avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
13
+ :x_height, :font_flags
14
+
15
+ def initialize(ohash, fd_hash)
16
+ @ascent = ohash.object(fd_hash[:Ascent]) || 0
17
+ @descent = ohash.object(fd_hash[:Descent]) || 0
18
+ @missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
19
+ @font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
20
+ @avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
21
+ @cap_height = ohash.object(fd_hash[:CapHeight]) || 0
22
+ @font_flags = ohash.object(fd_hash[:Flags]) || 0
23
+ @italic_angle = ohash.object(fd_hash[:ItalicAngle])
24
+ @font_name = ohash.object(fd_hash[:FontName]).to_s
25
+ @leading = ohash.object(fd_hash[:Leading]) || 0
26
+ @max_width = ohash.object(fd_hash[:MaxWidth]) || 0
27
+ @stem_v = ohash.object(fd_hash[:StemV])
28
+ @x_height = ohash.object(fd_hash[:XHeight])
29
+ @font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
30
+ @font_weight = ohash.object(fd_hash[:FontWeight]) || 400
31
+ @font_family = ohash.object(fd_hash[:FontFamily])
32
+
33
+ # A FontDescriptor may have an embedded font program in FontFile
34
+ # (Type 1 Font Program), FontFile2 (TrueType font program), or
35
+ # FontFile3 (Other font program as defined by Subtype entry)
36
+ # Subtype entries:
37
+ # 1) Type1C: Type 1 Font Program in Compact Font Format
38
+ # 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
39
+ # 3) OpenType: OpenType Font Program
40
+ # see Section 9.9, PDF 32000-1:2008, pp 288-292
41
+ @font_program_stream = ohash.object(fd_hash[:FontFile2])
42
+ #TODO handle FontFile and FontFile3
43
+
44
+ @is_ttf = true if @font_program_stream
45
+ end
46
+
47
+ def glyph_width(char_code)
48
+ if @is_ttf
49
+ if ttf_program_stream.cmap.unicode.length > 0
50
+ glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
51
+ else
52
+ glyph_id = char_code
53
+ end
54
+ char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
55
+ if char_metric
56
+ puts "Char Code: #{char_code} -- Advance Width: #{char_metric.advance_width}" > 0
57
+ return char_metric.advance_width
58
+ end
59
+ end
60
+ end
61
+
62
+ # PDF states that a glyph is 1000 units wide, true type doesn't enforce
63
+ # any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
64
+ def glyph_to_pdf_scale_factor
65
+ if @is_ttf
66
+ @glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
67
+ else
68
+ @glyph_to_pdf_sf ||= 1.0
69
+ end
70
+ @glyph_to_pdf_sf
71
+ end
72
+
73
+ private
74
+
75
+ def ttf_program_stream
76
+ @ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
77
+ end
78
+ end
79
+
80
+ end
@@ -1,3 +1,5 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
5
  # Copyright (C) 2011 James Healy (jimmy@deefa.com)
@@ -24,6 +26,9 @@
24
26
  ################################################################################
25
27
 
26
28
  class PDF::Reader
29
+ # A Hash-like object that can convert glyph names into a unicode codepoint.
30
+ # The mapping is read from a data file on disk the first time it's needed.
31
+ #
27
32
  class GlyphHash # :nodoc:
28
33
  def initialize
29
34
  # only parse the glyph list once, and cache the results (for performance)
@@ -45,6 +50,7 @@ class PDF::Reader
45
50
  # => 48
46
51
  #
47
52
  # h[:34]
53
+ # => 34
48
54
  #
49
55
  def [](name)
50
56
  return nil unless name.is_a?(Symbol)
@@ -17,6 +17,7 @@ module PDF
17
17
  #
18
18
  class LZW # :nodoc:
19
19
 
20
+ # Wraps an LZW encoded string
20
21
  class BitStream # :nodoc:
21
22
 
22
23
  def initialize(data, bits_in_chunk)
@@ -1,6 +1,6 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'hashery'
3
+ require 'hashery/lru_hash'
4
4
 
5
5
  class PDF::Reader
6
6
 
@@ -41,8 +41,8 @@ class PDF::Reader
41
41
  #
42
42
  def initialize(input, opts = {})
43
43
  @io = extract_io_from(input)
44
- @pdf_version = read_version
45
44
  @xref = PDF::Reader::XRef.new(@io)
45
+ @pdf_version = read_version
46
46
  @trailer = @xref.trailer
47
47
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
48
48
  @sec_handler = build_security_handler(opts)
@@ -0,0 +1,125 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # Takes a collection of TextRun objects and renders them into a single
6
+ # string that best approximates the way they'd appear on a render PDF page.
7
+ #
8
+ # media box should be a 4 number array that describes the dimensions of the
9
+ # page to be rendered as described by the page's MediaBox attribute
10
+ class PageLayout
11
+ def initialize(runs, mediabox)
12
+ @runs = merge_runs(runs)
13
+ @mean_font_size = mean(@runs.map(&:font_size)) || 0
14
+ @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
15
+ @page_width = mediabox[2] - mediabox[0]
16
+ @page_height = mediabox[3] - mediabox[1]
17
+ @x_offset = @runs.map(&:x).sort.first
18
+ @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
19
+ RUBY_VERSION >= "1.9.0"
20
+ end
21
+
22
+ def to_s
23
+ return "" if @runs.empty?
24
+
25
+ page = row_count.times.map { |i| " " * col_count }
26
+ @runs.each do |run|
27
+ x_pos = ((run.x - @x_offset) / col_multiplier).round
28
+ y_pos = row_count - (run.y / row_multiplier).round
29
+ if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
30
+ local_string_insert(page[y_pos], run.text, x_pos)
31
+ end
32
+ end
33
+ interesting_rows(page).map(&:rstrip).join("\n")
34
+ end
35
+
36
+ private
37
+
38
+ # given an array of strings, return a new array with empty rows from the
39
+ # beginning and end removed.
40
+ #
41
+ # interesting_rows([ "", "one", "two", "" ])
42
+ # => [ "one", "two" ]
43
+ #
44
+ def interesting_rows(rows)
45
+ line_lengths = rows.map { |l| l.strip.length }
46
+ first_line_with_text = line_lengths.index { |l| l > 0 }
47
+ last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
48
+ interesting_line_count = last_line_with_text - first_line_with_text
49
+ rows[first_line_with_text, interesting_line_count].map
50
+ end
51
+
52
+ def row_count
53
+ @row_count ||= (@page_height / @mean_font_size).floor
54
+ end
55
+
56
+ def col_count
57
+ @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
58
+ end
59
+
60
+ def row_multiplier
61
+ @row_multiplier ||= @page_height / row_count
62
+ end
63
+
64
+ def col_multiplier
65
+ @col_multiplier ||= @page_width / col_count
66
+ end
67
+
68
+ def mean(collection)
69
+ if collection.size == 0
70
+ 0
71
+ else
72
+ collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
73
+ end
74
+ end
75
+
76
+ def each_line(&block)
77
+ @runs.sort.group_by { |run|
78
+ run.y.to_i
79
+ }.map { |y, collection|
80
+ yield y, collection
81
+ }
82
+ end
83
+
84
+ # take a collection of TextRun objects and merge any that are in close
85
+ # proximity
86
+ def merge_runs(runs)
87
+ runs.group_by { |char|
88
+ char.y.to_i
89
+ }.map { |y, chars|
90
+ group_chars_into_runs(chars.sort)
91
+ }.flatten.sort
92
+ end
93
+
94
+ def group_chars_into_runs(chars)
95
+ runs = []
96
+ while head = chars.shift
97
+ if runs.empty?
98
+ runs << head
99
+ elsif runs.last.mergable?(head)
100
+ runs[-1] = runs.last + head
101
+ else
102
+ runs << head
103
+ end
104
+ end
105
+ runs
106
+ end
107
+
108
+ # This is a simple alternative to String#[]=. We can't use the string
109
+ # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
110
+ #
111
+ # See my bug report at https://github.com/rubinius/rubinius/issues/1985
112
+ def local_string_insert(haystack, needle, index)
113
+ if @current_platform_is_rbx_19
114
+ char_count = needle.length
115
+ haystack.replace(
116
+ (haystack[0,index] || "") +
117
+ needle +
118
+ (haystack[index+char_count,500] || "")
119
+ )
120
+ else
121
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
122
+ end
123
+ end
124
+ end
125
+ end