pdf-reader 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ v1.3.1 (12th February 2013)
2
+ - various bug fixes
3
+
1
4
  v1.3.0 (30th December 2012)
2
5
  - Numerous performance optimisations (thanks Alex Dowad)
3
6
  - Improved text extraction (thanks Nathaniel Madura)
@@ -77,17 +77,11 @@ of PDF::Reader::Page.
77
77
  receiver = RedGreenBlue.new
78
78
  page.walk(receiver)
79
79
 
80
- For low level access to the objects in a PDF file, use the ObjectHash class. You can
81
- build an ObjectHash instance directly:
82
-
83
- puts PDF::Reader::ObjectHash.new("somefile.pdf")
84
-
85
- or via a PDF::Reader instance:
80
+ For low level access to the objects in a PDF file, use the ObjectHash class like
81
+ so:
86
82
 
87
83
  reader = PDF::Reader.new("somefile.pdf")
88
- puts reader.objects
89
-
90
- The second method is preferred to increase the effectiveness of internal caching.
84
+ puts reader.objects.inspect
91
85
 
92
86
  = Text Encoding
93
87
 
@@ -61,6 +61,10 @@ module PDF
61
61
  hash.merge!(@objects.deref(obj))
62
62
  end
63
63
  }
64
+ # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
65
+ # out. Assuming 8.5" x 11" is what Acobat does, so we do it too.
66
+ @attributes[:MediaBox] ||= [0,0,612,792]
67
+ @attributes
64
68
  end
65
69
 
66
70
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -9,6 +9,8 @@ class PDF::Reader
9
9
  # page to be rendered as described by the page's MediaBox attribute
10
10
  class PageLayout
11
11
  def initialize(runs, mediabox)
12
+ raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
13
+
12
14
  @runs = merge_runs(runs)
13
15
  @mean_font_size = mean(@runs.map(&:font_size)) || 0
14
16
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
@@ -58,11 +60,11 @@ class PDF::Reader
58
60
  end
59
61
 
60
62
  def row_multiplier
61
- @row_multiplier ||= @page_height / row_count
63
+ @row_multiplier ||= @page_height.to_f / row_count.to_f
62
64
  end
63
65
 
64
66
  def col_multiplier
65
- @col_multiplier ||= @page_width / col_count
67
+ @col_multiplier ||= @page_width.to_f / col_count.to_f
66
68
  end
67
69
 
68
70
  def mean(collection)
@@ -63,7 +63,12 @@ class PDF::Reader
63
63
  #
64
64
  def concatenate_matrix(a, b, c, d, e, f)
65
65
  if state[:ctm]
66
- state[:ctm].multiply!(a,b,c,d,e,f)
66
+ ctm = state[:ctm]
67
+ state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f).multiply!(
68
+ ctm.a, ctm.b,
69
+ ctm.c, ctm.d,
70
+ ctm.e, ctm.f
71
+ )
67
72
  else
68
73
  state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
69
74
  end
@@ -102,7 +107,11 @@ class PDF::Reader
102
107
  end
103
108
 
104
109
  def font_size
105
- @font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
110
+ @font_size ||= begin
111
+ _, zero = trm_transform(0,0)
112
+ _, one = trm_transform(1,1)
113
+ (zero - one).abs
114
+ end
106
115
  end
107
116
 
108
117
  def set_text_leading(leading)
@@ -324,7 +333,7 @@ class PDF::Reader
324
333
  # ctm[0] here, but this gets my tests green and I'm out of
325
334
  # ideas for now
326
335
  # TODO: support ty > 0
327
- if ctm.a == 1
336
+ if ctm.a == 1 || ctm.a == 0
328
337
  @text_matrix.horizontal_displacement_multiply!(tx)
329
338
  else
330
339
  @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
@@ -341,8 +350,8 @@ class PDF::Reader
341
350
  def text_rendering_matrix
342
351
  @text_rendering_matrix ||= begin
343
352
  state_matrix = TransformationMatrix.new(
344
- font_size * state[:h_scaling], 0,
345
- 0, font_size,
353
+ state[:text_font_size] * state[:h_scaling], 0,
354
+ 0, state[:text_font_size],
346
355
  0, state[:text_rise]
347
356
  )
348
357
  state_matrix.multiply!(
@@ -58,8 +58,12 @@ module PDF
58
58
  end
59
59
 
60
60
  def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
61
- params.each_slice(2).each do |string, kerning|
62
- internal_show_text(string, kerning || 0)
61
+ params.each do |arg|
62
+ if arg.is_a?(String)
63
+ internal_show_text(arg)
64
+ else
65
+ @state.process_glyph_displacement(0, arg, false)
66
+ end
63
67
  end
64
68
  end
65
69
 
@@ -88,7 +92,7 @@ module PDF
88
92
 
89
93
  private
90
94
 
91
- def internal_show_text(string, kerning = 0)
95
+ def internal_show_text(string)
92
96
  if @state.current_font.nil?
93
97
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
94
98
  end
@@ -102,16 +106,11 @@ module PDF
102
106
  # glyph will appear in the correct position
103
107
  glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
104
108
  th = 1
105
- if kerning != 0 && index == glyphs.size - 1
106
- tj = kerning
107
- else
108
- tj = 0
109
- end
110
109
  scaled_glyph_width = glyph_width * @state.font_size * th
111
110
  unless utf8_chars == SPACE
112
111
  @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
113
112
  end
114
- @state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
113
+ @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
115
114
  end
116
115
  end
117
116
 
@@ -42,7 +42,14 @@ class PDF::Reader
42
42
  name = @font.encoding.int_to_name(code_point)
43
43
  m = @metrics.metrics_for_name(name)
44
44
  end
45
- m[:wx]
45
+
46
+ if m
47
+ m[:wx]
48
+ elsif @font.widths[code_point - 1]
49
+ @font.widths[code_point - 1]
50
+ else
51
+ raise ArgumentError, "Unknown glyph width for #{codepoint}"
52
+ end
46
53
  end
47
54
 
48
55
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-29 00:00:00.000000000 Z
12
+ date: 2013-02-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake