pdf-reader 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ v1.3.1 (12th February 2013)
2
+ - various bug fixes
3
+
1
4
  v1.3.0 (30th December 2012)
2
5
  - Numerous performance optimisations (thanks Alex Dowad)
3
6
  - Improved text extraction (thanks Nathaniel Madura)
@@ -77,17 +77,11 @@ of PDF::Reader::Page.
77
77
  receiver = RedGreenBlue.new
78
78
  page.walk(receiver)
79
79
 
80
- For low level access to the objects in a PDF file, use the ObjectHash class. You can
81
- build an ObjectHash instance directly:
82
-
83
- puts PDF::Reader::ObjectHash.new("somefile.pdf")
84
-
85
- or via a PDF::Reader instance:
80
+ For low level access to the objects in a PDF file, use the ObjectHash class like
81
+ so:
86
82
 
87
83
  reader = PDF::Reader.new("somefile.pdf")
88
- puts reader.objects
89
-
90
- The second method is preferred to increase the effectiveness of internal caching.
84
+ puts reader.objects.inspect
91
85
 
92
86
  = Text Encoding
93
87
 
@@ -61,6 +61,10 @@ module PDF
61
61
  hash.merge!(@objects.deref(obj))
62
62
  end
63
63
  }
64
+ # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
65
+ # out. Assuming 8.5" x 11" is what Acobat does, so we do it too.
66
+ @attributes[:MediaBox] ||= [0,0,612,792]
67
+ @attributes
64
68
  end
65
69
 
66
70
  # returns the plain text content of this page encoded as UTF-8. Any
@@ -9,6 +9,8 @@ class PDF::Reader
9
9
  # page to be rendered as described by the page's MediaBox attribute
10
10
  class PageLayout
11
11
  def initialize(runs, mediabox)
12
+ raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
13
+
12
14
  @runs = merge_runs(runs)
13
15
  @mean_font_size = mean(@runs.map(&:font_size)) || 0
14
16
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
@@ -58,11 +60,11 @@ class PDF::Reader
58
60
  end
59
61
 
60
62
  def row_multiplier
61
- @row_multiplier ||= @page_height / row_count
63
+ @row_multiplier ||= @page_height.to_f / row_count.to_f
62
64
  end
63
65
 
64
66
  def col_multiplier
65
- @col_multiplier ||= @page_width / col_count
67
+ @col_multiplier ||= @page_width.to_f / col_count.to_f
66
68
  end
67
69
 
68
70
  def mean(collection)
@@ -63,7 +63,12 @@ class PDF::Reader
63
63
  #
64
64
  def concatenate_matrix(a, b, c, d, e, f)
65
65
  if state[:ctm]
66
- state[:ctm].multiply!(a,b,c,d,e,f)
66
+ ctm = state[:ctm]
67
+ state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f).multiply!(
68
+ ctm.a, ctm.b,
69
+ ctm.c, ctm.d,
70
+ ctm.e, ctm.f
71
+ )
67
72
  else
68
73
  state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
69
74
  end
@@ -102,7 +107,11 @@ class PDF::Reader
102
107
  end
103
108
 
104
109
  def font_size
105
- @font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
110
+ @font_size ||= begin
111
+ _, zero = trm_transform(0,0)
112
+ _, one = trm_transform(1,1)
113
+ (zero - one).abs
114
+ end
106
115
  end
107
116
 
108
117
  def set_text_leading(leading)
@@ -324,7 +333,7 @@ class PDF::Reader
324
333
  # ctm[0] here, but this gets my tests green and I'm out of
325
334
  # ideas for now
326
335
  # TODO: support ty > 0
327
- if ctm.a == 1
336
+ if ctm.a == 1 || ctm.a == 0
328
337
  @text_matrix.horizontal_displacement_multiply!(tx)
329
338
  else
330
339
  @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
@@ -341,8 +350,8 @@ class PDF::Reader
341
350
  def text_rendering_matrix
342
351
  @text_rendering_matrix ||= begin
343
352
  state_matrix = TransformationMatrix.new(
344
- font_size * state[:h_scaling], 0,
345
- 0, font_size,
353
+ state[:text_font_size] * state[:h_scaling], 0,
354
+ 0, state[:text_font_size],
346
355
  0, state[:text_rise]
347
356
  )
348
357
  state_matrix.multiply!(
@@ -58,8 +58,12 @@ module PDF
58
58
  end
59
59
 
60
60
  def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
61
- params.each_slice(2).each do |string, kerning|
62
- internal_show_text(string, kerning || 0)
61
+ params.each do |arg|
62
+ if arg.is_a?(String)
63
+ internal_show_text(arg)
64
+ else
65
+ @state.process_glyph_displacement(0, arg, false)
66
+ end
63
67
  end
64
68
  end
65
69
 
@@ -88,7 +92,7 @@ module PDF
88
92
 
89
93
  private
90
94
 
91
- def internal_show_text(string, kerning = 0)
95
+ def internal_show_text(string)
92
96
  if @state.current_font.nil?
93
97
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
94
98
  end
@@ -102,16 +106,11 @@ module PDF
102
106
  # glyph will appear in the correct position
103
107
  glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
104
108
  th = 1
105
- if kerning != 0 && index == glyphs.size - 1
106
- tj = kerning
107
- else
108
- tj = 0
109
- end
110
109
  scaled_glyph_width = glyph_width * @state.font_size * th
111
110
  unless utf8_chars == SPACE
112
111
  @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
113
112
  end
114
- @state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
113
+ @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
115
114
  end
116
115
  end
117
116
 
@@ -42,7 +42,14 @@ class PDF::Reader
42
42
  name = @font.encoding.int_to_name(code_point)
43
43
  m = @metrics.metrics_for_name(name)
44
44
  end
45
- m[:wx]
45
+
46
+ if m
47
+ m[:wx]
48
+ elsif @font.widths[code_point - 1]
49
+ @font.widths[code_point - 1]
50
+ else
51
+ raise ArgumentError, "Unknown glyph width for #{codepoint}"
52
+ end
46
53
  end
47
54
 
48
55
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-29 00:00:00.000000000 Z
12
+ date: 2013-02-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake