pdf-reader 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ v0.9.2 (24th April 2011)
2
+ - add basic support for fonts with Identity-V encoding.
3
+ - bug: improve robustness of text extraction
4
+ - thanks to Evan Arnold for reporting
5
+ - bug: fix loading of nested resources on XObjects
6
+ - thanks to Samuel Williams for reporting
7
+ - bug: improve parsing of files with XRef object streams
8
+
1
9
  v0.9.1 (21st December 2010)
2
10
  - force gem to only install on ruby 1.8.7 or higher
3
11
  - maintaining supprot for earlier versions takes more time than I have
@@ -116,7 +116,7 @@ class PDF::Reader
116
116
  def original_codepoint_to_unicode(cp, tounicode = nil)
117
117
  if tounicode && (code = tounicode.decode(cp))
118
118
  code
119
- elsif tounicode || ( tounicode.nil? && to_unicode_required? )
119
+ elsif to_unicode_required? && (tounicode.nil? || tounicode.decode(cp).nil?)
120
120
  PDF::Reader::Encoding::UNKNOWN_CHAR
121
121
  elsif mapping[cp]
122
122
  mapping[cp]
@@ -129,7 +129,7 @@ class PDF::Reader
129
129
 
130
130
  def get_unpack(enc)
131
131
  case enc
132
- when :"Identity-H", :UTF16Encoding
132
+ when :"Identity-H", :"Identity-V", :UTF16Encoding
133
133
  "n*"
134
134
  else
135
135
  "C*"
@@ -140,6 +140,7 @@ class PDF::Reader
140
140
  return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
141
141
  files = {
142
142
  :"Identity-H" => nil,
143
+ :"Identity-V" => nil,
143
144
  :MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
144
145
  :MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
145
146
  :PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
@@ -158,7 +159,7 @@ class PDF::Reader
158
159
  end
159
160
 
160
161
  def unicode_required?(enc)
161
- enc == :"Identity-H"
162
+ enc == :"Identity-H" or enc == :"Identity-V"
162
163
  end
163
164
 
164
165
  def mapping
@@ -310,11 +310,15 @@ class PDF::Reader
310
310
 
311
311
  if xobject && xobject.hash[:Subtype] == :Form
312
312
  callback(:begin_form_xobject)
313
- resources = @ohash.object(xobject.hash[:Resources])
314
- walk_resources(resources) if resources
315
- fonts = font_hash_from_resources(resources)
313
+ xobj_resources = @ohash.object(xobject.hash[:Resources])
314
+ if xobj_resources
315
+ resources.push xobj_resources
316
+ walk_resources(xobj_resources)
317
+ end
318
+ fonts = font_hash_from_resources(xobj_resources)
316
319
  content_stream(xobject, fonts)
317
320
  callback(:end_form_xobject)
321
+ resources.pop if xobj_resources
318
322
  end
319
323
  end
320
324
 
@@ -439,8 +443,11 @@ class PDF::Reader
439
443
  obj
440
444
  when PDF::Reader::Reference then
441
445
  resolve_references(@ohash.object(obj))
442
- when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
443
- when Array then obj.collect { |item| resolve_references(item) }
446
+ when Hash then
447
+ arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
448
+ Hash[*arr]
449
+ when Array then
450
+ obj.collect { |item| resolve_references(item) }
444
451
  else
445
452
  obj
446
453
  end
@@ -154,23 +154,26 @@ class PDF::Reader
154
154
  trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
155
155
  trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
156
156
 
157
- widths = stream.hash[:W]
157
+ widths = stream.hash[:W]
158
158
  entry_length = widths.inject(0) { |s, w| s + w }
159
- raw_data = stream.unfiltered_data
159
+ raw_data = StringIO.new(stream.unfiltered_data)
160
160
  if stream.hash[:Index]
161
- index = stream.hash[:Index][0]
161
+ index = stream.hash[:Index]
162
162
  else
163
- index = 0
163
+ index = [0, stream.hash[:Size]]
164
164
  end
165
- stream.hash[:Size].times do |i|
166
- entry = raw_data[i*entry_length, entry_length] || ""
167
- f1 = unpack_bytes(entry[0,widths[0]])
168
- f2 = unpack_bytes(entry[widths[0],widths[1]])
169
- f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
170
- if f1 == 1
171
- store(index + i, f3, f2)
172
- elsif f1 == 2
173
- store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
165
+ index.each_slice(2) do |start_id, size|
166
+ obj_ids = (start_id..(start_id+(size-1)))
167
+ obj_ids.each do |objid|
168
+ entry = raw_data.read(entry_length) || ""
169
+ f1 = unpack_bytes(entry[0,widths[0]])
170
+ f2 = unpack_bytes(entry[widths[0],widths[1]])
171
+ f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
172
+ if f1 == 1 && f2 > 0
173
+ store(objid, f3, f2)
174
+ elsif f1 == 2 && f2 > 0
175
+ store(objid, 0, PDF::Reader::Reference.new(f2, 0))
176
+ end
174
177
  end
175
178
  end
176
179
 
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 9
9
- - 1
10
- version: 0.9.1
8
+ - 2
9
+ version: 0.9.2
11
10
  platform: ruby
12
11
  authors:
13
12
  - James Healy
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2010-12-21 00:00:00 +11:00
17
+ date: 2011-04-24 00:00:00 +10:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 3
30
28
  segments:
31
29
  - 0
32
30
  version: "0"
@@ -40,7 +38,6 @@ dependencies:
40
38
  requirements:
41
39
  - - ">="
42
40
  - !ruby/object:Gem::Version
43
- hash: 3
44
41
  segments:
45
42
  - 0
46
43
  version: "0"
@@ -54,7 +51,6 @@ dependencies:
54
51
  requirements:
55
52
  - - ~>
56
53
  - !ruby/object:Gem::Version
57
- hash: 1
58
54
  segments:
59
55
  - 2
60
56
  - 1
@@ -69,7 +65,6 @@ dependencies:
69
65
  requirements:
70
66
  - - ">="
71
67
  - !ruby/object:Gem::Version
72
- hash: 25
73
68
  segments:
74
69
  - 0
75
70
  - 9
@@ -158,7 +153,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
158
153
  requirements:
159
154
  - - ">="
160
155
  - !ruby/object:Gem::Version
161
- hash: 57
162
156
  segments:
163
157
  - 1
164
158
  - 8
@@ -169,7 +163,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
163
  requirements:
170
164
  - - ">="
171
165
  - !ruby/object:Gem::Version
172
- hash: 3
173
166
  segments:
174
167
  - 0
175
168
  version: "0"