pdf-reader 0.7 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,8 @@
1
+ v0.7.1 (6th May 2008)
2
+ - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
3
+ - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
4
+ correctly when translating text into UTF-8
5
+
1
6
  v0.7 (6th May 2008)
2
7
  - API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
3
8
  - Improved support for converting text in some PDF files to unicode
@@ -5,7 +10,7 @@ v0.7 (6th May 2008)
5
10
  - Include some basic metadata callbacks
6
11
  - Don't interpret a comment token (%) inside a string as a comment
7
12
  - Small fixes to improve 1.9 compatability
8
- - Improved our Zlib deflating to make it more slightly more robust - still some more issues to work out though
13
+ - Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
9
14
  - Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
10
15
  - Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
11
16
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.7"
9
+ PKG_VERSION = "0.7.1"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
data/TODO CHANGED
@@ -10,7 +10,7 @@ v0.8
10
10
  from the Original encoding to Unicode.
11
11
  - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
12
12
  - Provide a way to get raw access to a particular object. Good for testing purposes
13
- - Improve interpretation of non content stream data (ie metadata). Use PDFDofEncoding, recognise UTF16 strings, recognise dates, etc
13
+ - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
14
14
  - Support Cross Reference Streams (spec 3.4.7)
15
15
 
16
16
  v0.9
@@ -23,7 +23,6 @@
23
23
  #
24
24
  ################################################################################
25
25
  require 'stringio'
26
- #require 'enumerable'
27
26
 
28
27
  class PDF::Reader
29
28
  ################################################################################
@@ -254,7 +253,7 @@ class PDF::Reader
254
253
  ################################################################################
255
254
  # Begin processing the document metadata
256
255
  def metadata (info)
257
- info = utf16_to_utf8(info)
256
+ info = decode_strings(info)
258
257
  callback(:metadata, [info]) if info
259
258
  end
260
259
  ################################################################################
@@ -430,16 +429,17 @@ class PDF::Reader
430
429
  end
431
430
  ################################################################################
432
431
  private
433
- def utf16_to_utf8(obj)
432
+ # strings outside of page content should be in either PDFDocEncoding or UTF-16.
433
+ def decode_strings(obj)
434
434
  case obj
435
435
  when String then
436
436
  if obj[0,2] == "\376\377"
437
- obj[2, obj.size-2].unpack("n*").pack("U*")
437
+ PDF::Reader::Encoding::UTF16Encoding.new.to_utf8(obj)
438
438
  else
439
- obj
439
+ PDF::Reader::Encoding::PDFDocEncoding.new.to_utf8(obj)
440
440
  end
441
- when Hash then obj.each { |key,val| obj[key] = utf16_to_utf8(val) }
442
- when Array then obj.collect { |item| utf16_to_utf8(item) }
441
+ when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
442
+ when Array then obj.collect { |item| decode_strings(item) }
443
443
  else
444
444
  obj
445
445
  end
@@ -34,13 +34,13 @@ class PDF::Reader
34
34
 
35
35
  # set the differences table for this encoding. should be an array in the following format:
36
36
  #
37
- # [25, "A", 26, "B"]
37
+ # [25, :A, 26, :B]
38
38
  #
39
39
  # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
40
40
  #
41
41
  # To save space the following array is also valid and equivilant to the previous one
42
42
  #
43
- # [25, "A", "B"]
43
+ # [25, :A, :B]
44
44
  def differences=(diff)
45
45
  raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
46
46
 
@@ -498,6 +498,84 @@ class PDF::Reader
498
498
  end
499
499
  end
500
500
 
501
+ class PDFDocEncoding < Encoding
502
+ # convert a PDFDocEncoding string into UTF-8
503
+ def to_utf8(str, tounicode = nil)
504
+ array_pdf = str.unpack('C*')
505
+ array_pdf = self.process_differences(array_pdf)
506
+ array_enc = []
507
+ array_pdf.each do |num|
508
+ if tounicode && (code = tounicode.decode(num))
509
+ array_enc << code
510
+ elsif tounicode
511
+ array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
512
+ else
513
+ case num
514
+ # change necesary characters to equivilant Unicode codepoints
515
+ when 0x18; array_enc << 0x02D8
516
+ when 0x19; array_enc << 0x02C7
517
+ when 0x1A; array_enc << 0x02C6
518
+ when 0x1B; array_enc << 0x02D9
519
+ when 0x1C; array_enc << 0x02DD
520
+ when 0x1D; array_enc << 0x02DB
521
+ when 0x1E; array_enc << 0x02DA
522
+ when 0x1F; array_enc << 0x02DC
523
+ when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
524
+ when 0x80; array_enc << 0x2022
525
+ when 0x81; array_enc << 0x2020
526
+ when 0x82; array_enc << 0x2021
527
+ when 0x83; array_enc << 0x2026
528
+ when 0x84; array_enc << 0x2014
529
+ when 0x85; array_enc << 0x2013
530
+ when 0x86; array_enc << 0x0192
531
+ when 0x87; array_enc << 0x2044
532
+ when 0x88; array_enc << 0x2039
533
+ when 0x89; array_enc << 0x203A
534
+ when 0x8A; array_enc << 0x2212
535
+ when 0x8B; array_enc << 0x2030
536
+ when 0x8C; array_enc << 0x201E
537
+ when 0x8D; array_enc << 0x201C
538
+ when 0x8E; array_enc << 0x201D
539
+ when 0x8F; array_enc << 0x2018
540
+ when 0x90; array_enc << 0x2019
541
+ when 0x91; array_enc << 0x201A
542
+ when 0x92; array_enc << 0x2122
543
+ when 0x93; array_enc << 0xFB01
544
+ when 0x94; array_enc << 0xFB02
545
+ when 0x95; array_enc << 0x0141
546
+ when 0x96; array_enc << 0x0152
547
+ when 0x97; array_enc << 0x0160
548
+ when 0x98; array_enc << 0x0178
549
+ when 0x99; array_enc << 0x017D
550
+ when 0x9A; array_enc << 0x0131
551
+ when 0x9B; array_enc << 0x0142
552
+ when 0x9C; array_enc << 0x0153
553
+ when 0x9D; array_enc << 0x0161
554
+ when 0x9E; array_enc << 0x017E
555
+ when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
556
+ when 0xA0; array_enc << 0x20AC
557
+ else
558
+ array_enc << num
559
+ end
560
+ end
561
+ end
562
+
563
+ # convert any glyph names to unicode codepoints
564
+ array_enc = self.process_glyphnames(array_enc)
565
+
566
+ # replace charcters that didn't convert to unicode nicely with something valid
567
+ array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
568
+
569
+ # pack all our Unicode codepoints into a UTF-8 string
570
+ ret = array_enc.pack("U*")
571
+
572
+ # set the strings encoding correctly under ruby 1.9+
573
+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
574
+
575
+ return ret
576
+ end
577
+ end
578
+
501
579
  class StandardEncoding < Encoding
502
580
  # convert an Adobe Standard Encoding string into UTF-8
503
581
  def to_utf8(str, tounicode = nil)
@@ -771,6 +849,23 @@ class PDF::Reader
771
849
  end
772
850
  end
773
851
 
852
+ class UTF16Encoding < Encoding
853
+ # convert a UTF-16 string into UTF-8
854
+ def to_utf8(str, tounicode = nil)
855
+
856
+ # remove the UTF-16 Byte Order Mark if it exists
857
+ str = str[2, str.size-2] if str[0,2] == "\376\377"
858
+
859
+ # convert away
860
+ str = str.unpack("n*").pack("U*")
861
+
862
+ # set the strings encoding correctly under ruby 1.9+
863
+ str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
864
+
865
+ return str
866
+ end
867
+ end
868
+
774
869
  class WinAnsiEncoding < Encoding
775
870
  # convert a WinAnsiEncoding string into UTF-8
776
871
  def to_utf8(str, tounicode = nil)
@@ -39,7 +39,7 @@ class PDF::Reader
39
39
  File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
40
40
  f.each do |l|
41
41
  m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
42
- @@glyphs[name] = "0x#{code}".hex if name
42
+ @@glyphs[name.to_sym] = "0x#{code}".hex if name
43
43
  end
44
44
  end
45
45
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.7"
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones