pdf-reader 0.7 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,8 @@
1
+ v0.7.1 (6th May 2008)
2
+ - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
3
+ - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
4
+ correctly when translating text into UTF-8
5
+
1
6
  v0.7 (6th May 2008)
2
7
  - API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
3
8
  - Improved support for converting text in some PDF files to unicode
@@ -5,7 +10,7 @@ v0.7 (6th May 2008)
5
10
  - Include some basic metadata callbacks
6
11
  - Don't interpret a comment token (%) inside a string as a comment
7
12
  - Small fixes to improve 1.9 compatability
8
- - Improved our Zlib deflating to make it more slightly more robust - still some more issues to work out though
13
+ - Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
9
14
  - Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
10
15
  - Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
11
16
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.7"
9
+ PKG_VERSION = "0.7.1"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
data/TODO CHANGED
@@ -10,7 +10,7 @@ v0.8
10
10
  from the Original encoding to Unicode.
11
11
  - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
12
12
  - Provide a way to get raw access to a particular object. Good for testing purposes
13
- - Improve interpretation of non content stream data (ie metadata). Use PDFDofEncoding, recognise UTF16 strings, recognise dates, etc
13
+ - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
14
14
  - Support Cross Reference Streams (spec 3.4.7)
15
15
 
16
16
  v0.9
@@ -23,7 +23,6 @@
23
23
  #
24
24
  ################################################################################
25
25
  require 'stringio'
26
- #require 'enumerable'
27
26
 
28
27
  class PDF::Reader
29
28
  ################################################################################
@@ -254,7 +253,7 @@ class PDF::Reader
254
253
  ################################################################################
255
254
  # Begin processing the document metadata
256
255
  def metadata (info)
257
- info = utf16_to_utf8(info)
256
+ info = decode_strings(info)
258
257
  callback(:metadata, [info]) if info
259
258
  end
260
259
  ################################################################################
@@ -430,16 +429,17 @@ class PDF::Reader
430
429
  end
431
430
  ################################################################################
432
431
  private
433
- def utf16_to_utf8(obj)
432
+ # strings outside of page content should be in either PDFDocEncoding or UTF-16.
433
+ def decode_strings(obj)
434
434
  case obj
435
435
  when String then
436
436
  if obj[0,2] == "\376\377"
437
- obj[2, obj.size-2].unpack("n*").pack("U*")
437
+ PDF::Reader::Encoding::UTF16Encoding.new.to_utf8(obj)
438
438
  else
439
- obj
439
+ PDF::Reader::Encoding::PDFDocEncoding.new.to_utf8(obj)
440
440
  end
441
- when Hash then obj.each { |key,val| obj[key] = utf16_to_utf8(val) }
442
- when Array then obj.collect { |item| utf16_to_utf8(item) }
441
+ when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
442
+ when Array then obj.collect { |item| decode_strings(item) }
443
443
  else
444
444
  obj
445
445
  end
@@ -34,13 +34,13 @@ class PDF::Reader
34
34
 
35
35
  # set the differences table for this encoding. should be an array in the following format:
36
36
  #
37
- # [25, "A", 26, "B"]
37
+ # [25, :A, 26, :B]
38
38
  #
39
39
  # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
40
40
  #
41
41
  # To save space the following array is also valid and equivilant to the previous one
42
42
  #
43
- # [25, "A", "B"]
43
+ # [25, :A, :B]
44
44
  def differences=(diff)
45
45
  raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
46
46
 
@@ -498,6 +498,84 @@ class PDF::Reader
498
498
  end
499
499
  end
500
500
 
501
+ class PDFDocEncoding < Encoding
502
+ # convert a PDFDocEncoding string into UTF-8
503
+ def to_utf8(str, tounicode = nil)
504
+ array_pdf = str.unpack('C*')
505
+ array_pdf = self.process_differences(array_pdf)
506
+ array_enc = []
507
+ array_pdf.each do |num|
508
+ if tounicode && (code = tounicode.decode(num))
509
+ array_enc << code
510
+ elsif tounicode
511
+ array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
512
+ else
513
+ case num
514
+ # change necesary characters to equivilant Unicode codepoints
515
+ when 0x18; array_enc << 0x02D8
516
+ when 0x19; array_enc << 0x02C7
517
+ when 0x1A; array_enc << 0x02C6
518
+ when 0x1B; array_enc << 0x02D9
519
+ when 0x1C; array_enc << 0x02DD
520
+ when 0x1D; array_enc << 0x02DB
521
+ when 0x1E; array_enc << 0x02DA
522
+ when 0x1F; array_enc << 0x02DC
523
+ when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
524
+ when 0x80; array_enc << 0x2022
525
+ when 0x81; array_enc << 0x2020
526
+ when 0x82; array_enc << 0x2021
527
+ when 0x83; array_enc << 0x2026
528
+ when 0x84; array_enc << 0x2014
529
+ when 0x85; array_enc << 0x2013
530
+ when 0x86; array_enc << 0x0192
531
+ when 0x87; array_enc << 0x2044
532
+ when 0x88; array_enc << 0x2039
533
+ when 0x89; array_enc << 0x203A
534
+ when 0x8A; array_enc << 0x2212
535
+ when 0x8B; array_enc << 0x2030
536
+ when 0x8C; array_enc << 0x201E
537
+ when 0x8D; array_enc << 0x201C
538
+ when 0x8E; array_enc << 0x201D
539
+ when 0x8F; array_enc << 0x2018
540
+ when 0x90; array_enc << 0x2019
541
+ when 0x91; array_enc << 0x201A
542
+ when 0x92; array_enc << 0x2122
543
+ when 0x93; array_enc << 0xFB01
544
+ when 0x94; array_enc << 0xFB02
545
+ when 0x95; array_enc << 0x0141
546
+ when 0x96; array_enc << 0x0152
547
+ when 0x97; array_enc << 0x0160
548
+ when 0x98; array_enc << 0x0178
549
+ when 0x99; array_enc << 0x017D
550
+ when 0x9A; array_enc << 0x0131
551
+ when 0x9B; array_enc << 0x0142
552
+ when 0x9C; array_enc << 0x0153
553
+ when 0x9D; array_enc << 0x0161
554
+ when 0x9E; array_enc << 0x017E
555
+ when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
556
+ when 0xA0; array_enc << 0x20AC
557
+ else
558
+ array_enc << num
559
+ end
560
+ end
561
+ end
562
+
563
+ # convert any glyph names to unicode codepoints
564
+ array_enc = self.process_glyphnames(array_enc)
565
+
566
+ # replace charcters that didn't convert to unicode nicely with something valid
567
+ array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
568
+
569
+ # pack all our Unicode codepoints into a UTF-8 string
570
+ ret = array_enc.pack("U*")
571
+
572
+ # set the strings encoding correctly under ruby 1.9+
573
+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
574
+
575
+ return ret
576
+ end
577
+ end
578
+
501
579
  class StandardEncoding < Encoding
502
580
  # convert an Adobe Standard Encoding string into UTF-8
503
581
  def to_utf8(str, tounicode = nil)
@@ -771,6 +849,23 @@ class PDF::Reader
771
849
  end
772
850
  end
773
851
 
852
+ class UTF16Encoding < Encoding
853
+ # convert a UTF-16 string into UTF-8
854
+ def to_utf8(str, tounicode = nil)
855
+
856
+ # remove the UTF-16 Byte Order Mark if it exists
857
+ str = str[2, str.size-2] if str[0,2] == "\376\377"
858
+
859
+ # convert away
860
+ str = str.unpack("n*").pack("U*")
861
+
862
+ # set the strings encoding correctly under ruby 1.9+
863
+ str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
864
+
865
+ return str
866
+ end
867
+ end
868
+
774
869
  class WinAnsiEncoding < Encoding
775
870
  # convert a WinAnsiEncoding string into UTF-8
776
871
  def to_utf8(str, tounicode = nil)
@@ -39,7 +39,7 @@ class PDF::Reader
39
39
  File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
40
40
  f.each do |l|
41
41
  m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
42
- @@glyphs[name] = "0x#{code}".hex if name
42
+ @@glyphs[name.to_sym] = "0x#{code}".hex if name
43
43
  end
44
44
  end
45
45
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.7"
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones