pdf-reader 0.7 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/Rakefile +1 -1
- data/TODO +1 -1
- data/lib/pdf/reader/content.rb +7 -7
- data/lib/pdf/reader/encoding.rb +97 -2
- data/lib/pdf/reader/font.rb +1 -1
- metadata +1 -1
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v0.7.1 (6th May 2008)
|
2
|
+
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
3
|
+
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
4
|
+
correctly when translating text into UTF-8
|
5
|
+
|
1
6
|
v0.7 (6th May 2008)
|
2
7
|
- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
|
3
8
|
- Improved support for converting text in some PDF files to unicode
|
@@ -5,7 +10,7 @@ v0.7 (6th May 2008)
|
|
5
10
|
- Include some basic metadata callbacks
|
6
11
|
- Don't interpret a comment token (%) inside a string as a comment
|
7
12
|
- Small fixes to improve 1.9 compatability
|
8
|
-
- Improved our Zlib deflating to make it
|
13
|
+
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
9
14
|
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
10
15
|
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
11
16
|
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -10,7 +10,7 @@ v0.8
|
|
10
10
|
from the Original encoding to Unicode.
|
11
11
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
12
12
|
- Provide a way to get raw access to a particular object. Good for testing purposes
|
13
|
-
- Improve interpretation of non content stream data (ie metadata).
|
13
|
+
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
14
14
|
- Support Cross Reference Streams (spec 3.4.7)
|
15
15
|
|
16
16
|
v0.9
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -23,7 +23,6 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
require 'stringio'
|
26
|
-
#require 'enumerable'
|
27
26
|
|
28
27
|
class PDF::Reader
|
29
28
|
################################################################################
|
@@ -254,7 +253,7 @@ class PDF::Reader
|
|
254
253
|
################################################################################
|
255
254
|
# Begin processing the document metadata
|
256
255
|
def metadata (info)
|
257
|
-
info =
|
256
|
+
info = decode_strings(info)
|
258
257
|
callback(:metadata, [info]) if info
|
259
258
|
end
|
260
259
|
################################################################################
|
@@ -430,16 +429,17 @@ class PDF::Reader
|
|
430
429
|
end
|
431
430
|
################################################################################
|
432
431
|
private
|
433
|
-
|
432
|
+
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
433
|
+
def decode_strings(obj)
|
434
434
|
case obj
|
435
435
|
when String then
|
436
436
|
if obj[0,2] == "\376\377"
|
437
|
-
|
437
|
+
PDF::Reader::Encoding::UTF16Encoding.new.to_utf8(obj)
|
438
438
|
else
|
439
|
-
obj
|
439
|
+
PDF::Reader::Encoding::PDFDocEncoding.new.to_utf8(obj)
|
440
440
|
end
|
441
|
-
when Hash then obj.each { |key,val| obj[key] =
|
442
|
-
when Array then obj.collect { |item|
|
441
|
+
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
442
|
+
when Array then obj.collect { |item| decode_strings(item) }
|
443
443
|
else
|
444
444
|
obj
|
445
445
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -34,13 +34,13 @@ class PDF::Reader
|
|
34
34
|
|
35
35
|
# set the differences table for this encoding. should be an array in the following format:
|
36
36
|
#
|
37
|
-
# [25,
|
37
|
+
# [25, :A, 26, :B]
|
38
38
|
#
|
39
39
|
# The array alternates bewteen a decimal byte number and a glyph name to map to that byte
|
40
40
|
#
|
41
41
|
# To save space the following array is also valid and equivilant to the previous one
|
42
42
|
#
|
43
|
-
# [25,
|
43
|
+
# [25, :A, :B]
|
44
44
|
def differences=(diff)
|
45
45
|
raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
|
46
46
|
|
@@ -498,6 +498,84 @@ class PDF::Reader
|
|
498
498
|
end
|
499
499
|
end
|
500
500
|
|
501
|
+
class PDFDocEncoding < Encoding
|
502
|
+
# convert a PDFDocEncoding string into UTF-8
|
503
|
+
def to_utf8(str, tounicode = nil)
|
504
|
+
array_pdf = str.unpack('C*')
|
505
|
+
array_pdf = self.process_differences(array_pdf)
|
506
|
+
array_enc = []
|
507
|
+
array_pdf.each do |num|
|
508
|
+
if tounicode && (code = tounicode.decode(num))
|
509
|
+
array_enc << code
|
510
|
+
elsif tounicode
|
511
|
+
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
512
|
+
else
|
513
|
+
case num
|
514
|
+
# change necesary characters to equivilant Unicode codepoints
|
515
|
+
when 0x18; array_enc << 0x02D8
|
516
|
+
when 0x19; array_enc << 0x02C7
|
517
|
+
when 0x1A; array_enc << 0x02C6
|
518
|
+
when 0x1B; array_enc << 0x02D9
|
519
|
+
when 0x1C; array_enc << 0x02DD
|
520
|
+
when 0x1D; array_enc << 0x02DB
|
521
|
+
when 0x1E; array_enc << 0x02DA
|
522
|
+
when 0x1F; array_enc << 0x02DC
|
523
|
+
when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
524
|
+
when 0x80; array_enc << 0x2022
|
525
|
+
when 0x81; array_enc << 0x2020
|
526
|
+
when 0x82; array_enc << 0x2021
|
527
|
+
when 0x83; array_enc << 0x2026
|
528
|
+
when 0x84; array_enc << 0x2014
|
529
|
+
when 0x85; array_enc << 0x2013
|
530
|
+
when 0x86; array_enc << 0x0192
|
531
|
+
when 0x87; array_enc << 0x2044
|
532
|
+
when 0x88; array_enc << 0x2039
|
533
|
+
when 0x89; array_enc << 0x203A
|
534
|
+
when 0x8A; array_enc << 0x2212
|
535
|
+
when 0x8B; array_enc << 0x2030
|
536
|
+
when 0x8C; array_enc << 0x201E
|
537
|
+
when 0x8D; array_enc << 0x201C
|
538
|
+
when 0x8E; array_enc << 0x201D
|
539
|
+
when 0x8F; array_enc << 0x2018
|
540
|
+
when 0x90; array_enc << 0x2019
|
541
|
+
when 0x91; array_enc << 0x201A
|
542
|
+
when 0x92; array_enc << 0x2122
|
543
|
+
when 0x93; array_enc << 0xFB01
|
544
|
+
when 0x94; array_enc << 0xFB02
|
545
|
+
when 0x95; array_enc << 0x0141
|
546
|
+
when 0x96; array_enc << 0x0152
|
547
|
+
when 0x97; array_enc << 0x0160
|
548
|
+
when 0x98; array_enc << 0x0178
|
549
|
+
when 0x99; array_enc << 0x017D
|
550
|
+
when 0x9A; array_enc << 0x0131
|
551
|
+
when 0x9B; array_enc << 0x0142
|
552
|
+
when 0x9C; array_enc << 0x0153
|
553
|
+
when 0x9D; array_enc << 0x0161
|
554
|
+
when 0x9E; array_enc << 0x017E
|
555
|
+
when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
556
|
+
when 0xA0; array_enc << 0x20AC
|
557
|
+
else
|
558
|
+
array_enc << num
|
559
|
+
end
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
# convert any glyph names to unicode codepoints
|
564
|
+
array_enc = self.process_glyphnames(array_enc)
|
565
|
+
|
566
|
+
# replace charcters that didn't convert to unicode nicely with something valid
|
567
|
+
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
568
|
+
|
569
|
+
# pack all our Unicode codepoints into a UTF-8 string
|
570
|
+
ret = array_enc.pack("U*")
|
571
|
+
|
572
|
+
# set the strings encoding correctly under ruby 1.9+
|
573
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
574
|
+
|
575
|
+
return ret
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
501
579
|
class StandardEncoding < Encoding
|
502
580
|
# convert an Adobe Standard Encoding string into UTF-8
|
503
581
|
def to_utf8(str, tounicode = nil)
|
@@ -771,6 +849,23 @@ class PDF::Reader
|
|
771
849
|
end
|
772
850
|
end
|
773
851
|
|
852
|
+
class UTF16Encoding < Encoding
|
853
|
+
# convert a UTF-16 string into UTF-8
|
854
|
+
def to_utf8(str, tounicode = nil)
|
855
|
+
|
856
|
+
# remove the UTF-16 Byte Order Mark if it exists
|
857
|
+
str = str[2, str.size-2] if str[0,2] == "\376\377"
|
858
|
+
|
859
|
+
# convert away
|
860
|
+
str = str.unpack("n*").pack("U*")
|
861
|
+
|
862
|
+
# set the strings encoding correctly under ruby 1.9+
|
863
|
+
str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
|
864
|
+
|
865
|
+
return str
|
866
|
+
end
|
867
|
+
end
|
868
|
+
|
774
869
|
class WinAnsiEncoding < Encoding
|
775
870
|
# convert a WinAnsiEncoding string into UTF-8
|
776
871
|
def to_utf8(str, tounicode = nil)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -39,7 +39,7 @@ class PDF::Reader
|
|
39
39
|
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
40
40
|
f.each do |l|
|
41
41
|
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
42
|
-
@@glyphs[name] = "0x#{code}".hex if name
|
42
|
+
@@glyphs[name.to_sym] = "0x#{code}".hex if name
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|