pdf-reader 0.7 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
- data/Rakefile +1 -1
- data/TODO +1 -1
- data/lib/pdf/reader/content.rb +7 -7
- data/lib/pdf/reader/encoding.rb +97 -2
- data/lib/pdf/reader/font.rb +1 -1
- metadata +1 -1
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v0.7.1 (6th May 2008)
|
2
|
+
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
3
|
+
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
4
|
+
correctly when translating text into UTF-8
|
5
|
+
|
1
6
|
v0.7 (6th May 2008)
|
2
7
|
- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
|
3
8
|
- Improved support for converting text in some PDF files to unicode
|
@@ -5,7 +10,7 @@ v0.7 (6th May 2008)
|
|
5
10
|
- Include some basic metadata callbacks
|
6
11
|
- Don't interpret a comment token (%) inside a string as a comment
|
7
12
|
- Small fixes to improve 1.9 compatability
|
8
|
-
- Improved our Zlib deflating to make it
|
13
|
+
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
9
14
|
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
10
15
|
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
11
16
|
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -10,7 +10,7 @@ v0.8
|
|
10
10
|
from the Original encoding to Unicode.
|
11
11
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
12
12
|
- Provide a way to get raw access to a particular object. Good for testing purposes
|
13
|
-
- Improve interpretation of non content stream data (ie metadata).
|
13
|
+
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
14
14
|
- Support Cross Reference Streams (spec 3.4.7)
|
15
15
|
|
16
16
|
v0.9
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -23,7 +23,6 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
require 'stringio'
|
26
|
-
#require 'enumerable'
|
27
26
|
|
28
27
|
class PDF::Reader
|
29
28
|
################################################################################
|
@@ -254,7 +253,7 @@ class PDF::Reader
|
|
254
253
|
################################################################################
|
255
254
|
# Begin processing the document metadata
|
256
255
|
def metadata (info)
|
257
|
-
info =
|
256
|
+
info = decode_strings(info)
|
258
257
|
callback(:metadata, [info]) if info
|
259
258
|
end
|
260
259
|
################################################################################
|
@@ -430,16 +429,17 @@ class PDF::Reader
|
|
430
429
|
end
|
431
430
|
################################################################################
|
432
431
|
private
|
433
|
-
|
432
|
+
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
433
|
+
def decode_strings(obj)
|
434
434
|
case obj
|
435
435
|
when String then
|
436
436
|
if obj[0,2] == "\376\377"
|
437
|
-
|
437
|
+
PDF::Reader::Encoding::UTF16Encoding.new.to_utf8(obj)
|
438
438
|
else
|
439
|
-
obj
|
439
|
+
PDF::Reader::Encoding::PDFDocEncoding.new.to_utf8(obj)
|
440
440
|
end
|
441
|
-
when Hash then obj.each { |key,val| obj[key] =
|
442
|
-
when Array then obj.collect { |item|
|
441
|
+
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
442
|
+
when Array then obj.collect { |item| decode_strings(item) }
|
443
443
|
else
|
444
444
|
obj
|
445
445
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -34,13 +34,13 @@ class PDF::Reader
|
|
34
34
|
|
35
35
|
# set the differences table for this encoding. should be an array in the following format:
|
36
36
|
#
|
37
|
-
# [25,
|
37
|
+
# [25, :A, 26, :B]
|
38
38
|
#
|
39
39
|
# The array alternates bewteen a decimal byte number and a glyph name to map to that byte
|
40
40
|
#
|
41
41
|
# To save space the following array is also valid and equivilant to the previous one
|
42
42
|
#
|
43
|
-
# [25,
|
43
|
+
# [25, :A, :B]
|
44
44
|
def differences=(diff)
|
45
45
|
raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
|
46
46
|
|
@@ -498,6 +498,84 @@ class PDF::Reader
|
|
498
498
|
end
|
499
499
|
end
|
500
500
|
|
501
|
+
class PDFDocEncoding < Encoding
|
502
|
+
# convert a PDFDocEncoding string into UTF-8
|
503
|
+
def to_utf8(str, tounicode = nil)
|
504
|
+
array_pdf = str.unpack('C*')
|
505
|
+
array_pdf = self.process_differences(array_pdf)
|
506
|
+
array_enc = []
|
507
|
+
array_pdf.each do |num|
|
508
|
+
if tounicode && (code = tounicode.decode(num))
|
509
|
+
array_enc << code
|
510
|
+
elsif tounicode
|
511
|
+
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
512
|
+
else
|
513
|
+
case num
|
514
|
+
# change necesary characters to equivilant Unicode codepoints
|
515
|
+
when 0x18; array_enc << 0x02D8
|
516
|
+
when 0x19; array_enc << 0x02C7
|
517
|
+
when 0x1A; array_enc << 0x02C6
|
518
|
+
when 0x1B; array_enc << 0x02D9
|
519
|
+
when 0x1C; array_enc << 0x02DD
|
520
|
+
when 0x1D; array_enc << 0x02DB
|
521
|
+
when 0x1E; array_enc << 0x02DA
|
522
|
+
when 0x1F; array_enc << 0x02DC
|
523
|
+
when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
524
|
+
when 0x80; array_enc << 0x2022
|
525
|
+
when 0x81; array_enc << 0x2020
|
526
|
+
when 0x82; array_enc << 0x2021
|
527
|
+
when 0x83; array_enc << 0x2026
|
528
|
+
when 0x84; array_enc << 0x2014
|
529
|
+
when 0x85; array_enc << 0x2013
|
530
|
+
when 0x86; array_enc << 0x0192
|
531
|
+
when 0x87; array_enc << 0x2044
|
532
|
+
when 0x88; array_enc << 0x2039
|
533
|
+
when 0x89; array_enc << 0x203A
|
534
|
+
when 0x8A; array_enc << 0x2212
|
535
|
+
when 0x8B; array_enc << 0x2030
|
536
|
+
when 0x8C; array_enc << 0x201E
|
537
|
+
when 0x8D; array_enc << 0x201C
|
538
|
+
when 0x8E; array_enc << 0x201D
|
539
|
+
when 0x8F; array_enc << 0x2018
|
540
|
+
when 0x90; array_enc << 0x2019
|
541
|
+
when 0x91; array_enc << 0x201A
|
542
|
+
when 0x92; array_enc << 0x2122
|
543
|
+
when 0x93; array_enc << 0xFB01
|
544
|
+
when 0x94; array_enc << 0xFB02
|
545
|
+
when 0x95; array_enc << 0x0141
|
546
|
+
when 0x96; array_enc << 0x0152
|
547
|
+
when 0x97; array_enc << 0x0160
|
548
|
+
when 0x98; array_enc << 0x0178
|
549
|
+
when 0x99; array_enc << 0x017D
|
550
|
+
when 0x9A; array_enc << 0x0131
|
551
|
+
when 0x9B; array_enc << 0x0142
|
552
|
+
when 0x9C; array_enc << 0x0153
|
553
|
+
when 0x9D; array_enc << 0x0161
|
554
|
+
when 0x9E; array_enc << 0x017E
|
555
|
+
when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
556
|
+
when 0xA0; array_enc << 0x20AC
|
557
|
+
else
|
558
|
+
array_enc << num
|
559
|
+
end
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
# convert any glyph names to unicode codepoints
|
564
|
+
array_enc = self.process_glyphnames(array_enc)
|
565
|
+
|
566
|
+
# replace charcters that didn't convert to unicode nicely with something valid
|
567
|
+
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
568
|
+
|
569
|
+
# pack all our Unicode codepoints into a UTF-8 string
|
570
|
+
ret = array_enc.pack("U*")
|
571
|
+
|
572
|
+
# set the strings encoding correctly under ruby 1.9+
|
573
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
574
|
+
|
575
|
+
return ret
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
501
579
|
class StandardEncoding < Encoding
|
502
580
|
# convert an Adobe Standard Encoding string into UTF-8
|
503
581
|
def to_utf8(str, tounicode = nil)
|
@@ -771,6 +849,23 @@ class PDF::Reader
|
|
771
849
|
end
|
772
850
|
end
|
773
851
|
|
852
|
+
class UTF16Encoding < Encoding
|
853
|
+
# convert a UTF-16 string into UTF-8
|
854
|
+
def to_utf8(str, tounicode = nil)
|
855
|
+
|
856
|
+
# remove the UTF-16 Byte Order Mark if it exists
|
857
|
+
str = str[2, str.size-2] if str[0,2] == "\376\377"
|
858
|
+
|
859
|
+
# convert away
|
860
|
+
str = str.unpack("n*").pack("U*")
|
861
|
+
|
862
|
+
# set the strings encoding correctly under ruby 1.9+
|
863
|
+
str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
|
864
|
+
|
865
|
+
return str
|
866
|
+
end
|
867
|
+
end
|
868
|
+
|
774
869
|
class WinAnsiEncoding < Encoding
|
775
870
|
# convert a WinAnsiEncoding string into UTF-8
|
776
871
|
def to_utf8(str, tounicode = nil)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -39,7 +39,7 @@ class PDF::Reader
|
|
39
39
|
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
40
40
|
f.each do |l|
|
41
41
|
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
42
|
-
@@glyphs[name] = "0x#{code}".hex if name
|
42
|
+
@@glyphs[name.to_sym] = "0x#{code}".hex if name
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|