rpdf2txt 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gemtest ADDED
File without changes
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.8.3 / 05.01.2012
2
+
3
+ * Getting ready for Ruby 1.9.3, partially, see README for more.
4
+
1
5
  === 0.8.2 / 14.12.2010
2
6
 
3
7
  * Added a test for parsing encrypted object stream (PDF Ver 1.5-1.7)
data/Manifest.txt CHANGED
@@ -4,7 +4,6 @@ Manifest.txt
4
4
  README.txt
5
5
  Rakefile
6
6
  bin/rpdf2txt
7
- config.save
8
7
  install.rb
9
8
  lib/rpdf2txt-rockit/base_extensions.rb
10
9
  lib/rpdf2txt-rockit/bootstrap.rb
@@ -36,9 +35,6 @@ lib/rpdf2txt/data/cmap.grammar
36
35
  lib/rpdf2txt/data/cmap.rb
37
36
  lib/rpdf2txt/data/cmap_range.grammar
38
37
  lib/rpdf2txt/data/cmap_range.rb
39
- lib/rpdf2txt/data/_cmap.grammar
40
- lib/rpdf2txt/data/_cmap_range.grammar
41
- lib/rpdf2txt/data/_pdfattributes.grammar
42
38
  lib/rpdf2txt/data/fonts/Courier-Bold.afm
43
39
  lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
44
40
  lib/rpdf2txt/data/fonts/Courier-Oblique.afm
data/README.txt CHANGED
@@ -12,7 +12,20 @@ Not the problems here.
12
12
 
13
13
  == REQUIREMENTS:
14
14
 
15
- * Ruby 1.8
15
+ * Ruby 1.8 or Ruby 1.9.3
16
+
17
+ NOTE for Ruby 1.9.3
18
+
19
+ * rpdf2txt on Ruby 1.8.6 creates the Parser-Script from the grammar files
20
+ (without underline). Ruby 1.9.3 fails to create the Parser-Script. The work
21
+ around is to copy the grammar files to _grammar_file and then rpdf2txt (with
22
+ Ruby 1.8.6 and 1.9.3) stops creating a new Parser-Script and just uses the
23
+ existing one. Ergo: Ruby 1.9.3 just uses the parser script generated with
24
+ Ruby 1.8.6.
25
+
26
+ * If you change the grammar files then you have to create the
27
+ Parser-Script again with Ruby 1.8.6. The grammar file without underline
28
+ has to be changed.
16
29
 
17
30
  == INSTALL:
18
31
 
@@ -23,8 +36,8 @@ Not the problems here.
23
36
 
24
37
  == DEVELOPERS:
25
38
 
26
- Masaomi Hatakeyama, mhatakeyama@ywesee.com
27
- Zeno R.R. Davatz, zdavatz@ywesee.com
39
+ * Masaomi Hatakeyama, mhatakeyama@ywesee.com
40
+ * Zeno R.R. Davatz, zdavatz@ywesee.com
28
41
 
29
42
  == LICENSE:
30
43
  * GPLv2
data/bin/rpdf2txt CHANGED
@@ -49,7 +49,10 @@ if <output-file> is omitted, the extracted text is written to stdout
49
49
  EOS
50
50
  exit
51
51
  end
52
- parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8')
52
+ stream = open(ARGV[0], 'rb') do |file|
53
+ file.read
54
+ end
55
+ parser = Rpdf2txt::Parser.new(stream, 'utf-8')
53
56
  outstream = STDOUT
54
57
  if(ARGV.size == 2)
55
58
  outstream = File.open(ARGV[1], 'w')
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for CMap
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:10 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 11:22:39 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~4158671330)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~195355369)/),
10
11
  t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
11
12
  t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
12
13
  t4 = StringToken.new("StrToken61","<"),
@@ -14,9 +15,9 @@ module Rpdf2txt
14
15
  ]
15
16
  productions = [
16
17
  p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
17
- p2 = Production.new(:HexArray,[:"Plus-611119798"],LiftingSyntaxTreeBuilder.new(["values"],[])),
18
- p3 = Production.new(:"Plus-611119798",[:"Plus-611119798", :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
19
- p4 = Production.new(:"Plus-611119798",[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
18
+ p2 = Production.new(:HexArray,[:Plus69966636668160],LiftingSyntaxTreeBuilder.new(["values"],[])),
19
+ p3 = Production.new(:Plus69966636668160,[:Plus69966636668160, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
20
+ p4 = Production.new(:Plus69966636668160,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
20
21
  p5 = Production.new(:RangeDef,[:HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["source", "target"],[])),
21
22
  p6 = Production.new(:HexElement,[t4, t3, t5],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
22
23
  ]
@@ -24,14 +25,14 @@ module Rpdf2txt
24
25
 
25
26
  ]
26
27
  priorities = ProductionPriorities.new(relations)
27
- action_table = [[9, 8], [2, 1], [25, 4], [9, 8], [9, 8, 4, 1], [12, 29], [37, 16], [16, 29], [8, 29], [20, 29]]
28
- goto_hash = {0 => {1 => 1, 2 => 4, 3 => 5, 4 => 3}, 3 => {4 => 7}, 4 => {3 => 8, 4 => 3}}
29
- @@parse_table_611168728 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
28
+ action_table = [[17, 8], [17, 8], [12, 29], [2, 1], [29, 4], [17, 8, 4, 1], [16, 29], [37, 16], [8, 29], [20, 29]]
29
+ goto_hash = {5 => {3 => 8, 4 => 1}, 0 => {1 => 3, 2 => 5, 3 => 2, 4 => 1}, 1 => {4 => 6}}
30
+ @@parse_table69966636607980 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
30
31
  :REDUCE,
31
32
  :SHIFT,
32
33
  :ACCEPT
33
34
  ])
34
35
  def Rpdf2txt._cmap_parser
35
- GeneralizedLrParser.new(@@parse_table_611168728)
36
+ GeneralizedLrParser.new(@@parse_table69966636607980)
36
37
  end
37
38
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for CMap
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:11 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 11:29:28 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~6498354225)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~1397495140)/),
10
11
  t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
11
12
  t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
12
13
  t4 = StringToken.new("StrToken93","["),
@@ -16,28 +17,28 @@ module Rpdf2txt
16
17
  ]
17
18
  productions = [
18
19
  p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
19
- p2 = Production.new(:HexArray,[:"Plus-611547858"],LiftingSyntaxTreeBuilder.new(["values"],[])),
20
- p3 = Production.new(:"Plus-611547858",[:"Plus-611547858", :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
21
- p4 = Production.new(:"Plus-611547858",[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
20
+ p2 = Production.new(:HexArray,[:Plus69948349851480],LiftingSyntaxTreeBuilder.new(["values"],[])),
21
+ p3 = Production.new(:Plus69948349851480,[:Plus69948349851480, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
22
+ p4 = Production.new(:Plus69948349851480,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
22
23
  p5 = Production.new(:RangeDef,[:HexElement, :HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "offset"],[])),
23
24
  p6 = Production.new(:RangeDef,[:HexElement, :HexElement, :Explicit],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "explicit"],[])),
24
- p7 = Production.new(:Explicit,[t4, :"Plus-611556138", t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
25
- p8 = Production.new(:"Plus-611556138",[:"Plus-611556138", :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
26
- p9 = Production.new(:"Plus-611556138",[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
25
+ p7 = Production.new(:Explicit,[t4, :Plus69948349837620, t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
26
+ p8 = Production.new(:Plus69948349837620,[:Plus69948349837620, :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
27
+ p9 = Production.new(:Plus69948349837620,[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
27
28
  p10 = Production.new(:HexElement,[t6, t3, t7],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
28
29
  ]
29
30
  relations = [
30
31
 
31
32
  ]
32
33
  priorities = ProductionPriorities.new(relations)
33
- action_table = [[9, 32], [2, 1], [25, 4], [9, 32], [9, 32, 4, 1], [12, 125], [37, 64], [9, 32, 49, 8], [8, 125], [36, 125], [20, 125], [16, 125], [9, 32], [32, 116], [61, 16, 9, 32], [24, 125], [28, 116]]
34
- goto_hash = {0 => {6 => 3, 1 => 1, 2 => 4, 3 => 5}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 11, 4 => 10}, 14 => {6 => 16}, 3 => {6 => 7}, 4 => {6 => 3, 3 => 8}}
35
- @@parse_table_611607208 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
34
+ action_table = [[5, 32], [25, 4], [5, 32], [12, 125], [2, 1], [5, 32, 4, 1], [37, 64], [5, 32, 49, 8], [8, 125], [36, 125], [16, 125], [20, 125], [5, 32], [32, 116], [5, 32, 61, 16], [24, 125], [28, 116]]
35
+ goto_hash = {5 => {6 => 2, 3 => 8}, 0 => {6 => 2, 1 => 4, 2 => 5, 3 => 3}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 10, 4 => 11}, 2 => {6 => 7}, 14 => {6 => 16}}
36
+ @@parse_table69948349751360 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
36
37
  :REDUCE,
37
38
  :SHIFT,
38
39
  :ACCEPT
39
40
  ])
40
41
  def Rpdf2txt._cmap_range_parser
41
- GeneralizedLrParser.new(@@parse_table_611607208)
42
+ GeneralizedLrParser.new(@@parse_table69948349751360)
42
43
  end
43
44
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for PdfAttributes
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:10 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 07:42:18 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~1499757680)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/),
10
11
  t2 = Token.new("IDENTIFIER",/^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n),
11
12
  t3 = Token.new("NUMERIC",/^(-?[0-9]+([.,][0-9]+)?)/n),
12
13
  t4 = Token.new("REFERENCE",/^([0-9]+\s+[0-9]+\s+R)/n),
@@ -21,7 +22,7 @@ module Rpdf2txt
21
22
  t13 = StringToken.new("StrToken345387270","(D:"),
22
23
  t14 = RegexpToken.new("RegexpToken1015925646",/[\d+']+/n),
23
24
  t15 = StringToken.new("StrToken42",")"),
24
- t16 = RegexpToken.new("RegexpToken-1047402015",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
25
+ t16 = RegexpToken.new("RegexpToken1100081633",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
25
26
  ]
26
27
  productions = [
27
28
  p1 = Production.new("Expr'".intern,[:Expr],SyntaxTreeBuilder.new("Expr'",["expr"],[])),
@@ -36,9 +37,9 @@ module Rpdf2txt
36
37
  p10 = Production.new(:Expr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
37
38
  p11 = Production.new(:Array,[t9, :ArrayElements, t10],SyntaxTreeBuilder.new("Array",["_", "values", "_"],[])),
38
39
  p12 = Production.new(:Array,[t9, t10],SyntaxTreeBuilder.new("Array",["_", "_"],[])),
39
- p13 = Production.new(:ArrayElements,[:"Plus-611837548"],LiftingSyntaxTreeBuilder.new(["values"],[])),
40
- p14 = Production.new(:"Plus-611837548",[:"Plus-611837548", :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
41
- p15 = Production.new(:"Plus-611837548",[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
40
+ p13 = Production.new(:ArrayElements,[:Plus70010113073940],LiftingSyntaxTreeBuilder.new(["values"],[])),
41
+ p14 = Production.new(:Plus70010113073940,[:Plus70010113073940, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
42
+ p15 = Production.new(:Plus70010113073940,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
42
43
  p16 = Production.new(:ArrayElement,[:Array],LiftingSyntaxTreeBuilder.new(["_"],[])),
43
44
  p17 = Production.new(:ArrayElement,[:Hash],LiftingSyntaxTreeBuilder.new(["_"],[])),
44
45
  p18 = Production.new(:ArrayElement,[t3],LiftingSyntaxTreeBuilder.new(["_"],[])),
@@ -47,10 +48,10 @@ module Rpdf2txt
47
48
  p21 = Production.new(:ArrayElement,[t5],LiftingSyntaxTreeBuilder.new(["_"],[])),
48
49
  p22 = Production.new(:ArrayElement,[t8],LiftingSyntaxTreeBuilder.new(["_"],[])),
49
50
  p23 = Production.new(:ArrayElement,[:Text],LiftingSyntaxTreeBuilder.new(["_"],[])),
50
- p24 = Production.new(:Hash,[t11, :"Mult-611844478", t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
51
+ p24 = Production.new(:Hash,[t11, :Mult70010113055620, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
51
52
  p25 = Production.new(:Hash,[t11, t12],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
52
- p26 = Production.new(:"Mult-611844478",[:"Mult-611844478", t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
53
- p27 = Production.new(:"Mult-611844478",[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
53
+ p26 = Production.new(:Mult70010113055620,[:Mult70010113055620, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
54
+ p27 = Production.new(:Mult70010113055620,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
54
55
  p28 = Production.new(:Date,[t13, t14, t15],SyntaxTreeBuilder.new("Date",["c1", "regexptoken1015925646", "c3"],[])),
55
56
  p29 = Production.new(:Text,[t16],SyntaxTreeBuilder.new("Text",["text"],[]))
56
57
  ]
@@ -58,14 +59,14 @@ module Rpdf2txt
58
59
 
59
60
  ]
60
61
  priorities = ProductionPriorities.new(relations)
61
- action_table = [[5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [20, 2051], [65, 2048, 69, 2], [32, 2051], [12, 2051], [2, 1], [24, 2051], [28, 2051], [8, 2051], [4, 2051], [73, 8192], [77, 512, 81, 4, 9, 1024, 85, 16, 101, 2, 45, 256, 109, 8, 117, 128, 57, 32768], [16, 2051], [36, 2051], [112, 65439], [125, 2048, 129, 2], [96, 65439], [5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [137, 16384], [44, 65439], [68, 65438], [80, 65438], [81, 4, 9, 1024, 85, 16, 101, 2, 45, 256, 109, 8, 117, 128, 57, 32768, 48, 512], [60, 65438], [88, 65438], [72, 65438], [64, 65438], [76, 65438], [56, 65438], [84, 65438], [145, 512], [92, 65439], [5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
62
- goto_hash = {22 => {5 => 35, 6 => 26, 2 => 23, 9 => 24}, 11 => {5 => 28, 6 => 26, 2 => 23, 3 => 30, 9 => 24, 4 => 22}, 0 => {6 => 8, 1 => 5, 2 => 4, 8 => 7, 9 => 6}, 17 => {6 => 8, 1 => 33, 2 => 4, 8 => 7, 9 => 6}, 2 => {7 => 15}, 32 => {6 => 8, 1 => 37, 2 => 4, 8 => 7, 9 => 6}}
63
- @@parse_table_611955958 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
62
+ action_table = [[5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [32, 2051], [36, 2051], [65, 2048, 69, 2], [24, 2051], [20, 2051], [12, 2051], [8, 2051], [2, 1], [112, 65439], [73, 8192], [28, 2051], [16, 2051], [77, 16, 85, 128, 13, 1024, 93, 512, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2], [4, 2051], [125, 2048, 129, 2], [96, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [137, 16384], [80, 65438], [77, 16, 85, 128, 13, 1024, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2, 48, 512], [84, 65438], [88, 65438], [44, 65439], [68, 65438], [64, 65438], [60, 65438], [56, 65438], [76, 65438], [145, 512], [72, 65438], [92, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
63
+ goto_hash = {0 => {6 => 7, 1 => 8, 2 => 6, 8 => 11, 9 => 4}, 17 => {6 => 7, 1 => 33, 2 => 6, 8 => 11, 9 => 4}, 13 => {5 => 27, 6 => 25, 2 => 26, 3 => 29, 9 => 22, 4 => 20}, 3 => {7 => 15}, 20 => {5 => 35, 6 => 25, 2 => 26, 9 => 22}, 32 => {6 => 7, 1 => 37, 2 => 6, 8 => 11, 9 => 4}}
64
+ @@parse_table70010113197280 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
64
65
  :REDUCE,
65
66
  :SHIFT,
66
67
  :ACCEPT
67
68
  ])
68
69
  def Rpdf2txt._attr_parser
69
- GeneralizedLrParser.new(@@parse_table_611955958)
70
+ GeneralizedLrParser.new(@@parse_table70010113197280)
70
71
  end
71
72
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for PdfText
4
- # created by Rockit version 0.3.8 on Thu Oct 01 11:19:33 +0200 2009
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 07:42:19 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~5511964093)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/),
10
11
  t2 = Token.new("NUMERIC",/^(-?(([0-9]*[.,_][0-9]+)|([0-9]+)))/n),
11
12
  t3 = Token.new("SPACE",/^(\s+)/n,:Skip),
12
13
  t4 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
@@ -56,9 +57,9 @@ module Rpdf2txt
56
57
  productions = [
57
58
  p1 = Production.new("Target'".intern,[:Target],SyntaxTreeBuilder.new("Target'",["target"],[])),
58
59
  p2 = Production.new(:Target,[t10, :Exprs, t11],SyntaxTreeBuilder.new("Target",["_", "values", "_"],[])),
59
- p3 = Production.new(:Exprs,[:Plus70032186462260],LiftingSyntaxTreeBuilder.new(["values"],[])),
60
- p4 = Production.new(:Plus70032186462260,[:Plus70032186462260, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
61
- p5 = Production.new(:Plus70032186462260,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
60
+ p3 = Production.new(:Exprs,[:Plus70010106411320],LiftingSyntaxTreeBuilder.new(["values"],[])),
61
+ p4 = Production.new(:Plus70010106411320,[:Plus70010106411320, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
62
+ p5 = Production.new(:Plus70010106411320,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
62
63
  p6 = Production.new(:Expr,[:TmElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
63
64
  p7 = Production.new(:Expr,[:Array],LiftingSyntaxTreeBuilder.new(["val"],[])),
64
65
  p8 = Production.new(:Expr,[:TDElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
@@ -90,9 +91,9 @@ module Rpdf2txt
90
91
  p34 = Production.new(:HexElement,[t23, t4, t24],LiftingSyntaxTreeBuilder.new(["_", "hex", "_"],[])),
91
92
  p35 = Production.new(:HexElement,[t23, t24],LiftingSyntaxTreeBuilder.new(["_", "_"],[])),
92
93
  p36 = Production.new(:TjHex,[:HexElement, t22],SyntaxTreeBuilder.new("Tjhex",["hexsnippet", "_"],[])),
93
- p37 = Production.new(:TJArrayElements,[:Plus70032186325260],LiftingSyntaxTreeBuilder.new(["values"],[])),
94
- p38 = Production.new(:Plus70032186325260,[:Plus70032186325260, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
95
- p39 = Production.new(:Plus70032186325260,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
94
+ p37 = Production.new(:TJArrayElements,[:Plus70010106266620],LiftingSyntaxTreeBuilder.new(["values"],[])),
95
+ p38 = Production.new(:Plus70010106266620,[:Plus70010106266620, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
96
+ p39 = Production.new(:Plus70010106266620,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
96
97
  p40 = Production.new(:TJSingleElement,[t7],SyntaxTreeBuilder.new("TJSingleElement",["snippet"],[])),
97
98
  p41 = Production.new(:TJSingleElement,[t2],SyntaxTreeBuilder.new("TJSingleElement",["kerning"],[])),
98
99
  p42 = Production.new(:TJSingleElement,[:HexElement],SyntaxTreeBuilder.new("TJSingleElement",["hexsnippet"],[])),
@@ -107,10 +108,10 @@ module Rpdf2txt
107
108
  p51 = Production.new(:LineWidth,[t2, t33],SyntaxTreeBuilder.new("Width",["width", "_"],[])),
108
109
  p52 = Production.new(:BTElement,[t34],SyntaxTreeBuilder.new("BT",["_"],[])),
109
110
  p53 = Production.new(:ETElement,[t35],SyntaxTreeBuilder.new("ET",["_"],[])),
110
- p54 = Production.new(:Hash,[t36, :Mult70032186252840, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
111
+ p54 = Production.new(:Hash,[t36, :Mult70010106193040, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
111
112
  p55 = Production.new(:Hash,[t36, t37],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
112
- p56 = Production.new(:Mult70032186252840,[:Mult70032186252840, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
113
- p57 = Production.new(:Mult70032186252840,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
113
+ p56 = Production.new(:Mult70010106193040,[:Mult70010106193040, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
114
+ p57 = Production.new(:Mult70010106193040,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
114
115
  p58 = Production.new(:HashExpr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
115
116
  p59 = Production.new(:HashExpr,[t2],LiftingSyntaxTreeBuilder.new(["val"],[])),
116
117
  p60 = Production.new(:HashExpr,[t9],LiftingSyntaxTreeBuilder.new(["val"],[])),
@@ -122,9 +123,9 @@ module Rpdf2txt
122
123
  p66 = Production.new(:UElement,[t2, t43],SyntaxTreeBuilder.new("UElement",["c1", "regexptoken-265279295"],[])),
123
124
  p67 = Production.new(:UElement,[t2, t2, t2, t44],SyntaxTreeBuilder.new("UElement",["c1", "numeric2", "numeric3", "c4"],[])),
124
125
  p68 = Production.new(:UElement,[t45, t2, t46],SyntaxTreeBuilder.new("UElement",["c1", "numeric", "c3"],[])),
125
- p69 = Production.new(:CNElements,[:Plus70032186212980],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
126
- p70 = Production.new(:Plus70032186212980,[:Plus70032186212980, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
127
- p71 = Production.new(:Plus70032186212980,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
126
+ p69 = Production.new(:CNElements,[:Plus70010106149180],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
127
+ p70 = Production.new(:Plus70010106149180,[:Plus70010106149180, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
128
+ p71 = Production.new(:Plus70010106149180,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
128
129
  p72 = Production.new(:CNElement,[t5],SyntaxTreeBuilder.new("CNElement",["c1"],[])),
129
130
  p73 = Production.new(:CNElement,[t19, t5, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[])),
130
131
  p74 = Production.new(:CNElement,[t19, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[2]))
@@ -133,14 +134,14 @@ module Rpdf2txt
133
134
 
134
135
  ]
135
136
  priorities = ProductionPriorities.new(relations)
136
- action_table = [[5, 512], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416], [2, 1], [284, 2199023517712], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [153, 2097152], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [157, 2199023255552], [36, 70342974373338], [161, 4294967296, 165, 33554432, 169, 65536, 173, 536870912, 177, 4398046511104, 181, 2, 185, 134217728, 189, 1048576, 193, 131072], [24, 70342974373338], [16, 70342974373338], [197, 8, 201, 8388608], [205, 16], [32, 70342974373338], [13, 16, 209, 262144, 272, 2199023255552], [217, 2097152, 221, 1073741824], [233, 2, 69, 4194304, 237, 16777216, 241, 64], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [68, 70342974373338], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416, 8, 1024], [257, 1024], [252, 70342974373338], [176, 70342974373338], [56, 70342974373338], [280, 2199023517712], [261, 2], [48, 70342974373338], [44, 70342974373338], [140, 70342974373338], [256, 70342974373338], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [188, 70342974373338], [260, 70342974373338], [265, 2, 269, 64, 273, 16384, 277, 32768], [180, 70342974373338], [124, 70342974373338], [116, 70342974373338], [281, 8388608], [136, 31461450], [285, 16, 289, 34359738368, 297, 2, 301, 274877906944, 292, 2199023517712], [305, 16], [276, 2199023517712], [128, 70342974373338], [192, 70342974373338], [164, 29364298], [233, 2, 69, 4194304, 237, 16777216, 241, 64, 144, 4096], [160, 29364298], [168, 29364298], [156, 29364298], [152, 29364298], [313, 4096], [12, 70342974373338], [4, 1], [317, 35184372088832], [321, 8796093022208, 325, 268435456, 329, 2], [333, 2147483648], [104, 70342974373338], [108, 70342974373338], [132, 31461450], [288, 2199023517712], [341, 68719476736, 345, 128], [349, 549755813888], [353, 524288], [244, 70342974373338], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [268, 70342974373338], [264, 70342974373338], [184, 70342974373338], [357, 137438953472, 361, 2], [196, 70342974373338], [365, 68719476736, 369, 128], [216, 549755813888], [373, 2, 381, 128, 385, 256], [248, 70342974373338], [120, 70342974373338], [240, 70342974373338], [389, 2], [212, 549755813888], [373, 2, 381, 128, 385, 256], [232, 68719476864], [224, 68719476864], [228, 68719476864], [236, 68719476864], [397, 8192], [220, 68719476864], [100, 70342974373338]]
137
- goto_hash = {22 => {16 => 62, 17 => 57, 18 => 61, 14 => 56}, 0 => {1 => 2}, 72 => {30 => 84}, 1 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 29, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 16, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 51 => {29 => 73}, 29 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 24 => 5, 19 => 25, 8 => 19, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 63, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 57 => {18 => 77, 14 => 56}, 86 => {31 => 94}, 20 => {35 => 53}, 92 => {31 => 98}}
138
- @@parse_table70032185719080 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
137
+ action_table = [[5, 512], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048], [2, 1], [153, 8388608, 157, 8], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [161, 2], [165, 2097152], [169, 16], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [173, 2199023255552], [36, 70342974373338], [177, 131072, 181, 2, 185, 4294967296, 189, 33554432, 193, 65536, 197, 4398046511104, 201, 536870912, 205, 134217728, 209, 1048576], [24, 70342974373338], [16, 70342974373338], [32, 70342974373338], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048, 8, 1024], [284, 2199023517712], [217, 262144, 85, 16, 272, 2199023255552], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [252, 70342974373338], [68, 70342974373338], [225, 1024], [229, 2097152, 233, 1073741824], [56, 70342974373338], [280, 2199023517712], [176, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64], [48, 70342974373338], [44, 70342974373338], [136, 31461450], [265, 8388608], [269, 35184372088832], [140, 70342974373338], [273, 34359738368, 281, 2, 285, 16, 289, 274877906944, 292, 2199023517712], [256, 70342974373338], [116, 70342974373338], [293, 16384, 297, 2, 301, 32768, 305, 64], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [260, 70342974373338], [188, 70342974373338], [180, 70342974373338], [124, 70342974373338], [12, 70342974373338], [309, 16], [276, 2199023517712], [4, 1], [128, 70342974373338], [192, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64, 144, 4096], [168, 29364298], [164, 29364298], [160, 29364298], [152, 29364298], [317, 4096], [156, 29364298], [132, 31461450], [268, 70342974373338], [325, 68719476736, 329, 128], [333, 549755813888], [337, 524288], [288, 2199023517712], [244, 70342974373338], [104, 70342974373338], [341, 268435456, 345, 8796093022208, 349, 2], [108, 70342974373338], [353, 2147483648], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [357, 68719476736, 361, 128], [216, 549755813888], [365, 256, 369, 2, 377, 128], [248, 70342974373338], [120, 70342974373338], [184, 70342974373338], [264, 70342974373338], [381, 137438953472, 385, 2], [196, 70342974373338], [212, 549755813888], [365, 256, 369, 2, 377, 128], [236, 68719476864], [232, 68719476864], [224, 68719476864], [228, 68719476864], [240, 70342974373338], [393, 2], [220, 68719476864], [397, 8192], [100, 70342974373338]]
138
+ goto_hash = {82 => {31 => 93}, 22 => {35 => 55}, 0 => {1 => 2}, 1 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 20, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 18, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 90 => {31 => 97}, 68 => {30 => 80}, 35 => {16 => 64, 17 => 59, 18 => 63, 14 => 61}, 42 => {29 => 69}, 20 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 24 => 5, 19 => 25, 8 => 19, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 53, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 59 => {18 => 78, 14 => 61}}
139
+ @@parse_table70010113257400 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
139
140
  :REDUCE,
140
141
  :SHIFT,
141
142
  :ACCEPT
142
143
  ])
143
144
  def Rpdf2txt._text_parser
144
- GeneralizedLrParser.new(@@parse_table70032185719080)
145
+ GeneralizedLrParser.new(@@parse_table70010113257400)
145
146
  end
146
147
  end
@@ -20,6 +20,7 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # PdfObject -- Rpdf2txt -- 17.05.2011 -- mhatakeyama@ywesee.com
23
24
  # PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
24
25
 
25
26
  require 'zlib'
@@ -27,7 +28,7 @@ require 'rpdf2txt/text'
27
28
  require 'rpdf2txt/attributesparser'
28
29
  require 'rpdf2txt/cmapparser'
29
30
  require 'rpdf2txt/symbol'
30
- require 'md5'
31
+ require 'digest/md5'
31
32
  require 'matrix'
32
33
 
33
34
  module Rpdf2txt
@@ -101,13 +102,15 @@ module Rpdf2txt
101
102
  ast.values.collect { |child| extract_attributes(child) }
102
103
  elsif(ast.children_names.include?('pairs'))
103
104
  result = {}
104
- ast.pairs.each { |pair|
105
- k, v = pair
106
- keystr = k.value.strip.tr('/','')
107
- unless(keystr.empty?)
105
+ if(ast_pairs = ast.pairs)
106
+ ast_pairs.each { |pair|
107
+ k, v = pair
108
+ keystr = k.value.strip.tr('/','')
109
+ unless(keystr.empty?)
108
110
  result.store(keystr.downcase.intern, extract_attributes(v))
109
- end
110
- }
111
+ end
112
+ }
113
+ end
111
114
  result
112
115
  else
113
116
  value = ast
@@ -200,9 +203,13 @@ module Rpdf2txt
200
203
  end
201
204
  encryption_key = digest[0,keylength]
202
205
  test_key = compute_user_key encryption_key
203
- if(test_key != uk)
204
- raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
205
- end
206
+ ## Comment out the following, since import_gkv (de.oddb.org) stops due to this error
207
+ # See http://dev.ywesee.com/wiki.php/Masa/20110209-debug-importGkv-rpdf2txt
208
+ # Also refer to http://trac.ywesee.com/ticket/74#comment:5
209
+ #
210
+ #if(test_key != uk)
211
+ # raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
212
+ #end
206
213
  encryption_key
207
214
  end
208
215
  def file_id= (file_id)
@@ -309,7 +316,11 @@ module Rpdf2txt
309
316
  end
310
317
  def width(char)
311
318
  if(char.is_a?(String) && char.length == 1)
312
- char = char[0]
319
+ if RUBY_VERSION > "1.9"
320
+ char = char.bytes.to_a[0]
321
+ else
322
+ char = char[0]
323
+ end
313
324
  end
314
325
  _width(char) || named_width(char)
315
326
  end
@@ -442,11 +453,15 @@ module Rpdf2txt
442
453
  yield self
443
454
  end
444
455
  def extract_oids(array)
445
- array.collect{ |dirty_id|
456
+ if array.class != Array
457
+ array = [array]
458
+ end
459
+ result = array.collect{ |dirty_id|
446
460
  if(match = /\d+/on.match(dirty_id))
447
461
  match[0].to_i
448
462
  end
449
463
  }.compact
464
+ return result
450
465
  end
451
466
  def root?
452
467
  !(@parent || @attributes[:parent])
@@ -519,6 +534,36 @@ module Rpdf2txt
519
534
  parent.media_box
520
535
  end
521
536
  end
537
+ def merge_snippets(text_snippets)
538
+ # this is required for the pdf file that is written by pdfFactory 3.25
539
+ # (Windows Server 2003 R2 Standard Edition German)
540
+ # This builds up a meaningful snippet from the small snippets whose
541
+ # x, y positions are same
542
+ # See in more detail:
543
+ # * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt
544
+ # * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt
545
+ new_text_snippets = []
546
+ last = nil
547
+ snippet = nil
548
+ text_snippets.each do |snip|
549
+ snippet ||= snip.txt
550
+ if last
551
+ if last == snip
552
+ snippet << snip.txt
553
+ else
554
+ last.txt = snippet
555
+ new_text_snippets << last.dup
556
+ snippet = snip.txt
557
+ last = snip
558
+ end
559
+ end
560
+ last = snip
561
+ end
562
+ # for the last element
563
+ lasttxt = snippet
564
+ new_text_snippets << last.dup
565
+ return new_text_snippets
566
+ end
522
567
  def text(callback_handler)
523
568
  concat_stream = Stream.new('')
524
569
  if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
@@ -530,6 +575,7 @@ module Rpdf2txt
530
575
  end
531
576
  @text_state.media_box = self.media_box
532
577
  text_snippets = concat_stream.extract_text_objects(self, @text_state)
578
+ text_snippets = merge_snippets(text_snippets)
533
579
  join_snippets(text_snippets, callback_handler)
534
580
  end
535
581
  private
@@ -756,7 +802,16 @@ module Rpdf2txt
756
802
  result
757
803
  end
758
804
  def raw_stream
759
- @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
805
+ #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
806
+ #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
807
+ unless(@raw_stream)
808
+ if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?)
809
+ @raw_stream = src_scan[0][0]
810
+ else
811
+ @raw_stream = src_scan.to_s
812
+ end
813
+ end
814
+ return @raw_stream
760
815
  end
761
816
  def decode_raw_stream
762
817
  @decrypted_stream = raw_stream
@@ -20,15 +20,16 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # PdfParser -- Rpdf2txt-- 05.01.2012 -- mhatakeyama@ywesee.com
23
24
  # PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
24
25
 
25
26
  require 'zlib'
26
27
  require 'rpdf2txt/object'
27
28
  require 'rpdf2txt/default_handler'
28
- require 'md5'
29
+ require 'digest/md5'
29
30
 
30
31
  module Rpdf2txt
31
- VERSION = '0.8.2'
32
+ VERSION = '0.8.3'
32
33
  class Parser
33
34
  attr_accessor :encrypt
34
35
  def initialize(pdf_stream, target_encoding='utf8')
@@ -128,6 +129,9 @@ module Rpdf2txt
128
129
  startobj=0
129
130
  endobj=0
130
131
  catalogue = {}
132
+ if RUBY_VERSION >= '1.9'
133
+ @src.force_encoding('ascii-8bit')
134
+ end
131
135
  @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
132
136
  obj = build_object(match.to_s)
133
137
  catalogue.store(obj.oid, obj)
data/lib/rpdf2txt/text.rb CHANGED
@@ -64,7 +64,6 @@ module Rpdf2txt
64
64
  elsif((map = @current_font.to_unicode) \
65
65
  && (utf8 = map.to_utf8(ascii)))
66
66
  @current_font.attributes[:encoding] = '/UTF8'
67
- #@text_state.set_font(@current_font)
68
67
  [utf8].pack('U')
69
68
  end
70
69
  end
@@ -165,7 +164,7 @@ module Rpdf2txt
165
164
  snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
166
165
  snippet_text.gsub!(/\\([()])/n, '\1')
167
166
  snippet_text.gsub!(/./n) { |char|
168
- self.mapped_ascii(char[0]) || char
167
+ self.mapped_ascii(char.unpack('C*')[0]) || char
169
168
  }
170
169
  _snip(snippet_text)
171
170
  end
@@ -20,6 +20,7 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # TextState -- Rpdf2txt -- 05.01.2012 -- mhatakeyama@ywesee.com
23
24
  # TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
24
25
 
25
26
  module Rpdf2txt
@@ -124,7 +125,8 @@ module Rpdf2txt
124
125
  end
125
126
  def char_width(char)
126
127
  if(char.is_a? String)
127
- char = char[0]
128
+ #char = char[0]
129
+ char = char.unpack('C*')[0]
128
130
  end
129
131
  w = 0.0
130
132
  if(@font && (width = @font.width(char)))
@@ -306,7 +308,10 @@ module Rpdf2txt
306
308
  @boxwidth += char_width(char)
307
309
  end
308
310
  @w = @boxwidth
309
- if white = txt[/\s+$/u]
311
+ if RUBY_VERSION >= '1.9'
312
+ txt.force_encoding('ascii-8bit')
313
+ end
314
+ if white = txt[/\s+$/n]
310
315
  white.each_byte do |char|
311
316
  @w += char_width(char)
312
317
  end
@@ -329,6 +334,9 @@ module Rpdf2txt
329
334
  @tmy -= y_val.to_f
330
335
  end
331
336
  def unescape_txt!(txt)
337
+ if RUBY_VERSION >= '1.9'
338
+ txt.force_encoding('ascii-8bit')
339
+ end
332
340
  txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
333
341
  end
334
342
  protected
@@ -51,7 +51,7 @@ module Parse
51
51
  }
52
52
  File.open(outputFile, "w") do |f|
53
53
  time_and_puts("Writing parser to file #{outputFile}") {
54
- f.write "require 'rpdf2txt-rockit/rockit'\n" +
54
+ f.write "# encoding: ascii-8bit\nrequire 'rpdf2txt-rockit/rockit'\n" +
55
55
  parser.to_src_in_module(parserName, moduleName)
56
56
  }
57
57
  end
@@ -1,3 +1,4 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Parse
3
4
  # Parser for RockitGrammar
@@ -1,3 +1,4 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/syntax_tree'
2
3
  require 'rpdf2txt-rockit/sourcecode_dumpable'
3
4
  require 'rpdf2txt-rockit/bounded_lru_cache'