rpdf2txt 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gemtest ADDED
File without changes
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.8.3 / 05.01.2012
2
+
3
+ * Getting ready for Ruby 1.9.3, partially, see README for more.
4
+
1
5
  === 0.8.2 / 14.12.2010
2
6
 
3
7
  * Added a test for parsing encrypted object stream (PDF Ver 1.5-1.7)
data/Manifest.txt CHANGED
@@ -4,7 +4,6 @@ Manifest.txt
4
4
  README.txt
5
5
  Rakefile
6
6
  bin/rpdf2txt
7
- config.save
8
7
  install.rb
9
8
  lib/rpdf2txt-rockit/base_extensions.rb
10
9
  lib/rpdf2txt-rockit/bootstrap.rb
@@ -36,9 +35,6 @@ lib/rpdf2txt/data/cmap.grammar
36
35
  lib/rpdf2txt/data/cmap.rb
37
36
  lib/rpdf2txt/data/cmap_range.grammar
38
37
  lib/rpdf2txt/data/cmap_range.rb
39
- lib/rpdf2txt/data/_cmap.grammar
40
- lib/rpdf2txt/data/_cmap_range.grammar
41
- lib/rpdf2txt/data/_pdfattributes.grammar
42
38
  lib/rpdf2txt/data/fonts/Courier-Bold.afm
43
39
  lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
44
40
  lib/rpdf2txt/data/fonts/Courier-Oblique.afm
data/README.txt CHANGED
@@ -12,7 +12,20 @@ Not the problems here.
12
12
 
13
13
  == REQUIREMENTS:
14
14
 
15
- * Ruby 1.8
15
+ * Ruby 1.8 or Ruby 1.9.3
16
+
17
+ NOTE for Ruby 1.9.3
18
+
19
+ * rpdf2txt on Ruby 1.8.6 creates the Parser-Script from the grammar files
20
+ (without underline). Ruby 1.9.3 fails to create the Parser-Script. The work
21
+ around is to copy the grammar files to _grammar_file and then rpdf2txt (with
22
+ Ruby 1.8.6 and 1.9.3) stops creating a new Parser-Script and just uses the
23
+ existing one. Ergo: Ruby 1.9.3 just uses the parser script generated with
24
+ Ruby 1.8.6.
25
+
26
+ * If you change the grammar files then you have to create the
27
+ Parser-Script again with Ruby 1.8.6. The grammar file without underline
28
+ has to be changed.
16
29
 
17
30
  == INSTALL:
18
31
 
@@ -23,8 +36,8 @@ Not the problems here.
23
36
 
24
37
  == DEVELOPERS:
25
38
 
26
- Masaomi Hatakeyama, mhatakeyama@ywesee.com
27
- Zeno R.R. Davatz, zdavatz@ywesee.com
39
+ * Masaomi Hatakeyama, mhatakeyama@ywesee.com
40
+ * Zeno R.R. Davatz, zdavatz@ywesee.com
28
41
 
29
42
  == LICENSE:
30
43
  * GPLv2
data/bin/rpdf2txt CHANGED
@@ -49,7 +49,10 @@ if <output-file> is omitted, the extracted text is written to stdout
49
49
  EOS
50
50
  exit
51
51
  end
52
- parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8')
52
+ stream = open(ARGV[0], 'rb') do |file|
53
+ file.read
54
+ end
55
+ parser = Rpdf2txt::Parser.new(stream, 'utf-8')
53
56
  outstream = STDOUT
54
57
  if(ARGV.size == 2)
55
58
  outstream = File.open(ARGV[1], 'w')
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for CMap
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:10 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 11:22:39 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~4158671330)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~195355369)/),
10
11
  t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
11
12
  t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
12
13
  t4 = StringToken.new("StrToken61","<"),
@@ -14,9 +15,9 @@ module Rpdf2txt
14
15
  ]
15
16
  productions = [
16
17
  p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
17
- p2 = Production.new(:HexArray,[:"Plus-611119798"],LiftingSyntaxTreeBuilder.new(["values"],[])),
18
- p3 = Production.new(:"Plus-611119798",[:"Plus-611119798", :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
19
- p4 = Production.new(:"Plus-611119798",[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
18
+ p2 = Production.new(:HexArray,[:Plus69966636668160],LiftingSyntaxTreeBuilder.new(["values"],[])),
19
+ p3 = Production.new(:Plus69966636668160,[:Plus69966636668160, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
20
+ p4 = Production.new(:Plus69966636668160,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
20
21
  p5 = Production.new(:RangeDef,[:HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["source", "target"],[])),
21
22
  p6 = Production.new(:HexElement,[t4, t3, t5],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
22
23
  ]
@@ -24,14 +25,14 @@ module Rpdf2txt
24
25
 
25
26
  ]
26
27
  priorities = ProductionPriorities.new(relations)
27
- action_table = [[9, 8], [2, 1], [25, 4], [9, 8], [9, 8, 4, 1], [12, 29], [37, 16], [16, 29], [8, 29], [20, 29]]
28
- goto_hash = {0 => {1 => 1, 2 => 4, 3 => 5, 4 => 3}, 3 => {4 => 7}, 4 => {3 => 8, 4 => 3}}
29
- @@parse_table_611168728 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
28
+ action_table = [[17, 8], [17, 8], [12, 29], [2, 1], [29, 4], [17, 8, 4, 1], [16, 29], [37, 16], [8, 29], [20, 29]]
29
+ goto_hash = {5 => {3 => 8, 4 => 1}, 0 => {1 => 3, 2 => 5, 3 => 2, 4 => 1}, 1 => {4 => 6}}
30
+ @@parse_table69966636607980 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
30
31
  :REDUCE,
31
32
  :SHIFT,
32
33
  :ACCEPT
33
34
  ])
34
35
  def Rpdf2txt._cmap_parser
35
- GeneralizedLrParser.new(@@parse_table_611168728)
36
+ GeneralizedLrParser.new(@@parse_table69966636607980)
36
37
  end
37
38
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for CMap
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:11 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 11:29:28 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~6498354225)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~1397495140)/),
10
11
  t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
11
12
  t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
12
13
  t4 = StringToken.new("StrToken93","["),
@@ -16,28 +17,28 @@ module Rpdf2txt
16
17
  ]
17
18
  productions = [
18
19
  p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
19
- p2 = Production.new(:HexArray,[:"Plus-611547858"],LiftingSyntaxTreeBuilder.new(["values"],[])),
20
- p3 = Production.new(:"Plus-611547858",[:"Plus-611547858", :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
21
- p4 = Production.new(:"Plus-611547858",[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
20
+ p2 = Production.new(:HexArray,[:Plus69948349851480],LiftingSyntaxTreeBuilder.new(["values"],[])),
21
+ p3 = Production.new(:Plus69948349851480,[:Plus69948349851480, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
22
+ p4 = Production.new(:Plus69948349851480,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
22
23
  p5 = Production.new(:RangeDef,[:HexElement, :HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "offset"],[])),
23
24
  p6 = Production.new(:RangeDef,[:HexElement, :HexElement, :Explicit],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "explicit"],[])),
24
- p7 = Production.new(:Explicit,[t4, :"Plus-611556138", t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
25
- p8 = Production.new(:"Plus-611556138",[:"Plus-611556138", :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
26
- p9 = Production.new(:"Plus-611556138",[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
25
+ p7 = Production.new(:Explicit,[t4, :Plus69948349837620, t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
26
+ p8 = Production.new(:Plus69948349837620,[:Plus69948349837620, :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
27
+ p9 = Production.new(:Plus69948349837620,[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
27
28
  p10 = Production.new(:HexElement,[t6, t3, t7],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
28
29
  ]
29
30
  relations = [
30
31
 
31
32
  ]
32
33
  priorities = ProductionPriorities.new(relations)
33
- action_table = [[9, 32], [2, 1], [25, 4], [9, 32], [9, 32, 4, 1], [12, 125], [37, 64], [9, 32, 49, 8], [8, 125], [36, 125], [20, 125], [16, 125], [9, 32], [32, 116], [61, 16, 9, 32], [24, 125], [28, 116]]
34
- goto_hash = {0 => {6 => 3, 1 => 1, 2 => 4, 3 => 5}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 11, 4 => 10}, 14 => {6 => 16}, 3 => {6 => 7}, 4 => {6 => 3, 3 => 8}}
35
- @@parse_table_611607208 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
34
+ action_table = [[5, 32], [25, 4], [5, 32], [12, 125], [2, 1], [5, 32, 4, 1], [37, 64], [5, 32, 49, 8], [8, 125], [36, 125], [16, 125], [20, 125], [5, 32], [32, 116], [5, 32, 61, 16], [24, 125], [28, 116]]
35
+ goto_hash = {5 => {6 => 2, 3 => 8}, 0 => {6 => 2, 1 => 4, 2 => 5, 3 => 3}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 10, 4 => 11}, 2 => {6 => 7}, 14 => {6 => 16}}
36
+ @@parse_table69948349751360 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
36
37
  :REDUCE,
37
38
  :SHIFT,
38
39
  :ACCEPT
39
40
  ])
40
41
  def Rpdf2txt._cmap_range_parser
41
- GeneralizedLrParser.new(@@parse_table_611607208)
42
+ GeneralizedLrParser.new(@@parse_table69948349751360)
42
43
  end
43
44
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for PdfAttributes
4
- # created by Rockit version 0.3.8 on Tue Dec 14 18:03:10 +0100 2010
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 07:42:18 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~1499757680)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/),
10
11
  t2 = Token.new("IDENTIFIER",/^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n),
11
12
  t3 = Token.new("NUMERIC",/^(-?[0-9]+([.,][0-9]+)?)/n),
12
13
  t4 = Token.new("REFERENCE",/^([0-9]+\s+[0-9]+\s+R)/n),
@@ -21,7 +22,7 @@ module Rpdf2txt
21
22
  t13 = StringToken.new("StrToken345387270","(D:"),
22
23
  t14 = RegexpToken.new("RegexpToken1015925646",/[\d+']+/n),
23
24
  t15 = StringToken.new("StrToken42",")"),
24
- t16 = RegexpToken.new("RegexpToken-1047402015",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
25
+ t16 = RegexpToken.new("RegexpToken1100081633",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
25
26
  ]
26
27
  productions = [
27
28
  p1 = Production.new("Expr'".intern,[:Expr],SyntaxTreeBuilder.new("Expr'",["expr"],[])),
@@ -36,9 +37,9 @@ module Rpdf2txt
36
37
  p10 = Production.new(:Expr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
37
38
  p11 = Production.new(:Array,[t9, :ArrayElements, t10],SyntaxTreeBuilder.new("Array",["_", "values", "_"],[])),
38
39
  p12 = Production.new(:Array,[t9, t10],SyntaxTreeBuilder.new("Array",["_", "_"],[])),
39
- p13 = Production.new(:ArrayElements,[:"Plus-611837548"],LiftingSyntaxTreeBuilder.new(["values"],[])),
40
- p14 = Production.new(:"Plus-611837548",[:"Plus-611837548", :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
41
- p15 = Production.new(:"Plus-611837548",[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
40
+ p13 = Production.new(:ArrayElements,[:Plus70010113073940],LiftingSyntaxTreeBuilder.new(["values"],[])),
41
+ p14 = Production.new(:Plus70010113073940,[:Plus70010113073940, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
42
+ p15 = Production.new(:Plus70010113073940,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
42
43
  p16 = Production.new(:ArrayElement,[:Array],LiftingSyntaxTreeBuilder.new(["_"],[])),
43
44
  p17 = Production.new(:ArrayElement,[:Hash],LiftingSyntaxTreeBuilder.new(["_"],[])),
44
45
  p18 = Production.new(:ArrayElement,[t3],LiftingSyntaxTreeBuilder.new(["_"],[])),
@@ -47,10 +48,10 @@ module Rpdf2txt
47
48
  p21 = Production.new(:ArrayElement,[t5],LiftingSyntaxTreeBuilder.new(["_"],[])),
48
49
  p22 = Production.new(:ArrayElement,[t8],LiftingSyntaxTreeBuilder.new(["_"],[])),
49
50
  p23 = Production.new(:ArrayElement,[:Text],LiftingSyntaxTreeBuilder.new(["_"],[])),
50
- p24 = Production.new(:Hash,[t11, :"Mult-611844478", t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
51
+ p24 = Production.new(:Hash,[t11, :Mult70010113055620, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
51
52
  p25 = Production.new(:Hash,[t11, t12],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
52
- p26 = Production.new(:"Mult-611844478",[:"Mult-611844478", t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
53
- p27 = Production.new(:"Mult-611844478",[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
53
+ p26 = Production.new(:Mult70010113055620,[:Mult70010113055620, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
54
+ p27 = Production.new(:Mult70010113055620,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
54
55
  p28 = Production.new(:Date,[t13, t14, t15],SyntaxTreeBuilder.new("Date",["c1", "regexptoken1015925646", "c3"],[])),
55
56
  p29 = Production.new(:Text,[t16],SyntaxTreeBuilder.new("Text",["text"],[]))
56
57
  ]
@@ -58,14 +59,14 @@ module Rpdf2txt
58
59
 
59
60
  ]
60
61
  priorities = ProductionPriorities.new(relations)
61
- action_table = [[5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [20, 2051], [65, 2048, 69, 2], [32, 2051], [12, 2051], [2, 1], [24, 2051], [28, 2051], [8, 2051], [4, 2051], [73, 8192], [77, 512, 81, 4, 9, 1024, 85, 16, 101, 2, 45, 256, 109, 8, 117, 128, 57, 32768], [16, 2051], [36, 2051], [112, 65439], [125, 2048, 129, 2], [96, 65439], [5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [137, 16384], [44, 65439], [68, 65438], [80, 65438], [81, 4, 9, 1024, 85, 16, 101, 2, 45, 256, 109, 8, 117, 128, 57, 32768, 48, 512], [60, 65438], [88, 65438], [72, 65438], [64, 65438], [76, 65438], [56, 65438], [84, 65438], [145, 512], [92, 65439], [5, 4, 9, 1024, 13, 16, 37, 2, 41, 4096, 45, 256, 49, 8, 53, 128, 57, 32768], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
62
- goto_hash = {22 => {5 => 35, 6 => 26, 2 => 23, 9 => 24}, 11 => {5 => 28, 6 => 26, 2 => 23, 3 => 30, 9 => 24, 4 => 22}, 0 => {6 => 8, 1 => 5, 2 => 4, 8 => 7, 9 => 6}, 17 => {6 => 8, 1 => 33, 2 => 4, 8 => 7, 9 => 6}, 2 => {7 => 15}, 32 => {6 => 8, 1 => 37, 2 => 4, 8 => 7, 9 => 6}}
63
- @@parse_table_611955958 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
62
+ action_table = [[5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [32, 2051], [36, 2051], [65, 2048, 69, 2], [24, 2051], [20, 2051], [12, 2051], [8, 2051], [2, 1], [112, 65439], [73, 8192], [28, 2051], [16, 2051], [77, 16, 85, 128, 13, 1024, 93, 512, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2], [4, 2051], [125, 2048, 129, 2], [96, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [137, 16384], [80, 65438], [77, 16, 85, 128, 13, 1024, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2, 48, 512], [84, 65438], [88, 65438], [44, 65439], [68, 65438], [64, 65438], [60, 65438], [56, 65438], [76, 65438], [145, 512], [72, 65438], [92, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
63
+ goto_hash = {0 => {6 => 7, 1 => 8, 2 => 6, 8 => 11, 9 => 4}, 17 => {6 => 7, 1 => 33, 2 => 6, 8 => 11, 9 => 4}, 13 => {5 => 27, 6 => 25, 2 => 26, 3 => 29, 9 => 22, 4 => 20}, 3 => {7 => 15}, 20 => {5 => 35, 6 => 25, 2 => 26, 9 => 22}, 32 => {6 => 7, 1 => 37, 2 => 6, 8 => 11, 9 => 4}}
64
+ @@parse_table70010113197280 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
64
65
  :REDUCE,
65
66
  :SHIFT,
66
67
  :ACCEPT
67
68
  ])
68
69
  def Rpdf2txt._attr_parser
69
- GeneralizedLrParser.new(@@parse_table_611955958)
70
+ GeneralizedLrParser.new(@@parse_table70010113197280)
70
71
  end
71
72
  end
@@ -1,12 +1,13 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Rpdf2txt
3
4
  # Parser for PdfText
4
- # created by Rockit version 0.3.8 on Thu Oct 01 11:19:33 +0200 2009
5
+ # created by Rockit version 0.3.8 on Tue Jan 18 07:42:19 +0100 2011
5
6
  # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
6
7
  # and licensed under GPL
7
8
  # but this parser is under LGPL
8
9
  tokens = [
9
- t1 = EofToken.new("EOF",/^(�~~��~^^~5511964093)/),
10
+ t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/),
10
11
  t2 = Token.new("NUMERIC",/^(-?(([0-9]*[.,_][0-9]+)|([0-9]+)))/n),
11
12
  t3 = Token.new("SPACE",/^(\s+)/n,:Skip),
12
13
  t4 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
@@ -56,9 +57,9 @@ module Rpdf2txt
56
57
  productions = [
57
58
  p1 = Production.new("Target'".intern,[:Target],SyntaxTreeBuilder.new("Target'",["target"],[])),
58
59
  p2 = Production.new(:Target,[t10, :Exprs, t11],SyntaxTreeBuilder.new("Target",["_", "values", "_"],[])),
59
- p3 = Production.new(:Exprs,[:Plus70032186462260],LiftingSyntaxTreeBuilder.new(["values"],[])),
60
- p4 = Production.new(:Plus70032186462260,[:Plus70032186462260, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
61
- p5 = Production.new(:Plus70032186462260,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
60
+ p3 = Production.new(:Exprs,[:Plus70010106411320],LiftingSyntaxTreeBuilder.new(["values"],[])),
61
+ p4 = Production.new(:Plus70010106411320,[:Plus70010106411320, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
62
+ p5 = Production.new(:Plus70010106411320,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
62
63
  p6 = Production.new(:Expr,[:TmElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
63
64
  p7 = Production.new(:Expr,[:Array],LiftingSyntaxTreeBuilder.new(["val"],[])),
64
65
  p8 = Production.new(:Expr,[:TDElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
@@ -90,9 +91,9 @@ module Rpdf2txt
90
91
  p34 = Production.new(:HexElement,[t23, t4, t24],LiftingSyntaxTreeBuilder.new(["_", "hex", "_"],[])),
91
92
  p35 = Production.new(:HexElement,[t23, t24],LiftingSyntaxTreeBuilder.new(["_", "_"],[])),
92
93
  p36 = Production.new(:TjHex,[:HexElement, t22],SyntaxTreeBuilder.new("Tjhex",["hexsnippet", "_"],[])),
93
- p37 = Production.new(:TJArrayElements,[:Plus70032186325260],LiftingSyntaxTreeBuilder.new(["values"],[])),
94
- p38 = Production.new(:Plus70032186325260,[:Plus70032186325260, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
95
- p39 = Production.new(:Plus70032186325260,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
94
+ p37 = Production.new(:TJArrayElements,[:Plus70010106266620],LiftingSyntaxTreeBuilder.new(["values"],[])),
95
+ p38 = Production.new(:Plus70010106266620,[:Plus70010106266620, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
96
+ p39 = Production.new(:Plus70010106266620,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
96
97
  p40 = Production.new(:TJSingleElement,[t7],SyntaxTreeBuilder.new("TJSingleElement",["snippet"],[])),
97
98
  p41 = Production.new(:TJSingleElement,[t2],SyntaxTreeBuilder.new("TJSingleElement",["kerning"],[])),
98
99
  p42 = Production.new(:TJSingleElement,[:HexElement],SyntaxTreeBuilder.new("TJSingleElement",["hexsnippet"],[])),
@@ -107,10 +108,10 @@ module Rpdf2txt
107
108
  p51 = Production.new(:LineWidth,[t2, t33],SyntaxTreeBuilder.new("Width",["width", "_"],[])),
108
109
  p52 = Production.new(:BTElement,[t34],SyntaxTreeBuilder.new("BT",["_"],[])),
109
110
  p53 = Production.new(:ETElement,[t35],SyntaxTreeBuilder.new("ET",["_"],[])),
110
- p54 = Production.new(:Hash,[t36, :Mult70032186252840, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
111
+ p54 = Production.new(:Hash,[t36, :Mult70010106193040, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
111
112
  p55 = Production.new(:Hash,[t36, t37],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
112
- p56 = Production.new(:Mult70032186252840,[:Mult70032186252840, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
113
- p57 = Production.new(:Mult70032186252840,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
113
+ p56 = Production.new(:Mult70010106193040,[:Mult70010106193040, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
114
+ p57 = Production.new(:Mult70010106193040,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
114
115
  p58 = Production.new(:HashExpr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
115
116
  p59 = Production.new(:HashExpr,[t2],LiftingSyntaxTreeBuilder.new(["val"],[])),
116
117
  p60 = Production.new(:HashExpr,[t9],LiftingSyntaxTreeBuilder.new(["val"],[])),
@@ -122,9 +123,9 @@ module Rpdf2txt
122
123
  p66 = Production.new(:UElement,[t2, t43],SyntaxTreeBuilder.new("UElement",["c1", "regexptoken-265279295"],[])),
123
124
  p67 = Production.new(:UElement,[t2, t2, t2, t44],SyntaxTreeBuilder.new("UElement",["c1", "numeric2", "numeric3", "c4"],[])),
124
125
  p68 = Production.new(:UElement,[t45, t2, t46],SyntaxTreeBuilder.new("UElement",["c1", "numeric", "c3"],[])),
125
- p69 = Production.new(:CNElements,[:Plus70032186212980],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
126
- p70 = Production.new(:Plus70032186212980,[:Plus70032186212980, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
127
- p71 = Production.new(:Plus70032186212980,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
126
+ p69 = Production.new(:CNElements,[:Plus70010106149180],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
127
+ p70 = Production.new(:Plus70010106149180,[:Plus70010106149180, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
128
+ p71 = Production.new(:Plus70010106149180,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
128
129
  p72 = Production.new(:CNElement,[t5],SyntaxTreeBuilder.new("CNElement",["c1"],[])),
129
130
  p73 = Production.new(:CNElement,[t19, t5, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[])),
130
131
  p74 = Production.new(:CNElement,[t19, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[2]))
@@ -133,14 +134,14 @@ module Rpdf2txt
133
134
 
134
135
  ]
135
136
  priorities = ProductionPriorities.new(relations)
136
- action_table = [[5, 512], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416], [2, 1], [284, 2199023517712], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [153, 2097152], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [157, 2199023255552], [36, 70342974373338], [161, 4294967296, 165, 33554432, 169, 65536, 173, 536870912, 177, 4398046511104, 181, 2, 185, 134217728, 189, 1048576, 193, 131072], [24, 70342974373338], [16, 70342974373338], [197, 8, 201, 8388608], [205, 16], [32, 70342974373338], [13, 16, 209, 262144, 272, 2199023255552], [217, 2097152, 221, 1073741824], [233, 2, 69, 4194304, 237, 16777216, 241, 64], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [68, 70342974373338], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416, 8, 1024], [257, 1024], [252, 70342974373338], [176, 70342974373338], [56, 70342974373338], [280, 2199023517712], [261, 2], [48, 70342974373338], [44, 70342974373338], [140, 70342974373338], [256, 70342974373338], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [188, 70342974373338], [260, 70342974373338], [265, 2, 269, 64, 273, 16384, 277, 32768], [180, 70342974373338], [124, 70342974373338], [116, 70342974373338], [281, 8388608], [136, 31461450], [285, 16, 289, 34359738368, 297, 2, 301, 274877906944, 292, 2199023517712], [305, 16], [276, 2199023517712], [128, 70342974373338], [192, 70342974373338], [164, 29364298], [233, 2, 69, 4194304, 237, 16777216, 241, 64, 144, 4096], [160, 29364298], [168, 29364298], [156, 29364298], [152, 29364298], [313, 4096], [12, 70342974373338], [4, 1], [317, 35184372088832], [321, 8796093022208, 325, 268435456, 329, 2], [333, 2147483648], [104, 70342974373338], [108, 70342974373338], [132, 31461450], [288, 2199023517712], [341, 68719476736, 345, 128], [349, 549755813888], [353, 524288], [244, 70342974373338], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [268, 70342974373338], [264, 70342974373338], [184, 70342974373338], [357, 137438953472, 361, 2], [196, 70342974373338], [365, 68719476736, 369, 128], [216, 549755813888], [373, 2, 381, 128, 385, 256], [248, 70342974373338], [120, 70342974373338], [240, 70342974373338], [389, 2], [212, 549755813888], [373, 2, 381, 128, 385, 256], [232, 68719476864], [224, 68719476864], [228, 68719476864], [236, 68719476864], [397, 8192], [220, 68719476864], [100, 70342974373338]]
137
- goto_hash = {22 => {16 => 62, 17 => 57, 18 => 61, 14 => 56}, 0 => {1 => 2}, 72 => {30 => 84}, 1 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 29, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 16, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 51 => {29 => 73}, 29 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 24 => 5, 19 => 25, 8 => 19, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 63, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 57 => {18 => 77, 14 => 56}, 86 => {31 => 94}, 20 => {35 => 53}, 92 => {31 => 98}}
138
- @@parse_table70032185719080 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
137
+ action_table = [[5, 512], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048], [2, 1], [153, 8388608, 157, 8], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [161, 2], [165, 2097152], [169, 16], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [173, 2199023255552], [36, 70342974373338], [177, 131072, 181, 2, 185, 4294967296, 189, 33554432, 193, 65536, 197, 4398046511104, 201, 536870912, 205, 134217728, 209, 1048576], [24, 70342974373338], [16, 70342974373338], [32, 70342974373338], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048, 8, 1024], [284, 2199023517712], [217, 262144, 85, 16, 272, 2199023255552], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [252, 70342974373338], [68, 70342974373338], [225, 1024], [229, 2097152, 233, 1073741824], [56, 70342974373338], [280, 2199023517712], [176, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64], [48, 70342974373338], [44, 70342974373338], [136, 31461450], [265, 8388608], [269, 35184372088832], [140, 70342974373338], [273, 34359738368, 281, 2, 285, 16, 289, 274877906944, 292, 2199023517712], [256, 70342974373338], [116, 70342974373338], [293, 16384, 297, 2, 301, 32768, 305, 64], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [260, 70342974373338], [188, 70342974373338], [180, 70342974373338], [124, 70342974373338], [12, 70342974373338], [309, 16], [276, 2199023517712], [4, 1], [128, 70342974373338], [192, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64, 144, 4096], [168, 29364298], [164, 29364298], [160, 29364298], [152, 29364298], [317, 4096], [156, 29364298], [132, 31461450], [268, 70342974373338], [325, 68719476736, 329, 128], [333, 549755813888], [337, 524288], [288, 2199023517712], [244, 70342974373338], [104, 70342974373338], [341, 268435456, 345, 8796093022208, 349, 2], [108, 70342974373338], [353, 2147483648], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [357, 68719476736, 361, 128], [216, 549755813888], [365, 256, 369, 2, 377, 128], [248, 70342974373338], [120, 70342974373338], [184, 70342974373338], [264, 70342974373338], [381, 137438953472, 385, 2], [196, 70342974373338], [212, 549755813888], [365, 256, 369, 2, 377, 128], [236, 68719476864], [232, 68719476864], [224, 68719476864], [228, 68719476864], [240, 70342974373338], [393, 2], [220, 68719476864], [397, 8192], [100, 70342974373338]]
138
+ goto_hash = {82 => {31 => 93}, 22 => {35 => 55}, 0 => {1 => 2}, 1 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 20, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 18, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 90 => {31 => 97}, 68 => {30 => 80}, 35 => {16 => 64, 17 => 59, 18 => 63, 14 => 61}, 42 => {29 => 69}, 20 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 24 => 5, 19 => 25, 8 => 19, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 53, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 59 => {18 => 78, 14 => 61}}
139
+ @@parse_table70010113257400 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
139
140
  :REDUCE,
140
141
  :SHIFT,
141
142
  :ACCEPT
142
143
  ])
143
144
  def Rpdf2txt._text_parser
144
- GeneralizedLrParser.new(@@parse_table70032185719080)
145
+ GeneralizedLrParser.new(@@parse_table70010113257400)
145
146
  end
146
147
  end
@@ -20,6 +20,7 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # PdfObject -- Rpdf2txt -- 17.05.2011 -- mhatakeyama@ywesee.com
23
24
  # PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
24
25
 
25
26
  require 'zlib'
@@ -27,7 +28,7 @@ require 'rpdf2txt/text'
27
28
  require 'rpdf2txt/attributesparser'
28
29
  require 'rpdf2txt/cmapparser'
29
30
  require 'rpdf2txt/symbol'
30
- require 'md5'
31
+ require 'digest/md5'
31
32
  require 'matrix'
32
33
 
33
34
  module Rpdf2txt
@@ -101,13 +102,15 @@ module Rpdf2txt
101
102
  ast.values.collect { |child| extract_attributes(child) }
102
103
  elsif(ast.children_names.include?('pairs'))
103
104
  result = {}
104
- ast.pairs.each { |pair|
105
- k, v = pair
106
- keystr = k.value.strip.tr('/','')
107
- unless(keystr.empty?)
105
+ if(ast_pairs = ast.pairs)
106
+ ast_pairs.each { |pair|
107
+ k, v = pair
108
+ keystr = k.value.strip.tr('/','')
109
+ unless(keystr.empty?)
108
110
  result.store(keystr.downcase.intern, extract_attributes(v))
109
- end
110
- }
111
+ end
112
+ }
113
+ end
111
114
  result
112
115
  else
113
116
  value = ast
@@ -200,9 +203,13 @@ module Rpdf2txt
200
203
  end
201
204
  encryption_key = digest[0,keylength]
202
205
  test_key = compute_user_key encryption_key
203
- if(test_key != uk)
204
- raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
205
- end
206
+ ## Comment out the following, since import_gkv (de.oddb.org) stops due to this error
207
+ # See http://dev.ywesee.com/wiki.php/Masa/20110209-debug-importGkv-rpdf2txt
208
+ # Also refer to http://trac.ywesee.com/ticket/74#comment:5
209
+ #
210
+ #if(test_key != uk)
211
+ # raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
212
+ #end
206
213
  encryption_key
207
214
  end
208
215
  def file_id= (file_id)
@@ -309,7 +316,11 @@ module Rpdf2txt
309
316
  end
310
317
  def width(char)
311
318
  if(char.is_a?(String) && char.length == 1)
312
- char = char[0]
319
+ if RUBY_VERSION > "1.9"
320
+ char = char.bytes.to_a[0]
321
+ else
322
+ char = char[0]
323
+ end
313
324
  end
314
325
  _width(char) || named_width(char)
315
326
  end
@@ -442,11 +453,15 @@ module Rpdf2txt
442
453
  yield self
443
454
  end
444
455
  def extract_oids(array)
445
- array.collect{ |dirty_id|
456
+ if array.class != Array
457
+ array = [array]
458
+ end
459
+ result = array.collect{ |dirty_id|
446
460
  if(match = /\d+/on.match(dirty_id))
447
461
  match[0].to_i
448
462
  end
449
463
  }.compact
464
+ return result
450
465
  end
451
466
  def root?
452
467
  !(@parent || @attributes[:parent])
@@ -519,6 +534,36 @@ module Rpdf2txt
519
534
  parent.media_box
520
535
  end
521
536
  end
537
+ def merge_snippets(text_snippets)
538
+ # this is required for the pdf file that is written by pdfFactory 3.25
539
+ # (Windows Server 2003 R2 Standard Edition German)
540
+ # This builds up a meaningful snippet from the small snippets whose
541
+ # x, y positions are same
542
+ # See in more detail:
543
+ # * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt
544
+ # * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt
545
+ new_text_snippets = []
546
+ last = nil
547
+ snippet = nil
548
+ text_snippets.each do |snip|
549
+ snippet ||= snip.txt
550
+ if last
551
+ if last == snip
552
+ snippet << snip.txt
553
+ else
554
+ last.txt = snippet
555
+ new_text_snippets << last.dup
556
+ snippet = snip.txt
557
+ last = snip
558
+ end
559
+ end
560
+ last = snip
561
+ end
562
+ # for the last element
563
+ lasttxt = snippet
564
+ new_text_snippets << last.dup
565
+ return new_text_snippets
566
+ end
522
567
  def text(callback_handler)
523
568
  concat_stream = Stream.new('')
524
569
  if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
@@ -530,6 +575,7 @@ module Rpdf2txt
530
575
  end
531
576
  @text_state.media_box = self.media_box
532
577
  text_snippets = concat_stream.extract_text_objects(self, @text_state)
578
+ text_snippets = merge_snippets(text_snippets)
533
579
  join_snippets(text_snippets, callback_handler)
534
580
  end
535
581
  private
@@ -756,7 +802,16 @@ module Rpdf2txt
756
802
  result
757
803
  end
758
804
  def raw_stream
759
- @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
805
+ #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
806
+ #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
807
+ unless(@raw_stream)
808
+ if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?)
809
+ @raw_stream = src_scan[0][0]
810
+ else
811
+ @raw_stream = src_scan.to_s
812
+ end
813
+ end
814
+ return @raw_stream
760
815
  end
761
816
  def decode_raw_stream
762
817
  @decrypted_stream = raw_stream
@@ -20,15 +20,16 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # PdfParser -- Rpdf2txt-- 05.01.2012 -- mhatakeyama@ywesee.com
23
24
  # PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
24
25
 
25
26
  require 'zlib'
26
27
  require 'rpdf2txt/object'
27
28
  require 'rpdf2txt/default_handler'
28
- require 'md5'
29
+ require 'digest/md5'
29
30
 
30
31
  module Rpdf2txt
31
- VERSION = '0.8.2'
32
+ VERSION = '0.8.3'
32
33
  class Parser
33
34
  attr_accessor :encrypt
34
35
  def initialize(pdf_stream, target_encoding='utf8')
@@ -128,6 +129,9 @@ module Rpdf2txt
128
129
  startobj=0
129
130
  endobj=0
130
131
  catalogue = {}
132
+ if RUBY_VERSION >= '1.9'
133
+ @src.force_encoding('ascii-8bit')
134
+ end
131
135
  @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
132
136
  obj = build_object(match.to_s)
133
137
  catalogue.store(obj.oid, obj)
data/lib/rpdf2txt/text.rb CHANGED
@@ -64,7 +64,6 @@ module Rpdf2txt
64
64
  elsif((map = @current_font.to_unicode) \
65
65
  && (utf8 = map.to_utf8(ascii)))
66
66
  @current_font.attributes[:encoding] = '/UTF8'
67
- #@text_state.set_font(@current_font)
68
67
  [utf8].pack('U')
69
68
  end
70
69
  end
@@ -165,7 +164,7 @@ module Rpdf2txt
165
164
  snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
166
165
  snippet_text.gsub!(/\\([()])/n, '\1')
167
166
  snippet_text.gsub!(/./n) { |char|
168
- self.mapped_ascii(char[0]) || char
167
+ self.mapped_ascii(char.unpack('C*')[0]) || char
169
168
  }
170
169
  _snip(snippet_text)
171
170
  end
@@ -20,6 +20,7 @@
20
20
  # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
21
  # hwyss@ywesee.com, aschrafl@ywesee.com
22
22
  #
23
+ # TextState -- Rpdf2txt -- 05.01.2012 -- mhatakeyama@ywesee.com
23
24
  # TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
24
25
 
25
26
  module Rpdf2txt
@@ -124,7 +125,8 @@ module Rpdf2txt
124
125
  end
125
126
  def char_width(char)
126
127
  if(char.is_a? String)
127
- char = char[0]
128
+ #char = char[0]
129
+ char = char.unpack('C*')[0]
128
130
  end
129
131
  w = 0.0
130
132
  if(@font && (width = @font.width(char)))
@@ -306,7 +308,10 @@ module Rpdf2txt
306
308
  @boxwidth += char_width(char)
307
309
  end
308
310
  @w = @boxwidth
309
- if white = txt[/\s+$/u]
311
+ if RUBY_VERSION >= '1.9'
312
+ txt.force_encoding('ascii-8bit')
313
+ end
314
+ if white = txt[/\s+$/n]
310
315
  white.each_byte do |char|
311
316
  @w += char_width(char)
312
317
  end
@@ -329,6 +334,9 @@ module Rpdf2txt
329
334
  @tmy -= y_val.to_f
330
335
  end
331
336
  def unescape_txt!(txt)
337
+ if RUBY_VERSION >= '1.9'
338
+ txt.force_encoding('ascii-8bit')
339
+ end
332
340
  txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
333
341
  end
334
342
  protected
@@ -51,7 +51,7 @@ module Parse
51
51
  }
52
52
  File.open(outputFile, "w") do |f|
53
53
  time_and_puts("Writing parser to file #{outputFile}") {
54
- f.write "require 'rpdf2txt-rockit/rockit'\n" +
54
+ f.write "# encoding: ascii-8bit\nrequire 'rpdf2txt-rockit/rockit'\n" +
55
55
  parser.to_src_in_module(parserName, moduleName)
56
56
  }
57
57
  end
@@ -1,3 +1,4 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/rockit'
2
3
  module Parse
3
4
  # Parser for RockitGrammar
@@ -1,3 +1,4 @@
1
+ # encoding: ascii-8bit
1
2
  require 'rpdf2txt-rockit/syntax_tree'
2
3
  require 'rpdf2txt-rockit/sourcecode_dumpable'
3
4
  require 'rpdf2txt-rockit/bounded_lru_cache'