rpdf2txt 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +0 -4
- data/README.txt +16 -3
- data/bin/rpdf2txt +4 -1
- data/lib/rpdf2txt/data/cmap.rb +10 -9
- data/lib/rpdf2txt/data/cmap_range.rb +13 -12
- data/lib/rpdf2txt/data/pdfattributes.rb +14 -13
- data/lib/rpdf2txt/data/pdftext.rb +19 -18
- data/lib/rpdf2txt/object.rb +68 -13
- data/lib/rpdf2txt/parser.rb +6 -2
- data/lib/rpdf2txt/text.rb +1 -2
- data/lib/rpdf2txt/text_state.rb +10 -2
- data/lib/rpdf2txt-rockit/rockit.rb +1 -1
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +1 -0
- data/lib/rpdf2txt-rockit/token.rb +1 -0
- data/test/mock.rb +19 -11
- data/test/test_object.rb +33 -0
- data/test/test_pdf_object.rb +25 -24
- data/test/test_pdf_parser.rb +8 -5
- data/test/test_pdf_text.rb +11 -10
- data/test/test_space_bug_05_2004.rb +2 -1
- data/test/test_stream.rb +6 -5
- data/test/test_text_state.rb +220 -219
- metadata +13 -14
- data/config.save +0 -12
- data/lib/rpdf2txt/data/_cmap.grammar +0 -11
- data/lib/rpdf2txt/data/_cmap_range.grammar +0 -15
- data/lib/rpdf2txt/data/_pdfattributes.grammar +0 -32
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -4,7 +4,6 @@ Manifest.txt
|
|
4
4
|
README.txt
|
5
5
|
Rakefile
|
6
6
|
bin/rpdf2txt
|
7
|
-
config.save
|
8
7
|
install.rb
|
9
8
|
lib/rpdf2txt-rockit/base_extensions.rb
|
10
9
|
lib/rpdf2txt-rockit/bootstrap.rb
|
@@ -36,9 +35,6 @@ lib/rpdf2txt/data/cmap.grammar
|
|
36
35
|
lib/rpdf2txt/data/cmap.rb
|
37
36
|
lib/rpdf2txt/data/cmap_range.grammar
|
38
37
|
lib/rpdf2txt/data/cmap_range.rb
|
39
|
-
lib/rpdf2txt/data/_cmap.grammar
|
40
|
-
lib/rpdf2txt/data/_cmap_range.grammar
|
41
|
-
lib/rpdf2txt/data/_pdfattributes.grammar
|
42
38
|
lib/rpdf2txt/data/fonts/Courier-Bold.afm
|
43
39
|
lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
|
44
40
|
lib/rpdf2txt/data/fonts/Courier-Oblique.afm
|
data/README.txt
CHANGED
@@ -12,7 +12,20 @@ Not the problems here.
|
|
12
12
|
|
13
13
|
== REQUIREMENTS:
|
14
14
|
|
15
|
-
* Ruby 1.8
|
15
|
+
* Ruby 1.8 or Ruby 1.9.3
|
16
|
+
|
17
|
+
NOTE for Ruby 1.9.3
|
18
|
+
|
19
|
+
* rpdf2txt on Ruby 1.8.6 creates the Parser-Script from the grammar files
|
20
|
+
(without underline). Ruby 1.9.3 fails to create the Parser-Script. The work
|
21
|
+
around is to copy the grammar files to _grammar_file and then rpdf2txt (with
|
22
|
+
Ruby 1.8.6 and 1.9.3) stops creating a new Parser-Script and just uses the
|
23
|
+
existing one. Ergo: Ruby 1.9.3 just uses the parser script generated with
|
24
|
+
Ruby 1.8.6.
|
25
|
+
|
26
|
+
* If you change the grammar files then you have to create the
|
27
|
+
Parser-Script again with Ruby 1.8.6. The grammar file without underline
|
28
|
+
has to be changed.
|
16
29
|
|
17
30
|
== INSTALL:
|
18
31
|
|
@@ -23,8 +36,8 @@ Not the problems here.
|
|
23
36
|
|
24
37
|
== DEVELOPERS:
|
25
38
|
|
26
|
-
Masaomi Hatakeyama, mhatakeyama@ywesee.com
|
27
|
-
Zeno R.R. Davatz, zdavatz@ywesee.com
|
39
|
+
* Masaomi Hatakeyama, mhatakeyama@ywesee.com
|
40
|
+
* Zeno R.R. Davatz, zdavatz@ywesee.com
|
28
41
|
|
29
42
|
== LICENSE:
|
30
43
|
* GPLv2
|
data/bin/rpdf2txt
CHANGED
@@ -49,7 +49,10 @@ if <output-file> is omitted, the extracted text is written to stdout
|
|
49
49
|
EOS
|
50
50
|
exit
|
51
51
|
end
|
52
|
-
|
52
|
+
stream = open(ARGV[0], 'rb') do |file|
|
53
|
+
file.read
|
54
|
+
end
|
55
|
+
parser = Rpdf2txt::Parser.new(stream, 'utf-8')
|
53
56
|
outstream = STDOUT
|
54
57
|
if(ARGV.size == 2)
|
55
58
|
outstream = File.open(ARGV[1], 'w')
|
data/lib/rpdf2txt/data/cmap.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for CMap
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 11:22:39 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~195355369)/),
|
10
11
|
t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
11
12
|
t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
12
13
|
t4 = StringToken.new("StrToken61","<"),
|
@@ -14,9 +15,9 @@ module Rpdf2txt
|
|
14
15
|
]
|
15
16
|
productions = [
|
16
17
|
p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
|
17
|
-
p2 = Production.new(:HexArray,[:
|
18
|
-
p3 = Production.new(:
|
19
|
-
p4 = Production.new(:
|
18
|
+
p2 = Production.new(:HexArray,[:Plus69966636668160],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
19
|
+
p3 = Production.new(:Plus69966636668160,[:Plus69966636668160, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
20
|
+
p4 = Production.new(:Plus69966636668160,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
20
21
|
p5 = Production.new(:RangeDef,[:HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["source", "target"],[])),
|
21
22
|
p6 = Production.new(:HexElement,[t4, t3, t5],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
|
22
23
|
]
|
@@ -24,14 +25,14 @@ module Rpdf2txt
|
|
24
25
|
|
25
26
|
]
|
26
27
|
priorities = ProductionPriorities.new(relations)
|
27
|
-
action_table = [[
|
28
|
-
goto_hash = {
|
29
|
-
@@
|
28
|
+
action_table = [[17, 8], [17, 8], [12, 29], [2, 1], [29, 4], [17, 8, 4, 1], [16, 29], [37, 16], [8, 29], [20, 29]]
|
29
|
+
goto_hash = {5 => {3 => 8, 4 => 1}, 0 => {1 => 3, 2 => 5, 3 => 2, 4 => 1}, 1 => {4 => 6}}
|
30
|
+
@@parse_table69966636607980 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
30
31
|
:REDUCE,
|
31
32
|
:SHIFT,
|
32
33
|
:ACCEPT
|
33
34
|
])
|
34
35
|
def Rpdf2txt._cmap_parser
|
35
|
-
GeneralizedLrParser.new(@@
|
36
|
+
GeneralizedLrParser.new(@@parse_table69966636607980)
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for CMap
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 11:29:28 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~1397495140)/),
|
10
11
|
t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
11
12
|
t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
12
13
|
t4 = StringToken.new("StrToken93","["),
|
@@ -16,28 +17,28 @@ module Rpdf2txt
|
|
16
17
|
]
|
17
18
|
productions = [
|
18
19
|
p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
|
19
|
-
p2 = Production.new(:HexArray,[:
|
20
|
-
p3 = Production.new(:
|
21
|
-
p4 = Production.new(:
|
20
|
+
p2 = Production.new(:HexArray,[:Plus69948349851480],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
21
|
+
p3 = Production.new(:Plus69948349851480,[:Plus69948349851480, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
22
|
+
p4 = Production.new(:Plus69948349851480,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
22
23
|
p5 = Production.new(:RangeDef,[:HexElement, :HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "offset"],[])),
|
23
24
|
p6 = Production.new(:RangeDef,[:HexElement, :HexElement, :Explicit],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "explicit"],[])),
|
24
|
-
p7 = Production.new(:Explicit,[t4, :
|
25
|
-
p8 = Production.new(:
|
26
|
-
p9 = Production.new(:
|
25
|
+
p7 = Production.new(:Explicit,[t4, :Plus69948349837620, t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
|
26
|
+
p8 = Production.new(:Plus69948349837620,[:Plus69948349837620, :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
27
|
+
p9 = Production.new(:Plus69948349837620,[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
27
28
|
p10 = Production.new(:HexElement,[t6, t3, t7],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
|
28
29
|
]
|
29
30
|
relations = [
|
30
31
|
|
31
32
|
]
|
32
33
|
priorities = ProductionPriorities.new(relations)
|
33
|
-
action_table = [[
|
34
|
-
goto_hash = {
|
35
|
-
@@
|
34
|
+
action_table = [[5, 32], [25, 4], [5, 32], [12, 125], [2, 1], [5, 32, 4, 1], [37, 64], [5, 32, 49, 8], [8, 125], [36, 125], [16, 125], [20, 125], [5, 32], [32, 116], [5, 32, 61, 16], [24, 125], [28, 116]]
|
35
|
+
goto_hash = {5 => {6 => 2, 3 => 8}, 0 => {6 => 2, 1 => 4, 2 => 5, 3 => 3}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 10, 4 => 11}, 2 => {6 => 7}, 14 => {6 => 16}}
|
36
|
+
@@parse_table69948349751360 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
36
37
|
:REDUCE,
|
37
38
|
:SHIFT,
|
38
39
|
:ACCEPT
|
39
40
|
])
|
40
41
|
def Rpdf2txt._cmap_range_parser
|
41
|
-
GeneralizedLrParser.new(@@
|
42
|
+
GeneralizedLrParser.new(@@parse_table69948349751360)
|
42
43
|
end
|
43
44
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for PdfAttributes
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 07:42:18 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/),
|
10
11
|
t2 = Token.new("IDENTIFIER",/^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n),
|
11
12
|
t3 = Token.new("NUMERIC",/^(-?[0-9]+([.,][0-9]+)?)/n),
|
12
13
|
t4 = Token.new("REFERENCE",/^([0-9]+\s+[0-9]+\s+R)/n),
|
@@ -21,7 +22,7 @@ module Rpdf2txt
|
|
21
22
|
t13 = StringToken.new("StrToken345387270","(D:"),
|
22
23
|
t14 = RegexpToken.new("RegexpToken1015925646",/[\d+']+/n),
|
23
24
|
t15 = StringToken.new("StrToken42",")"),
|
24
|
-
t16 = RegexpToken.new("
|
25
|
+
t16 = RegexpToken.new("RegexpToken1100081633",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
|
25
26
|
]
|
26
27
|
productions = [
|
27
28
|
p1 = Production.new("Expr'".intern,[:Expr],SyntaxTreeBuilder.new("Expr'",["expr"],[])),
|
@@ -36,9 +37,9 @@ module Rpdf2txt
|
|
36
37
|
p10 = Production.new(:Expr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
37
38
|
p11 = Production.new(:Array,[t9, :ArrayElements, t10],SyntaxTreeBuilder.new("Array",["_", "values", "_"],[])),
|
38
39
|
p12 = Production.new(:Array,[t9, t10],SyntaxTreeBuilder.new("Array",["_", "_"],[])),
|
39
|
-
p13 = Production.new(:ArrayElements,[:
|
40
|
-
p14 = Production.new(:
|
41
|
-
p15 = Production.new(:
|
40
|
+
p13 = Production.new(:ArrayElements,[:Plus70010113073940],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
41
|
+
p14 = Production.new(:Plus70010113073940,[:Plus70010113073940, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
42
|
+
p15 = Production.new(:Plus70010113073940,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
42
43
|
p16 = Production.new(:ArrayElement,[:Array],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
43
44
|
p17 = Production.new(:ArrayElement,[:Hash],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
44
45
|
p18 = Production.new(:ArrayElement,[t3],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
@@ -47,10 +48,10 @@ module Rpdf2txt
|
|
47
48
|
p21 = Production.new(:ArrayElement,[t5],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
48
49
|
p22 = Production.new(:ArrayElement,[t8],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
49
50
|
p23 = Production.new(:ArrayElement,[:Text],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
50
|
-
p24 = Production.new(:Hash,[t11, :
|
51
|
+
p24 = Production.new(:Hash,[t11, :Mult70010113055620, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
|
51
52
|
p25 = Production.new(:Hash,[t11, t12],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
|
52
|
-
p26 = Production.new(:
|
53
|
-
p27 = Production.new(:
|
53
|
+
p26 = Production.new(:Mult70010113055620,[:Mult70010113055620, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
|
54
|
+
p27 = Production.new(:Mult70010113055620,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
|
54
55
|
p28 = Production.new(:Date,[t13, t14, t15],SyntaxTreeBuilder.new("Date",["c1", "regexptoken1015925646", "c3"],[])),
|
55
56
|
p29 = Production.new(:Text,[t16],SyntaxTreeBuilder.new("Text",["text"],[]))
|
56
57
|
]
|
@@ -58,14 +59,14 @@ module Rpdf2txt
|
|
58
59
|
|
59
60
|
]
|
60
61
|
priorities = ProductionPriorities.new(relations)
|
61
|
-
action_table = [[5,
|
62
|
-
goto_hash = {
|
63
|
-
@@
|
62
|
+
action_table = [[5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [32, 2051], [36, 2051], [65, 2048, 69, 2], [24, 2051], [20, 2051], [12, 2051], [8, 2051], [2, 1], [112, 65439], [73, 8192], [28, 2051], [16, 2051], [77, 16, 85, 128, 13, 1024, 93, 512, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2], [4, 2051], [125, 2048, 129, 2], [96, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [137, 16384], [80, 65438], [77, 16, 85, 128, 13, 1024, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2, 48, 512], [84, 65438], [88, 65438], [44, 65439], [68, 65438], [64, 65438], [60, 65438], [56, 65438], [76, 65438], [145, 512], [72, 65438], [92, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
|
63
|
+
goto_hash = {0 => {6 => 7, 1 => 8, 2 => 6, 8 => 11, 9 => 4}, 17 => {6 => 7, 1 => 33, 2 => 6, 8 => 11, 9 => 4}, 13 => {5 => 27, 6 => 25, 2 => 26, 3 => 29, 9 => 22, 4 => 20}, 3 => {7 => 15}, 20 => {5 => 35, 6 => 25, 2 => 26, 9 => 22}, 32 => {6 => 7, 1 => 37, 2 => 6, 8 => 11, 9 => 4}}
|
64
|
+
@@parse_table70010113197280 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
64
65
|
:REDUCE,
|
65
66
|
:SHIFT,
|
66
67
|
:ACCEPT
|
67
68
|
])
|
68
69
|
def Rpdf2txt._attr_parser
|
69
|
-
GeneralizedLrParser.new(@@
|
70
|
+
GeneralizedLrParser.new(@@parse_table70010113197280)
|
70
71
|
end
|
71
72
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for PdfText
|
4
|
-
# created by Rockit version 0.3.8 on
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 07:42:19 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/),
|
10
11
|
t2 = Token.new("NUMERIC",/^(-?(([0-9]*[.,_][0-9]+)|([0-9]+)))/n),
|
11
12
|
t3 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
12
13
|
t4 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
@@ -56,9 +57,9 @@ module Rpdf2txt
|
|
56
57
|
productions = [
|
57
58
|
p1 = Production.new("Target'".intern,[:Target],SyntaxTreeBuilder.new("Target'",["target"],[])),
|
58
59
|
p2 = Production.new(:Target,[t10, :Exprs, t11],SyntaxTreeBuilder.new("Target",["_", "values", "_"],[])),
|
59
|
-
p3 = Production.new(:Exprs,[:
|
60
|
-
p4 = Production.new(:
|
61
|
-
p5 = Production.new(:
|
60
|
+
p3 = Production.new(:Exprs,[:Plus70010106411320],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
61
|
+
p4 = Production.new(:Plus70010106411320,[:Plus70010106411320, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
62
|
+
p5 = Production.new(:Plus70010106411320,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
62
63
|
p6 = Production.new(:Expr,[:TmElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
63
64
|
p7 = Production.new(:Expr,[:Array],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
64
65
|
p8 = Production.new(:Expr,[:TDElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
@@ -90,9 +91,9 @@ module Rpdf2txt
|
|
90
91
|
p34 = Production.new(:HexElement,[t23, t4, t24],LiftingSyntaxTreeBuilder.new(["_", "hex", "_"],[])),
|
91
92
|
p35 = Production.new(:HexElement,[t23, t24],LiftingSyntaxTreeBuilder.new(["_", "_"],[])),
|
92
93
|
p36 = Production.new(:TjHex,[:HexElement, t22],SyntaxTreeBuilder.new("Tjhex",["hexsnippet", "_"],[])),
|
93
|
-
p37 = Production.new(:TJArrayElements,[:
|
94
|
-
p38 = Production.new(:
|
95
|
-
p39 = Production.new(:
|
94
|
+
p37 = Production.new(:TJArrayElements,[:Plus70010106266620],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
95
|
+
p38 = Production.new(:Plus70010106266620,[:Plus70010106266620, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
96
|
+
p39 = Production.new(:Plus70010106266620,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
96
97
|
p40 = Production.new(:TJSingleElement,[t7],SyntaxTreeBuilder.new("TJSingleElement",["snippet"],[])),
|
97
98
|
p41 = Production.new(:TJSingleElement,[t2],SyntaxTreeBuilder.new("TJSingleElement",["kerning"],[])),
|
98
99
|
p42 = Production.new(:TJSingleElement,[:HexElement],SyntaxTreeBuilder.new("TJSingleElement",["hexsnippet"],[])),
|
@@ -107,10 +108,10 @@ module Rpdf2txt
|
|
107
108
|
p51 = Production.new(:LineWidth,[t2, t33],SyntaxTreeBuilder.new("Width",["width", "_"],[])),
|
108
109
|
p52 = Production.new(:BTElement,[t34],SyntaxTreeBuilder.new("BT",["_"],[])),
|
109
110
|
p53 = Production.new(:ETElement,[t35],SyntaxTreeBuilder.new("ET",["_"],[])),
|
110
|
-
p54 = Production.new(:Hash,[t36, :
|
111
|
+
p54 = Production.new(:Hash,[t36, :Mult70010106193040, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
|
111
112
|
p55 = Production.new(:Hash,[t36, t37],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
|
112
|
-
p56 = Production.new(:
|
113
|
-
p57 = Production.new(:
|
113
|
+
p56 = Production.new(:Mult70010106193040,[:Mult70010106193040, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
|
114
|
+
p57 = Production.new(:Mult70010106193040,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
|
114
115
|
p58 = Production.new(:HashExpr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
115
116
|
p59 = Production.new(:HashExpr,[t2],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
116
117
|
p60 = Production.new(:HashExpr,[t9],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
@@ -122,9 +123,9 @@ module Rpdf2txt
|
|
122
123
|
p66 = Production.new(:UElement,[t2, t43],SyntaxTreeBuilder.new("UElement",["c1", "regexptoken-265279295"],[])),
|
123
124
|
p67 = Production.new(:UElement,[t2, t2, t2, t44],SyntaxTreeBuilder.new("UElement",["c1", "numeric2", "numeric3", "c4"],[])),
|
124
125
|
p68 = Production.new(:UElement,[t45, t2, t46],SyntaxTreeBuilder.new("UElement",["c1", "numeric", "c3"],[])),
|
125
|
-
p69 = Production.new(:CNElements,[:
|
126
|
-
p70 = Production.new(:
|
127
|
-
p71 = Production.new(:
|
126
|
+
p69 = Production.new(:CNElements,[:Plus70010106149180],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
|
127
|
+
p70 = Production.new(:Plus70010106149180,[:Plus70010106149180, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
128
|
+
p71 = Production.new(:Plus70010106149180,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
128
129
|
p72 = Production.new(:CNElement,[t5],SyntaxTreeBuilder.new("CNElement",["c1"],[])),
|
129
130
|
p73 = Production.new(:CNElement,[t19, t5, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[])),
|
130
131
|
p74 = Production.new(:CNElement,[t19, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[2]))
|
@@ -133,14 +134,14 @@ module Rpdf2txt
|
|
133
134
|
|
134
135
|
]
|
135
136
|
priorities = ProductionPriorities.new(relations)
|
136
|
-
action_table = [[5, 512], [13,
|
137
|
-
goto_hash = {
|
138
|
-
@@
|
137
|
+
action_table = [[5, 512], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048], [2, 1], [153, 8388608, 157, 8], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [161, 2], [165, 2097152], [169, 16], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [173, 2199023255552], [36, 70342974373338], [177, 131072, 181, 2, 185, 4294967296, 189, 33554432, 193, 65536, 197, 4398046511104, 201, 536870912, 205, 134217728, 209, 1048576], [24, 70342974373338], [16, 70342974373338], [32, 70342974373338], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048, 8, 1024], [284, 2199023517712], [217, 262144, 85, 16, 272, 2199023255552], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [252, 70342974373338], [68, 70342974373338], [225, 1024], [229, 2097152, 233, 1073741824], [56, 70342974373338], [280, 2199023517712], [176, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64], [48, 70342974373338], [44, 70342974373338], [136, 31461450], [265, 8388608], [269, 35184372088832], [140, 70342974373338], [273, 34359738368, 281, 2, 285, 16, 289, 274877906944, 292, 2199023517712], [256, 70342974373338], [116, 70342974373338], [293, 16384, 297, 2, 301, 32768, 305, 64], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [260, 70342974373338], [188, 70342974373338], [180, 70342974373338], [124, 70342974373338], [12, 70342974373338], [309, 16], [276, 2199023517712], [4, 1], [128, 70342974373338], [192, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64, 144, 4096], [168, 29364298], [164, 29364298], [160, 29364298], [152, 29364298], [317, 4096], [156, 29364298], [132, 31461450], [268, 70342974373338], [325, 68719476736, 329, 128], [333, 549755813888], [337, 524288], [288, 2199023517712], [244, 70342974373338], [104, 70342974373338], [341, 268435456, 345, 8796093022208, 349, 2], [108, 70342974373338], [353, 2147483648], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [357, 68719476736, 361, 128], [216, 549755813888], [365, 256, 369, 2, 377, 128], [248, 70342974373338], [120, 70342974373338], [184, 70342974373338], [264, 70342974373338], [381, 137438953472, 385, 2], [196, 70342974373338], [212, 549755813888], [365, 256, 369, 2, 377, 128], [236, 68719476864], [232, 68719476864], [224, 68719476864], [228, 68719476864], [240, 70342974373338], [393, 2], [220, 68719476864], [397, 8192], [100, 70342974373338]]
|
138
|
+
goto_hash = {82 => {31 => 93}, 22 => {35 => 55}, 0 => {1 => 2}, 1 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 20, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 18, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 90 => {31 => 97}, 68 => {30 => 80}, 35 => {16 => 64, 17 => 59, 18 => 63, 14 => 61}, 42 => {29 => 69}, 20 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 24 => 5, 19 => 25, 8 => 19, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 53, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 59 => {18 => 78, 14 => 61}}
|
139
|
+
@@parse_table70010113257400 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
139
140
|
:REDUCE,
|
140
141
|
:SHIFT,
|
141
142
|
:ACCEPT
|
142
143
|
])
|
143
144
|
def Rpdf2txt._text_parser
|
144
|
-
GeneralizedLrParser.new(@@
|
145
|
+
GeneralizedLrParser.new(@@parse_table70010113257400)
|
145
146
|
end
|
146
147
|
end
|
data/lib/rpdf2txt/object.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# PdfObject -- Rpdf2txt -- 17.05.2011 -- mhatakeyama@ywesee.com
|
23
24
|
# PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
|
24
25
|
|
25
26
|
require 'zlib'
|
@@ -27,7 +28,7 @@ require 'rpdf2txt/text'
|
|
27
28
|
require 'rpdf2txt/attributesparser'
|
28
29
|
require 'rpdf2txt/cmapparser'
|
29
30
|
require 'rpdf2txt/symbol'
|
30
|
-
require 'md5'
|
31
|
+
require 'digest/md5'
|
31
32
|
require 'matrix'
|
32
33
|
|
33
34
|
module Rpdf2txt
|
@@ -101,13 +102,15 @@ module Rpdf2txt
|
|
101
102
|
ast.values.collect { |child| extract_attributes(child) }
|
102
103
|
elsif(ast.children_names.include?('pairs'))
|
103
104
|
result = {}
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
105
|
+
if(ast_pairs = ast.pairs)
|
106
|
+
ast_pairs.each { |pair|
|
107
|
+
k, v = pair
|
108
|
+
keystr = k.value.strip.tr('/','')
|
109
|
+
unless(keystr.empty?)
|
108
110
|
result.store(keystr.downcase.intern, extract_attributes(v))
|
109
|
-
|
110
|
-
|
111
|
+
end
|
112
|
+
}
|
113
|
+
end
|
111
114
|
result
|
112
115
|
else
|
113
116
|
value = ast
|
@@ -200,9 +203,13 @@ module Rpdf2txt
|
|
200
203
|
end
|
201
204
|
encryption_key = digest[0,keylength]
|
202
205
|
test_key = compute_user_key encryption_key
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
+
## Comment out the following, since import_gkv (de.oddb.org) stops due to this error
|
207
|
+
# See http://dev.ywesee.com/wiki.php/Masa/20110209-debug-importGkv-rpdf2txt
|
208
|
+
# Also refer to http://trac.ywesee.com/ticket/74#comment:5
|
209
|
+
#
|
210
|
+
#if(test_key != uk)
|
211
|
+
# raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
|
212
|
+
#end
|
206
213
|
encryption_key
|
207
214
|
end
|
208
215
|
def file_id= (file_id)
|
@@ -309,7 +316,11 @@ module Rpdf2txt
|
|
309
316
|
end
|
310
317
|
def width(char)
|
311
318
|
if(char.is_a?(String) && char.length == 1)
|
312
|
-
|
319
|
+
if RUBY_VERSION > "1.9"
|
320
|
+
char = char.bytes.to_a[0]
|
321
|
+
else
|
322
|
+
char = char[0]
|
323
|
+
end
|
313
324
|
end
|
314
325
|
_width(char) || named_width(char)
|
315
326
|
end
|
@@ -442,11 +453,15 @@ module Rpdf2txt
|
|
442
453
|
yield self
|
443
454
|
end
|
444
455
|
def extract_oids(array)
|
445
|
-
|
456
|
+
if array.class != Array
|
457
|
+
array = [array]
|
458
|
+
end
|
459
|
+
result = array.collect{ |dirty_id|
|
446
460
|
if(match = /\d+/on.match(dirty_id))
|
447
461
|
match[0].to_i
|
448
462
|
end
|
449
463
|
}.compact
|
464
|
+
return result
|
450
465
|
end
|
451
466
|
def root?
|
452
467
|
!(@parent || @attributes[:parent])
|
@@ -519,6 +534,36 @@ module Rpdf2txt
|
|
519
534
|
parent.media_box
|
520
535
|
end
|
521
536
|
end
|
537
|
+
def merge_snippets(text_snippets)
|
538
|
+
# this is required for the pdf file that is written by pdfFactory 3.25
|
539
|
+
# (Windows Server 2003 R2 Standard Edition German)
|
540
|
+
# This builds up a meaningful snippet from the small snippets whose
|
541
|
+
# x, y positions are same
|
542
|
+
# See in more detail:
|
543
|
+
# * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt
|
544
|
+
# * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt
|
545
|
+
new_text_snippets = []
|
546
|
+
last = nil
|
547
|
+
snippet = nil
|
548
|
+
text_snippets.each do |snip|
|
549
|
+
snippet ||= snip.txt
|
550
|
+
if last
|
551
|
+
if last == snip
|
552
|
+
snippet << snip.txt
|
553
|
+
else
|
554
|
+
last.txt = snippet
|
555
|
+
new_text_snippets << last.dup
|
556
|
+
snippet = snip.txt
|
557
|
+
last = snip
|
558
|
+
end
|
559
|
+
end
|
560
|
+
last = snip
|
561
|
+
end
|
562
|
+
# for the last element
|
563
|
+
lasttxt = snippet
|
564
|
+
new_text_snippets << last.dup
|
565
|
+
return new_text_snippets
|
566
|
+
end
|
522
567
|
def text(callback_handler)
|
523
568
|
concat_stream = Stream.new('')
|
524
569
|
if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
|
@@ -530,6 +575,7 @@ module Rpdf2txt
|
|
530
575
|
end
|
531
576
|
@text_state.media_box = self.media_box
|
532
577
|
text_snippets = concat_stream.extract_text_objects(self, @text_state)
|
578
|
+
text_snippets = merge_snippets(text_snippets)
|
533
579
|
join_snippets(text_snippets, callback_handler)
|
534
580
|
end
|
535
581
|
private
|
@@ -756,7 +802,16 @@ module Rpdf2txt
|
|
756
802
|
result
|
757
803
|
end
|
758
804
|
def raw_stream
|
759
|
-
|
805
|
+
#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
|
806
|
+
#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
|
807
|
+
unless(@raw_stream)
|
808
|
+
if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?)
|
809
|
+
@raw_stream = src_scan[0][0]
|
810
|
+
else
|
811
|
+
@raw_stream = src_scan.to_s
|
812
|
+
end
|
813
|
+
end
|
814
|
+
return @raw_stream
|
760
815
|
end
|
761
816
|
def decode_raw_stream
|
762
817
|
@decrypted_stream = raw_stream
|
data/lib/rpdf2txt/parser.rb
CHANGED
@@ -20,15 +20,16 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# PdfParser -- Rpdf2txt-- 05.01.2012 -- mhatakeyama@ywesee.com
|
23
24
|
# PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
|
24
25
|
|
25
26
|
require 'zlib'
|
26
27
|
require 'rpdf2txt/object'
|
27
28
|
require 'rpdf2txt/default_handler'
|
28
|
-
require 'md5'
|
29
|
+
require 'digest/md5'
|
29
30
|
|
30
31
|
module Rpdf2txt
|
31
|
-
VERSION = '0.8.
|
32
|
+
VERSION = '0.8.3'
|
32
33
|
class Parser
|
33
34
|
attr_accessor :encrypt
|
34
35
|
def initialize(pdf_stream, target_encoding='utf8')
|
@@ -128,6 +129,9 @@ module Rpdf2txt
|
|
128
129
|
startobj=0
|
129
130
|
endobj=0
|
130
131
|
catalogue = {}
|
132
|
+
if RUBY_VERSION >= '1.9'
|
133
|
+
@src.force_encoding('ascii-8bit')
|
134
|
+
end
|
131
135
|
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
|
132
136
|
obj = build_object(match.to_s)
|
133
137
|
catalogue.store(obj.oid, obj)
|
data/lib/rpdf2txt/text.rb
CHANGED
@@ -64,7 +64,6 @@ module Rpdf2txt
|
|
64
64
|
elsif((map = @current_font.to_unicode) \
|
65
65
|
&& (utf8 = map.to_utf8(ascii)))
|
66
66
|
@current_font.attributes[:encoding] = '/UTF8'
|
67
|
-
#@text_state.set_font(@current_font)
|
68
67
|
[utf8].pack('U')
|
69
68
|
end
|
70
69
|
end
|
@@ -165,7 +164,7 @@ module Rpdf2txt
|
|
165
164
|
snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
|
166
165
|
snippet_text.gsub!(/\\([()])/n, '\1')
|
167
166
|
snippet_text.gsub!(/./n) { |char|
|
168
|
-
self.mapped_ascii(char[0]) || char
|
167
|
+
self.mapped_ascii(char.unpack('C*')[0]) || char
|
169
168
|
}
|
170
169
|
_snip(snippet_text)
|
171
170
|
end
|
data/lib/rpdf2txt/text_state.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# TextState -- Rpdf2txt -- 05.01.2012 -- mhatakeyama@ywesee.com
|
23
24
|
# TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
|
24
25
|
|
25
26
|
module Rpdf2txt
|
@@ -124,7 +125,8 @@ module Rpdf2txt
|
|
124
125
|
end
|
125
126
|
def char_width(char)
|
126
127
|
if(char.is_a? String)
|
127
|
-
char = char[0]
|
128
|
+
#char = char[0]
|
129
|
+
char = char.unpack('C*')[0]
|
128
130
|
end
|
129
131
|
w = 0.0
|
130
132
|
if(@font && (width = @font.width(char)))
|
@@ -306,7 +308,10 @@ module Rpdf2txt
|
|
306
308
|
@boxwidth += char_width(char)
|
307
309
|
end
|
308
310
|
@w = @boxwidth
|
309
|
-
if
|
311
|
+
if RUBY_VERSION >= '1.9'
|
312
|
+
txt.force_encoding('ascii-8bit')
|
313
|
+
end
|
314
|
+
if white = txt[/\s+$/n]
|
310
315
|
white.each_byte do |char|
|
311
316
|
@w += char_width(char)
|
312
317
|
end
|
@@ -329,6 +334,9 @@ module Rpdf2txt
|
|
329
334
|
@tmy -= y_val.to_f
|
330
335
|
end
|
331
336
|
def unescape_txt!(txt)
|
337
|
+
if RUBY_VERSION >= '1.9'
|
338
|
+
txt.force_encoding('ascii-8bit')
|
339
|
+
end
|
332
340
|
txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
|
333
341
|
end
|
334
342
|
protected
|
@@ -51,7 +51,7 @@ module Parse
|
|
51
51
|
}
|
52
52
|
File.open(outputFile, "w") do |f|
|
53
53
|
time_and_puts("Writing parser to file #{outputFile}") {
|
54
|
-
f.write "
|
54
|
+
f.write "# encoding: ascii-8bit\nrequire 'rpdf2txt-rockit/rockit'\n" +
|
55
55
|
parser.to_src_in_module(parserName, moduleName)
|
56
56
|
}
|
57
57
|
end
|