rpdf2txt 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +0 -4
- data/README.txt +16 -3
- data/bin/rpdf2txt +4 -1
- data/lib/rpdf2txt/data/cmap.rb +10 -9
- data/lib/rpdf2txt/data/cmap_range.rb +13 -12
- data/lib/rpdf2txt/data/pdfattributes.rb +14 -13
- data/lib/rpdf2txt/data/pdftext.rb +19 -18
- data/lib/rpdf2txt/object.rb +68 -13
- data/lib/rpdf2txt/parser.rb +6 -2
- data/lib/rpdf2txt/text.rb +1 -2
- data/lib/rpdf2txt/text_state.rb +10 -2
- data/lib/rpdf2txt-rockit/rockit.rb +1 -1
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +1 -0
- data/lib/rpdf2txt-rockit/token.rb +1 -0
- data/test/mock.rb +19 -11
- data/test/test_object.rb +33 -0
- data/test/test_pdf_object.rb +25 -24
- data/test/test_pdf_parser.rb +8 -5
- data/test/test_pdf_text.rb +11 -10
- data/test/test_space_bug_05_2004.rb +2 -1
- data/test/test_stream.rb +6 -5
- data/test/test_text_state.rb +220 -219
- metadata +13 -14
- data/config.save +0 -12
- data/lib/rpdf2txt/data/_cmap.grammar +0 -11
- data/lib/rpdf2txt/data/_cmap_range.grammar +0 -15
- data/lib/rpdf2txt/data/_pdfattributes.grammar +0 -32
data/.gemtest
ADDED
File without changes
|
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -4,7 +4,6 @@ Manifest.txt
|
|
4
4
|
README.txt
|
5
5
|
Rakefile
|
6
6
|
bin/rpdf2txt
|
7
|
-
config.save
|
8
7
|
install.rb
|
9
8
|
lib/rpdf2txt-rockit/base_extensions.rb
|
10
9
|
lib/rpdf2txt-rockit/bootstrap.rb
|
@@ -36,9 +35,6 @@ lib/rpdf2txt/data/cmap.grammar
|
|
36
35
|
lib/rpdf2txt/data/cmap.rb
|
37
36
|
lib/rpdf2txt/data/cmap_range.grammar
|
38
37
|
lib/rpdf2txt/data/cmap_range.rb
|
39
|
-
lib/rpdf2txt/data/_cmap.grammar
|
40
|
-
lib/rpdf2txt/data/_cmap_range.grammar
|
41
|
-
lib/rpdf2txt/data/_pdfattributes.grammar
|
42
38
|
lib/rpdf2txt/data/fonts/Courier-Bold.afm
|
43
39
|
lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
|
44
40
|
lib/rpdf2txt/data/fonts/Courier-Oblique.afm
|
data/README.txt
CHANGED
@@ -12,7 +12,20 @@ Not the problems here.
|
|
12
12
|
|
13
13
|
== REQUIREMENTS:
|
14
14
|
|
15
|
-
* Ruby 1.8
|
15
|
+
* Ruby 1.8 or Ruby 1.9.3
|
16
|
+
|
17
|
+
NOTE for Ruby 1.9.3
|
18
|
+
|
19
|
+
* rpdf2txt on Ruby 1.8.6 creates the Parser-Script from the grammar files
|
20
|
+
(without underline). Ruby 1.9.3 fails to create the Parser-Script. The work
|
21
|
+
around is to copy the grammar files to _grammar_file and then rpdf2txt (with
|
22
|
+
Ruby 1.8.6 and 1.9.3) stops creating a new Parser-Script and just uses the
|
23
|
+
existing one. Ergo: Ruby 1.9.3 just uses the parser script generated with
|
24
|
+
Ruby 1.8.6.
|
25
|
+
|
26
|
+
* If you change the grammar files then you have to create the
|
27
|
+
Parser-Script again with Ruby 1.8.6. The grammar file without underline
|
28
|
+
has to be changed.
|
16
29
|
|
17
30
|
== INSTALL:
|
18
31
|
|
@@ -23,8 +36,8 @@ Not the problems here.
|
|
23
36
|
|
24
37
|
== DEVELOPERS:
|
25
38
|
|
26
|
-
Masaomi Hatakeyama, mhatakeyama@ywesee.com
|
27
|
-
Zeno R.R. Davatz, zdavatz@ywesee.com
|
39
|
+
* Masaomi Hatakeyama, mhatakeyama@ywesee.com
|
40
|
+
* Zeno R.R. Davatz, zdavatz@ywesee.com
|
28
41
|
|
29
42
|
== LICENSE:
|
30
43
|
* GPLv2
|
data/bin/rpdf2txt
CHANGED
@@ -49,7 +49,10 @@ if <output-file> is omitted, the extracted text is written to stdout
|
|
49
49
|
EOS
|
50
50
|
exit
|
51
51
|
end
|
52
|
-
|
52
|
+
stream = open(ARGV[0], 'rb') do |file|
|
53
|
+
file.read
|
54
|
+
end
|
55
|
+
parser = Rpdf2txt::Parser.new(stream, 'utf-8')
|
53
56
|
outstream = STDOUT
|
54
57
|
if(ARGV.size == 2)
|
55
58
|
outstream = File.open(ARGV[1], 'w')
|
data/lib/rpdf2txt/data/cmap.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for CMap
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 11:22:39 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~195355369)/),
|
10
11
|
t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
11
12
|
t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
12
13
|
t4 = StringToken.new("StrToken61","<"),
|
@@ -14,9 +15,9 @@ module Rpdf2txt
|
|
14
15
|
]
|
15
16
|
productions = [
|
16
17
|
p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
|
17
|
-
p2 = Production.new(:HexArray,[:
|
18
|
-
p3 = Production.new(:
|
19
|
-
p4 = Production.new(:
|
18
|
+
p2 = Production.new(:HexArray,[:Plus69966636668160],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
19
|
+
p3 = Production.new(:Plus69966636668160,[:Plus69966636668160, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
20
|
+
p4 = Production.new(:Plus69966636668160,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
20
21
|
p5 = Production.new(:RangeDef,[:HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["source", "target"],[])),
|
21
22
|
p6 = Production.new(:HexElement,[t4, t3, t5],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
|
22
23
|
]
|
@@ -24,14 +25,14 @@ module Rpdf2txt
|
|
24
25
|
|
25
26
|
]
|
26
27
|
priorities = ProductionPriorities.new(relations)
|
27
|
-
action_table = [[
|
28
|
-
goto_hash = {
|
29
|
-
@@
|
28
|
+
action_table = [[17, 8], [17, 8], [12, 29], [2, 1], [29, 4], [17, 8, 4, 1], [16, 29], [37, 16], [8, 29], [20, 29]]
|
29
|
+
goto_hash = {5 => {3 => 8, 4 => 1}, 0 => {1 => 3, 2 => 5, 3 => 2, 4 => 1}, 1 => {4 => 6}}
|
30
|
+
@@parse_table69966636607980 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
30
31
|
:REDUCE,
|
31
32
|
:SHIFT,
|
32
33
|
:ACCEPT
|
33
34
|
])
|
34
35
|
def Rpdf2txt._cmap_parser
|
35
|
-
GeneralizedLrParser.new(@@
|
36
|
+
GeneralizedLrParser.new(@@parse_table69966636607980)
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for CMap
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 11:29:28 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~1397495140)/),
|
10
11
|
t2 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
11
12
|
t3 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
12
13
|
t4 = StringToken.new("StrToken93","["),
|
@@ -16,28 +17,28 @@ module Rpdf2txt
|
|
16
17
|
]
|
17
18
|
productions = [
|
18
19
|
p1 = Production.new("HexArray'".intern,[:HexArray],SyntaxTreeBuilder.new("HexArray'",["hexarray"],[])),
|
19
|
-
p2 = Production.new(:HexArray,[:
|
20
|
-
p3 = Production.new(:
|
21
|
-
p4 = Production.new(:
|
20
|
+
p2 = Production.new(:HexArray,[:Plus69948349851480],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
21
|
+
p3 = Production.new(:Plus69948349851480,[:Plus69948349851480, :RangeDef],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
22
|
+
p4 = Production.new(:Plus69948349851480,[:RangeDef],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
22
23
|
p5 = Production.new(:RangeDef,[:HexElement, :HexElement, :HexElement],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "offset"],[])),
|
23
24
|
p6 = Production.new(:RangeDef,[:HexElement, :HexElement, :Explicit],SyntaxTreeBuilder.new("RangeDef",["start", "stop", "explicit"],[])),
|
24
|
-
p7 = Production.new(:Explicit,[t4, :
|
25
|
-
p8 = Production.new(:
|
26
|
-
p9 = Production.new(:
|
25
|
+
p7 = Production.new(:Explicit,[t4, :Plus69948349837620, t5],LiftingSyntaxTreeBuilder.new(["_", "explicit", "_"],[])),
|
26
|
+
p8 = Production.new(:Plus69948349837620,[:Plus69948349837620, :HexElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
27
|
+
p9 = Production.new(:Plus69948349837620,[:HexElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
27
28
|
p10 = Production.new(:HexElement,[t6, t3, t7],LiftingSyntaxTreeBuilder.new(["_", "hexsnip", "_"],[]))
|
28
29
|
]
|
29
30
|
relations = [
|
30
31
|
|
31
32
|
]
|
32
33
|
priorities = ProductionPriorities.new(relations)
|
33
|
-
action_table = [[
|
34
|
-
goto_hash = {
|
35
|
-
@@
|
34
|
+
action_table = [[5, 32], [25, 4], [5, 32], [12, 125], [2, 1], [5, 32, 4, 1], [37, 64], [5, 32, 49, 8], [8, 125], [36, 125], [16, 125], [20, 125], [5, 32], [32, 116], [5, 32, 61, 16], [24, 125], [28, 116]]
|
35
|
+
goto_hash = {5 => {6 => 2, 3 => 8}, 0 => {6 => 2, 1 => 4, 2 => 5, 3 => 3}, 12 => {5 => 14, 6 => 13}, 7 => {6 => 10, 4 => 11}, 2 => {6 => 7}, 14 => {6 => 16}}
|
36
|
+
@@parse_table69948349751360 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
36
37
|
:REDUCE,
|
37
38
|
:SHIFT,
|
38
39
|
:ACCEPT
|
39
40
|
])
|
40
41
|
def Rpdf2txt._cmap_range_parser
|
41
|
-
GeneralizedLrParser.new(@@
|
42
|
+
GeneralizedLrParser.new(@@parse_table69948349751360)
|
42
43
|
end
|
43
44
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for PdfAttributes
|
4
|
-
# created by Rockit version 0.3.8 on Tue
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 07:42:18 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/),
|
10
11
|
t2 = Token.new("IDENTIFIER",/^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n),
|
11
12
|
t3 = Token.new("NUMERIC",/^(-?[0-9]+([.,][0-9]+)?)/n),
|
12
13
|
t4 = Token.new("REFERENCE",/^([0-9]+\s+[0-9]+\s+R)/n),
|
@@ -21,7 +22,7 @@ module Rpdf2txt
|
|
21
22
|
t13 = StringToken.new("StrToken345387270","(D:"),
|
22
23
|
t14 = RegexpToken.new("RegexpToken1015925646",/[\d+']+/n),
|
23
24
|
t15 = StringToken.new("StrToken42",")"),
|
24
|
-
t16 = RegexpToken.new("
|
25
|
+
t16 = RegexpToken.new("RegexpToken1100081633",/\(([^\)\\]|\\[\(\)\\]?)*?\)/n)
|
25
26
|
]
|
26
27
|
productions = [
|
27
28
|
p1 = Production.new("Expr'".intern,[:Expr],SyntaxTreeBuilder.new("Expr'",["expr"],[])),
|
@@ -36,9 +37,9 @@ module Rpdf2txt
|
|
36
37
|
p10 = Production.new(:Expr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
37
38
|
p11 = Production.new(:Array,[t9, :ArrayElements, t10],SyntaxTreeBuilder.new("Array",["_", "values", "_"],[])),
|
38
39
|
p12 = Production.new(:Array,[t9, t10],SyntaxTreeBuilder.new("Array",["_", "_"],[])),
|
39
|
-
p13 = Production.new(:ArrayElements,[:
|
40
|
-
p14 = Production.new(:
|
41
|
-
p15 = Production.new(:
|
40
|
+
p13 = Production.new(:ArrayElements,[:Plus70010113073940],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
41
|
+
p14 = Production.new(:Plus70010113073940,[:Plus70010113073940, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
42
|
+
p15 = Production.new(:Plus70010113073940,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
42
43
|
p16 = Production.new(:ArrayElement,[:Array],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
43
44
|
p17 = Production.new(:ArrayElement,[:Hash],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
44
45
|
p18 = Production.new(:ArrayElement,[t3],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
@@ -47,10 +48,10 @@ module Rpdf2txt
|
|
47
48
|
p21 = Production.new(:ArrayElement,[t5],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
48
49
|
p22 = Production.new(:ArrayElement,[t8],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
49
50
|
p23 = Production.new(:ArrayElement,[:Text],LiftingSyntaxTreeBuilder.new(["_"],[])),
|
50
|
-
p24 = Production.new(:Hash,[t11, :
|
51
|
+
p24 = Production.new(:Hash,[t11, :Mult70010113055620, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
|
51
52
|
p25 = Production.new(:Hash,[t11, t12],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
|
52
|
-
p26 = Production.new(:
|
53
|
-
p27 = Production.new(:
|
53
|
+
p26 = Production.new(:Mult70010113055620,[:Mult70010113055620, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
|
54
|
+
p27 = Production.new(:Mult70010113055620,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
|
54
55
|
p28 = Production.new(:Date,[t13, t14, t15],SyntaxTreeBuilder.new("Date",["c1", "regexptoken1015925646", "c3"],[])),
|
55
56
|
p29 = Production.new(:Text,[t16],SyntaxTreeBuilder.new("Text",["text"],[]))
|
56
57
|
]
|
@@ -58,14 +59,14 @@ module Rpdf2txt
|
|
58
59
|
|
59
60
|
]
|
60
61
|
priorities = ProductionPriorities.new(relations)
|
61
|
-
action_table = [[5,
|
62
|
-
goto_hash = {
|
63
|
-
@@
|
62
|
+
action_table = [[5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [32, 2051], [36, 2051], [65, 2048, 69, 2], [24, 2051], [20, 2051], [12, 2051], [8, 2051], [2, 1], [112, 65439], [73, 8192], [28, 2051], [16, 2051], [77, 16, 85, 128, 13, 1024, 93, 512, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2], [4, 2051], [125, 2048, 129, 2], [96, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [137, 16384], [80, 65438], [77, 16, 85, 128, 13, 1024, 97, 4, 37, 32768, 113, 8, 53, 256, 121, 2, 48, 512], [84, 65438], [88, 65438], [44, 65439], [68, 65438], [64, 65438], [60, 65438], [56, 65438], [76, 65438], [145, 512], [72, 65438], [92, 65439], [5, 16, 9, 128, 13, 1024, 21, 4, 37, 32768, 41, 4096, 49, 8, 53, 256, 57, 2], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]]
|
63
|
+
goto_hash = {0 => {6 => 7, 1 => 8, 2 => 6, 8 => 11, 9 => 4}, 17 => {6 => 7, 1 => 33, 2 => 6, 8 => 11, 9 => 4}, 13 => {5 => 27, 6 => 25, 2 => 26, 3 => 29, 9 => 22, 4 => 20}, 3 => {7 => 15}, 20 => {5 => 35, 6 => 25, 2 => 26, 9 => 22}, 32 => {6 => 7, 1 => 37, 2 => 6, 8 => 11, 9 => 4}}
|
64
|
+
@@parse_table70010113197280 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
64
65
|
:REDUCE,
|
65
66
|
:SHIFT,
|
66
67
|
:ACCEPT
|
67
68
|
])
|
68
69
|
def Rpdf2txt._attr_parser
|
69
|
-
GeneralizedLrParser.new(@@
|
70
|
+
GeneralizedLrParser.new(@@parse_table70010113197280)
|
70
71
|
end
|
71
72
|
end
|
@@ -1,12 +1,13 @@
|
|
1
|
+
# encoding: ascii-8bit
|
1
2
|
require 'rpdf2txt-rockit/rockit'
|
2
3
|
module Rpdf2txt
|
3
4
|
# Parser for PdfText
|
4
|
-
# created by Rockit version 0.3.8 on
|
5
|
+
# created by Rockit version 0.3.8 on Tue Jan 18 07:42:19 +0100 2011
|
5
6
|
# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se
|
6
7
|
# and licensed under GPL
|
7
8
|
# but this parser is under LGPL
|
8
9
|
tokens = [
|
9
|
-
t1 = EofToken.new("EOF",/^(�~~��~^^~
|
10
|
+
t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/),
|
10
11
|
t2 = Token.new("NUMERIC",/^(-?(([0-9]*[.,_][0-9]+)|([0-9]+)))/n),
|
11
12
|
t3 = Token.new("SPACE",/^(\s+)/n,:Skip),
|
12
13
|
t4 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in),
|
@@ -56,9 +57,9 @@ module Rpdf2txt
|
|
56
57
|
productions = [
|
57
58
|
p1 = Production.new("Target'".intern,[:Target],SyntaxTreeBuilder.new("Target'",["target"],[])),
|
58
59
|
p2 = Production.new(:Target,[t10, :Exprs, t11],SyntaxTreeBuilder.new("Target",["_", "values", "_"],[])),
|
59
|
-
p3 = Production.new(:Exprs,[:
|
60
|
-
p4 = Production.new(:
|
61
|
-
p5 = Production.new(:
|
60
|
+
p3 = Production.new(:Exprs,[:Plus70010106411320],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
61
|
+
p4 = Production.new(:Plus70010106411320,[:Plus70010106411320, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
62
|
+
p5 = Production.new(:Plus70010106411320,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
62
63
|
p6 = Production.new(:Expr,[:TmElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
63
64
|
p7 = Production.new(:Expr,[:Array],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
64
65
|
p8 = Production.new(:Expr,[:TDElement],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
@@ -90,9 +91,9 @@ module Rpdf2txt
|
|
90
91
|
p34 = Production.new(:HexElement,[t23, t4, t24],LiftingSyntaxTreeBuilder.new(["_", "hex", "_"],[])),
|
91
92
|
p35 = Production.new(:HexElement,[t23, t24],LiftingSyntaxTreeBuilder.new(["_", "_"],[])),
|
92
93
|
p36 = Production.new(:TjHex,[:HexElement, t22],SyntaxTreeBuilder.new("Tjhex",["hexsnippet", "_"],[])),
|
93
|
-
p37 = Production.new(:TJArrayElements,[:
|
94
|
-
p38 = Production.new(:
|
95
|
-
p39 = Production.new(:
|
94
|
+
p37 = Production.new(:TJArrayElements,[:Plus70010106266620],LiftingSyntaxTreeBuilder.new(["values"],[])),
|
95
|
+
p38 = Production.new(:Plus70010106266620,[:Plus70010106266620, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
96
|
+
p39 = Production.new(:Plus70010106266620,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
96
97
|
p40 = Production.new(:TJSingleElement,[t7],SyntaxTreeBuilder.new("TJSingleElement",["snippet"],[])),
|
97
98
|
p41 = Production.new(:TJSingleElement,[t2],SyntaxTreeBuilder.new("TJSingleElement",["kerning"],[])),
|
98
99
|
p42 = Production.new(:TJSingleElement,[:HexElement],SyntaxTreeBuilder.new("TJSingleElement",["hexsnippet"],[])),
|
@@ -107,10 +108,10 @@ module Rpdf2txt
|
|
107
108
|
p51 = Production.new(:LineWidth,[t2, t33],SyntaxTreeBuilder.new("Width",["width", "_"],[])),
|
108
109
|
p52 = Production.new(:BTElement,[t34],SyntaxTreeBuilder.new("BT",["_"],[])),
|
109
110
|
p53 = Production.new(:ETElement,[t35],SyntaxTreeBuilder.new("ET",["_"],[])),
|
110
|
-
p54 = Production.new(:Hash,[t36, :
|
111
|
+
p54 = Production.new(:Hash,[t36, :Mult70010106193040, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])),
|
111
112
|
p55 = Production.new(:Hash,[t36, t37],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)),
|
112
|
-
p56 = Production.new(:
|
113
|
-
p57 = Production.new(:
|
113
|
+
p56 = Production.new(:Mult70010106193040,[:Mult70010106193040, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)),
|
114
|
+
p57 = Production.new(:Mult70010106193040,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)),
|
114
115
|
p58 = Production.new(:HashExpr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
115
116
|
p59 = Production.new(:HashExpr,[t2],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
116
117
|
p60 = Production.new(:HashExpr,[t9],LiftingSyntaxTreeBuilder.new(["val"],[])),
|
@@ -122,9 +123,9 @@ module Rpdf2txt
|
|
122
123
|
p66 = Production.new(:UElement,[t2, t43],SyntaxTreeBuilder.new("UElement",["c1", "regexptoken-265279295"],[])),
|
123
124
|
p67 = Production.new(:UElement,[t2, t2, t2, t44],SyntaxTreeBuilder.new("UElement",["c1", "numeric2", "numeric3", "c4"],[])),
|
124
125
|
p68 = Production.new(:UElement,[t45, t2, t46],SyntaxTreeBuilder.new("UElement",["c1", "numeric", "c3"],[])),
|
125
|
-
p69 = Production.new(:CNElements,[:
|
126
|
-
p70 = Production.new(:
|
127
|
-
p71 = Production.new(:
|
126
|
+
p69 = Production.new(:CNElements,[:Plus70010106149180],SyntaxTreeBuilder.new("CNElements",["plus"],[])),
|
127
|
+
p70 = Production.new(:Plus70010106149180,[:Plus70010106149180, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)),
|
128
|
+
p71 = Production.new(:Plus70010106149180,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)),
|
128
129
|
p72 = Production.new(:CNElement,[t5],SyntaxTreeBuilder.new("CNElement",["c1"],[])),
|
129
130
|
p73 = Production.new(:CNElement,[t19, t5, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[])),
|
130
131
|
p74 = Production.new(:CNElement,[t19, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[2]))
|
@@ -133,14 +134,14 @@ module Rpdf2txt
|
|
133
134
|
|
134
135
|
]
|
135
136
|
priorities = ProductionPriorities.new(relations)
|
136
|
-
action_table = [[5, 512], [13,
|
137
|
-
goto_hash = {
|
138
|
-
@@
|
137
|
+
action_table = [[5, 512], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048], [2, 1], [153, 8388608, 157, 8], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [161, 2], [165, 2097152], [169, 16], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [173, 2199023255552], [36, 70342974373338], [177, 131072, 181, 2, 185, 4294967296, 189, 33554432, 193, 65536, 197, 4398046511104, 201, 536870912, 205, 134217728, 209, 1048576], [24, 70342974373338], [16, 70342974373338], [32, 70342974373338], [13, 4194304, 33, 17592186044416, 41, 262144, 65, 2, 85, 16, 113, 1099511627776, 125, 64, 137, 67108864, 141, 2048, 8, 1024], [284, 2199023517712], [217, 262144, 85, 16, 272, 2199023255552], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [252, 70342974373338], [68, 70342974373338], [225, 1024], [229, 2097152, 233, 1073741824], [56, 70342974373338], [280, 2199023517712], [176, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64], [48, 70342974373338], [44, 70342974373338], [136, 31461450], [265, 8388608], [269, 35184372088832], [140, 70342974373338], [273, 34359738368, 281, 2, 285, 16, 289, 274877906944, 292, 2199023517712], [256, 70342974373338], [116, 70342974373338], [293, 16384, 297, 2, 301, 32768, 305, 64], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [260, 70342974373338], [188, 70342974373338], [180, 70342974373338], [124, 70342974373338], [12, 70342974373338], [309, 16], [276, 2199023517712], [4, 1], [128, 70342974373338], [192, 70342974373338], [13, 4194304, 241, 16777216, 249, 2, 261, 64, 144, 4096], [168, 29364298], [164, 29364298], [160, 29364298], [152, 29364298], [317, 4096], [156, 29364298], [132, 31461450], [268, 70342974373338], [325, 68719476736, 329, 128], [333, 549755813888], [337, 524288], [288, 2199023517712], [244, 70342974373338], [104, 70342974373338], [341, 268435456, 345, 8796093022208, 349, 2], [108, 70342974373338], [353, 2147483648], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [357, 68719476736, 361, 128], [216, 549755813888], [365, 256, 369, 2, 377, 128], [248, 70342974373338], [120, 70342974373338], [184, 70342974373338], [264, 70342974373338], [381, 137438953472, 385, 2], [196, 70342974373338], [212, 549755813888], [365, 256, 369, 2, 377, 128], [236, 68719476864], [232, 68719476864], [224, 68719476864], [228, 68719476864], [240, 70342974373338], [393, 2], [220, 68719476864], [397, 8192], [100, 70342974373338]]
|
138
|
+
goto_hash = {82 => {31 => 93}, 22 => {35 => 55}, 0 => {1 => 2}, 1 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 20, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 18, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 90 => {31 => 97}, 68 => {30 => 80}, 35 => {16 => 64, 17 => 59, 18 => 63, 14 => 61}, 42 => {29 => 69}, 20 => {5 => 17, 11 => 37, 22 => 24, 33 => 14, 6 => 27, 12 => 29, 34 => 22, 23 => 12, 7 => 13, 13 => 36, 35 => 33, 24 => 5, 19 => 25, 8 => 19, 25 => 11, 14 => 9, 9 => 15, 20 => 7, 15 => 32, 4 => 53, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 59 => {18 => 78, 14 => 61}}
|
139
|
+
@@parse_table70010113257400 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[
|
139
140
|
:REDUCE,
|
140
141
|
:SHIFT,
|
141
142
|
:ACCEPT
|
142
143
|
])
|
143
144
|
def Rpdf2txt._text_parser
|
144
|
-
GeneralizedLrParser.new(@@
|
145
|
+
GeneralizedLrParser.new(@@parse_table70010113257400)
|
145
146
|
end
|
146
147
|
end
|
data/lib/rpdf2txt/object.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# PdfObject -- Rpdf2txt -- 17.05.2011 -- mhatakeyama@ywesee.com
|
23
24
|
# PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
|
24
25
|
|
25
26
|
require 'zlib'
|
@@ -27,7 +28,7 @@ require 'rpdf2txt/text'
|
|
27
28
|
require 'rpdf2txt/attributesparser'
|
28
29
|
require 'rpdf2txt/cmapparser'
|
29
30
|
require 'rpdf2txt/symbol'
|
30
|
-
require 'md5'
|
31
|
+
require 'digest/md5'
|
31
32
|
require 'matrix'
|
32
33
|
|
33
34
|
module Rpdf2txt
|
@@ -101,13 +102,15 @@ module Rpdf2txt
|
|
101
102
|
ast.values.collect { |child| extract_attributes(child) }
|
102
103
|
elsif(ast.children_names.include?('pairs'))
|
103
104
|
result = {}
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
105
|
+
if(ast_pairs = ast.pairs)
|
106
|
+
ast_pairs.each { |pair|
|
107
|
+
k, v = pair
|
108
|
+
keystr = k.value.strip.tr('/','')
|
109
|
+
unless(keystr.empty?)
|
108
110
|
result.store(keystr.downcase.intern, extract_attributes(v))
|
109
|
-
|
110
|
-
|
111
|
+
end
|
112
|
+
}
|
113
|
+
end
|
111
114
|
result
|
112
115
|
else
|
113
116
|
value = ast
|
@@ -200,9 +203,13 @@ module Rpdf2txt
|
|
200
203
|
end
|
201
204
|
encryption_key = digest[0,keylength]
|
202
205
|
test_key = compute_user_key encryption_key
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
+
## Comment out the following, since import_gkv (de.oddb.org) stops due to this error
|
207
|
+
# See http://dev.ywesee.com/wiki.php/Masa/20110209-debug-importGkv-rpdf2txt
|
208
|
+
# Also refer to http://trac.ywesee.com/ticket/74#comment:5
|
209
|
+
#
|
210
|
+
#if(test_key != uk)
|
211
|
+
# raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
|
212
|
+
#end
|
206
213
|
encryption_key
|
207
214
|
end
|
208
215
|
def file_id= (file_id)
|
@@ -309,7 +316,11 @@ module Rpdf2txt
|
|
309
316
|
end
|
310
317
|
def width(char)
|
311
318
|
if(char.is_a?(String) && char.length == 1)
|
312
|
-
|
319
|
+
if RUBY_VERSION > "1.9"
|
320
|
+
char = char.bytes.to_a[0]
|
321
|
+
else
|
322
|
+
char = char[0]
|
323
|
+
end
|
313
324
|
end
|
314
325
|
_width(char) || named_width(char)
|
315
326
|
end
|
@@ -442,11 +453,15 @@ module Rpdf2txt
|
|
442
453
|
yield self
|
443
454
|
end
|
444
455
|
def extract_oids(array)
|
445
|
-
|
456
|
+
if array.class != Array
|
457
|
+
array = [array]
|
458
|
+
end
|
459
|
+
result = array.collect{ |dirty_id|
|
446
460
|
if(match = /\d+/on.match(dirty_id))
|
447
461
|
match[0].to_i
|
448
462
|
end
|
449
463
|
}.compact
|
464
|
+
return result
|
450
465
|
end
|
451
466
|
def root?
|
452
467
|
!(@parent || @attributes[:parent])
|
@@ -519,6 +534,36 @@ module Rpdf2txt
|
|
519
534
|
parent.media_box
|
520
535
|
end
|
521
536
|
end
|
537
|
+
def merge_snippets(text_snippets)
|
538
|
+
# this is required for the pdf file that is written by pdfFactory 3.25
|
539
|
+
# (Windows Server 2003 R2 Standard Edition German)
|
540
|
+
# This builds up a meaningful snippet from the small snippets whose
|
541
|
+
# x, y positions are same
|
542
|
+
# See in more detail:
|
543
|
+
# * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt
|
544
|
+
# * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt
|
545
|
+
new_text_snippets = []
|
546
|
+
last = nil
|
547
|
+
snippet = nil
|
548
|
+
text_snippets.each do |snip|
|
549
|
+
snippet ||= snip.txt
|
550
|
+
if last
|
551
|
+
if last == snip
|
552
|
+
snippet << snip.txt
|
553
|
+
else
|
554
|
+
last.txt = snippet
|
555
|
+
new_text_snippets << last.dup
|
556
|
+
snippet = snip.txt
|
557
|
+
last = snip
|
558
|
+
end
|
559
|
+
end
|
560
|
+
last = snip
|
561
|
+
end
|
562
|
+
# for the last element
|
563
|
+
lasttxt = snippet
|
564
|
+
new_text_snippets << last.dup
|
565
|
+
return new_text_snippets
|
566
|
+
end
|
522
567
|
def text(callback_handler)
|
523
568
|
concat_stream = Stream.new('')
|
524
569
|
if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
|
@@ -530,6 +575,7 @@ module Rpdf2txt
|
|
530
575
|
end
|
531
576
|
@text_state.media_box = self.media_box
|
532
577
|
text_snippets = concat_stream.extract_text_objects(self, @text_state)
|
578
|
+
text_snippets = merge_snippets(text_snippets)
|
533
579
|
join_snippets(text_snippets, callback_handler)
|
534
580
|
end
|
535
581
|
private
|
@@ -756,7 +802,16 @@ module Rpdf2txt
|
|
756
802
|
result
|
757
803
|
end
|
758
804
|
def raw_stream
|
759
|
-
|
805
|
+
#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
|
806
|
+
#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
|
807
|
+
unless(@raw_stream)
|
808
|
+
if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?)
|
809
|
+
@raw_stream = src_scan[0][0]
|
810
|
+
else
|
811
|
+
@raw_stream = src_scan.to_s
|
812
|
+
end
|
813
|
+
end
|
814
|
+
return @raw_stream
|
760
815
|
end
|
761
816
|
def decode_raw_stream
|
762
817
|
@decrypted_stream = raw_stream
|
data/lib/rpdf2txt/parser.rb
CHANGED
@@ -20,15 +20,16 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# PdfParser -- Rpdf2txt-- 05.01.2012 -- mhatakeyama@ywesee.com
|
23
24
|
# PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
|
24
25
|
|
25
26
|
require 'zlib'
|
26
27
|
require 'rpdf2txt/object'
|
27
28
|
require 'rpdf2txt/default_handler'
|
28
|
-
require 'md5'
|
29
|
+
require 'digest/md5'
|
29
30
|
|
30
31
|
module Rpdf2txt
|
31
|
-
VERSION = '0.8.
|
32
|
+
VERSION = '0.8.3'
|
32
33
|
class Parser
|
33
34
|
attr_accessor :encrypt
|
34
35
|
def initialize(pdf_stream, target_encoding='utf8')
|
@@ -128,6 +129,9 @@ module Rpdf2txt
|
|
128
129
|
startobj=0
|
129
130
|
endobj=0
|
130
131
|
catalogue = {}
|
132
|
+
if RUBY_VERSION >= '1.9'
|
133
|
+
@src.force_encoding('ascii-8bit')
|
134
|
+
end
|
131
135
|
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
|
132
136
|
obj = build_object(match.to_s)
|
133
137
|
catalogue.store(obj.oid, obj)
|
data/lib/rpdf2txt/text.rb
CHANGED
@@ -64,7 +64,6 @@ module Rpdf2txt
|
|
64
64
|
elsif((map = @current_font.to_unicode) \
|
65
65
|
&& (utf8 = map.to_utf8(ascii)))
|
66
66
|
@current_font.attributes[:encoding] = '/UTF8'
|
67
|
-
#@text_state.set_font(@current_font)
|
68
67
|
[utf8].pack('U')
|
69
68
|
end
|
70
69
|
end
|
@@ -165,7 +164,7 @@ module Rpdf2txt
|
|
165
164
|
snippet_text = snippet[1..-2].gsub(/\\[nrt]/n, " ")
|
166
165
|
snippet_text.gsub!(/\\([()])/n, '\1')
|
167
166
|
snippet_text.gsub!(/./n) { |char|
|
168
|
-
self.mapped_ascii(char[0]) || char
|
167
|
+
self.mapped_ascii(char.unpack('C*')[0]) || char
|
169
168
|
}
|
170
169
|
_snip(snippet_text)
|
171
170
|
end
|
data/lib/rpdf2txt/text_state.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
21
|
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
22
|
#
|
23
|
+
# TextState -- Rpdf2txt -- 05.01.2012 -- mhatakeyama@ywesee.com
|
23
24
|
# TextState -- Rpdf2txt -- 29.11.2002 -- asschrafl@ywesee.com
|
24
25
|
|
25
26
|
module Rpdf2txt
|
@@ -124,7 +125,8 @@ module Rpdf2txt
|
|
124
125
|
end
|
125
126
|
def char_width(char)
|
126
127
|
if(char.is_a? String)
|
127
|
-
char = char[0]
|
128
|
+
#char = char[0]
|
129
|
+
char = char.unpack('C*')[0]
|
128
130
|
end
|
129
131
|
w = 0.0
|
130
132
|
if(@font && (width = @font.width(char)))
|
@@ -306,7 +308,10 @@ module Rpdf2txt
|
|
306
308
|
@boxwidth += char_width(char)
|
307
309
|
end
|
308
310
|
@w = @boxwidth
|
309
|
-
if
|
311
|
+
if RUBY_VERSION >= '1.9'
|
312
|
+
txt.force_encoding('ascii-8bit')
|
313
|
+
end
|
314
|
+
if white = txt[/\s+$/n]
|
310
315
|
white.each_byte do |char|
|
311
316
|
@w += char_width(char)
|
312
317
|
end
|
@@ -329,6 +334,9 @@ module Rpdf2txt
|
|
329
334
|
@tmy -= y_val.to_f
|
330
335
|
end
|
331
336
|
def unescape_txt!(txt)
|
337
|
+
if RUBY_VERSION >= '1.9'
|
338
|
+
txt.force_encoding('ascii-8bit')
|
339
|
+
end
|
332
340
|
txt.gsub!(/\\([0-9]{3})/n) { |match| $1.oct.chr }
|
333
341
|
end
|
334
342
|
protected
|
@@ -51,7 +51,7 @@ module Parse
|
|
51
51
|
}
|
52
52
|
File.open(outputFile, "w") do |f|
|
53
53
|
time_and_puts("Writing parser to file #{outputFile}") {
|
54
|
-
f.write "
|
54
|
+
f.write "# encoding: ascii-8bit\nrequire 'rpdf2txt-rockit/rockit'\n" +
|
55
55
|
parser.to_src_in_module(parserName, moduleName)
|
56
56
|
}
|
57
57
|
end
|