rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,107 @@
|
|
1
|
+
|
2
|
+
def stack_to_node(stack, includeShape = false)
|
3
|
+
str = "#{stack.state} (#{stack.lexer.position.char_position})"
|
4
|
+
if includeShape
|
5
|
+
str = str.inspect + " [shape=box]"
|
6
|
+
end
|
7
|
+
str
|
8
|
+
end
|
9
|
+
|
10
|
+
def parsestack_as_dot_digraph(stack, nodes = Hash.new, links = Hash.new)
|
11
|
+
nodes[stack] = stack_to_node(stack, true)
|
12
|
+
stack.links.each do |link|
|
13
|
+
nodes, links = parsestack_as_dot_digraph(link.stack, nodes, links)
|
14
|
+
links[link] = "#{stack_to_node(stack).inspect} -> #{stack_to_node(link.stack).inspect} [label=#{link.tree.inspect_compact.inspect}]"
|
15
|
+
end
|
16
|
+
return nodes, links
|
17
|
+
end
|
18
|
+
|
19
|
+
def parsestacks_as_dot_digraph(stacks)
|
20
|
+
nodes, links = {}, {}
|
21
|
+
stacks.each do |stack|
|
22
|
+
nodes, links = parsestack_as_dot_digraph(stack, nodes, links)
|
23
|
+
end
|
24
|
+
"digraph G {\nsize=\"8,11\"\n" +
|
25
|
+
nodes.values.uniq.join("\n") + "\n" +
|
26
|
+
links.values.uniq.join("\n") + "\n" +
|
27
|
+
"}"
|
28
|
+
end
|
29
|
+
|
30
|
+
class DotGraphPrinter
|
31
|
+
def initialize(size = "11,9", orientation = "landscape")
|
32
|
+
@size, @orientation = size, orientation
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_graph(ast)
|
36
|
+
@nodes, @links = Hash.new, Hash.new
|
37
|
+
eval_to_dot(ast, nil)
|
38
|
+
to_graph_from_nodes_and_links(@nodes, @links)
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def to_graph_from_nodes_and_links(nodes, links)
|
44
|
+
"digraph G {\n" +
|
45
|
+
"size = #{@size.inspect}\n" +
|
46
|
+
"orientation = #{@orientation}\n" +
|
47
|
+
@nodes.values.uniq.join("\n") + "\n" +
|
48
|
+
@links.values.uniq.join("\n") + "\n" +
|
49
|
+
"}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class SyntaxTreeAsDotGraph < DotGraphPrinter
|
54
|
+
def add_parent_link(parent, child, label = nil, weight = nil)
|
55
|
+
if parent
|
56
|
+
@links[[parent, child]] =
|
57
|
+
"#{parent.id} -> #{child.id}"
|
58
|
+
if label or weight
|
59
|
+
@links[[parent, child]] += " [" +
|
60
|
+
(label ? "label=#{label.inspect}" : "") +
|
61
|
+
((label and weight) ? "," : "") +
|
62
|
+
(weight ? "weight=#{weight.inspect}" : "") +
|
63
|
+
"]"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def eval_to_dot(ast, parent = nil, linkname = nil, weight = nil)
|
69
|
+
if ast
|
70
|
+
if ast.kind_of?(SyntaxTree)
|
71
|
+
case ast.name
|
72
|
+
when "_ArrayNode"
|
73
|
+
add_parent_link(parent, ast, linkname, weight)
|
74
|
+
@nodes[ast] = "#{ast.id} [label=" + '"[]"]'
|
75
|
+
ast.each_with_index {|c,i| eval_to_dot(c, ast, i.inspect)}
|
76
|
+
else
|
77
|
+
if parent
|
78
|
+
end
|
79
|
+
# Special handling of Token nodes since we only want to print
|
80
|
+
# the lexeme
|
81
|
+
if ast.children_names.sort == ["lexeme", "value"].sort
|
82
|
+
@nodes[ast] = "#{ast.id} [shape=box,label=#{ast.lexeme.inspect}]"
|
83
|
+
add_parent_link(parent, ast, linkname, weight)
|
84
|
+
else
|
85
|
+
add_parent_link(parent, ast, linkname, weight)
|
86
|
+
@nodes[ast] = "#{ast.id} [label=#{ast.name.inspect}]"
|
87
|
+
ast.childrens.each_with_index {|c,i|
|
88
|
+
eval_to_dot(c, ast, ast.children_names[i])
|
89
|
+
}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
elsif ast.class == Array
|
93
|
+
# Or nodes return array but they should return ArrayNodes...
|
94
|
+
add_parent_link(parent, ast, linkname, weight)
|
95
|
+
@nodes[ast] = "#{ast.id} [label=\"[]\"]"
|
96
|
+
ast.each_with_index {|c,i| eval_to_dot(c, ast, i)}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def syntaxtree_as_dot_digraph(syntaxtree)
|
103
|
+
SyntaxTreeAsDotGraph.new.to_graph(syntaxtree)
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
class DotGraphFormatter
|
2
|
+
@@default_node_shaper = proc{|n| "box"}
|
3
|
+
@@default_node_labeler = proc{|n| n.inspect}
|
4
|
+
@@default_link_labeler = proc{|info| info ? info.inspect : nil}
|
5
|
+
|
6
|
+
def initialize(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil,
|
7
|
+
size = "11,9", orientation = "landscape")
|
8
|
+
@node_shaper = nodeShaper || @@default_node_shaper
|
9
|
+
@node_labeler = nodeLabeler || @@default_node_labeler
|
10
|
+
@link_labeler = linkLabeler || @@default_link_labeler
|
11
|
+
@size, @orientation = size, orientation
|
12
|
+
end
|
13
|
+
|
14
|
+
# nodes is array of node objects
|
15
|
+
# links is either array of
|
16
|
+
# arrays [fromNode, toNode [, infoOnLink]], or
|
17
|
+
# objects with attributes :from, :to, :info
|
18
|
+
def format(nodes, links)
|
19
|
+
DotGraph.new("digraph G {\n" +
|
20
|
+
"size = #{@size.inspect}\n" +
|
21
|
+
"orientation = #{@orientation}\n" +
|
22
|
+
nodes.uniq.map {|n| format_node(n)}.join("\n") + "\n" +
|
23
|
+
links.uniq.map {|l| format_link(l)}.join("\n") + "\n" +
|
24
|
+
"}"
|
25
|
+
)
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
|
30
|
+
def format_node(node)
|
31
|
+
node.id.inspect + " [" +
|
32
|
+
"shape=" + @node_shaper.call(node).inspect + ", " +
|
33
|
+
"label=" + @node_labeler.call(node).inspect + "]"
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_link_data(link)
|
37
|
+
begin
|
38
|
+
return link.from, link.to, link.info
|
39
|
+
rescue Exception
|
40
|
+
return link[0], link[1], link[2]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def format_link(link)
|
45
|
+
from, to, info = get_link_data(link)
|
46
|
+
label = @link_labeler.call(info)
|
47
|
+
from.id.inspect + " -> " + to.id.inspect +
|
48
|
+
(label ? " [label=" + label.inspect + "]" : "")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
DotGraph = Struct.new("DotGraph", :description)
|
53
|
+
class DotGraph
|
54
|
+
def write_to_file(filename)
|
55
|
+
tmpfile = filename + rand(100000).inspect
|
56
|
+
while test(?f, tmpfile)
|
57
|
+
tmpfile = filename + rand(100000).inspect
|
58
|
+
end
|
59
|
+
File.open(tmpfile, "w") {|f| f.write description}
|
60
|
+
system "dot -Tps -o #{filename} #{tmpfile}"
|
61
|
+
File.delete(tmpfile)
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Indexable
|
2
|
+
attr_accessor :index_number # attr_reader_once_write instead?
|
3
|
+
attr_accessor :factory
|
4
|
+
end
|
5
|
+
|
6
|
+
class IndexableFactory
|
7
|
+
attr_reader :instances, :start_index, :next_index
|
8
|
+
|
9
|
+
def initialize(klass, startIndex = 0)
|
10
|
+
unless klass.ancestors.include?(Indexable)
|
11
|
+
raise ArgumentError, "#{klass.inspect} is not Indexable"
|
12
|
+
end
|
13
|
+
@klass, @start_index, @next_index = klass, startIndex, startIndex
|
14
|
+
@instance_map, @instances = Hash.new, Array.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def make(*args)
|
18
|
+
obj = @instance_map[args]
|
19
|
+
unless obj
|
20
|
+
@instance_map[args] = obj = make_new_obj(args)
|
21
|
+
@instances.push obj
|
22
|
+
end
|
23
|
+
obj
|
24
|
+
end
|
25
|
+
|
26
|
+
def make_unless_exists(*args)
|
27
|
+
new_instance = @instance_map[args] == nil
|
28
|
+
return make(*args), new_instance
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_instance(*args)
|
32
|
+
@instance_map[args]
|
33
|
+
end
|
34
|
+
|
35
|
+
def instance_with_args(*args)
|
36
|
+
@instance_map[args] || (@instance_map[args] = make(*args))
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
def make_new_obj(arguments)
|
42
|
+
obj = @klass.new(*arguments)
|
43
|
+
obj.index_number = advance_index_number
|
44
|
+
obj.factory = self
|
45
|
+
obj
|
46
|
+
end
|
47
|
+
|
48
|
+
def advance_index_number
|
49
|
+
i = @next_index
|
50
|
+
@next_index += 1
|
51
|
+
i
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'rpdf2txt-rockit/grammar'
|
2
|
+
require 'rpdf2txt-rockit/parse_table'
|
3
|
+
require 'rpdf2txt-rockit/parsetable_generation'
|
4
|
+
require 'rpdf2txt-rockit/reduce_actions_generator'
|
5
|
+
|
6
|
+
require 'rpdf2txt-rockit/profiler'
|
7
|
+
|
8
|
+
module Parse
|
9
|
+
class StateGraph < BackLinkedDirectedGraph
|
10
|
+
attr_reader :start_state
|
11
|
+
attr_reader :consistent_reduce_states, :inconsistent_reduce_states
|
12
|
+
|
13
|
+
def initialize(startState, *args)
|
14
|
+
super(*args)
|
15
|
+
@start_state = startState
|
16
|
+
@consistent_reduce_states = Array.new
|
17
|
+
@inconsistent_reduce_states = Array.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_node(state)
|
21
|
+
super(state)
|
22
|
+
if state.reduce_state?
|
23
|
+
if state.consistent?
|
24
|
+
a = @consistent_reduce_states
|
25
|
+
else
|
26
|
+
a = @inconsistent_reduce_states
|
27
|
+
end
|
28
|
+
a.push state unless a.include?(state)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
@@node_labeler_with_kernels = proc{|state|
|
33
|
+
"S" + state.index_number.inspect + ": " + state.kernel_items.inspect
|
34
|
+
}
|
35
|
+
|
36
|
+
@@node_labeler = proc{|state|
|
37
|
+
"S" + state.index_number.inspect
|
38
|
+
}
|
39
|
+
|
40
|
+
def to_postscript_file(filename, withKernelItems = true)
|
41
|
+
super(filename, nil,
|
42
|
+
withKernelItems ? @@node_labeler_with_kernels : @@node_labeler )
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class LaLr1ParseTableGenerator
|
47
|
+
def initialize(grammar,
|
48
|
+
lookaheadCalculatorKlass =
|
49
|
+
ReduceActionsGenerator)
|
50
|
+
@grammar, @lookahead_calculator_klass = grammar, lookaheadCalculatorKlass
|
51
|
+
end
|
52
|
+
|
53
|
+
def generate_parse_table(parseTableKlass = ParseTable)
|
54
|
+
@grammar.augment
|
55
|
+
#puts "\n NoN = #{@grammar.productions.map{|p| p.nonterminal}.uniq.length}"
|
56
|
+
#puts " NoP = #{@grammar.productions.length}"
|
57
|
+
time_and_puts("\n Normalizing grammar") {
|
58
|
+
@grammar.normalize
|
59
|
+
}
|
60
|
+
#puts " NoPN = #{@grammar.productions.length}"
|
61
|
+
state_graph = nil
|
62
|
+
time_and_puts(" Calculating states") {
|
63
|
+
@item_factory = IndexableFactory.new(Item, 0)
|
64
|
+
@state_factory = IndexableFactory.new(LrState, 0)
|
65
|
+
precalc_nonkernel_items_for_nonterminals
|
66
|
+
@parse_table = parseTableKlass.new_from_grammar(@grammar)
|
67
|
+
state_graph = calculate_state_graph # also adds gotos and shift actions
|
68
|
+
}
|
69
|
+
time_and_puts(" Calculating lalr1_lookaheads") {
|
70
|
+
lookahead_alg =
|
71
|
+
@lookahead_calculator_klass.new(state_graph, @grammar, @parse_table,
|
72
|
+
@item_factory.instances)
|
73
|
+
lookahead_alg.add_reduce_actions
|
74
|
+
}
|
75
|
+
@parse_table
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_nonterminal_uniqueness
|
79
|
+
nts = DefaultInitHash.new {|k| Array.new}
|
80
|
+
@grammar.productions.each do |production|
|
81
|
+
nts[production.nonterminal.name].push(production.nonterminal.id)
|
82
|
+
nts[production.nonterminal.name].uniq!
|
83
|
+
production.elements.each do |e|
|
84
|
+
if e.nonterminal?
|
85
|
+
nts[e.name].push(e.id)
|
86
|
+
nts[e.name].uniq!
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
puts nts.inspect
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
|
95
|
+
def precalc_nonkernel_items_for_nonterminals
|
96
|
+
@grammar.nonterminals.each do |nt|
|
97
|
+
nt.calc_nonkernel_items(@grammar, @item_factory)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def add_state_unless_exists(kernel_items)
|
102
|
+
@state_factory.make_unless_exists(kernel_items)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Calculate the state graph by constructing the sets-of-lr0-items
|
106
|
+
# collection.
|
107
|
+
# See page 224 (basic algorithm) and 240 (representing the states by
|
108
|
+
# their kernel items) in the Dragon book
|
109
|
+
def calculate_state_graph
|
110
|
+
Profiler.__enter__(:calculate_state_graph) if $PROFILE
|
111
|
+
start_item = @item_factory.make(@grammar.productions[0], 0)
|
112
|
+
state_graph = StateGraph.new(add_state_unless_exists([start_item]).first)
|
113
|
+
states, current = [state_graph.start_state], 0
|
114
|
+
dest_sets, next_item = DefaultInitHash.new {|k| Array.new}, nil
|
115
|
+
while current < states.length
|
116
|
+
state = states[current]
|
117
|
+
dest_sets.clear
|
118
|
+
state.closure.each do |item|
|
119
|
+
symbol = item.symbol
|
120
|
+
if symbol
|
121
|
+
next_item = item.next_item
|
122
|
+
dest_sets[symbol].push(next_item) if next_item
|
123
|
+
end
|
124
|
+
end
|
125
|
+
dest_sets.each do |symbol, kernel_item_set|
|
126
|
+
kernel_item_set.uniq! # Needed?
|
127
|
+
dest_state, new_state = add_state_unless_exists(kernel_item_set)
|
128
|
+
states.push(dest_state) if new_state
|
129
|
+
state_graph.add_link(state, dest_state, symbol)
|
130
|
+
if symbol.nonterminal?
|
131
|
+
@parse_table.add_goto(state.index_number, symbol,
|
132
|
+
dest_state.index_number)
|
133
|
+
else
|
134
|
+
@parse_table.add_action(state.index_number, symbol,
|
135
|
+
[:SHIFT, dest_state.index_number])
|
136
|
+
end
|
137
|
+
end
|
138
|
+
current += 1
|
139
|
+
end
|
140
|
+
Profiler.__leave__(:calculate_state_graph, state_graph) if $PROFILE
|
141
|
+
state_graph
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,273 @@
|
|
1
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
2
|
+
require 'rpdf2txt-rockit/grammar'
|
3
|
+
require 'rpdf2txt-rockit/base_extensions'
|
4
|
+
|
5
|
+
require 'rpdf2txt-rockit/profiler'
|
6
|
+
|
7
|
+
class ParseTable
|
8
|
+
include SourceCodeDumpable
|
9
|
+
attr_reader :start_state, :tokens, :priorities
|
10
|
+
attr_accessor :language
|
11
|
+
|
12
|
+
def ParseTable.new_from_grammar(aGrammar)
|
13
|
+
pt = self.new(aGrammar.productions, aGrammar.tokens, aGrammar.priorities)
|
14
|
+
pt.language = aGrammar.name || "UNNAMED_LANGUAGE"
|
15
|
+
pt
|
16
|
+
end
|
17
|
+
|
18
|
+
# We save the actions in a compact numerical way to save space and time:
|
19
|
+
# The action table is an array of arrays. Each state number is an index
|
20
|
+
# into the array and its array contains an even number of integers.
|
21
|
+
# Each pair of integers represent one unique action. The first of the
|
22
|
+
# integers is the action and the second is the number representing the
|
23
|
+
# terminals for which it apply. The least significant 'action_bits' bits
|
24
|
+
# of the action number determines the type of action by giving an index
|
25
|
+
# into the 'action_map'. Its default value is:
|
26
|
+
# [:REDUCE, :SHIFT, :ACCEPT]
|
27
|
+
# so that
|
28
|
+
@@default_action_map = [:REDUCE, :SHIFT, :ACCEPT]
|
29
|
+
|
30
|
+
def initialize(productions, tokens, priorities = nil,
|
31
|
+
actionTable = ArrayOfArrays.new, gotoHash = Hash.new,
|
32
|
+
actionBits = 2, actionMap = @@default_action_map)
|
33
|
+
@productions, @start_state, @language = productions, 0, "UNNAMED_LANGUAGE"
|
34
|
+
@priorities = priorities
|
35
|
+
@tokens, @nonterminals = tokens, nonterminals(productions)
|
36
|
+
@action_table, @goto_hash = actionTable, gotoHash
|
37
|
+
@action_cache = ArrayOfHashes.new
|
38
|
+
@mask = Array.new
|
39
|
+
@action_map, @action_bits, @action_mask = actionMap, actionBits, 0
|
40
|
+
while actionBits > 0
|
41
|
+
@action_mask = (@action_mask << 1) | 1
|
42
|
+
actionBits -= 1
|
43
|
+
end
|
44
|
+
init_productionnum_to_nonterminal_number_hash
|
45
|
+
init_tokentype_to_token_number_hash
|
46
|
+
end
|
47
|
+
|
48
|
+
def num_states
|
49
|
+
@action_table.length
|
50
|
+
end
|
51
|
+
|
52
|
+
def ==(other)
|
53
|
+
other.class == self.class and
|
54
|
+
other.productions == @productions and
|
55
|
+
other.tokens == @tokens and
|
56
|
+
other.action_table == @action_table and
|
57
|
+
other.goto_hash == @goto_hash and
|
58
|
+
other.start_state == @start_state
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_action(state, aTokenType, action)
|
62
|
+
Profiler.__enter__(:ParseTable_add_action, state, aTokenType, action) if $PROFILE
|
63
|
+
@action_cache[state].clear
|
64
|
+
actionnum = action_to_actionnum(action)
|
65
|
+
@action_table[state] << actionnum << token_to_terminalset(aTokenType)
|
66
|
+
Profiler.__leave__(:ParseTable_add_action) if $PROFILE
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_action_for_terminalset(state, action, terminalSet)
|
70
|
+
Profiler.__enter__(:ParseTable_add_action_for_terminalset, state, action, terminalSet) if $PROFILE
|
71
|
+
@action_table[state] << action_to_actionnum(action) << terminalSet.to_i
|
72
|
+
Profiler.__leave__(:ParseTable_add_action_for_terminalset) if $PROFILE
|
73
|
+
end
|
74
|
+
|
75
|
+
# Unify terminal sets for identical actions
|
76
|
+
def compact!
|
77
|
+
Profiler.__enter__(:ParseTable_compact!) if $PROFILE
|
78
|
+
actions, i, new_index = Hash.new, 0, 0
|
79
|
+
@action_table.map! do |actionnums|
|
80
|
+
actions.clear;
|
81
|
+
i, new_actionnums, new_index = 0, Array.new, 0
|
82
|
+
while i < actionnums.length
|
83
|
+
if (index = actions[actionnums[i]])
|
84
|
+
new_actionnums[index+1] |= actionnums[i+1]
|
85
|
+
else
|
86
|
+
actions[actionnums[i]] = new_index
|
87
|
+
new_index += 2
|
88
|
+
new_actionnums << actionnums[i] << actionnums[i+1]
|
89
|
+
end
|
90
|
+
i += 2
|
91
|
+
end
|
92
|
+
new_actionnums
|
93
|
+
end
|
94
|
+
Profiler.__leave__(:ParseTable_compact!) if $PROFILE
|
95
|
+
end
|
96
|
+
|
97
|
+
def token_to_terminalset(aTokenType)
|
98
|
+
mask(@token_to_number[aTokenType])
|
99
|
+
end
|
100
|
+
|
101
|
+
def mask(index)
|
102
|
+
@mask[index] || (@mask[index] = (1 << index))
|
103
|
+
end
|
104
|
+
|
105
|
+
def add_goto(state, aNonTerminal, newState)
|
106
|
+
begin
|
107
|
+
@goto_hash[state][@nonterminals.index(aNonTerminal)] = newState
|
108
|
+
rescue NameError
|
109
|
+
@goto_hash[state] = Hash.new
|
110
|
+
retry
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def actions(state, tokenType)
|
115
|
+
actions = @action_cache[state][tokenType]
|
116
|
+
unless actions
|
117
|
+
actions = Array.new
|
118
|
+
actionnums = @action_table[state]
|
119
|
+
token_mask = mask(@token_to_number[tokenType])
|
120
|
+
i = 0
|
121
|
+
while i < actionnums.length
|
122
|
+
if(actionnums[i+1] & token_mask > 0)
|
123
|
+
actions.push actionnum_to_action(actionnums[i])
|
124
|
+
end
|
125
|
+
i += 2
|
126
|
+
end
|
127
|
+
@action_cache[state][tokenType] = actions
|
128
|
+
end
|
129
|
+
actions
|
130
|
+
end
|
131
|
+
|
132
|
+
def valid_tokens(state)
|
133
|
+
terminal_set = 0
|
134
|
+
each_terminalset(state) {|ts| terminal_set |= ts}
|
135
|
+
terminalset_to_terminals(terminal_set)
|
136
|
+
end
|
137
|
+
|
138
|
+
def each_terminalset(state)
|
139
|
+
@action_table[state].each_with_index {|e,i| yield(e) if i % 2 == 1}
|
140
|
+
end
|
141
|
+
|
142
|
+
def terminalset_to_terminals(terminalSet)
|
143
|
+
@tokens.select {|t| terminalSet & mask(@token_to_number[t]) > 0}
|
144
|
+
end
|
145
|
+
|
146
|
+
def actionnum_to_action(actionNumber)
|
147
|
+
[@action_map[actionNumber & @action_mask], actionNumber >> @action_bits]
|
148
|
+
end
|
149
|
+
|
150
|
+
def action_to_actionnum(action)
|
151
|
+
Profiler.__enter__(:ParseTable_action_to_actionnum, action) if $PROFILE
|
152
|
+
res = @action_map.index(action[0]) + (action[1] << @action_bits)
|
153
|
+
Profiler.__leave__(:ParseTable_action_to_actionnum) if $PROFILE
|
154
|
+
res
|
155
|
+
end
|
156
|
+
|
157
|
+
def goto(state, productionNumber)
|
158
|
+
begin
|
159
|
+
@goto_hash[state][@productionnum_to_nonterminal_num[productionNumber]]
|
160
|
+
rescue Exception
|
161
|
+
nil
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def production(number)
|
166
|
+
@productions[number]
|
167
|
+
end
|
168
|
+
|
169
|
+
def to_src(name = "parse_table", nameHash = {})
|
170
|
+
names = name_hash(@tokens) {|t| "t"}
|
171
|
+
str = @tokens.to_src("tokens", names) + "\n"
|
172
|
+
names.update(name_hash(@productions) {|p| "p"})
|
173
|
+
str << @productions.to_src("productions", names) + "\n"
|
174
|
+
str << @priorities.to_src("priorities", names) + "\n"
|
175
|
+
#str << "r = :REDUCE\n"
|
176
|
+
#str << "s = :SHIFT\n"
|
177
|
+
str << @action_table.to_compact_src("action_table") + "\n"
|
178
|
+
str << @goto_hash.to_compact_src("goto_hash") + "\n"
|
179
|
+
str << assign_to(name,
|
180
|
+
new_of_my_type(as_code("productions"),
|
181
|
+
as_code("tokens"),
|
182
|
+
as_code("priorities"),
|
183
|
+
as_code("action_table"),
|
184
|
+
as_code("goto_hash"),
|
185
|
+
@action_bits,
|
186
|
+
@action_map))
|
187
|
+
str
|
188
|
+
end
|
189
|
+
|
190
|
+
def inspect
|
191
|
+
str = "ParseTable\n"
|
192
|
+
str += "Tokens: #{@tokens.inspect}\n"
|
193
|
+
str += "NonTerminals: #{@nonterminals.inspect}\n"
|
194
|
+
str += "Productions:\n#{productions_inspect}\n"
|
195
|
+
str += "Actions: \n"
|
196
|
+
max_state = @action_table.length-1
|
197
|
+
(max_state+1).times do |state|
|
198
|
+
str += "#{state}:\t"
|
199
|
+
@tokens.each do |t|
|
200
|
+
str += inspect_actions(actions(state, t)) + ","
|
201
|
+
end
|
202
|
+
str += "| "
|
203
|
+
@nonterminals.each do |nt|
|
204
|
+
i = @productions.index(@productions.detect {|p| p.nonterminal == nt})
|
205
|
+
str += ((ns=goto(state, i)) ? "#{ns}" : " ") + ","
|
206
|
+
end
|
207
|
+
str += "\n"
|
208
|
+
end
|
209
|
+
str
|
210
|
+
end
|
211
|
+
|
212
|
+
protected
|
213
|
+
|
214
|
+
def productions_inspect
|
215
|
+
str = ""
|
216
|
+
@productions.each_with_index do |production, i|
|
217
|
+
str += " #{i}: #{production.inspect}\n"
|
218
|
+
end
|
219
|
+
str
|
220
|
+
end
|
221
|
+
|
222
|
+
def inspect_actions(actions)
|
223
|
+
unless actions
|
224
|
+
" "
|
225
|
+
else
|
226
|
+
return " " if actions.length == 0
|
227
|
+
if actions.length > 1
|
228
|
+
"[" + actions.map {|a| inspect_actions([a])}.join(',') + "]"
|
229
|
+
else
|
230
|
+
case actions[0][0]
|
231
|
+
when :ACCEPT
|
232
|
+
" a "
|
233
|
+
when :SHIFT
|
234
|
+
"s#{actions[0][1]} "
|
235
|
+
when :REDUCE
|
236
|
+
"r#{actions[0][1]} "
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
attr_reader :productions, :action_table, :goto_hash
|
243
|
+
|
244
|
+
def nonterminals(anArrayOfProductions)
|
245
|
+
anArrayOfProductions.map {|p| p.nonterminal}.equality_uniq
|
246
|
+
end
|
247
|
+
|
248
|
+
def init_nonterminal_index(nonterminals, productions)
|
249
|
+
@nonterminal_index = Hash.new
|
250
|
+
productions.each_with_index do |prod, i|
|
251
|
+
@nonterminal_index[i] = nonterminals.index(prod.nonterminal)
|
252
|
+
end
|
253
|
+
@nonterminal_index
|
254
|
+
end
|
255
|
+
|
256
|
+
def init_token_index(tokens)
|
257
|
+
@token_index = Hash.new
|
258
|
+
tokens.each_with_index {|t,i| @token_index[t] = i}
|
259
|
+
@token_index
|
260
|
+
end
|
261
|
+
|
262
|
+
def init_productionnum_to_nonterminal_number_hash
|
263
|
+
@productionnum_to_nonterminal_num = Hash.new
|
264
|
+
@productions.each_with_index do |p, n|
|
265
|
+
@productionnum_to_nonterminal_num[n] = @nonterminals.index(p.nonterminal)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
def init_tokentype_to_token_number_hash
|
270
|
+
@token_to_number = Hash.new
|
271
|
+
@tokens.each_with_index {|t,i| @token_to_number[t] = i}
|
272
|
+
end
|
273
|
+
end
|