rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
|
|
2
|
+
def stack_to_node(stack, includeShape = false)
|
|
3
|
+
str = "#{stack.state} (#{stack.lexer.position.char_position})"
|
|
4
|
+
if includeShape
|
|
5
|
+
str = str.inspect + " [shape=box]"
|
|
6
|
+
end
|
|
7
|
+
str
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def parsestack_as_dot_digraph(stack, nodes = Hash.new, links = Hash.new)
|
|
11
|
+
nodes[stack] = stack_to_node(stack, true)
|
|
12
|
+
stack.links.each do |link|
|
|
13
|
+
nodes, links = parsestack_as_dot_digraph(link.stack, nodes, links)
|
|
14
|
+
links[link] = "#{stack_to_node(stack).inspect} -> #{stack_to_node(link.stack).inspect} [label=#{link.tree.inspect_compact.inspect}]"
|
|
15
|
+
end
|
|
16
|
+
return nodes, links
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def parsestacks_as_dot_digraph(stacks)
|
|
20
|
+
nodes, links = {}, {}
|
|
21
|
+
stacks.each do |stack|
|
|
22
|
+
nodes, links = parsestack_as_dot_digraph(stack, nodes, links)
|
|
23
|
+
end
|
|
24
|
+
"digraph G {\nsize=\"8,11\"\n" +
|
|
25
|
+
nodes.values.uniq.join("\n") + "\n" +
|
|
26
|
+
links.values.uniq.join("\n") + "\n" +
|
|
27
|
+
"}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
class DotGraphPrinter
|
|
31
|
+
def initialize(size = "11,9", orientation = "landscape")
|
|
32
|
+
@size, @orientation = size, orientation
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def to_graph(ast)
|
|
36
|
+
@nodes, @links = Hash.new, Hash.new
|
|
37
|
+
eval_to_dot(ast, nil)
|
|
38
|
+
to_graph_from_nodes_and_links(@nodes, @links)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
protected
|
|
42
|
+
|
|
43
|
+
def to_graph_from_nodes_and_links(nodes, links)
|
|
44
|
+
"digraph G {\n" +
|
|
45
|
+
"size = #{@size.inspect}\n" +
|
|
46
|
+
"orientation = #{@orientation}\n" +
|
|
47
|
+
@nodes.values.uniq.join("\n") + "\n" +
|
|
48
|
+
@links.values.uniq.join("\n") + "\n" +
|
|
49
|
+
"}"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
class SyntaxTreeAsDotGraph < DotGraphPrinter
|
|
54
|
+
def add_parent_link(parent, child, label = nil, weight = nil)
|
|
55
|
+
if parent
|
|
56
|
+
@links[[parent, child]] =
|
|
57
|
+
"#{parent.id} -> #{child.id}"
|
|
58
|
+
if label or weight
|
|
59
|
+
@links[[parent, child]] += " [" +
|
|
60
|
+
(label ? "label=#{label.inspect}" : "") +
|
|
61
|
+
((label and weight) ? "," : "") +
|
|
62
|
+
(weight ? "weight=#{weight.inspect}" : "") +
|
|
63
|
+
"]"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def eval_to_dot(ast, parent = nil, linkname = nil, weight = nil)
|
|
69
|
+
if ast
|
|
70
|
+
if ast.kind_of?(SyntaxTree)
|
|
71
|
+
case ast.name
|
|
72
|
+
when "_ArrayNode"
|
|
73
|
+
add_parent_link(parent, ast, linkname, weight)
|
|
74
|
+
@nodes[ast] = "#{ast.id} [label=" + '"[]"]'
|
|
75
|
+
ast.each_with_index {|c,i| eval_to_dot(c, ast, i.inspect)}
|
|
76
|
+
else
|
|
77
|
+
if parent
|
|
78
|
+
end
|
|
79
|
+
# Special handling of Token nodes since we only want to print
|
|
80
|
+
# the lexeme
|
|
81
|
+
if ast.children_names.sort == ["lexeme", "value"].sort
|
|
82
|
+
@nodes[ast] = "#{ast.id} [shape=box,label=#{ast.lexeme.inspect}]"
|
|
83
|
+
add_parent_link(parent, ast, linkname, weight)
|
|
84
|
+
else
|
|
85
|
+
add_parent_link(parent, ast, linkname, weight)
|
|
86
|
+
@nodes[ast] = "#{ast.id} [label=#{ast.name.inspect}]"
|
|
87
|
+
ast.childrens.each_with_index {|c,i|
|
|
88
|
+
eval_to_dot(c, ast, ast.children_names[i])
|
|
89
|
+
}
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
elsif ast.class == Array
|
|
93
|
+
# Or nodes return array but they should return ArrayNodes...
|
|
94
|
+
add_parent_link(parent, ast, linkname, weight)
|
|
95
|
+
@nodes[ast] = "#{ast.id} [label=\"[]\"]"
|
|
96
|
+
ast.each_with_index {|c,i| eval_to_dot(c, ast, i)}
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def syntaxtree_as_dot_digraph(syntaxtree)
|
|
103
|
+
SyntaxTreeAsDotGraph.new.to_graph(syntaxtree)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
class DotGraphFormatter
|
|
2
|
+
@@default_node_shaper = proc{|n| "box"}
|
|
3
|
+
@@default_node_labeler = proc{|n| n.inspect}
|
|
4
|
+
@@default_link_labeler = proc{|info| info ? info.inspect : nil}
|
|
5
|
+
|
|
6
|
+
def initialize(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil,
|
|
7
|
+
size = "11,9", orientation = "landscape")
|
|
8
|
+
@node_shaper = nodeShaper || @@default_node_shaper
|
|
9
|
+
@node_labeler = nodeLabeler || @@default_node_labeler
|
|
10
|
+
@link_labeler = linkLabeler || @@default_link_labeler
|
|
11
|
+
@size, @orientation = size, orientation
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# nodes is array of node objects
|
|
15
|
+
# links is either array of
|
|
16
|
+
# arrays [fromNode, toNode [, infoOnLink]], or
|
|
17
|
+
# objects with attributes :from, :to, :info
|
|
18
|
+
def format(nodes, links)
|
|
19
|
+
DotGraph.new("digraph G {\n" +
|
|
20
|
+
"size = #{@size.inspect}\n" +
|
|
21
|
+
"orientation = #{@orientation}\n" +
|
|
22
|
+
nodes.uniq.map {|n| format_node(n)}.join("\n") + "\n" +
|
|
23
|
+
links.uniq.map {|l| format_link(l)}.join("\n") + "\n" +
|
|
24
|
+
"}"
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
protected
|
|
29
|
+
|
|
30
|
+
def format_node(node)
|
|
31
|
+
node.id.inspect + " [" +
|
|
32
|
+
"shape=" + @node_shaper.call(node).inspect + ", " +
|
|
33
|
+
"label=" + @node_labeler.call(node).inspect + "]"
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def get_link_data(link)
|
|
37
|
+
begin
|
|
38
|
+
return link.from, link.to, link.info
|
|
39
|
+
rescue Exception
|
|
40
|
+
return link[0], link[1], link[2]
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def format_link(link)
|
|
45
|
+
from, to, info = get_link_data(link)
|
|
46
|
+
label = @link_labeler.call(info)
|
|
47
|
+
from.id.inspect + " -> " + to.id.inspect +
|
|
48
|
+
(label ? " [label=" + label.inspect + "]" : "")
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
DotGraph = Struct.new("DotGraph", :description)
|
|
53
|
+
class DotGraph
|
|
54
|
+
def write_to_file(filename)
|
|
55
|
+
tmpfile = filename + rand(100000).inspect
|
|
56
|
+
while test(?f, tmpfile)
|
|
57
|
+
tmpfile = filename + rand(100000).inspect
|
|
58
|
+
end
|
|
59
|
+
File.open(tmpfile, "w") {|f| f.write description}
|
|
60
|
+
system "dot -Tps -o #{filename} #{tmpfile}"
|
|
61
|
+
File.delete(tmpfile)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
module Indexable
|
|
2
|
+
attr_accessor :index_number # attr_reader_once_write instead?
|
|
3
|
+
attr_accessor :factory
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
class IndexableFactory
|
|
7
|
+
attr_reader :instances, :start_index, :next_index
|
|
8
|
+
|
|
9
|
+
def initialize(klass, startIndex = 0)
|
|
10
|
+
unless klass.ancestors.include?(Indexable)
|
|
11
|
+
raise ArgumentError, "#{klass.inspect} is not Indexable"
|
|
12
|
+
end
|
|
13
|
+
@klass, @start_index, @next_index = klass, startIndex, startIndex
|
|
14
|
+
@instance_map, @instances = Hash.new, Array.new
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def make(*args)
|
|
18
|
+
obj = @instance_map[args]
|
|
19
|
+
unless obj
|
|
20
|
+
@instance_map[args] = obj = make_new_obj(args)
|
|
21
|
+
@instances.push obj
|
|
22
|
+
end
|
|
23
|
+
obj
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def make_unless_exists(*args)
|
|
27
|
+
new_instance = @instance_map[args] == nil
|
|
28
|
+
return make(*args), new_instance
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def get_instance(*args)
|
|
32
|
+
@instance_map[args]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def instance_with_args(*args)
|
|
36
|
+
@instance_map[args] || (@instance_map[args] = make(*args))
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
protected
|
|
40
|
+
|
|
41
|
+
def make_new_obj(arguments)
|
|
42
|
+
obj = @klass.new(*arguments)
|
|
43
|
+
obj.index_number = advance_index_number
|
|
44
|
+
obj.factory = self
|
|
45
|
+
obj
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def advance_index_number
|
|
49
|
+
i = @next_index
|
|
50
|
+
@next_index += 1
|
|
51
|
+
i
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/grammar'
|
|
2
|
+
require 'rpdf2txt-rockit/parse_table'
|
|
3
|
+
require 'rpdf2txt-rockit/parsetable_generation'
|
|
4
|
+
require 'rpdf2txt-rockit/reduce_actions_generator'
|
|
5
|
+
|
|
6
|
+
require 'rpdf2txt-rockit/profiler'
|
|
7
|
+
|
|
8
|
+
module Parse
|
|
9
|
+
class StateGraph < BackLinkedDirectedGraph
|
|
10
|
+
attr_reader :start_state
|
|
11
|
+
attr_reader :consistent_reduce_states, :inconsistent_reduce_states
|
|
12
|
+
|
|
13
|
+
def initialize(startState, *args)
|
|
14
|
+
super(*args)
|
|
15
|
+
@start_state = startState
|
|
16
|
+
@consistent_reduce_states = Array.new
|
|
17
|
+
@inconsistent_reduce_states = Array.new
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def add_node(state)
|
|
21
|
+
super(state)
|
|
22
|
+
if state.reduce_state?
|
|
23
|
+
if state.consistent?
|
|
24
|
+
a = @consistent_reduce_states
|
|
25
|
+
else
|
|
26
|
+
a = @inconsistent_reduce_states
|
|
27
|
+
end
|
|
28
|
+
a.push state unless a.include?(state)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
@@node_labeler_with_kernels = proc{|state|
|
|
33
|
+
"S" + state.index_number.inspect + ": " + state.kernel_items.inspect
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
@@node_labeler = proc{|state|
|
|
37
|
+
"S" + state.index_number.inspect
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def to_postscript_file(filename, withKernelItems = true)
|
|
41
|
+
super(filename, nil,
|
|
42
|
+
withKernelItems ? @@node_labeler_with_kernels : @@node_labeler )
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
class LaLr1ParseTableGenerator
|
|
47
|
+
def initialize(grammar,
|
|
48
|
+
lookaheadCalculatorKlass =
|
|
49
|
+
ReduceActionsGenerator)
|
|
50
|
+
@grammar, @lookahead_calculator_klass = grammar, lookaheadCalculatorKlass
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def generate_parse_table(parseTableKlass = ParseTable)
|
|
54
|
+
@grammar.augment
|
|
55
|
+
#puts "\n NoN = #{@grammar.productions.map{|p| p.nonterminal}.uniq.length}"
|
|
56
|
+
#puts " NoP = #{@grammar.productions.length}"
|
|
57
|
+
time_and_puts("\n Normalizing grammar") {
|
|
58
|
+
@grammar.normalize
|
|
59
|
+
}
|
|
60
|
+
#puts " NoPN = #{@grammar.productions.length}"
|
|
61
|
+
state_graph = nil
|
|
62
|
+
time_and_puts(" Calculating states") {
|
|
63
|
+
@item_factory = IndexableFactory.new(Item, 0)
|
|
64
|
+
@state_factory = IndexableFactory.new(LrState, 0)
|
|
65
|
+
precalc_nonkernel_items_for_nonterminals
|
|
66
|
+
@parse_table = parseTableKlass.new_from_grammar(@grammar)
|
|
67
|
+
state_graph = calculate_state_graph # also adds gotos and shift actions
|
|
68
|
+
}
|
|
69
|
+
time_and_puts(" Calculating lalr1_lookaheads") {
|
|
70
|
+
lookahead_alg =
|
|
71
|
+
@lookahead_calculator_klass.new(state_graph, @grammar, @parse_table,
|
|
72
|
+
@item_factory.instances)
|
|
73
|
+
lookahead_alg.add_reduce_actions
|
|
74
|
+
}
|
|
75
|
+
@parse_table
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def test_nonterminal_uniqueness
|
|
79
|
+
nts = DefaultInitHash.new {|k| Array.new}
|
|
80
|
+
@grammar.productions.each do |production|
|
|
81
|
+
nts[production.nonterminal.name].push(production.nonterminal.id)
|
|
82
|
+
nts[production.nonterminal.name].uniq!
|
|
83
|
+
production.elements.each do |e|
|
|
84
|
+
if e.nonterminal?
|
|
85
|
+
nts[e.name].push(e.id)
|
|
86
|
+
nts[e.name].uniq!
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
puts nts.inspect
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
protected
|
|
94
|
+
|
|
95
|
+
def precalc_nonkernel_items_for_nonterminals
|
|
96
|
+
@grammar.nonterminals.each do |nt|
|
|
97
|
+
nt.calc_nonkernel_items(@grammar, @item_factory)
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def add_state_unless_exists(kernel_items)
|
|
102
|
+
@state_factory.make_unless_exists(kernel_items)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Calculate the state graph by constructing the sets-of-lr0-items
|
|
106
|
+
# collection.
|
|
107
|
+
# See page 224 (basic algorithm) and 240 (representing the states by
|
|
108
|
+
# their kernel items) in the Dragon book
|
|
109
|
+
def calculate_state_graph
|
|
110
|
+
Profiler.__enter__(:calculate_state_graph) if $PROFILE
|
|
111
|
+
start_item = @item_factory.make(@grammar.productions[0], 0)
|
|
112
|
+
state_graph = StateGraph.new(add_state_unless_exists([start_item]).first)
|
|
113
|
+
states, current = [state_graph.start_state], 0
|
|
114
|
+
dest_sets, next_item = DefaultInitHash.new {|k| Array.new}, nil
|
|
115
|
+
while current < states.length
|
|
116
|
+
state = states[current]
|
|
117
|
+
dest_sets.clear
|
|
118
|
+
state.closure.each do |item|
|
|
119
|
+
symbol = item.symbol
|
|
120
|
+
if symbol
|
|
121
|
+
next_item = item.next_item
|
|
122
|
+
dest_sets[symbol].push(next_item) if next_item
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
dest_sets.each do |symbol, kernel_item_set|
|
|
126
|
+
kernel_item_set.uniq! # Needed?
|
|
127
|
+
dest_state, new_state = add_state_unless_exists(kernel_item_set)
|
|
128
|
+
states.push(dest_state) if new_state
|
|
129
|
+
state_graph.add_link(state, dest_state, symbol)
|
|
130
|
+
if symbol.nonterminal?
|
|
131
|
+
@parse_table.add_goto(state.index_number, symbol,
|
|
132
|
+
dest_state.index_number)
|
|
133
|
+
else
|
|
134
|
+
@parse_table.add_action(state.index_number, symbol,
|
|
135
|
+
[:SHIFT, dest_state.index_number])
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
current += 1
|
|
139
|
+
end
|
|
140
|
+
Profiler.__leave__(:calculate_state_graph, state_graph) if $PROFILE
|
|
141
|
+
state_graph
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/sourcecode_dumpable'
|
|
2
|
+
require 'rpdf2txt-rockit/grammar'
|
|
3
|
+
require 'rpdf2txt-rockit/base_extensions'
|
|
4
|
+
|
|
5
|
+
require 'rpdf2txt-rockit/profiler'
|
|
6
|
+
|
|
7
|
+
class ParseTable
|
|
8
|
+
include SourceCodeDumpable
|
|
9
|
+
attr_reader :start_state, :tokens, :priorities
|
|
10
|
+
attr_accessor :language
|
|
11
|
+
|
|
12
|
+
def ParseTable.new_from_grammar(aGrammar)
|
|
13
|
+
pt = self.new(aGrammar.productions, aGrammar.tokens, aGrammar.priorities)
|
|
14
|
+
pt.language = aGrammar.name || "UNNAMED_LANGUAGE"
|
|
15
|
+
pt
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# We save the actions in a compact numerical way to save space and time:
|
|
19
|
+
# The action table is an array of arrays. Each state number is an index
|
|
20
|
+
# into the array and its array contains an even number of integers.
|
|
21
|
+
# Each pair of integers represent one unique action. The first of the
|
|
22
|
+
# integers is the action and the second is the number representing the
|
|
23
|
+
# terminals for which it apply. The least significant 'action_bits' bits
|
|
24
|
+
# of the action number determines the type of action by giving an index
|
|
25
|
+
# into the 'action_map'. Its default value is:
|
|
26
|
+
# [:REDUCE, :SHIFT, :ACCEPT]
|
|
27
|
+
# so that
|
|
28
|
+
@@default_action_map = [:REDUCE, :SHIFT, :ACCEPT]
|
|
29
|
+
|
|
30
|
+
def initialize(productions, tokens, priorities = nil,
|
|
31
|
+
actionTable = ArrayOfArrays.new, gotoHash = Hash.new,
|
|
32
|
+
actionBits = 2, actionMap = @@default_action_map)
|
|
33
|
+
@productions, @start_state, @language = productions, 0, "UNNAMED_LANGUAGE"
|
|
34
|
+
@priorities = priorities
|
|
35
|
+
@tokens, @nonterminals = tokens, nonterminals(productions)
|
|
36
|
+
@action_table, @goto_hash = actionTable, gotoHash
|
|
37
|
+
@action_cache = ArrayOfHashes.new
|
|
38
|
+
@mask = Array.new
|
|
39
|
+
@action_map, @action_bits, @action_mask = actionMap, actionBits, 0
|
|
40
|
+
while actionBits > 0
|
|
41
|
+
@action_mask = (@action_mask << 1) | 1
|
|
42
|
+
actionBits -= 1
|
|
43
|
+
end
|
|
44
|
+
init_productionnum_to_nonterminal_number_hash
|
|
45
|
+
init_tokentype_to_token_number_hash
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def num_states
|
|
49
|
+
@action_table.length
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def ==(other)
|
|
53
|
+
other.class == self.class and
|
|
54
|
+
other.productions == @productions and
|
|
55
|
+
other.tokens == @tokens and
|
|
56
|
+
other.action_table == @action_table and
|
|
57
|
+
other.goto_hash == @goto_hash and
|
|
58
|
+
other.start_state == @start_state
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def add_action(state, aTokenType, action)
|
|
62
|
+
Profiler.__enter__(:ParseTable_add_action, state, aTokenType, action) if $PROFILE
|
|
63
|
+
@action_cache[state].clear
|
|
64
|
+
actionnum = action_to_actionnum(action)
|
|
65
|
+
@action_table[state] << actionnum << token_to_terminalset(aTokenType)
|
|
66
|
+
Profiler.__leave__(:ParseTable_add_action) if $PROFILE
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def add_action_for_terminalset(state, action, terminalSet)
|
|
70
|
+
Profiler.__enter__(:ParseTable_add_action_for_terminalset, state, action, terminalSet) if $PROFILE
|
|
71
|
+
@action_table[state] << action_to_actionnum(action) << terminalSet.to_i
|
|
72
|
+
Profiler.__leave__(:ParseTable_add_action_for_terminalset) if $PROFILE
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Unify terminal sets for identical actions
|
|
76
|
+
def compact!
|
|
77
|
+
Profiler.__enter__(:ParseTable_compact!) if $PROFILE
|
|
78
|
+
actions, i, new_index = Hash.new, 0, 0
|
|
79
|
+
@action_table.map! do |actionnums|
|
|
80
|
+
actions.clear;
|
|
81
|
+
i, new_actionnums, new_index = 0, Array.new, 0
|
|
82
|
+
while i < actionnums.length
|
|
83
|
+
if (index = actions[actionnums[i]])
|
|
84
|
+
new_actionnums[index+1] |= actionnums[i+1]
|
|
85
|
+
else
|
|
86
|
+
actions[actionnums[i]] = new_index
|
|
87
|
+
new_index += 2
|
|
88
|
+
new_actionnums << actionnums[i] << actionnums[i+1]
|
|
89
|
+
end
|
|
90
|
+
i += 2
|
|
91
|
+
end
|
|
92
|
+
new_actionnums
|
|
93
|
+
end
|
|
94
|
+
Profiler.__leave__(:ParseTable_compact!) if $PROFILE
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def token_to_terminalset(aTokenType)
|
|
98
|
+
mask(@token_to_number[aTokenType])
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def mask(index)
|
|
102
|
+
@mask[index] || (@mask[index] = (1 << index))
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def add_goto(state, aNonTerminal, newState)
|
|
106
|
+
begin
|
|
107
|
+
@goto_hash[state][@nonterminals.index(aNonTerminal)] = newState
|
|
108
|
+
rescue NameError
|
|
109
|
+
@goto_hash[state] = Hash.new
|
|
110
|
+
retry
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def actions(state, tokenType)
|
|
115
|
+
actions = @action_cache[state][tokenType]
|
|
116
|
+
unless actions
|
|
117
|
+
actions = Array.new
|
|
118
|
+
actionnums = @action_table[state]
|
|
119
|
+
token_mask = mask(@token_to_number[tokenType])
|
|
120
|
+
i = 0
|
|
121
|
+
while i < actionnums.length
|
|
122
|
+
if(actionnums[i+1] & token_mask > 0)
|
|
123
|
+
actions.push actionnum_to_action(actionnums[i])
|
|
124
|
+
end
|
|
125
|
+
i += 2
|
|
126
|
+
end
|
|
127
|
+
@action_cache[state][tokenType] = actions
|
|
128
|
+
end
|
|
129
|
+
actions
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def valid_tokens(state)
|
|
133
|
+
terminal_set = 0
|
|
134
|
+
each_terminalset(state) {|ts| terminal_set |= ts}
|
|
135
|
+
terminalset_to_terminals(terminal_set)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def each_terminalset(state)
|
|
139
|
+
@action_table[state].each_with_index {|e,i| yield(e) if i % 2 == 1}
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def terminalset_to_terminals(terminalSet)
|
|
143
|
+
@tokens.select {|t| terminalSet & mask(@token_to_number[t]) > 0}
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def actionnum_to_action(actionNumber)
|
|
147
|
+
[@action_map[actionNumber & @action_mask], actionNumber >> @action_bits]
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def action_to_actionnum(action)
|
|
151
|
+
Profiler.__enter__(:ParseTable_action_to_actionnum, action) if $PROFILE
|
|
152
|
+
res = @action_map.index(action[0]) + (action[1] << @action_bits)
|
|
153
|
+
Profiler.__leave__(:ParseTable_action_to_actionnum) if $PROFILE
|
|
154
|
+
res
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def goto(state, productionNumber)
|
|
158
|
+
begin
|
|
159
|
+
@goto_hash[state][@productionnum_to_nonterminal_num[productionNumber]]
|
|
160
|
+
rescue Exception
|
|
161
|
+
nil
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def production(number)
|
|
166
|
+
@productions[number]
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def to_src(name = "parse_table", nameHash = {})
|
|
170
|
+
names = name_hash(@tokens) {|t| "t"}
|
|
171
|
+
str = @tokens.to_src("tokens", names) + "\n"
|
|
172
|
+
names.update(name_hash(@productions) {|p| "p"})
|
|
173
|
+
str << @productions.to_src("productions", names) + "\n"
|
|
174
|
+
str << @priorities.to_src("priorities", names) + "\n"
|
|
175
|
+
#str << "r = :REDUCE\n"
|
|
176
|
+
#str << "s = :SHIFT\n"
|
|
177
|
+
str << @action_table.to_compact_src("action_table") + "\n"
|
|
178
|
+
str << @goto_hash.to_compact_src("goto_hash") + "\n"
|
|
179
|
+
str << assign_to(name,
|
|
180
|
+
new_of_my_type(as_code("productions"),
|
|
181
|
+
as_code("tokens"),
|
|
182
|
+
as_code("priorities"),
|
|
183
|
+
as_code("action_table"),
|
|
184
|
+
as_code("goto_hash"),
|
|
185
|
+
@action_bits,
|
|
186
|
+
@action_map))
|
|
187
|
+
str
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def inspect
|
|
191
|
+
str = "ParseTable\n"
|
|
192
|
+
str += "Tokens: #{@tokens.inspect}\n"
|
|
193
|
+
str += "NonTerminals: #{@nonterminals.inspect}\n"
|
|
194
|
+
str += "Productions:\n#{productions_inspect}\n"
|
|
195
|
+
str += "Actions: \n"
|
|
196
|
+
max_state = @action_table.length-1
|
|
197
|
+
(max_state+1).times do |state|
|
|
198
|
+
str += "#{state}:\t"
|
|
199
|
+
@tokens.each do |t|
|
|
200
|
+
str += inspect_actions(actions(state, t)) + ","
|
|
201
|
+
end
|
|
202
|
+
str += "| "
|
|
203
|
+
@nonterminals.each do |nt|
|
|
204
|
+
i = @productions.index(@productions.detect {|p| p.nonterminal == nt})
|
|
205
|
+
str += ((ns=goto(state, i)) ? "#{ns}" : " ") + ","
|
|
206
|
+
end
|
|
207
|
+
str += "\n"
|
|
208
|
+
end
|
|
209
|
+
str
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
protected
|
|
213
|
+
|
|
214
|
+
def productions_inspect
|
|
215
|
+
str = ""
|
|
216
|
+
@productions.each_with_index do |production, i|
|
|
217
|
+
str += " #{i}: #{production.inspect}\n"
|
|
218
|
+
end
|
|
219
|
+
str
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def inspect_actions(actions)
|
|
223
|
+
unless actions
|
|
224
|
+
" "
|
|
225
|
+
else
|
|
226
|
+
return " " if actions.length == 0
|
|
227
|
+
if actions.length > 1
|
|
228
|
+
"[" + actions.map {|a| inspect_actions([a])}.join(',') + "]"
|
|
229
|
+
else
|
|
230
|
+
case actions[0][0]
|
|
231
|
+
when :ACCEPT
|
|
232
|
+
" a "
|
|
233
|
+
when :SHIFT
|
|
234
|
+
"s#{actions[0][1]} "
|
|
235
|
+
when :REDUCE
|
|
236
|
+
"r#{actions[0][1]} "
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
attr_reader :productions, :action_table, :goto_hash
|
|
243
|
+
|
|
244
|
+
def nonterminals(anArrayOfProductions)
|
|
245
|
+
anArrayOfProductions.map {|p| p.nonterminal}.equality_uniq
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def init_nonterminal_index(nonterminals, productions)
|
|
249
|
+
@nonterminal_index = Hash.new
|
|
250
|
+
productions.each_with_index do |prod, i|
|
|
251
|
+
@nonterminal_index[i] = nonterminals.index(prod.nonterminal)
|
|
252
|
+
end
|
|
253
|
+
@nonterminal_index
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def init_token_index(tokens)
|
|
257
|
+
@token_index = Hash.new
|
|
258
|
+
tokens.each_with_index {|t,i| @token_index[t] = i}
|
|
259
|
+
@token_index
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def init_productionnum_to_nonterminal_number_hash
|
|
263
|
+
@productionnum_to_nonterminal_num = Hash.new
|
|
264
|
+
@productions.each_with_index do |p, n|
|
|
265
|
+
@productionnum_to_nonterminal_num[n] = @nonterminals.index(p.nonterminal)
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def init_tokentype_to_token_number_hash
|
|
270
|
+
@token_to_number = Hash.new
|
|
271
|
+
@tokens.each_with_index {|t,i| @token_to_number[t] = i}
|
|
272
|
+
end
|
|
273
|
+
end
|