rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,164 @@
1
+ # Common classes used in many of the parse table generation classes.
2
+ #
3
+ require 'rpdf2txt-rockit/indexable'
4
+ require 'rpdf2txt-rockit/token'
5
+ require 'rpdf2txt-rockit/grammar'
6
+
7
+ class Item
8
+ include Indexable
9
+ attr_reader :symbol, :production, :position, :lookahead
10
+
11
+ def initialize(production, position, lookahead = nil, nextItem = nil)
12
+ @production, @position, @lookahead = production, position, lookahead
13
+ @symbol = production.elements[position]
14
+ @hash_value = [production, position, lookahead].hash
15
+ end
16
+
17
+ def direct_following_symbols
18
+ @direct_following_symbols || (@direct_following_symbols = calc_followers)
19
+ end
20
+
21
+ def calc_followers
22
+ return @@empty_array if position >= production.elements.length-1
23
+ elements = production.elements[position+1..-1]
24
+ followers = Array.new
25
+ elements.each do |e|
26
+ followers.push e
27
+ return followers unless e.derives_epsilon?
28
+ end
29
+ followers
30
+ end
31
+
32
+ @@empty_array = Array.new
33
+
34
+ def suffix
35
+ if position < production.elements.length-1
36
+ production.elements[position+1..-1]
37
+ else
38
+ nil
39
+ end
40
+ end
41
+
42
+ def final?
43
+ next_item == nil
44
+ end
45
+
46
+ def next_item
47
+ @next_item || (@next_item = calc_next_item)
48
+ end
49
+
50
+ def lookahead_item(lookaheadSymbol)
51
+ make_new_item(production, position, lookaheadSymbol, next_item)
52
+ end
53
+
54
+ def inspect
55
+ endpos = ((@position >= production.elements.length) or
56
+ (@position == 0))
57
+ production.nonterminal.inspect + "->" +
58
+ inspect_elements(production.elements[0...@position]) +
59
+ (endpos ? "." : " .") +
60
+ inspect_elements(production.elements[@position..-1])
61
+ end
62
+
63
+ protected
64
+
65
+ def inspect_elements(elements)
66
+ elements ? elements.map {|e| e.inspect}.join(" ") : ""
67
+ end
68
+
69
+ def make_new_item(*args)
70
+ @factory ? @factory.make(*args) : Item.new(*args)
71
+ end
72
+
73
+ def calc_next_item
74
+ if position < production.elements.length
75
+ make_new_item(production, position+1)
76
+ else
77
+ nil
78
+ end
79
+ end
80
+ end
81
+
82
+ class LrState
83
+ include Indexable
84
+ attr_reader :kernel_items, :closure, :final_items
85
+
86
+ def initialize(kernelItems)
87
+ @kernel_items = kernelItems
88
+ calc_closure(kernelItems) # Do it lazily instead?
89
+ @final_items = kernelItems.select {|i| i.final?}
90
+ @reduce_state = @final_items.length > 0
91
+ @consistent = kernelItems.length == 1
92
+ end
93
+
94
+ def reduce_state?
95
+ @reduce_state
96
+ end
97
+
98
+ def consistent?
99
+ @consistent
100
+ end
101
+
102
+ def length
103
+ @kernel_items.length
104
+ end
105
+
106
+ def inspect
107
+ "State(#{@kernel_items.inspect})"
108
+ end
109
+
110
+ protected
111
+
112
+ @@empty_array = Array.new
113
+
114
+ def calc_closure(itemset)
115
+ @closure, checked = itemset.clone, Hash.new
116
+ itemset.each {|i| checked = recursive_calc_closure(i.symbol, checked)}
117
+ end
118
+
119
+ def recursive_calc_closure(symbol, checked)
120
+ return checked if !symbol or checked[symbol]
121
+ checked[symbol] = true
122
+ @closure.concat(new_items = symbol.nonkernel_items)
123
+ new_items.each {|i| checked = recursive_calc_closure(i.symbol, checked)}
124
+ checked
125
+ end
126
+ end
127
+
128
+ # Decorate the grammar symbols with some additional info we're gonna need
129
+ class NonTerminal
130
+ include Indexable
131
+ attr_reader :nonkernel_items
132
+ def calc_nonkernel_items(grammar, itemFactory)
133
+ @nonkernel_items = Array.new
134
+ grammar.alternatives(self).each do |production|
135
+ @nonkernel_items.push itemFactory.make(production, 0)
136
+ end
137
+ end
138
+
139
+ def nonterminal?; true; end
140
+ def terminal?; false; end
141
+
142
+ attr_writer :derives_epsilon
143
+
144
+ def derives_epsilon?; @derives_epsilon; end
145
+ end
146
+
147
+ class Token
148
+ include Indexable
149
+ def nonterminal?; false; end
150
+ def terminal?; true; end
151
+
152
+ @@empty_array = []
153
+
154
+ def nonkernel_items
155
+ @@empty_array
156
+ end
157
+
158
+ def derives_epsilon?; false; end
159
+ end
160
+
161
+ # Decorate Productions
162
+ class Production
163
+ include Indexable
164
+ end
@@ -0,0 +1,84 @@
1
+ require 'rpdf2txt-rockit/syntax_tree'
2
+
3
+ class AmbiguityNode < SyntaxTree
4
+ def initialize(alt1, alt2)
5
+ @ambigous_trees = [alt1, alt2]
6
+ super("_Ambiguity", ["ambigous_trees"], [@ambigous_trees])
7
+ end
8
+
9
+ def add_ambigoustree(tree)
10
+ @ambigous_trees.push tree unless @ambigous_trees.include?(tree)
11
+ end
12
+ end
13
+
14
+ class AmbigousParseException < Exception
15
+ attr_reader :alternatives, :substring
16
+
17
+ def initialize(stringBeingParsed, fullTree, *alternativeTrees)
18
+ super("Ambigous parse")
19
+ @alternatives, @full_tree = alternativeTrees, fullTree
20
+ init_substring(stringBeingParsed)
21
+ end
22
+
23
+ def inspect(prettyPrinter = nil)
24
+ str = "Ambiguity: The substring '#{substring}' can be parsed as:\n"
25
+ #return str + report_on_tree_differences(alternatives[0], alternatives[1], 0, 1)
26
+ strings_to_show, same_strings = Array.new, Array.new
27
+ alternatives.each_with_index {|alt, i|
28
+ s = prettyPrinter ? prettyPrinter.print(alt) : alt.inspect
29
+ if (j = strings_to_show.index(s))
30
+ same_strings.push [j,i]
31
+ end
32
+ strings_to_show.push s
33
+ }
34
+ alternatives.each_with_index do |alt,i|
35
+ str << " Alternative #{i+1}: #{strings_to_show[i]}"
36
+ str << ", or" if i < alternatives.length-1
37
+ str << "\n"
38
+ end
39
+ same_strings.each do |i,j|
40
+ str += report_on_tree_differences(alternatives[i], alternatives[j],
41
+ i, j)
42
+ end
43
+ @full_tree.compact!
44
+ str + "The full tree looks like:\n" + @full_tree.inspect
45
+ end
46
+
47
+ def difference_description(i, j, str0, o1, o2, childPath = "")
48
+ child_str =
49
+ childPath.length > 0 ? "in the childrens '#{childPath[1..-1]}'" : ""
50
+ " Alternatives #{i+1} and #{j+1} differ #{child_str} by not having" +
51
+ " the same #{str0} (#{o1.inspect} and #{o2.inspect})"
52
+ end
53
+
54
+ def report_on_tree_differences(t1, t2, i, j, childPath = "")
55
+ if t1.class != t2.class
56
+ difference_description(i, j, "type", t1.class, t2.class, childPath)
57
+ elsif not t1.kind_of?(SyntaxTree)
58
+ if t1 != t2
59
+ " Alternatives #{i} and #{j} are not SyntaxTree's and differ"
60
+ else
61
+ ""
62
+ end
63
+ elsif t1.name != t2.name
64
+ difference_description(i, j, "name", t1.name, t2.name, childPath)
65
+ elsif t1.children_names != t2.children_names
66
+ difference_description(i, j, "children_names",t1.children_names,
67
+ t2.children_names, childPath)
68
+ else
69
+ t1.childrens.each_with_index do |child, k|
70
+ report = report_on_tree_differences(child, t2[k], i, j,
71
+ childPath + "." +
72
+ t1.children_names[k])
73
+ return report if report.length > 0
74
+ end
75
+ return ""
76
+ end
77
+ end
78
+
79
+ protected
80
+
81
+ def init_substring(string)
82
+ @substring = string
83
+ end
84
+ end
@@ -0,0 +1,168 @@
1
+ unless TimesClass
2
+ TimesClass = ((RUBY_VERSION < "1.7") ? Time : Process)
3
+ end
4
+
5
+ module Profiler
6
+ @@start = TimesClass.times.utime
7
+
8
+ # Method invocation stack with one entry for each invocation:
9
+ # Time at entry, Total times in subfunction also being logged, MethodId
10
+ @@invocation_stack = [[0, 0, "#toplevel".intern]]
11
+
12
+ # One entry for each method: NumCalls, TotalTime, OnlyMyTime, Callers
13
+ @@map = {"#toplevel".intern => [1, 0, 0]}
14
+
15
+ # One entry for each method: hash mapping args.inspect to count
16
+ @@arguments = Hash.new
17
+
18
+ @@time_limit = 5 * 60
19
+
20
+ def start(timeLimitInMinutes = nil)
21
+ if timeLimitInMinutes
22
+ @@time_limit = timeLimitInMinutes * 60
23
+ else
24
+ @@time_limit = nil
25
+ end
26
+ @@start = Float(TimesClass.times.utime)
27
+ end
28
+ module_function :start
29
+
30
+ def __enter__(method, *args)
31
+ now = TimesClass.times.utime
32
+ if @@time_limit
33
+ if now - @@start > @@time_limit
34
+ STDERR.puts "Profiling time limit violated. Run terminated."
35
+ STDERR.puts profile_summary(true, true)
36
+ exit(-1)
37
+ end
38
+ end
39
+ @@invocation_stack.push [now, 0.0, method]
40
+ begin
41
+ @@arguments[method][args.inspect] += 1
42
+ rescue Exception
43
+ @@arguments[method] = Hash.new(0)
44
+ retry
45
+ end
46
+ end
47
+ module_function :__enter__
48
+
49
+ def __leave__(method, *retargs)
50
+ now = TimesClass.times.utime
51
+ tick, data = @@invocation_stack.pop, @@map[method]
52
+ unless data
53
+ data = [0.0, 0.0, 0.0, Hash.new(0)]
54
+ @@map[method] = data
55
+ end
56
+ data[0] += 1
57
+ total_time_this_invocation = now - tick[0]
58
+ data[1] += total_time_this_invocation
59
+ data[2] += total_time_this_invocation - tick[1]
60
+ data[3][caller[1]] += 1
61
+ @@invocation_stack[-1][1] += total_time_this_invocation
62
+ return *retargs
63
+ end
64
+ module_function :__leave__
65
+
66
+ # Go through the invocation stack and leave all methods.
67
+ def unwind_invocation_stack
68
+ while @@invocation_stack.length > 1
69
+ __leave__(@@invocation_stack.pop[2])
70
+ end
71
+ end
72
+ module_function :unwind_invocation_stack
73
+
74
+ def Profiler.profile_summary(writeCallers = false, writeArguments = false)
75
+ total_elapsed = TimesClass.times.utime - @@start
76
+ str = "Profiling summary\n"
77
+ str += "*****************\n"
78
+ str += "Total elapsed time: #{total_elapsed} seconds\n"
79
+ unwind_invocation_stack if @@invocation_stack.length > 1
80
+ total = @@invocation_stack.last[1]
81
+ time_in_nonprofiled = total_elapsed - total
82
+ str += "Time spent in non-profiled methods: #{time_in_nonprofiled} sec\n"
83
+ str += "Time in profiled methods:\n"
84
+ if total == 0 then total = 0.01 end
85
+ @@map["#toplevel".intern][1] = total
86
+ data = @@map.to_a.sort{|a,b| b[1][2] <=> a[1][2]}
87
+ sum = 0
88
+ str += " %% cumulative self self total\n"
89
+ str += " time seconds seconds calls ms/call ms/call name\n"
90
+ str += " ---------------------------------------------------------\n"
91
+ for d in data
92
+ method = d[0]
93
+ next if method == "#toplevel".intern
94
+ d = d[1]
95
+ sum += d[2]
96
+ str += "%6.2f %8.2f %8.2f %8d " % [d[2]/total*100, sum, d[2], d[0]]
97
+ str += "%8.2f %8.2f %s\n" % [d[2]*1000/d[0], d[1]*1000/d[0],
98
+ method.id2name]
99
+ if writeCallers
100
+ str += " Call sites:\n"
101
+ d[3].to_a.sort {|a,b| b[1] <=> a[1]}.each do |callersite, count|
102
+ str += " #{count}: " + callersite.split("/").last + "\n"
103
+ end
104
+ end
105
+ if writeArguments and d[0] > 1
106
+ str += " Arguments:\n"
107
+ counts, num_prev_seen = Hash.new(0), 0
108
+ @@arguments[method].to_a.sort {|a,b| b[1] <=> a[1]}.each do |args, cnt|
109
+ # str += " #{cnt}: " + args + "\n" if cnt > 1
110
+ counts[cnt] += cnt
111
+ num_prev_seen += cnt if cnt > 1
112
+ end
113
+ proportion_prev_seen = num_prev_seen*100.0/d[0]
114
+ proportion_unique = 100.0 - proportion_prev_seen
115
+ str += " %3.2f%% (#{d[0].to_i - num_prev_seen}) of calls with unique args" % proportion_unique
116
+ if proportion_unique != 100.0
117
+ str += ", and\n"
118
+ str += " %3.2f%% (#{num_prev_seen}) of calls with args that were used several times\n" % proportion_prev_seen
119
+ str += " distr: #{counts.inspect}"
120
+ end
121
+ str += "\n"
122
+ end
123
+ str += "\n" if writeCallers or writeArguments
124
+ end
125
+ return str
126
+ end
127
+ end
128
+
129
+ #############################################################################
130
+ # Simple test
131
+ #############################################################################
132
+ if __FILE__ == $0
133
+ class ComplexTest
134
+ attr_reader :real, :imaginary
135
+ def initialize(real, imaginary)
136
+ Profiler.__enter__(:initialize, real, imaginary)
137
+ @real, @imaginary = real, imaginary
138
+ Profiler.__leave__(:initialize, self)
139
+ end
140
+ def add(other)
141
+ Profiler.__enter__(:add, other)
142
+ real_add(other)
143
+ Profiler.__leave__(:add, self)
144
+ end
145
+ def real_add(other)
146
+ Profiler.__enter__(:real_add, other)
147
+ @real += other.real
148
+ @imaginary += other.imaginary
149
+ Profiler.__leave__(:real_add, self)
150
+ end
151
+ def inspect
152
+ "#{real} + i*#{imaginary}"
153
+ end
154
+ end
155
+
156
+ Profiler.start
157
+
158
+ 10.times do
159
+ c = ComplexTest.new(rand, rand)
160
+ 100.times do
161
+ c.add(ComplexTest.new(rand, rand))
162
+ end
163
+ puts "It finished"
164
+ end
165
+ c = ComplexTest.new(1,1)
166
+ c = ComplexTest.new(1,1)
167
+ puts Profiler.profile_summary(true, true)
168
+ end
@@ -0,0 +1,523 @@
1
+ # Find the reductions in a parse table for a StateGraph.
2
+ #
3
+ # There can be reductions in all states with kernel items that are final, ie.
4
+ # where the position cannot be advanced any further
5
+ # (example: State(3, [S -> R.])).
6
+ #
7
+ # For states with only one kernel item (which is final which is always the
8
+ # case if its only one) we should reduce for
9
+ # all terminals in the follow set of the nonterminal (Follow(S) in the
10
+ # example above). These states are called consistent reduce states.
11
+ #
12
+ # When there are multiple kernel items and one (or several) of them is a final
13
+ # item we should reduce for terminals in Follow(state, nonterminal). These
14
+ # states are called inconsistent reduce states. An example is
15
+ # State(5, [S -> L .= R, R -> L.]) where we dont know whether to reduce
16
+ # or shift. Here, we should reduce by R -> L for all terminals in Follow(5, R).
17
+ #
18
+ # To reduce the amount of computation needed we use a hybrid, lazy approach.
19
+ # We calculate all first and follow sets for all symbols in the grammar.
20
+ # With this we can find the reductions for consistent reduce states. For the
21
+ # inconsistent ones we build the first to follow relations "from the back"
22
+ # (ie. from the actual follow sets we need) up to the first sets. This way
23
+ # we don't need to calculate relations that are not relevant.
24
+ #
25
+ require 'rpdf2txt-rockit/directed_graph'
26
+ require 'rpdf2txt-rockit/base_extensions'
27
+ require 'rpdf2txt-rockit/parsetable_generation'
28
+
29
+ require 'rpdf2txt-rockit/profiler'
30
+
31
+ #$PROFILE = true
32
+
33
+ class ReduceActionsGenerator
34
+ def initialize(stateGraph, grammar, parseTable, allItems)
35
+ @state_graph, @grammar, @parsetable = stateGraph, grammar, parseTable
36
+ set_index_numbers
37
+ precalc_items_at_nonterminal(allItems)
38
+ @lalr_pair_factory = IndexableFactory.new(LaLrPair, 0)
39
+ init_traverse_cache
40
+ end
41
+
42
+ def add_reduce_actions
43
+ Profiler.start if $PROFILE
44
+ add_actions_for_consistent_states
45
+ add_actions_for_inconsistent_states
46
+ @parsetable.compact!
47
+ puts Profiler.profile_summary(true, true) if $PROFILE
48
+ @parsetable
49
+ end
50
+
51
+ protected
52
+
53
+ def init_traverse_cache
54
+ num_states = @state_graph.nodes.length
55
+ @traverse_cache = Array.new(num_states)
56
+ num_states.times {|i| @traverse_cache[i] = Hash.new}
57
+ end
58
+
59
+ def set_index_numbers
60
+ cnt = -1
61
+ @grammar.terminals.each {|t| t.index_number = (cnt += 1)}
62
+ @grammar.nonterminals.each {|nt| nt.index_number = (cnt += 1)}
63
+ @grammar.productions.each_with_index {|p,i| p.index_number = i}
64
+ end
65
+
66
+ def add_actions_for_consistent_states
67
+ Profiler.__enter__(:add_actions_for_consistent_states) if $PROFILE
68
+ crs = @state_graph.consistent_reduce_states
69
+ build_follow_sets(crs).each do |state|
70
+ state.final_items.each do |item|
71
+ prod = item.production
72
+ add_reduce_action_for_terminalset(state,
73
+ follow_set(prod.nonterminal), prod)
74
+ end
75
+ end
76
+ Profiler.__leave__(:add_actions_for_consistent_states) if $PROFILE
77
+ end
78
+
79
+ def build_follow_sets(states)
80
+ create_first_sets
81
+ precalc_epsilon_derivation
82
+ create_follow_sets
83
+ build_relations # Currently we build the full relations graphs
84
+ update_first_sets
85
+ update_follow_sets
86
+ states
87
+ end
88
+
89
+ def epsilon
90
+ return @epsilon if @epsilon
91
+ @epsilon = EpsilonToken.new
92
+ @epsilon.index_number = @grammar.terminals.map{|t| t.index_number}.max+1
93
+ @epsilon
94
+ end
95
+
96
+ def all_terminals
97
+ @all_terminals || (@all_terminals = @grammar.terminals + [epsilon])
98
+ end
99
+
100
+ def precalc_epsilon_derivation
101
+ @i2i_relations = DirectedGraph.new
102
+ @grammar.productions.each do |production|
103
+ first_element = production.elements.first
104
+ unless first_element
105
+ i2i_relation(@epsilon, production.nonterminal)
106
+ end
107
+ end
108
+ propagate_terminal_sets([@epsilon], @i2i_relations) {|n| first_set(n)}
109
+ @grammar.nonterminals.each do |nt|
110
+ nt.derives_epsilon = first_set(nt).include?(@epsilon)
111
+ end
112
+ end
113
+
114
+ def update_first_sets
115
+ propagate_terminal_sets(@i2i_relations.roots, @i2i_relations) {|n| first_set(n)}
116
+ end
117
+
118
+ def update_follow_sets
119
+ @i2o_relations.each do |src, dests|
120
+ next unless dests
121
+ source_set = first_set(src)
122
+ dests.uniq.each {|d| follow_set(d).update source_set}
123
+ end
124
+ propagate_terminal_sets(@o2o_relations.roots, @o2o_relations) {|n| follow_set(n)}
125
+ end
126
+
127
+ def create_first_sets
128
+ nonterms = @grammar.nonterminals
129
+ ntfs = @nonterminal_first_sets = Array.new(nonterms.length)
130
+ nonterms.each {|nt| ntfs[nt.index_number] = new_terminal_set}
131
+ terms = @grammar.terminals
132
+ tfs = @terminal_first_sets = Array.new(nonterms.length)
133
+ terms.each {|t| tfs[t.index_number] = new_terminal_set(t)}
134
+ end
135
+
136
+ def create_follow_sets
137
+ nonterms = @grammar.nonterminals
138
+ fs = @follow_sets = Array.new(nonterms.length)
139
+ nonterms.each {|nt| fs[nt.index_number] = new_terminal_set}
140
+ end
141
+
142
+ def build_relations
143
+ @i2o_relations = Hash.new
144
+ @o2o_relations = DirectedGraph.new
145
+ @productions_ending_with = Array.new
146
+ i = elements = nonterminal = last_nonterm = len = update_first = nil
147
+ @grammar.productions.each do |production|
148
+ elements = production.elements
149
+ nonterminal = production.nonterminal
150
+ i, updating, len, last_nonterm = 0, true, elements.length, nil
151
+ while i < len
152
+ x = elements[i]
153
+ i2i_relation(x, nonterminal) if updating
154
+ i2o_relation(x, last_nonterm) if last_nonterm
155
+ last_nonterm = x.nonterminal? ? x : nil
156
+ updating = updating and x.nonterminal? and x.derives_epsilon?
157
+ i += 1
158
+ end
159
+ i, updating = len-1, true
160
+ while updating and i >= 0
161
+ x = elements[i]
162
+ if x.nonterminal?
163
+ o2o_relation(nonterminal, x)
164
+ update_productions_ending_with(x, production, elements[0...i])
165
+ end
166
+ updating = x.nonterminal? and x.derives_epsilon?
167
+ i -= 1
168
+ end
169
+ end
170
+ # Should be @grammar.original_start_symbol but this will always work
171
+ # since Follow(S) includes Follow(S'). It will take care of the accept.
172
+ i2o_relation(@grammar.eof_terminal, @grammar.start_symbol)
173
+ end
174
+
175
+ def i2i_relation(src, dest)
176
+ @i2i_relations.link_nodes(src, dest)
177
+ end
178
+
179
+ def o2o_relation(src, dest)
180
+ @o2o_relations.link_nodes(src, dest)
181
+ end
182
+
183
+ def i2o_relation(src, dest)
184
+ a = @i2o_relations[src]
185
+ if a
186
+ a.push dest
187
+ else
188
+ @i2o_relations[src] = [dest]
189
+ end
190
+ end
191
+
192
+ def new_terminal_set(terminal = nil)
193
+ if terminal
194
+ TerminalSet.new(all_terminals, [terminal])
195
+ else
196
+ TerminalSet.new(all_terminals)
197
+ end
198
+ end
199
+
200
+ def first_set(symbol)
201
+ if symbol.nonterminal?
202
+ @nonterminal_first_sets[symbol.index_number]
203
+ else
204
+ @terminal_first_sets[symbol.index_number]
205
+ end
206
+ end
207
+
208
+ def add_actions_for_inconsistent_states
209
+ Profiler.__enter__(:add_actions_for_inconsistent_states) if $PROFILE
210
+ irs = @state_graph.inconsistent_reduce_states
211
+ build_lalr_follow_sets(irs).each do |state, pair|
212
+ state.final_items.each do |item|
213
+ add_reduce_action_for_terminalset(state, lalr_follow_set(pair),
214
+ item.production)
215
+ end
216
+ end
217
+ Profiler.__leave__(:add_actions_for_inconsistent_states) if $PROFILE
218
+ end
219
+
220
+ def add_reduce_action_for_terminalset(state, terminalSet, production)
221
+ Profiler.__enter__(:add_reduce_action_for_terminalset, state, terminalSet, production) if $PROFILE
222
+ if production.nonterminal == @grammar.start_symbol
223
+ a = [:ACCEPT, 0]
224
+ else
225
+ a = [:REDUCE, production.index_number]
226
+ end
227
+ @parsetable.add_action_for_terminalset(state.index_number, a, terminalSet)
228
+ Profiler.__leave__(:add_reduce_action_for_terminalset) if $PROFILE
229
+ end
230
+
231
+ def propagate_terminal_sets(roots, graph = @relations, &getTerminalSet)
232
+ Profiler.__enter__(:propagate_terminal_sets, roots, graph) if $PROFILE
233
+ # Logothetis-Bermudez-style propagation. Does NOT exploit the fact that
234
+ # many propagations share a majority of paths. Exploit if speed needed!
235
+ # Note that this is not garantueed to work if there is a strongly connected
236
+ # component with more than two nodes. The intermediate nodes (only linked
237
+ # within the component) will not be updated after the backlink.
238
+ # Solve by doing a real reachability graph instead.
239
+ roots.each do |root|
240
+ graph.each_reachable_node_once_breadth_first(root) do |parent|
241
+ parent_fset = getTerminalSet.call(parent)
242
+ graph.children(parent).each do |child|
243
+ getTerminalSet.call(child).update parent_fset
244
+ end
245
+ end
246
+ end
247
+ Profiler.__leave__(:propagate_terminal_sets) if $PROFILE
248
+ end
249
+
250
+ def build_lalr_follow_sets(states)
251
+ Profiler.__enter__(:build_lalr_follow_sets) if $PROFILE
252
+ follow_sets_needed = Array.new
253
+ states.each do |state|
254
+ state.final_items.each do |item|
255
+ production = item.production
256
+ src_state = back_traverse(state, production.elements)
257
+ follow_sets_needed.push [state,
258
+ lalr_pair(src_state, production.nonterminal)]
259
+ end
260
+ end
261
+ build_o2o_lalr_relations
262
+ create_lalr_follow_sets
263
+ update_lalr_follow_sets_with_direct_followers
264
+ propagate_terminal_sets(@o2o_lalr_relations.roots, @o2o_lalr_relations) {|n| lalr_follow_set(n)}
265
+ Profiler.__leave__(:build_lalr_follow_sets) if $PROFILE
266
+ follow_sets_needed
267
+ end
268
+
269
+ def o2o_lalr_relation(src, dest)
270
+ @o2o_lalr_relations.link_nodes(src, dest)
271
+ end
272
+
273
+ LaLrPair = Struct.new("LaLrPair", :state, :nonterminal)
274
+ class LaLrPair
275
+ include Indexable
276
+ def inspect
277
+ "(#{state.index_number.inspect}, #{nonterminal.inspect})"
278
+ end
279
+ end
280
+
281
+ def lalr_pair(state, nonterminal)
282
+ @lalr_pair_factory.instance_with_args(state, nonterminal)
283
+ end
284
+
285
+ def precalc_items_at_nonterminal(allItems) # O(I)
286
+ @items_at_nonterminal = Array.new
287
+ allItems.each do |item|
288
+ symbol = item.symbol
289
+ if symbol and symbol.nonterminal?
290
+ update_items_at_nonterminal(symbol, item)
291
+ end
292
+ end
293
+ @items_at_nonterminal
294
+ end
295
+
296
+ def update_items_at_nonterminal(nonterminal, item)
297
+ i = nonterminal.index_number
298
+ if (a = @items_at_nonterminal[i])
299
+ a.push item
300
+ else
301
+ @items_at_nonterminal[i] = [item]
302
+ end
303
+ end
304
+
305
+ def items_at_nonterminal(nonterminal)
306
+ @items_at_nonterminal[nonterminal.index_number]
307
+ end
308
+
309
+ def create_lalr_follow_sets
310
+ Profiler.__enter__(:create_lalr_follow_sets) if $PROFILE
311
+ lalr_pairs = @lalr_pair_factory.instances
312
+ @lalr_follow_sets = Array.new(lalr_pairs.length)
313
+ lalr_pairs.each do |pair|
314
+ @lalr_follow_sets[pair.index_number] = new_terminal_set
315
+ end
316
+ Profiler.__leave__(:create_lalr_follow_sets) if $PROFILE
317
+ end
318
+
319
+ def lalr_follow_set(lalrPair)
320
+ @lalr_follow_sets[lalrPair.index_number]
321
+ end
322
+
323
+ def update_lalr_follow_sets_with_direct_followers
324
+ Profiler.__enter__(:update_lalr_follow_sets_with_direct_followers) if $PROFILE
325
+ @lalr_pair_factory.instances.each do |pair|
326
+ if pair.nonterminal == @grammar.start_symbol
327
+ lalr_follow_set(pair).update first_set(@grammar.eof_terminal)
328
+ end
329
+ items = items_at_nonterminal(pair.nonterminal)
330
+ next unless items
331
+ items.each do |item|
332
+ item.direct_following_symbols.each do |symbol|
333
+ lalr_follow_set(pair).update first_set(symbol)
334
+ end
335
+ end
336
+ end
337
+ Profiler.__leave__(:update_lalr_follow_sets_with_direct_followers) if $PROFILE
338
+ end
339
+
340
+ def productions_ending_with2(nonterminal, withLastPrefix = nil)
341
+ if withLastPrefix
342
+ @productions_ending_with[nonterminal.index_number][withLastPrefix.index_number]
343
+ else
344
+ @productions_ending_with_and_having_empty_prefix
345
+ end
346
+ end
347
+
348
+ def update_productions_ending_with2(nonterminal, production, prefix)
349
+ ntindex = nonterminal.index_number
350
+ pindex = prefix.last.index_number
351
+ new_element = [production, prefix]
352
+ if a = @productions_ending_with[ntindex]
353
+ if a2 = @productions_ending_with[ntindex][pindex]
354
+ a2.push(new_element) unless a2.include?(new_element)
355
+ else
356
+ @productions_ending_with[ntindex][pindex] = [new_element]
357
+ end
358
+ else
359
+ @productions_ending_with[ntindex] = Array.new
360
+ @productions_ending_with[ntindex][pindex] = [new_element]
361
+ end
362
+ end
363
+
364
+ def productions_ending_with(nonterminal)
365
+ @productions_ending_with[nonterminal.index_number]
366
+ end
367
+
368
+ def update_productions_ending_with(nonterminal, production, prefix)
369
+ ntindex = nonterminal.index_number
370
+ new_element = [production, prefix]
371
+ if a = @productions_ending_with[ntindex]
372
+ a.push(new_element) unless a.index(new_element)
373
+ else
374
+ @productions_ending_with[ntindex] = [new_element]
375
+ end
376
+ end
377
+
378
+ @@empty_array = Array.new
379
+
380
+ # We discard prefixes where the first symbol is not valid directly
381
+ # to save time in back_traverse.
382
+ def productions_with_valid_prefixes(pair)
383
+ Profiler.__enter__(:productions_with_valid_prefixes) if $PROFILE
384
+ prods = productions_ending_with(pair.nonterminal)
385
+ unless prods
386
+ Profiler.__leave__(:productions_with_valid_prefixes) if $PROFILE
387
+ return @@empty_array
388
+ end
389
+ incoming = @state_graph.incoming_links_info(pair.state)
390
+ res = prods.select do |prod, prefix|
391
+ prefix.length == 0 or incoming.include?(prefix.last)
392
+ end
393
+ Profiler.__leave__(:productions_with_valid_prefixes) if $PROFILE
394
+ res
395
+ end
396
+
397
+ def build_o2o_lalr_relations
398
+ Profiler.__enter__(:build_o2o_lalr_relations) if $PROFILE
399
+ @o2o_lalr_relations = DirectedGraph.new
400
+ current = 0
401
+ while current < @lalr_pair_factory.instances.length
402
+ pair = @lalr_pair_factory.instances[current]
403
+ productions = productions_with_valid_prefixes(pair)
404
+ productions.each do |prod, prefix|
405
+ q = back_traverse(pair.state, prefix)
406
+ if q
407
+ src_pair = lalr_pair(q, prod.nonterminal)
408
+ o2o_lalr_relation(src_pair, pair)
409
+ end
410
+ end
411
+ current += 1
412
+ end
413
+ #puts "Num LaLrPairs = #{current.inspect}"
414
+ Profiler.__leave__(:build_o2o_lalr_relations) if $PROFILE
415
+ end
416
+
417
+ def back_traverse(state, elements)
418
+ Profiler.__enter__(:back_traverse, state, elements) if $PROFILE
419
+ index = state.index_number
420
+ dest = @traverse_cache[index][elements]
421
+ unless dest
422
+ begin
423
+ dest = @state_graph.back_traverse(state, elements)
424
+ @traverse_cache[index][elements] = dest
425
+ rescue GraphTraversalException => e
426
+ dest = nil
427
+ end
428
+ end
429
+ Profiler.__leave__(:back_traverse) if $PROFILE
430
+ dest
431
+ end
432
+
433
+ def follow_set(symbol)
434
+ @follow_sets[symbol.index_number]
435
+ end
436
+ end
437
+
438
+ class TerminalSet
439
+ # With fast union and add operations. Actually its actually a general
440
+ # implementation and should probably be called IndexableObjectSet.
441
+
442
+ def initialize(allPossibleMembers, members = [], all = nil, max = nil)
443
+ @all_possible_members = allPossibleMembers
444
+ @max_index = max || allPossibleMembers.map{|m| m.index_number}.max
445
+ @all = all || (2**(@max_index+1))-1
446
+ @included = 0 # We represent as bit vector in Integer.
447
+ members.each {|t| add(t)}
448
+ end
449
+
450
+ @@empty_array = Array.new
451
+
452
+ def TerminalSet.new_from_integer(allPossibleMembers, included, all = nil,
453
+ max = nil)
454
+ ts = new(allPossibleMembers, @@empty_array, all, max)
455
+ ts.set_include_vector(included)
456
+ ts
457
+ end
458
+
459
+ def add(terminal)
460
+ raise ArgumentError unless @all_possible_members.include?(terminal)
461
+ begin
462
+ @included |= mask(terminal.index_number)
463
+ rescue Exception
464
+ puts "TerminalSet: #{@all_possible_members.inspect} but was #{terminal.inspect}"
465
+ end
466
+ end
467
+
468
+ def update(other)
469
+ @included |= other.to_i
470
+ end
471
+
472
+ def -(other)
473
+ # 1 in result if 1 in @included and 0 in other =>
474
+ #
475
+ TerminalSet.new_from_integer(@all_possible_members,
476
+ @included & (@all - other.to_i), @all,
477
+ @max_index)
478
+ end
479
+
480
+ def empty?
481
+ @included == 0
482
+ end
483
+
484
+ def terminals
485
+ @all_possible_members.select {|t| index_included?(t.index_number)}
486
+ end
487
+
488
+ def index_included?(index)
489
+ @included & mask(index) > 0
490
+ end
491
+
492
+ def include?(terminal)
493
+ index_included?(terminal.index_number)
494
+ end
495
+
496
+ def inspect
497
+ terminals.inspect
498
+ end
499
+
500
+ def each
501
+ Profiler.__enter__(:TerminalSet_each) if $PROFILE
502
+ @all_possible_members.each do |t|
503
+ yield(t) if index_included?(t.index_number)
504
+ end
505
+ Profiler.__leave__(:TerminalSet_each) if $PROFILE
506
+ end
507
+
508
+ def set_include_vector(newVector)
509
+ @included = newVector
510
+ end
511
+
512
+ def to_i
513
+ @included
514
+ end
515
+
516
+ protected
517
+
518
+ @@masks = Array.new
519
+
520
+ def mask(index)
521
+ @@masks[index] || (@@masks[index] = (1<<index))
522
+ end
523
+ end