rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,164 @@
1
+ # Common classes used in many of the parse table generation classes.
2
+ #
3
+ require 'rpdf2txt-rockit/indexable'
4
+ require 'rpdf2txt-rockit/token'
5
+ require 'rpdf2txt-rockit/grammar'
6
+
7
+ class Item
8
+ include Indexable
9
+ attr_reader :symbol, :production, :position, :lookahead
10
+
11
+ def initialize(production, position, lookahead = nil, nextItem = nil)
12
+ @production, @position, @lookahead = production, position, lookahead
13
+ @symbol = production.elements[position]
14
+ @hash_value = [production, position, lookahead].hash
15
+ end
16
+
17
+ def direct_following_symbols
18
+ @direct_following_symbols || (@direct_following_symbols = calc_followers)
19
+ end
20
+
21
+ def calc_followers
22
+ return @@empty_array if position >= production.elements.length-1
23
+ elements = production.elements[position+1..-1]
24
+ followers = Array.new
25
+ elements.each do |e|
26
+ followers.push e
27
+ return followers unless e.derives_epsilon?
28
+ end
29
+ followers
30
+ end
31
+
32
+ @@empty_array = Array.new
33
+
34
+ def suffix
35
+ if position < production.elements.length-1
36
+ production.elements[position+1..-1]
37
+ else
38
+ nil
39
+ end
40
+ end
41
+
42
+ def final?
43
+ next_item == nil
44
+ end
45
+
46
+ def next_item
47
+ @next_item || (@next_item = calc_next_item)
48
+ end
49
+
50
+ def lookahead_item(lookaheadSymbol)
51
+ make_new_item(production, position, lookaheadSymbol, next_item)
52
+ end
53
+
54
+ def inspect
55
+ endpos = ((@position >= production.elements.length) or
56
+ (@position == 0))
57
+ production.nonterminal.inspect + "->" +
58
+ inspect_elements(production.elements[0...@position]) +
59
+ (endpos ? "." : " .") +
60
+ inspect_elements(production.elements[@position..-1])
61
+ end
62
+
63
+ protected
64
+
65
+ def inspect_elements(elements)
66
+ elements ? elements.map {|e| e.inspect}.join(" ") : ""
67
+ end
68
+
69
+ def make_new_item(*args)
70
+ @factory ? @factory.make(*args) : Item.new(*args)
71
+ end
72
+
73
+ def calc_next_item
74
+ if position < production.elements.length
75
+ make_new_item(production, position+1)
76
+ else
77
+ nil
78
+ end
79
+ end
80
+ end
81
+
82
+ class LrState
83
+ include Indexable
84
+ attr_reader :kernel_items, :closure, :final_items
85
+
86
+ def initialize(kernelItems)
87
+ @kernel_items = kernelItems
88
+ calc_closure(kernelItems) # Do it lazily instead?
89
+ @final_items = kernelItems.select {|i| i.final?}
90
+ @reduce_state = @final_items.length > 0
91
+ @consistent = kernelItems.length == 1
92
+ end
93
+
94
+ def reduce_state?
95
+ @reduce_state
96
+ end
97
+
98
+ def consistent?
99
+ @consistent
100
+ end
101
+
102
+ def length
103
+ @kernel_items.length
104
+ end
105
+
106
+ def inspect
107
+ "State(#{@kernel_items.inspect})"
108
+ end
109
+
110
+ protected
111
+
112
+ @@empty_array = Array.new
113
+
114
+ def calc_closure(itemset)
115
+ @closure, checked = itemset.clone, Hash.new
116
+ itemset.each {|i| checked = recursive_calc_closure(i.symbol, checked)}
117
+ end
118
+
119
+ def recursive_calc_closure(symbol, checked)
120
+ return checked if !symbol or checked[symbol]
121
+ checked[symbol] = true
122
+ @closure.concat(new_items = symbol.nonkernel_items)
123
+ new_items.each {|i| checked = recursive_calc_closure(i.symbol, checked)}
124
+ checked
125
+ end
126
+ end
127
+
128
+ # Decorate the grammar symbols with some additional info we're gonna need
129
+ class NonTerminal
130
+ include Indexable
131
+ attr_reader :nonkernel_items
132
+ def calc_nonkernel_items(grammar, itemFactory)
133
+ @nonkernel_items = Array.new
134
+ grammar.alternatives(self).each do |production|
135
+ @nonkernel_items.push itemFactory.make(production, 0)
136
+ end
137
+ end
138
+
139
+ def nonterminal?; true; end
140
+ def terminal?; false; end
141
+
142
+ attr_writer :derives_epsilon
143
+
144
+ def derives_epsilon?; @derives_epsilon; end
145
+ end
146
+
147
+ class Token
148
+ include Indexable
149
+ def nonterminal?; false; end
150
+ def terminal?; true; end
151
+
152
+ @@empty_array = []
153
+
154
+ def nonkernel_items
155
+ @@empty_array
156
+ end
157
+
158
+ def derives_epsilon?; false; end
159
+ end
160
+
161
+ # Decorate Productions
162
+ class Production
163
+ include Indexable
164
+ end
@@ -0,0 +1,84 @@
1
+ require 'rpdf2txt-rockit/syntax_tree'
2
+
3
+ class AmbiguityNode < SyntaxTree
4
+ def initialize(alt1, alt2)
5
+ @ambigous_trees = [alt1, alt2]
6
+ super("_Ambiguity", ["ambigous_trees"], [@ambigous_trees])
7
+ end
8
+
9
+ def add_ambigoustree(tree)
10
+ @ambigous_trees.push tree unless @ambigous_trees.include?(tree)
11
+ end
12
+ end
13
+
14
+ class AmbigousParseException < Exception
15
+ attr_reader :alternatives, :substring
16
+
17
+ def initialize(stringBeingParsed, fullTree, *alternativeTrees)
18
+ super("Ambigous parse")
19
+ @alternatives, @full_tree = alternativeTrees, fullTree
20
+ init_substring(stringBeingParsed)
21
+ end
22
+
23
+ def inspect(prettyPrinter = nil)
24
+ str = "Ambiguity: The substring '#{substring}' can be parsed as:\n"
25
+ #return str + report_on_tree_differences(alternatives[0], alternatives[1], 0, 1)
26
+ strings_to_show, same_strings = Array.new, Array.new
27
+ alternatives.each_with_index {|alt, i|
28
+ s = prettyPrinter ? prettyPrinter.print(alt) : alt.inspect
29
+ if (j = strings_to_show.index(s))
30
+ same_strings.push [j,i]
31
+ end
32
+ strings_to_show.push s
33
+ }
34
+ alternatives.each_with_index do |alt,i|
35
+ str << " Alternative #{i+1}: #{strings_to_show[i]}"
36
+ str << ", or" if i < alternatives.length-1
37
+ str << "\n"
38
+ end
39
+ same_strings.each do |i,j|
40
+ str += report_on_tree_differences(alternatives[i], alternatives[j],
41
+ i, j)
42
+ end
43
+ @full_tree.compact!
44
+ str + "The full tree looks like:\n" + @full_tree.inspect
45
+ end
46
+
47
+ def difference_description(i, j, str0, o1, o2, childPath = "")
48
+ child_str =
49
+ childPath.length > 0 ? "in the childrens '#{childPath[1..-1]}'" : ""
50
+ " Alternatives #{i+1} and #{j+1} differ #{child_str} by not having" +
51
+ " the same #{str0} (#{o1.inspect} and #{o2.inspect})"
52
+ end
53
+
54
+ def report_on_tree_differences(t1, t2, i, j, childPath = "")
55
+ if t1.class != t2.class
56
+ difference_description(i, j, "type", t1.class, t2.class, childPath)
57
+ elsif not t1.kind_of?(SyntaxTree)
58
+ if t1 != t2
59
+ " Alternatives #{i} and #{j} are not SyntaxTree's and differ"
60
+ else
61
+ ""
62
+ end
63
+ elsif t1.name != t2.name
64
+ difference_description(i, j, "name", t1.name, t2.name, childPath)
65
+ elsif t1.children_names != t2.children_names
66
+ difference_description(i, j, "children_names",t1.children_names,
67
+ t2.children_names, childPath)
68
+ else
69
+ t1.childrens.each_with_index do |child, k|
70
+ report = report_on_tree_differences(child, t2[k], i, j,
71
+ childPath + "." +
72
+ t1.children_names[k])
73
+ return report if report.length > 0
74
+ end
75
+ return ""
76
+ end
77
+ end
78
+
79
+ protected
80
+
81
+ def init_substring(string)
82
+ @substring = string
83
+ end
84
+ end
@@ -0,0 +1,168 @@
1
+ unless TimesClass
2
+ TimesClass = ((RUBY_VERSION < "1.7") ? Time : Process)
3
+ end
4
+
5
+ module Profiler
6
+ @@start = TimesClass.times.utime
7
+
8
+ # Method invocation stack with one entry for each invocation:
9
+ # Time at entry, Total times in subfunction also being logged, MethodId
10
+ @@invocation_stack = [[0, 0, "#toplevel".intern]]
11
+
12
+ # One entry for each method: NumCalls, TotalTime, OnlyMyTime, Callers
13
+ @@map = {"#toplevel".intern => [1, 0, 0]}
14
+
15
+ # One entry for each method: hash mapping args.inspect to count
16
+ @@arguments = Hash.new
17
+
18
+ @@time_limit = 5 * 60
19
+
20
+ def start(timeLimitInMinutes = nil)
21
+ if timeLimitInMinutes
22
+ @@time_limit = timeLimitInMinutes * 60
23
+ else
24
+ @@time_limit = nil
25
+ end
26
+ @@start = Float(TimesClass.times.utime)
27
+ end
28
+ module_function :start
29
+
30
+ def __enter__(method, *args)
31
+ now = TimesClass.times.utime
32
+ if @@time_limit
33
+ if now - @@start > @@time_limit
34
+ STDERR.puts "Profiling time limit violated. Run terminated."
35
+ STDERR.puts profile_summary(true, true)
36
+ exit(-1)
37
+ end
38
+ end
39
+ @@invocation_stack.push [now, 0.0, method]
40
+ begin
41
+ @@arguments[method][args.inspect] += 1
42
+ rescue Exception
43
+ @@arguments[method] = Hash.new(0)
44
+ retry
45
+ end
46
+ end
47
+ module_function :__enter__
48
+
49
+ def __leave__(method, *retargs)
50
+ now = TimesClass.times.utime
51
+ tick, data = @@invocation_stack.pop, @@map[method]
52
+ unless data
53
+ data = [0.0, 0.0, 0.0, Hash.new(0)]
54
+ @@map[method] = data
55
+ end
56
+ data[0] += 1
57
+ total_time_this_invocation = now - tick[0]
58
+ data[1] += total_time_this_invocation
59
+ data[2] += total_time_this_invocation - tick[1]
60
+ data[3][caller[1]] += 1
61
+ @@invocation_stack[-1][1] += total_time_this_invocation
62
+ return *retargs
63
+ end
64
+ module_function :__leave__
65
+
66
+ # Go through the invocation stack and leave all methods.
67
+ def unwind_invocation_stack
68
+ while @@invocation_stack.length > 1
69
+ __leave__(@@invocation_stack.pop[2])
70
+ end
71
+ end
72
+ module_function :unwind_invocation_stack
73
+
74
+ def Profiler.profile_summary(writeCallers = false, writeArguments = false)
75
+ total_elapsed = TimesClass.times.utime - @@start
76
+ str = "Profiling summary\n"
77
+ str += "*****************\n"
78
+ str += "Total elapsed time: #{total_elapsed} seconds\n"
79
+ unwind_invocation_stack if @@invocation_stack.length > 1
80
+ total = @@invocation_stack.last[1]
81
+ time_in_nonprofiled = total_elapsed - total
82
+ str += "Time spent in non-profiled methods: #{time_in_nonprofiled} sec\n"
83
+ str += "Time in profiled methods:\n"
84
+ if total == 0 then total = 0.01 end
85
+ @@map["#toplevel".intern][1] = total
86
+ data = @@map.to_a.sort{|a,b| b[1][2] <=> a[1][2]}
87
+ sum = 0
88
+ str += " %% cumulative self self total\n"
89
+ str += " time seconds seconds calls ms/call ms/call name\n"
90
+ str += " ---------------------------------------------------------\n"
91
+ for d in data
92
+ method = d[0]
93
+ next if method == "#toplevel".intern
94
+ d = d[1]
95
+ sum += d[2]
96
+ str += "%6.2f %8.2f %8.2f %8d " % [d[2]/total*100, sum, d[2], d[0]]
97
+ str += "%8.2f %8.2f %s\n" % [d[2]*1000/d[0], d[1]*1000/d[0],
98
+ method.id2name]
99
+ if writeCallers
100
+ str += " Call sites:\n"
101
+ d[3].to_a.sort {|a,b| b[1] <=> a[1]}.each do |callersite, count|
102
+ str += " #{count}: " + callersite.split("/").last + "\n"
103
+ end
104
+ end
105
+ if writeArguments and d[0] > 1
106
+ str += " Arguments:\n"
107
+ counts, num_prev_seen = Hash.new(0), 0
108
+ @@arguments[method].to_a.sort {|a,b| b[1] <=> a[1]}.each do |args, cnt|
109
+ # str += " #{cnt}: " + args + "\n" if cnt > 1
110
+ counts[cnt] += cnt
111
+ num_prev_seen += cnt if cnt > 1
112
+ end
113
+ proportion_prev_seen = num_prev_seen*100.0/d[0]
114
+ proportion_unique = 100.0 - proportion_prev_seen
115
+ str += " %3.2f%% (#{d[0].to_i - num_prev_seen}) of calls with unique args" % proportion_unique
116
+ if proportion_unique != 100.0
117
+ str += ", and\n"
118
+ str += " %3.2f%% (#{num_prev_seen}) of calls with args that were used several times\n" % proportion_prev_seen
119
+ str += " distr: #{counts.inspect}"
120
+ end
121
+ str += "\n"
122
+ end
123
+ str += "\n" if writeCallers or writeArguments
124
+ end
125
+ return str
126
+ end
127
+ end
128
+
129
+ #############################################################################
130
+ # Simple test
131
+ #############################################################################
132
+ if __FILE__ == $0
133
+ class ComplexTest
134
+ attr_reader :real, :imaginary
135
+ def initialize(real, imaginary)
136
+ Profiler.__enter__(:initialize, real, imaginary)
137
+ @real, @imaginary = real, imaginary
138
+ Profiler.__leave__(:initialize, self)
139
+ end
140
+ def add(other)
141
+ Profiler.__enter__(:add, other)
142
+ real_add(other)
143
+ Profiler.__leave__(:add, self)
144
+ end
145
+ def real_add(other)
146
+ Profiler.__enter__(:real_add, other)
147
+ @real += other.real
148
+ @imaginary += other.imaginary
149
+ Profiler.__leave__(:real_add, self)
150
+ end
151
+ def inspect
152
+ "#{real} + i*#{imaginary}"
153
+ end
154
+ end
155
+
156
+ Profiler.start
157
+
158
+ 10.times do
159
+ c = ComplexTest.new(rand, rand)
160
+ 100.times do
161
+ c.add(ComplexTest.new(rand, rand))
162
+ end
163
+ puts "It finished"
164
+ end
165
+ c = ComplexTest.new(1,1)
166
+ c = ComplexTest.new(1,1)
167
+ puts Profiler.profile_summary(true, true)
168
+ end
@@ -0,0 +1,523 @@
1
+ # Find the reductions in a parse table for a StateGraph.
2
+ #
3
+ # There can be reductions in all states with kernel items that are final, ie.
4
+ # where the position cannot be advanced any further
5
+ # (example: State(3, [S -> R.])).
6
+ #
7
+ # For states with only one kernel item (which is final which is always the
8
+ # case if its only one) we should reduce for
9
+ # all terminals in the follow set of the nonterminal (Follow(S) in the
10
+ # example above). These states are called consistent reduce states.
11
+ #
12
+ # When there are multiple kernel items and one (or several) of them is a final
13
+ # item we should reduce for terminals in Follow(state, nonterminal). These
14
+ # states are called inconsistent reduce states. An example is
15
+ # State(5, [S -> L .= R, R -> L.]) where we dont know whether to reduce
16
+ # or shift. Here, we should reduce by R -> L for all terminals in Follow(5, R).
17
+ #
18
+ # To reduce the amount of computation needed we use a hybrid, lazy approach.
19
+ # We calculate all first and follow sets for all symbols in the grammar.
20
+ # With this we can find the reductions for consistent reduce states. For the
21
+ # inconsistent ones we build the first to follow relations "from the back"
22
+ # (ie. from the actual follow sets we need) up to the first sets. This way
23
+ # we don't need to calculate relations that are not relevant.
24
+ #
25
+ require 'rpdf2txt-rockit/directed_graph'
26
+ require 'rpdf2txt-rockit/base_extensions'
27
+ require 'rpdf2txt-rockit/parsetable_generation'
28
+
29
+ require 'rpdf2txt-rockit/profiler'
30
+
31
+ #$PROFILE = true
32
+
33
+ class ReduceActionsGenerator
34
+ def initialize(stateGraph, grammar, parseTable, allItems)
35
+ @state_graph, @grammar, @parsetable = stateGraph, grammar, parseTable
36
+ set_index_numbers
37
+ precalc_items_at_nonterminal(allItems)
38
+ @lalr_pair_factory = IndexableFactory.new(LaLrPair, 0)
39
+ init_traverse_cache
40
+ end
41
+
42
+ def add_reduce_actions
43
+ Profiler.start if $PROFILE
44
+ add_actions_for_consistent_states
45
+ add_actions_for_inconsistent_states
46
+ @parsetable.compact!
47
+ puts Profiler.profile_summary(true, true) if $PROFILE
48
+ @parsetable
49
+ end
50
+
51
+ protected
52
+
53
+ def init_traverse_cache
54
+ num_states = @state_graph.nodes.length
55
+ @traverse_cache = Array.new(num_states)
56
+ num_states.times {|i| @traverse_cache[i] = Hash.new}
57
+ end
58
+
59
+ def set_index_numbers
60
+ cnt = -1
61
+ @grammar.terminals.each {|t| t.index_number = (cnt += 1)}
62
+ @grammar.nonterminals.each {|nt| nt.index_number = (cnt += 1)}
63
+ @grammar.productions.each_with_index {|p,i| p.index_number = i}
64
+ end
65
+
66
+ def add_actions_for_consistent_states
67
+ Profiler.__enter__(:add_actions_for_consistent_states) if $PROFILE
68
+ crs = @state_graph.consistent_reduce_states
69
+ build_follow_sets(crs).each do |state|
70
+ state.final_items.each do |item|
71
+ prod = item.production
72
+ add_reduce_action_for_terminalset(state,
73
+ follow_set(prod.nonterminal), prod)
74
+ end
75
+ end
76
+ Profiler.__leave__(:add_actions_for_consistent_states) if $PROFILE
77
+ end
78
+
79
+ def build_follow_sets(states)
80
+ create_first_sets
81
+ precalc_epsilon_derivation
82
+ create_follow_sets
83
+ build_relations # Currently we build the full relations graphs
84
+ update_first_sets
85
+ update_follow_sets
86
+ states
87
+ end
88
+
89
+ def epsilon
90
+ return @epsilon if @epsilon
91
+ @epsilon = EpsilonToken.new
92
+ @epsilon.index_number = @grammar.terminals.map{|t| t.index_number}.max+1
93
+ @epsilon
94
+ end
95
+
96
+ def all_terminals
97
+ @all_terminals || (@all_terminals = @grammar.terminals + [epsilon])
98
+ end
99
+
100
+ def precalc_epsilon_derivation
101
+ @i2i_relations = DirectedGraph.new
102
+ @grammar.productions.each do |production|
103
+ first_element = production.elements.first
104
+ unless first_element
105
+ i2i_relation(@epsilon, production.nonterminal)
106
+ end
107
+ end
108
+ propagate_terminal_sets([@epsilon], @i2i_relations) {|n| first_set(n)}
109
+ @grammar.nonterminals.each do |nt|
110
+ nt.derives_epsilon = first_set(nt).include?(@epsilon)
111
+ end
112
+ end
113
+
114
+ def update_first_sets
115
+ propagate_terminal_sets(@i2i_relations.roots, @i2i_relations) {|n| first_set(n)}
116
+ end
117
+
118
+ def update_follow_sets
119
+ @i2o_relations.each do |src, dests|
120
+ next unless dests
121
+ source_set = first_set(src)
122
+ dests.uniq.each {|d| follow_set(d).update source_set}
123
+ end
124
+ propagate_terminal_sets(@o2o_relations.roots, @o2o_relations) {|n| follow_set(n)}
125
+ end
126
+
127
+ def create_first_sets
128
+ nonterms = @grammar.nonterminals
129
+ ntfs = @nonterminal_first_sets = Array.new(nonterms.length)
130
+ nonterms.each {|nt| ntfs[nt.index_number] = new_terminal_set}
131
+ terms = @grammar.terminals
132
+ tfs = @terminal_first_sets = Array.new(nonterms.length)
133
+ terms.each {|t| tfs[t.index_number] = new_terminal_set(t)}
134
+ end
135
+
136
+ def create_follow_sets
137
+ nonterms = @grammar.nonterminals
138
+ fs = @follow_sets = Array.new(nonterms.length)
139
+ nonterms.each {|nt| fs[nt.index_number] = new_terminal_set}
140
+ end
141
+
142
+ def build_relations
143
+ @i2o_relations = Hash.new
144
+ @o2o_relations = DirectedGraph.new
145
+ @productions_ending_with = Array.new
146
+ i = elements = nonterminal = last_nonterm = len = update_first = nil
147
+ @grammar.productions.each do |production|
148
+ elements = production.elements
149
+ nonterminal = production.nonterminal
150
+ i, updating, len, last_nonterm = 0, true, elements.length, nil
151
+ while i < len
152
+ x = elements[i]
153
+ i2i_relation(x, nonterminal) if updating
154
+ i2o_relation(x, last_nonterm) if last_nonterm
155
+ last_nonterm = x.nonterminal? ? x : nil
156
+ updating = updating and x.nonterminal? and x.derives_epsilon?
157
+ i += 1
158
+ end
159
+ i, updating = len-1, true
160
+ while updating and i >= 0
161
+ x = elements[i]
162
+ if x.nonterminal?
163
+ o2o_relation(nonterminal, x)
164
+ update_productions_ending_with(x, production, elements[0...i])
165
+ end
166
+ updating = x.nonterminal? and x.derives_epsilon?
167
+ i -= 1
168
+ end
169
+ end
170
+ # Should be @grammar.original_start_symbol but this will always work
171
+ # since Follow(S) includes Follow(S'). It will take care of the accept.
172
+ i2o_relation(@grammar.eof_terminal, @grammar.start_symbol)
173
+ end
174
+
175
+ def i2i_relation(src, dest)
176
+ @i2i_relations.link_nodes(src, dest)
177
+ end
178
+
179
+ def o2o_relation(src, dest)
180
+ @o2o_relations.link_nodes(src, dest)
181
+ end
182
+
183
+ def i2o_relation(src, dest)
184
+ a = @i2o_relations[src]
185
+ if a
186
+ a.push dest
187
+ else
188
+ @i2o_relations[src] = [dest]
189
+ end
190
+ end
191
+
192
+ def new_terminal_set(terminal = nil)
193
+ if terminal
194
+ TerminalSet.new(all_terminals, [terminal])
195
+ else
196
+ TerminalSet.new(all_terminals)
197
+ end
198
+ end
199
+
200
+ def first_set(symbol)
201
+ if symbol.nonterminal?
202
+ @nonterminal_first_sets[symbol.index_number]
203
+ else
204
+ @terminal_first_sets[symbol.index_number]
205
+ end
206
+ end
207
+
208
+ def add_actions_for_inconsistent_states
209
+ Profiler.__enter__(:add_actions_for_inconsistent_states) if $PROFILE
210
+ irs = @state_graph.inconsistent_reduce_states
211
+ build_lalr_follow_sets(irs).each do |state, pair|
212
+ state.final_items.each do |item|
213
+ add_reduce_action_for_terminalset(state, lalr_follow_set(pair),
214
+ item.production)
215
+ end
216
+ end
217
+ Profiler.__leave__(:add_actions_for_inconsistent_states) if $PROFILE
218
+ end
219
+
220
+ def add_reduce_action_for_terminalset(state, terminalSet, production)
221
+ Profiler.__enter__(:add_reduce_action_for_terminalset, state, terminalSet, production) if $PROFILE
222
+ if production.nonterminal == @grammar.start_symbol
223
+ a = [:ACCEPT, 0]
224
+ else
225
+ a = [:REDUCE, production.index_number]
226
+ end
227
+ @parsetable.add_action_for_terminalset(state.index_number, a, terminalSet)
228
+ Profiler.__leave__(:add_reduce_action_for_terminalset) if $PROFILE
229
+ end
230
+
231
+ def propagate_terminal_sets(roots, graph = @relations, &getTerminalSet)
232
+ Profiler.__enter__(:propagate_terminal_sets, roots, graph) if $PROFILE
233
+ # Logothetis-Bermudez-style propagation. Does NOT exploit the fact that
234
+ # many propagations share a majority of paths. Exploit if speed needed!
235
+ # Note that this is not garantueed to work if there is a strongly connected
236
+ # component with more than two nodes. The intermediate nodes (only linked
237
+ # within the component) will not be updated after the backlink.
238
+ # Solve by doing a real reachability graph instead.
239
+ roots.each do |root|
240
+ graph.each_reachable_node_once_breadth_first(root) do |parent|
241
+ parent_fset = getTerminalSet.call(parent)
242
+ graph.children(parent).each do |child|
243
+ getTerminalSet.call(child).update parent_fset
244
+ end
245
+ end
246
+ end
247
+ Profiler.__leave__(:propagate_terminal_sets) if $PROFILE
248
+ end
249
+
250
+ def build_lalr_follow_sets(states)
251
+ Profiler.__enter__(:build_lalr_follow_sets) if $PROFILE
252
+ follow_sets_needed = Array.new
253
+ states.each do |state|
254
+ state.final_items.each do |item|
255
+ production = item.production
256
+ src_state = back_traverse(state, production.elements)
257
+ follow_sets_needed.push [state,
258
+ lalr_pair(src_state, production.nonterminal)]
259
+ end
260
+ end
261
+ build_o2o_lalr_relations
262
+ create_lalr_follow_sets
263
+ update_lalr_follow_sets_with_direct_followers
264
+ propagate_terminal_sets(@o2o_lalr_relations.roots, @o2o_lalr_relations) {|n| lalr_follow_set(n)}
265
+ Profiler.__leave__(:build_lalr_follow_sets) if $PROFILE
266
+ follow_sets_needed
267
+ end
268
+
269
+ def o2o_lalr_relation(src, dest)
270
+ @o2o_lalr_relations.link_nodes(src, dest)
271
+ end
272
+
273
+ LaLrPair = Struct.new("LaLrPair", :state, :nonterminal)
274
+ class LaLrPair
275
+ include Indexable
276
+ def inspect
277
+ "(#{state.index_number.inspect}, #{nonterminal.inspect})"
278
+ end
279
+ end
280
+
281
+ def lalr_pair(state, nonterminal)
282
+ @lalr_pair_factory.instance_with_args(state, nonterminal)
283
+ end
284
+
285
+ def precalc_items_at_nonterminal(allItems) # O(I)
286
+ @items_at_nonterminal = Array.new
287
+ allItems.each do |item|
288
+ symbol = item.symbol
289
+ if symbol and symbol.nonterminal?
290
+ update_items_at_nonterminal(symbol, item)
291
+ end
292
+ end
293
+ @items_at_nonterminal
294
+ end
295
+
296
+ def update_items_at_nonterminal(nonterminal, item)
297
+ i = nonterminal.index_number
298
+ if (a = @items_at_nonterminal[i])
299
+ a.push item
300
+ else
301
+ @items_at_nonterminal[i] = [item]
302
+ end
303
+ end
304
+
305
+ def items_at_nonterminal(nonterminal)
306
+ @items_at_nonterminal[nonterminal.index_number]
307
+ end
308
+
309
+ def create_lalr_follow_sets
310
+ Profiler.__enter__(:create_lalr_follow_sets) if $PROFILE
311
+ lalr_pairs = @lalr_pair_factory.instances
312
+ @lalr_follow_sets = Array.new(lalr_pairs.length)
313
+ lalr_pairs.each do |pair|
314
+ @lalr_follow_sets[pair.index_number] = new_terminal_set
315
+ end
316
+ Profiler.__leave__(:create_lalr_follow_sets) if $PROFILE
317
+ end
318
+
319
+ def lalr_follow_set(lalrPair)
320
+ @lalr_follow_sets[lalrPair.index_number]
321
+ end
322
+
323
+ def update_lalr_follow_sets_with_direct_followers
324
+ Profiler.__enter__(:update_lalr_follow_sets_with_direct_followers) if $PROFILE
325
+ @lalr_pair_factory.instances.each do |pair|
326
+ if pair.nonterminal == @grammar.start_symbol
327
+ lalr_follow_set(pair).update first_set(@grammar.eof_terminal)
328
+ end
329
+ items = items_at_nonterminal(pair.nonterminal)
330
+ next unless items
331
+ items.each do |item|
332
+ item.direct_following_symbols.each do |symbol|
333
+ lalr_follow_set(pair).update first_set(symbol)
334
+ end
335
+ end
336
+ end
337
+ Profiler.__leave__(:update_lalr_follow_sets_with_direct_followers) if $PROFILE
338
+ end
339
+
340
+ def productions_ending_with2(nonterminal, withLastPrefix = nil)
341
+ if withLastPrefix
342
+ @productions_ending_with[nonterminal.index_number][withLastPrefix.index_number]
343
+ else
344
+ @productions_ending_with_and_having_empty_prefix
345
+ end
346
+ end
347
+
348
+ def update_productions_ending_with2(nonterminal, production, prefix)
349
+ ntindex = nonterminal.index_number
350
+ pindex = prefix.last.index_number
351
+ new_element = [production, prefix]
352
+ if a = @productions_ending_with[ntindex]
353
+ if a2 = @productions_ending_with[ntindex][pindex]
354
+ a2.push(new_element) unless a2.include?(new_element)
355
+ else
356
+ @productions_ending_with[ntindex][pindex] = [new_element]
357
+ end
358
+ else
359
+ @productions_ending_with[ntindex] = Array.new
360
+ @productions_ending_with[ntindex][pindex] = [new_element]
361
+ end
362
+ end
363
+
364
+ def productions_ending_with(nonterminal)
365
+ @productions_ending_with[nonterminal.index_number]
366
+ end
367
+
368
+ def update_productions_ending_with(nonterminal, production, prefix)
369
+ ntindex = nonterminal.index_number
370
+ new_element = [production, prefix]
371
+ if a = @productions_ending_with[ntindex]
372
+ a.push(new_element) unless a.index(new_element)
373
+ else
374
+ @productions_ending_with[ntindex] = [new_element]
375
+ end
376
+ end
377
+
378
+ @@empty_array = Array.new
379
+
380
+ # We discard prefixes where the first symbol is not valid directly
381
+ # to save time in back_traverse.
382
+ def productions_with_valid_prefixes(pair)
383
+ Profiler.__enter__(:productions_with_valid_prefixes) if $PROFILE
384
+ prods = productions_ending_with(pair.nonterminal)
385
+ unless prods
386
+ Profiler.__leave__(:productions_with_valid_prefixes) if $PROFILE
387
+ return @@empty_array
388
+ end
389
+ incoming = @state_graph.incoming_links_info(pair.state)
390
+ res = prods.select do |prod, prefix|
391
+ prefix.length == 0 or incoming.include?(prefix.last)
392
+ end
393
+ Profiler.__leave__(:productions_with_valid_prefixes) if $PROFILE
394
+ res
395
+ end
396
+
397
+ def build_o2o_lalr_relations
398
+ Profiler.__enter__(:build_o2o_lalr_relations) if $PROFILE
399
+ @o2o_lalr_relations = DirectedGraph.new
400
+ current = 0
401
+ while current < @lalr_pair_factory.instances.length
402
+ pair = @lalr_pair_factory.instances[current]
403
+ productions = productions_with_valid_prefixes(pair)
404
+ productions.each do |prod, prefix|
405
+ q = back_traverse(pair.state, prefix)
406
+ if q
407
+ src_pair = lalr_pair(q, prod.nonterminal)
408
+ o2o_lalr_relation(src_pair, pair)
409
+ end
410
+ end
411
+ current += 1
412
+ end
413
+ #puts "Num LaLrPairs = #{current.inspect}"
414
+ Profiler.__leave__(:build_o2o_lalr_relations) if $PROFILE
415
+ end
416
+
417
+ def back_traverse(state, elements)
418
+ Profiler.__enter__(:back_traverse, state, elements) if $PROFILE
419
+ index = state.index_number
420
+ dest = @traverse_cache[index][elements]
421
+ unless dest
422
+ begin
423
+ dest = @state_graph.back_traverse(state, elements)
424
+ @traverse_cache[index][elements] = dest
425
+ rescue GraphTraversalException => e
426
+ dest = nil
427
+ end
428
+ end
429
+ Profiler.__leave__(:back_traverse) if $PROFILE
430
+ dest
431
+ end
432
+
433
+ def follow_set(symbol)
434
+ @follow_sets[symbol.index_number]
435
+ end
436
+ end
437
+
438
+ class TerminalSet
439
+ # With fast union and add operations. Actually its actually a general
440
+ # implementation and should probably be called IndexableObjectSet.
441
+
442
+ def initialize(allPossibleMembers, members = [], all = nil, max = nil)
443
+ @all_possible_members = allPossibleMembers
444
+ @max_index = max || allPossibleMembers.map{|m| m.index_number}.max
445
+ @all = all || (2**(@max_index+1))-1
446
+ @included = 0 # We represent as bit vector in Integer.
447
+ members.each {|t| add(t)}
448
+ end
449
+
450
+ @@empty_array = Array.new
451
+
452
+ def TerminalSet.new_from_integer(allPossibleMembers, included, all = nil,
453
+ max = nil)
454
+ ts = new(allPossibleMembers, @@empty_array, all, max)
455
+ ts.set_include_vector(included)
456
+ ts
457
+ end
458
+
459
+ def add(terminal)
460
+ raise ArgumentError unless @all_possible_members.include?(terminal)
461
+ begin
462
+ @included |= mask(terminal.index_number)
463
+ rescue Exception
464
+ puts "TerminalSet: #{@all_possible_members.inspect} but was #{terminal.inspect}"
465
+ end
466
+ end
467
+
468
+ def update(other)
469
+ @included |= other.to_i
470
+ end
471
+
472
+ def -(other)
473
+ # 1 in result if 1 in @included and 0 in other =>
474
+ #
475
+ TerminalSet.new_from_integer(@all_possible_members,
476
+ @included & (@all - other.to_i), @all,
477
+ @max_index)
478
+ end
479
+
480
+ def empty?
481
+ @included == 0
482
+ end
483
+
484
+ def terminals
485
+ @all_possible_members.select {|t| index_included?(t.index_number)}
486
+ end
487
+
488
+ def index_included?(index)
489
+ @included & mask(index) > 0
490
+ end
491
+
492
+ def include?(terminal)
493
+ index_included?(terminal.index_number)
494
+ end
495
+
496
+ def inspect
497
+ terminals.inspect
498
+ end
499
+
500
+ def each
501
+ Profiler.__enter__(:TerminalSet_each) if $PROFILE
502
+ @all_possible_members.each do |t|
503
+ yield(t) if index_included?(t.index_number)
504
+ end
505
+ Profiler.__leave__(:TerminalSet_each) if $PROFILE
506
+ end
507
+
508
+ def set_include_vector(newVector)
509
+ @included = newVector
510
+ end
511
+
512
+ def to_i
513
+ @included
514
+ end
515
+
516
+ protected
517
+
518
+ @@masks = Array.new
519
+
520
+ def mask(index)
521
+ @@masks[index] || (@@masks[index] = (1<<index))
522
+ end
523
+ end