rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,401 @@
1
+ require 'rpdf2txt-rockit/base_extensions'
2
+ require 'rpdf2txt-rockit/graphviz_dot'
3
+
4
+ class HashOfHash < DefaultInitHash
5
+ def initialize(&initBlock)
6
+ super do
7
+ if initBlock
8
+ DefaultInitHash.new(&initBlock)
9
+ else
10
+ Hash.new
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ GraphLink = Struct.new("GraphLink", :from, :to, :info)
17
+ class GraphLink
18
+ def inspect
19
+ info_str = info ? info.inspect + "-" : ""
20
+ "#{from.inspect}-#{info_str}>#{to.inspect}"
21
+ end
22
+ end
23
+
24
+ class GraphTraversalException < Exception
25
+ attr_reader :node, :links, :link_info
26
+ def initialize(node, links, linkInfo)
27
+ @node, @links, @link_info = node, links, linkInfo
28
+ super(message)
29
+ end
30
+
31
+ def message
32
+ "There is no link from #{@node.inspect} having info #{@link_info.inspect} (valid links are #{@links.inspect})"
33
+ end
34
+ alias inspect message
35
+ end
36
+
37
+ class DirectedGraph
38
+ # This is a memory expensive variant that manages several additional
39
+ # information data structures to cut down on processing when the graph
40
+ # has been built.
41
+
42
+ attr_reader :links
43
+
44
+ def initialize
45
+ @link_map = HashOfHash.new {Array.new} # [from][to] -> array of links
46
+ @links = Array.new # All links in one array
47
+ @is_root = Hash.new # true iff root node
48
+ @is_leaf = Hash.new # true iff leaf node
49
+ end
50
+
51
+ def nodes
52
+ @is_root.keys
53
+ end
54
+
55
+ def add_node(node)
56
+ unless include_node?(node)
57
+ @is_root[node] = @is_leaf[node] = true
58
+ end
59
+ end
60
+
61
+ def root?(node)
62
+ @is_root[node]
63
+ end
64
+
65
+ def leaf?(node)
66
+ @is_leaf[node]
67
+ end
68
+
69
+ def include_node?(node)
70
+ @is_root.has_key?(node)
71
+ end
72
+
73
+ def links_from_to(from, to)
74
+ @link_map[from][to]
75
+ end
76
+
77
+ def links_from(node)
78
+ @link_map[node].map {|to, links| links}.flatten
79
+ end
80
+
81
+ def children(node)
82
+ @link_map[node].keys.select {|k| @link_map[node][k].length > 0}
83
+ end
84
+
85
+ # (Forced) add link will always add link even if there are already links
86
+ # between the nodes.
87
+ def add_link(from, to, informationOnLink = nil)
88
+ add_link_nodes(from, to)
89
+ link = GraphLink.new(from, to, informationOnLink)
90
+ links_from_to(from, to).push link
91
+ add_to_links(link)
92
+ link
93
+ end
94
+
95
+ def add_link_nodes(from, to)
96
+ add_node(from)
97
+ add_node(to)
98
+ @is_leaf[from] = @is_root[to] = false
99
+ end
100
+
101
+ # Add link if not already linked
102
+ def link_nodes(from, to, info = nil)
103
+ links_from_to?(from, to) ? nil : add_link(from, to, info)
104
+ end
105
+
106
+ def links_from_to?(from, to)
107
+ not links_from_to(from, to).empty?
108
+ end
109
+ alias linked? links_from_to?
110
+
111
+ def add_to_links(link)
112
+ @links.push link
113
+ end
114
+ private :add_to_links
115
+
116
+ def each_reachable_node_once_depth_first(node, inclusive = true, &block)
117
+ children(node).each do |c|
118
+ recurse_each_reachable_depth_first_visited(c, Hash.new, &block)
119
+ end
120
+ block.call(node) if inclusive
121
+ end
122
+ alias each_reachable_node each_reachable_node_once_depth_first
123
+
124
+ def recurse_each_reachable_depth_first_visited(node, visited, &block)
125
+ visited[node] = true
126
+ children(node).each do |c|
127
+ unless visited[c]
128
+ recurse_each_reachable_depth_first_visited(c, visited, &block)
129
+ end
130
+ end
131
+ block.call(node)
132
+ end
133
+
134
+ def each_reachable_node_once_breadth_first(node, inclusive = true, &block)
135
+ block.call(node) if inclusive
136
+ children(node).each do |c|
137
+ recurse_each_reachable_breadth_first_visited(c, Hash.new, &block)
138
+ end
139
+ end
140
+ alias each_reachable_node each_reachable_node_once_depth_first
141
+
142
+ def recurse_each_reachable_breadth_first_visited(node, visited, &block)
143
+ visited[node] = true
144
+ block.call(node)
145
+ children(node).each do |c|
146
+ unless visited[c]
147
+ recurse_each_reachable_breadth_first_visited(c, visited, &block)
148
+ end
149
+ end
150
+ end
151
+
152
+ def root_nodes
153
+ @is_root.reject {|key,val| val == false}.keys
154
+ end
155
+ alias_method :roots, :root_nodes
156
+
157
+ def leaf_nodes
158
+ @is_leaf.reject {|key,val| val == false}.keys
159
+ end
160
+ alias_method :leafs, :leaf_nodes
161
+
162
+ def internal_node?(node)
163
+ !root?(node) and !leaf?(node)
164
+ end
165
+
166
+ def internal_nodes
167
+ nodes.reject {|n| root?(n) or leaf?(n)}
168
+ end
169
+
170
+ def recurse_cyclic?(node, visited)
171
+ visited[node] = true
172
+ children(node).each do |c|
173
+ return true if visited[c] || recurse_cyclic?(c, visited)
174
+ end
175
+ false
176
+ end
177
+
178
+ def cyclic?
179
+ visited = Hash.new
180
+ root_nodes.each {|root| return true if recurse_cyclic?(root, visited)}
181
+ false
182
+ end
183
+
184
+ def acyclic?
185
+ not cyclic?
186
+ end
187
+
188
+ def transition(state, linkInfo)
189
+ link = links_from(state).detect {|l| l.info == linkInfo}
190
+ begin
191
+ link.to
192
+ rescue Exception
193
+ raise GraphTraversalException.new(state, links_from(state), linkInfo)
194
+ end
195
+ end
196
+
197
+ def traverse(fromState, alongLinksWithInfo = [])
198
+ state, len = fromState, alongLinksWithInfo.length
199
+ alongLinksWithInfo = alongLinksWithInfo.clone
200
+ while len > 0
201
+ state = transition(state, alongLinksWithInfo.shift)
202
+ len -= 1
203
+ end
204
+ state
205
+ end
206
+
207
+ def to_dot(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil)
208
+ f = DotGraphFormatter.new(nodeShaper, nodeLabeler, linkLabeler)
209
+ f.format(nodes, links)
210
+ end
211
+
212
+ def to_postscript_file(filename, nodeShaper = nil, nodeLabeler = nil,
213
+ linkLabeler = nil)
214
+ to_dot(nodeShaper, nodeLabeler, linkLabeler).write_to_file(filename)
215
+ end
216
+
217
+ # Floyd-Warshal algorithm which should be O(n^3) where n is the number of
218
+ # nodes. We can probably work a bit on the constant factors!
219
+ def transitive_closure_floyd_warshal
220
+ vertices = nodes
221
+ tcg = DirectedGraph.new
222
+ num_nodes = vertices.length
223
+
224
+ # Direct links
225
+ for k in (0...num_nodes)
226
+ for s in (0...num_nodes)
227
+ vk, vs = vertices[k], vertices[s]
228
+ if vk == vs
229
+ tcg.link_nodes(vk,vs)
230
+ elsif linked?(vk, vs)
231
+ tcg.link_nodes(vk,vs)
232
+ end
233
+ end
234
+ end
235
+
236
+ # Indirect links
237
+ for i in (0...num_nodes)
238
+ for j in (0...num_nodes)
239
+ for k in (0...num_nodes)
240
+ vi, vj, vk = vertices[i], vertices[j], vertices[k]
241
+ if not tcg.linked?(vi,vj)
242
+ tcg.link_nodes(vi, vj) if linked?(vi,vk) and linked?(vk,vj)
243
+ end
244
+ end
245
+ end
246
+ end
247
+ tcg
248
+ end
249
+ alias_method :transitive_closure, :transitive_closure_floyd_warshal
250
+ end
251
+
252
+ # Parallel propagation in directed acyclic graphs. Should be faster than
253
+ # traversing all links from each start node if the graph is dense so that
254
+ # many traversals can be merged.
255
+ class DagPropagator
256
+ def initialize(directedGraph, startNodes, &propagationBlock)
257
+ @graph, @block = directedGraph, propagationBlock
258
+ init_start_nodes(startNodes)
259
+ @visited = Hash.new
260
+ end
261
+
262
+ def init_start_nodes(startNodes)
263
+ @startnodes = startNodes
264
+ end
265
+
266
+ def propagate
267
+ @visited.clear
268
+ propagate_recursive
269
+ end
270
+
271
+ def propagate_recursive
272
+ next_start_nodes = Array.new
273
+ @startnodes.each do |parent|
274
+ @visited[parent] = true
275
+ @graph.children(parent).each do |child|
276
+ @block.call(parent, child)
277
+ unless @visited[child] or next_start_nodes.include?(child)
278
+ next_start_nodes.push(child)
279
+ end
280
+ end
281
+ end
282
+ if next_start_nodes.length > 0
283
+ @startnodes = next_start_nodes
284
+ propagate_recursive
285
+ end
286
+ end
287
+ end
288
+
289
+ # Directed graph with fast traversal from children to parents (back)
290
+ class BackLinkedDirectedGraph < DirectedGraph
291
+ def initialize(*args)
292
+ super
293
+ @back_link_map = HashOfHash.new {Array.new} # [to][from] -> array of links
294
+ @incoming_links_info = DefaultInitHash.new {Array.new}
295
+ end
296
+
297
+ def add_link(from, to, informationOnLink = nil)
298
+ link = super
299
+ links_to_from(to, from).push link
300
+ if informationOnLink and
301
+ !@incoming_links_info[to].include?(informationOnLink)
302
+ @incoming_links_info[to].push informationOnLink
303
+ end
304
+ link
305
+ end
306
+
307
+ def incoming_links_info(node)
308
+ @incoming_links_info[node]
309
+ end
310
+
311
+ def back_transition(node, backLinkInfo)
312
+ link = links_to(node).detect {|l| l.info == backLinkInfo}
313
+ begin
314
+ link.from
315
+ rescue Exception
316
+ raise GraphTraversalException.new(node, links_to(node), backLinkInfo)
317
+ end
318
+ end
319
+
320
+ def back_traverse(state, alongLinksWithInfo = [])
321
+ len = alongLinksWithInfo.length
322
+ alongLinksWithInfo = alongLinksWithInfo.clone
323
+ while len > 0
324
+ state = back_transition(state, alongLinksWithInfo.pop)
325
+ len -= 1
326
+ end
327
+ state
328
+ end
329
+
330
+ def links_to(node)
331
+ @back_link_map[node].map {|from, links| links}.flatten
332
+ end
333
+
334
+ protected
335
+
336
+ def links_to_from(to, from)
337
+ @back_link_map[to][from]
338
+ end
339
+ end
340
+
341
+ def calc_masks(start, stop, masks = Array.new)
342
+ mask = 1 << start
343
+ (start..stop).each {|i| masks[i] = mask; mask <<= 1}
344
+ masks
345
+ end
346
+
347
+ class BooleanMatrix
348
+ def initialize(objects)
349
+ @index, @objects, @matrix = Hash.new, objects, Array.new
350
+ cnt = 0
351
+ objects.each do |o|
352
+ @index[o] = cnt
353
+ @matrix[cnt] = 0 # Use Integers to represent the booleans
354
+ cnt += 1
355
+ end
356
+ @num_obects = cnt
357
+ end
358
+
359
+ @@masks_max = 1000
360
+ @@masks = calc_masks(0,@@masks_max)
361
+
362
+ def mask(index)
363
+ mask = @@masks[index]
364
+ unless mask
365
+ calc_masks(@@masks_max+1, index, @@masks)
366
+ mask = @masks[index]
367
+ end
368
+ mask
369
+ end
370
+
371
+ def or(index1, index2)
372
+ @matrix[index1] |= @matrix[index2]
373
+ end
374
+
375
+ def indices(anInteger)
376
+ index = 0
377
+ while anInteger > 0
378
+ yeild(index) if anInteger & 1
379
+ anInteger >>= 1
380
+ index += 1
381
+ end
382
+ end
383
+
384
+ def directed_graph
385
+ dg = Directedgraph.new
386
+ @matrix.each_with_index do |v,i|
387
+ indices(v) do |index|
388
+ dg.link_nodes(@objects[i], @objects[index])
389
+ end
390
+ end
391
+ dg
392
+ end
393
+
394
+ def transitive_closure
395
+ for i in (0..@num_obects)
396
+ for j in (0..@num_obects)
397
+
398
+ end
399
+ end
400
+ end
401
+ end
@@ -0,0 +1,393 @@
1
+ require 'rpdf2txt-rockit/graphdrawing'
2
+ require 'rpdf2txt-rockit/version'
3
+ require 'rpdf2txt-rockit/sourcecode_dumpable'
4
+ require 'rpdf2txt-rockit/parsing_ambiguities'
5
+
6
+ class ParseException < Exception; end;
7
+
8
+ # Generalized LR Parsing class
9
+ #
10
+ # This is a modification of Jan Rekers and Eelco Vissers Generalized LR
11
+ # parsers which in turn are derived from the Tomita parsing algorithm. The
12
+ # main feature of these kinds of parsers is that aribtrary long lookahead is
13
+ # used (when needed) since a parser is forked off every time there is an
14
+ # ambiguity.
15
+ #
16
+ # This implementation assumes that the ambiguities (arising from lack of
17
+ # lookahead) are resolved later; it does not handle ambiguities arising
18
+ # from the grammar. However, it can easily be extended to return a parse tree
19
+ # forest with all possible parse trees if there is a need for that.
20
+ # Alternatively, the user can resolve ambiguities in the grammar by specifying
21
+ # production priorities.
22
+ #
23
+ # The modification I've done is so that multiple token streams from the lexer
24
+ # can be handled. This allows simpler specification of lexers while still
25
+ # leading to valid parses as long as the grammar is unambigous.
26
+ #
27
+ # The algorithm used is copyright (c) 2001 Robert Feldt.
28
+ #
29
+ class GeneralizedLrParser
30
+ include SourceCodeDumpable
31
+
32
+ def initialize(aParseTable, aLexer = nil)
33
+ @parse_table = aParseTable
34
+ # puts @parse_table.inspect
35
+ if aLexer
36
+ @lexer = aLexer
37
+ else
38
+ tokens = @parse_table.tokens.clone
39
+ tokens.delete(:EOF)
40
+ @lexer = ForkingRegexpLexer.new(tokens)
41
+ end
42
+ end
43
+
44
+ def parser_src_header
45
+ "# Parser for #{@parse_table.language}\n" +
46
+ "# created by Rockit version #{rockit_version} on #{Time.new.inspect}\n" +
47
+ "# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se\n" +
48
+ "# and licensed under GPL\n" +
49
+ "# but this parser is under LGPL\n"
50
+ end
51
+
52
+ def to_src(assignToName = nil, nameHash = {})
53
+ ptname = "@@parse_table" + self.object_id.inspect.gsub('-', '_')
54
+ parser_src_header + @parse_table.to_src(ptname) + "\n" +
55
+ assign_to(assignToName,
56
+ new_of_my_type(as_code(ptname)))
57
+ end
58
+
59
+ def parse(aString)
60
+ @string_being_parsed = aString
61
+ @stacks_to_act_on, @accepted_stacks, @stacks_to_shift = [], [], []
62
+ @lexer.init(aString)
63
+ start_state = @parse_table.start_state
64
+ @active_stacks = [ParseStack.new(start_state, @lexer)]
65
+ @cnt, @reducer_cnt = -1, 0
66
+ while @active_stacks.length > 0
67
+ # File.open("as#{@cnt+=1}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
68
+ @stacks_to_shift.clear
69
+ @stacks_to_act_on = @active_stacks.clone
70
+ actor(@stacks_to_act_on.shift) while @stacks_to_act_on.length > 0
71
+ shifter
72
+ end
73
+ if @accepted_stacks.length > 0
74
+ tree = @accepted_stacks.first.links_to_stack_in_state?(start_state).tree
75
+ check_and_report_ambiguity tree
76
+ return tree
77
+ else
78
+ handle_parse_error
79
+ end
80
+ end
81
+
82
+ protected
83
+
84
+ def check_and_report_ambiguity(tree)
85
+ tree.each_node do |node|
86
+ if node.kind_of?(AmbiguityNode)
87
+ raise AmbigousParseException.new(@string_being_parsed, tree,
88
+ *(node.ambigous_trees))
89
+ end
90
+ end
91
+ end
92
+
93
+ def handle_parse_error
94
+ if @last_active_stacks
95
+ str = "No valid token found on stacks:\n"
96
+ @last_active_stacks.each_with_index do |stack, i|
97
+ tokens = stack.lexer.peek
98
+ str += "stack #{i}: #{context(stack.lexer.position.char_position)}" +
99
+ "in state #{stack.state}\n" +
100
+ "the lexer returns tokens = #{tokens.inspect} (#{tokens.map{|t| t.token_type.to_s}.inspect})\n" +
101
+ "and the valid tokens = #{@parse_table.valid_tokens(stack.state).inspect}\n"
102
+ end
103
+ else
104
+ str = "String could not be parsed"
105
+ end
106
+ raise ParseException, str
107
+ end
108
+
109
+ def context(position)
110
+ line, startpos = get_line_with_position(@string_being_parsed, position)
111
+ indent = (" " * (position - startpos + 1))
112
+ "on line:\n '" + line.to_s + "'\n " +
113
+ indent + "^\n " +
114
+ indent + "|\n " +
115
+ indent + "--- Parse error!!\n"
116
+ end
117
+
118
+ def get_line_with_position(string, position)
119
+ startpos = position
120
+ len = string.length
121
+ while string[startpos,1] != "\n" and startpos > 1
122
+ startpos -= 1
123
+ end
124
+ endpos = position
125
+ while string[endpos,1] != "\n" and endpos < len
126
+ endpos += 1
127
+ end
128
+ return string[startpos+1...endpos], startpos+1
129
+ end
130
+
131
+ def actor(stack)
132
+ #puts "actor(#{stack.state}) @stacks_to_act_on = #{@stacks_to_act_on.map{|s| s.state}.inspect}, @active_stacks = #{@active_stacks.map{|s| s.state}.inspect}"
133
+ tokens = stack.lexer.peek
134
+ #print "tokens = #{tokens.inspect}, "
135
+ tokens.each do |token|
136
+ #print "state = #{stack.state.inspect}, "
137
+ actions = @parse_table.actions(stack.state, token.token_type)
138
+ next unless actions
139
+ #puts "lexer = #{stack.lexer.inspect} (#{stack.lexer.id}), token = #{token.inspect}, actions = #{actions.inspect}"
140
+ actions.each do |action|
141
+ case action[0]
142
+ when :SHIFT
143
+ @stacks_to_shift.push [stack, action[1], token]
144
+ when :REDUCE
145
+ do_reductions(stack, action[1], stack.lexer)
146
+ when :ACCEPT
147
+ @accepted_stacks.push stack
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ def do_reductions(stack, productionNumber, lexer, pathsInludingLink = nil)
154
+ production = @parse_table.production(productionNumber)
155
+ #puts @parse_table.priorities.inspect
156
+ paths =
157
+ if @parse_table.priorities.in_some_conflict?(production)
158
+ # Only return valid paths, ie. paths without conflicts. Prune the
159
+ # invalid paths.
160
+ stack.valid_paths_of_length_with_pruning(production.length,
161
+ pathsInludingLink) do |l,p|
162
+ not @parse_table.priorities.conflict?(l.production, p, production)
163
+ end
164
+ else
165
+ stack.paths_of_length(production.length, pathsInludingLink)
166
+ end
167
+ paths.each do |path|
168
+ reducer(path.to, @parse_table.goto(path.to.state, productionNumber),
169
+ lexer, production, trees_on_path(path), productionNumber)
170
+ end
171
+ end
172
+
173
+ def reducer(stack, newState, newLexer, production, childTrees, productionNum)
174
+ # puts "#{@reducer_cnt+=1}: reducer(#{stack.state}, #{production.inspect})\n"
175
+ tree = production.create_tree(childTrees)
176
+
177
+ # ywesee
178
+ if(tree)
179
+ range = stack.pointer..newLexer.scanner.pointer
180
+ tree.raw_src = (@lexer.scanner.string[range] || '').strip
181
+ end
182
+
183
+ existing_stack = @active_stacks.detect do |as|
184
+ as.state==newState and as.lexer==newLexer
185
+ end
186
+ if existing_stack
187
+ # There is already a stack with the same state and lexer, so instead
188
+ # of creating a new stack we re-use the existing one.
189
+ if (existing_link = existing_stack.links_to_stack?(stack))
190
+ # There is already a link to the stack => Ambiguity unless existing
191
+ # tree same as new tree. The latter can happen when multiple token
192
+ # types that all match the current string are in the follow set of
193
+ # the same production.
194
+ if tree != existing_link.tree
195
+ handle_ambiguity(existing_stack, existing_link, tree, production)
196
+ end
197
+ else
198
+ new_link = existing_stack.add_link(stack, tree, production)
199
+ recheck_stacks(@active_stacks - @stacks_to_act_on, new_link)
200
+ end
201
+ else
202
+ new_stack = ParseStack.new(newState, newLexer)
203
+ new_stack.add_link(stack, tree, production)
204
+ @stacks_to_act_on.push new_stack
205
+ @active_stacks.push new_stack
206
+ end
207
+ # File.open("as#{@cnt}_#{@reducer_cnt}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
208
+ end
209
+
210
+ def recheck_stacks(stacks, link)
211
+ # Recheck stacks to see if new reductions are possible including the new
212
+ # link
213
+ stacks.each do |stack|
214
+ actions(stack).each do |action|
215
+ if action[0] == :REDUCE
216
+ do_reductions(stack, action[1], stack.lexer, link)
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+ def actions(stack)
223
+ actions, state = [], stack.state
224
+ stack.lexer.peek.each do |token|
225
+ actions.concat @parse_table.actions(state, token.token_type)
226
+ end
227
+ actions
228
+ end
229
+
230
+ def handle_ambiguity(existingStack, existingLink, newTree, production)
231
+ # We can extend the parser here to return the full parse forest.
232
+ # For now we simplify things and raise an exception.
233
+ existing_tree = existingLink.tree
234
+ if existing_tree.kind_of?(AmbiguityNode)
235
+ existing_tree.add_ambigoustree(newTree)
236
+ else
237
+ existingLink.tree = AmbiguityNode.new(existing_tree, newTree)
238
+ end
239
+ #alternatives = [newTree.compact!]
240
+ #existingStack.links.each {|link| alternatives.push link.tree.compact!}
241
+ #raise AmbigousParseException.new(@string_being_parsed, *alternatives)
242
+ end
243
+
244
+ def write_graphs
245
+ #File.open("atree1.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(newTree)}
246
+ #File.open("atree2.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(existingStack.links[0].tree)}
247
+ #File.open("as_at_ambiguity.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
248
+ end
249
+
250
+ def shifter
251
+ # Save current active_stacks if there is a parse error
252
+ @last_active_stacks = @active_stacks.clone
253
+ @active_stacks.clear
254
+ @stacks_to_shift.each do |stack, newstate, lexertoken|
255
+ tree = lexertoken.create_tree
256
+ existing_stack = @active_stacks.detect do |as|
257
+ as.state==newstate and as.lexer==lexertoken.lexer
258
+ end
259
+ if existing_stack
260
+ existing_stack.add_link(stack, tree)
261
+ else
262
+ new_stack = ParseStack.new(newstate, lexertoken.lexer)
263
+ new_stack.add_link(stack, tree)
264
+ @active_stacks.push new_stack
265
+ end
266
+ end
267
+ end
268
+
269
+ def trees_on_path(path)
270
+ path.links.reverse.map {|link| link.tree}
271
+ end
272
+ end
273
+
274
+ class ParseStack
275
+ Link = Struct.new("Link", :stack, :tree, :production)
276
+ class Link
277
+ def inspect
278
+ "-#{tree.inspect}->#{stack.state.inspect}"
279
+ end
280
+ end
281
+
282
+ attr_reader :state, :lexer, :links
283
+ attr_reader :pointer # ywesee
284
+
285
+ def initialize(aState, aLexer)
286
+ @state, @lexer = aState, aLexer
287
+ @links = Array.new
288
+ end
289
+
290
+ # ywesee
291
+ def pointer=(value)
292
+ @pointer ||= value
293
+ end
294
+
295
+ def add_link(aParseStack, aTree, aProduction = nil)
296
+ aParseStack.pointer = @lexer.scanner.pointer # ywesee
297
+ @links.push(l = Link.new(aParseStack, aTree, aProduction))
298
+ l
299
+ end
300
+
301
+ def links_to_stack_in_state?(state)
302
+ @links.detect {|link| link.stack.state == state}
303
+ end
304
+
305
+ def links_to_stack?(stack)
306
+ @links.detect {|link| link.stack == stack}
307
+ end
308
+
309
+ def paths_of_length(length, aLink = nil)
310
+ return [] if length == 0
311
+ paths = Array.new
312
+ @links.each do |link|
313
+ child_paths = link.stack.paths_of_length(length-1)
314
+ if child_paths.length > 0
315
+ child_paths.each {|cpath| paths.push(StackPath.new(self, [link]).add_path(cpath))}
316
+ else
317
+ paths.push StackPath.new(self, [link])
318
+ end
319
+ end
320
+ delete_paths_without_link(paths, aLink)
321
+ end
322
+
323
+ def valid_paths_of_length_with_pruning(length, aLink = nil,
324
+ &validity_checker)
325
+ return [] if length == 0
326
+ paths, new_links = Array.new, Array.new
327
+ @links.each do |link|
328
+ if validity_checker.call(link, length-1)
329
+ if length == 1
330
+ new_links.push(link)
331
+ paths.push(StackPath.new(self, [link]))
332
+ else
333
+ child_paths =
334
+ link.stack.valid_paths_of_length_with_pruning(length-1,
335
+ &validity_checker)
336
+ if child_paths.length > 0
337
+ new_links.push(link)
338
+ child_paths.each do |cpath|
339
+ paths.push(StackPath.new(self, [link]).add_path(cpath))
340
+ end
341
+ end
342
+ end
343
+ end
344
+ end
345
+ @links = new_links
346
+ delete_paths_without_link(paths, aLink)
347
+ end
348
+
349
+ def inspect
350
+ "PSt(#{@state}, #{@links.inspect}, #{@lexer.inspect})"
351
+ end
352
+
353
+ private
354
+
355
+ def delete_paths_without_link(paths, aLink)
356
+ if aLink
357
+ return paths.find_all {|path| path.includes_link?(aLink)}
358
+ else
359
+ return paths
360
+ end
361
+ end
362
+ end
363
+
364
+ class StackPath
365
+ attr_reader :to, :from
366
+ attr_reader :links
367
+
368
+ def initialize(from, links = [])
369
+ @from, @links = from, Array.new
370
+ links.each {|link| self.add_link(link)}
371
+ end
372
+
373
+ def add_link(link)
374
+ @links.push link
375
+ @to = link.stack
376
+ end
377
+
378
+ def add_path(aStackPath)
379
+ @links.concat aStackPath.links
380
+ @to = @links.last.stack
381
+ self
382
+ end
383
+
384
+ def includes_link?(link)
385
+ @links.detect {|l| l==link}
386
+ end
387
+
388
+ def inspect
389
+ "#{from.state}(#{from.lexer.position.char_position}) " +
390
+ @links.map {|l| "-#{l.tree.inspect}-> #{l.stack.state} (#{l.stack.lexer.position.char_position}) "}.join
391
+ end
392
+ end
393
+