rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,401 @@
1
+ require 'rpdf2txt-rockit/base_extensions'
2
+ require 'rpdf2txt-rockit/graphviz_dot'
3
+
4
+ class HashOfHash < DefaultInitHash
5
+ def initialize(&initBlock)
6
+ super do
7
+ if initBlock
8
+ DefaultInitHash.new(&initBlock)
9
+ else
10
+ Hash.new
11
+ end
12
+ end
13
+ end
14
+ end
15
+
16
+ GraphLink = Struct.new("GraphLink", :from, :to, :info)
17
+ class GraphLink
18
+ def inspect
19
+ info_str = info ? info.inspect + "-" : ""
20
+ "#{from.inspect}-#{info_str}>#{to.inspect}"
21
+ end
22
+ end
23
+
24
+ class GraphTraversalException < Exception
25
+ attr_reader :node, :links, :link_info
26
+ def initialize(node, links, linkInfo)
27
+ @node, @links, @link_info = node, links, linkInfo
28
+ super(message)
29
+ end
30
+
31
+ def message
32
+ "There is no link from #{@node.inspect} having info #{@link_info.inspect} (valid links are #{@links.inspect})"
33
+ end
34
+ alias inspect message
35
+ end
36
+
37
+ class DirectedGraph
38
+ # This is a memory expensive variant that manages several additional
39
+ # information data structures to cut down on processing when the graph
40
+ # has been built.
41
+
42
+ attr_reader :links
43
+
44
+ def initialize
45
+ @link_map = HashOfHash.new {Array.new} # [from][to] -> array of links
46
+ @links = Array.new # All links in one array
47
+ @is_root = Hash.new # true iff root node
48
+ @is_leaf = Hash.new # true iff leaf node
49
+ end
50
+
51
+ def nodes
52
+ @is_root.keys
53
+ end
54
+
55
+ def add_node(node)
56
+ unless include_node?(node)
57
+ @is_root[node] = @is_leaf[node] = true
58
+ end
59
+ end
60
+
61
+ def root?(node)
62
+ @is_root[node]
63
+ end
64
+
65
+ def leaf?(node)
66
+ @is_leaf[node]
67
+ end
68
+
69
+ def include_node?(node)
70
+ @is_root.has_key?(node)
71
+ end
72
+
73
+ def links_from_to(from, to)
74
+ @link_map[from][to]
75
+ end
76
+
77
+ def links_from(node)
78
+ @link_map[node].map {|to, links| links}.flatten
79
+ end
80
+
81
+ def children(node)
82
+ @link_map[node].keys.select {|k| @link_map[node][k].length > 0}
83
+ end
84
+
85
+ # (Forced) add link will always add link even if there are already links
86
+ # between the nodes.
87
+ def add_link(from, to, informationOnLink = nil)
88
+ add_link_nodes(from, to)
89
+ link = GraphLink.new(from, to, informationOnLink)
90
+ links_from_to(from, to).push link
91
+ add_to_links(link)
92
+ link
93
+ end
94
+
95
+ def add_link_nodes(from, to)
96
+ add_node(from)
97
+ add_node(to)
98
+ @is_leaf[from] = @is_root[to] = false
99
+ end
100
+
101
+ # Add link if not already linked
102
+ def link_nodes(from, to, info = nil)
103
+ links_from_to?(from, to) ? nil : add_link(from, to, info)
104
+ end
105
+
106
+ def links_from_to?(from, to)
107
+ not links_from_to(from, to).empty?
108
+ end
109
+ alias linked? links_from_to?
110
+
111
+ def add_to_links(link)
112
+ @links.push link
113
+ end
114
+ private :add_to_links
115
+
116
+ def each_reachable_node_once_depth_first(node, inclusive = true, &block)
117
+ children(node).each do |c|
118
+ recurse_each_reachable_depth_first_visited(c, Hash.new, &block)
119
+ end
120
+ block.call(node) if inclusive
121
+ end
122
+ alias each_reachable_node each_reachable_node_once_depth_first
123
+
124
+ def recurse_each_reachable_depth_first_visited(node, visited, &block)
125
+ visited[node] = true
126
+ children(node).each do |c|
127
+ unless visited[c]
128
+ recurse_each_reachable_depth_first_visited(c, visited, &block)
129
+ end
130
+ end
131
+ block.call(node)
132
+ end
133
+
134
+ def each_reachable_node_once_breadth_first(node, inclusive = true, &block)
135
+ block.call(node) if inclusive
136
+ children(node).each do |c|
137
+ recurse_each_reachable_breadth_first_visited(c, Hash.new, &block)
138
+ end
139
+ end
140
+ alias each_reachable_node each_reachable_node_once_depth_first
141
+
142
+ def recurse_each_reachable_breadth_first_visited(node, visited, &block)
143
+ visited[node] = true
144
+ block.call(node)
145
+ children(node).each do |c|
146
+ unless visited[c]
147
+ recurse_each_reachable_breadth_first_visited(c, visited, &block)
148
+ end
149
+ end
150
+ end
151
+
152
+ def root_nodes
153
+ @is_root.reject {|key,val| val == false}.keys
154
+ end
155
+ alias_method :roots, :root_nodes
156
+
157
+ def leaf_nodes
158
+ @is_leaf.reject {|key,val| val == false}.keys
159
+ end
160
+ alias_method :leafs, :leaf_nodes
161
+
162
+ def internal_node?(node)
163
+ !root?(node) and !leaf?(node)
164
+ end
165
+
166
+ def internal_nodes
167
+ nodes.reject {|n| root?(n) or leaf?(n)}
168
+ end
169
+
170
+ def recurse_cyclic?(node, visited)
171
+ visited[node] = true
172
+ children(node).each do |c|
173
+ return true if visited[c] || recurse_cyclic?(c, visited)
174
+ end
175
+ false
176
+ end
177
+
178
+ def cyclic?
179
+ visited = Hash.new
180
+ root_nodes.each {|root| return true if recurse_cyclic?(root, visited)}
181
+ false
182
+ end
183
+
184
+ def acyclic?
185
+ not cyclic?
186
+ end
187
+
188
+ def transition(state, linkInfo)
189
+ link = links_from(state).detect {|l| l.info == linkInfo}
190
+ begin
191
+ link.to
192
+ rescue Exception
193
+ raise GraphTraversalException.new(state, links_from(state), linkInfo)
194
+ end
195
+ end
196
+
197
+ def traverse(fromState, alongLinksWithInfo = [])
198
+ state, len = fromState, alongLinksWithInfo.length
199
+ alongLinksWithInfo = alongLinksWithInfo.clone
200
+ while len > 0
201
+ state = transition(state, alongLinksWithInfo.shift)
202
+ len -= 1
203
+ end
204
+ state
205
+ end
206
+
207
+ def to_dot(nodeShaper = nil, nodeLabeler = nil, linkLabeler = nil)
208
+ f = DotGraphFormatter.new(nodeShaper, nodeLabeler, linkLabeler)
209
+ f.format(nodes, links)
210
+ end
211
+
212
+ def to_postscript_file(filename, nodeShaper = nil, nodeLabeler = nil,
213
+ linkLabeler = nil)
214
+ to_dot(nodeShaper, nodeLabeler, linkLabeler).write_to_file(filename)
215
+ end
216
+
217
+ # Floyd-Warshal algorithm which should be O(n^3) where n is the number of
218
+ # nodes. We can probably work a bit on the constant factors!
219
+ def transitive_closure_floyd_warshal
220
+ vertices = nodes
221
+ tcg = DirectedGraph.new
222
+ num_nodes = vertices.length
223
+
224
+ # Direct links
225
+ for k in (0...num_nodes)
226
+ for s in (0...num_nodes)
227
+ vk, vs = vertices[k], vertices[s]
228
+ if vk == vs
229
+ tcg.link_nodes(vk,vs)
230
+ elsif linked?(vk, vs)
231
+ tcg.link_nodes(vk,vs)
232
+ end
233
+ end
234
+ end
235
+
236
+ # Indirect links
237
+ for i in (0...num_nodes)
238
+ for j in (0...num_nodes)
239
+ for k in (0...num_nodes)
240
+ vi, vj, vk = vertices[i], vertices[j], vertices[k]
241
+ if not tcg.linked?(vi,vj)
242
+ tcg.link_nodes(vi, vj) if linked?(vi,vk) and linked?(vk,vj)
243
+ end
244
+ end
245
+ end
246
+ end
247
+ tcg
248
+ end
249
+ alias_method :transitive_closure, :transitive_closure_floyd_warshal
250
+ end
251
+
252
+ # Parallel propagation in directed acyclic graphs. Should be faster than
253
+ # traversing all links from each start node if the graph is dense so that
254
+ # many traversals can be merged.
255
+ class DagPropagator
256
+ def initialize(directedGraph, startNodes, &propagationBlock)
257
+ @graph, @block = directedGraph, propagationBlock
258
+ init_start_nodes(startNodes)
259
+ @visited = Hash.new
260
+ end
261
+
262
+ def init_start_nodes(startNodes)
263
+ @startnodes = startNodes
264
+ end
265
+
266
+ def propagate
267
+ @visited.clear
268
+ propagate_recursive
269
+ end
270
+
271
+ def propagate_recursive
272
+ next_start_nodes = Array.new
273
+ @startnodes.each do |parent|
274
+ @visited[parent] = true
275
+ @graph.children(parent).each do |child|
276
+ @block.call(parent, child)
277
+ unless @visited[child] or next_start_nodes.include?(child)
278
+ next_start_nodes.push(child)
279
+ end
280
+ end
281
+ end
282
+ if next_start_nodes.length > 0
283
+ @startnodes = next_start_nodes
284
+ propagate_recursive
285
+ end
286
+ end
287
+ end
288
+
289
+ # Directed graph with fast traversal from children to parents (back)
290
+ class BackLinkedDirectedGraph < DirectedGraph
291
+ def initialize(*args)
292
+ super
293
+ @back_link_map = HashOfHash.new {Array.new} # [to][from] -> array of links
294
+ @incoming_links_info = DefaultInitHash.new {Array.new}
295
+ end
296
+
297
+ def add_link(from, to, informationOnLink = nil)
298
+ link = super
299
+ links_to_from(to, from).push link
300
+ if informationOnLink and
301
+ !@incoming_links_info[to].include?(informationOnLink)
302
+ @incoming_links_info[to].push informationOnLink
303
+ end
304
+ link
305
+ end
306
+
307
+ def incoming_links_info(node)
308
+ @incoming_links_info[node]
309
+ end
310
+
311
+ def back_transition(node, backLinkInfo)
312
+ link = links_to(node).detect {|l| l.info == backLinkInfo}
313
+ begin
314
+ link.from
315
+ rescue Exception
316
+ raise GraphTraversalException.new(node, links_to(node), backLinkInfo)
317
+ end
318
+ end
319
+
320
+ def back_traverse(state, alongLinksWithInfo = [])
321
+ len = alongLinksWithInfo.length
322
+ alongLinksWithInfo = alongLinksWithInfo.clone
323
+ while len > 0
324
+ state = back_transition(state, alongLinksWithInfo.pop)
325
+ len -= 1
326
+ end
327
+ state
328
+ end
329
+
330
+ def links_to(node)
331
+ @back_link_map[node].map {|from, links| links}.flatten
332
+ end
333
+
334
+ protected
335
+
336
+ def links_to_from(to, from)
337
+ @back_link_map[to][from]
338
+ end
339
+ end
340
+
341
+ def calc_masks(start, stop, masks = Array.new)
342
+ mask = 1 << start
343
+ (start..stop).each {|i| masks[i] = mask; mask <<= 1}
344
+ masks
345
+ end
346
+
347
+ class BooleanMatrix
348
+ def initialize(objects)
349
+ @index, @objects, @matrix = Hash.new, objects, Array.new
350
+ cnt = 0
351
+ objects.each do |o|
352
+ @index[o] = cnt
353
+ @matrix[cnt] = 0 # Use Integers to represent the booleans
354
+ cnt += 1
355
+ end
356
+ @num_obects = cnt
357
+ end
358
+
359
+ @@masks_max = 1000
360
+ @@masks = calc_masks(0,@@masks_max)
361
+
362
+ def mask(index)
363
+ mask = @@masks[index]
364
+ unless mask
365
+ calc_masks(@@masks_max+1, index, @@masks)
366
+ mask = @masks[index]
367
+ end
368
+ mask
369
+ end
370
+
371
+ def or(index1, index2)
372
+ @matrix[index1] |= @matrix[index2]
373
+ end
374
+
375
+ def indices(anInteger)
376
+ index = 0
377
+ while anInteger > 0
378
+ yeild(index) if anInteger & 1
379
+ anInteger >>= 1
380
+ index += 1
381
+ end
382
+ end
383
+
384
+ def directed_graph
385
+ dg = Directedgraph.new
386
+ @matrix.each_with_index do |v,i|
387
+ indices(v) do |index|
388
+ dg.link_nodes(@objects[i], @objects[index])
389
+ end
390
+ end
391
+ dg
392
+ end
393
+
394
+ def transitive_closure
395
+ for i in (0..@num_obects)
396
+ for j in (0..@num_obects)
397
+
398
+ end
399
+ end
400
+ end
401
+ end
@@ -0,0 +1,393 @@
1
+ require 'rpdf2txt-rockit/graphdrawing'
2
+ require 'rpdf2txt-rockit/version'
3
+ require 'rpdf2txt-rockit/sourcecode_dumpable'
4
+ require 'rpdf2txt-rockit/parsing_ambiguities'
5
+
6
+ class ParseException < Exception; end;
7
+
8
+ # Generalized LR Parsing class
9
+ #
10
+ # This is a modification of Jan Rekers and Eelco Vissers Generalized LR
11
+ # parsers which in turn are derived from the Tomita parsing algorithm. The
12
+ # main feature of these kinds of parsers is that aribtrary long lookahead is
13
+ # used (when needed) since a parser is forked off every time there is an
14
+ # ambiguity.
15
+ #
16
+ # This implementation assumes that the ambiguities (arising from lack of
17
+ # lookahead) are resolved later; it does not handle ambiguities arising
18
+ # from the grammar. However, it can easily be extended to return a parse tree
19
+ # forest with all possible parse trees if there is a need for that.
20
+ # Alternatively, the user can resolve ambiguities in the grammar by specifying
21
+ # production priorities.
22
+ #
23
+ # The modification I've done is so that multiple token streams from the lexer
24
+ # can be handled. This allows simpler specification of lexers while still
25
+ # leading to valid parses as long as the grammar is unambigous.
26
+ #
27
+ # The algorithm used is copyright (c) 2001 Robert Feldt.
28
+ #
29
+ class GeneralizedLrParser
30
+ include SourceCodeDumpable
31
+
32
+ def initialize(aParseTable, aLexer = nil)
33
+ @parse_table = aParseTable
34
+ # puts @parse_table.inspect
35
+ if aLexer
36
+ @lexer = aLexer
37
+ else
38
+ tokens = @parse_table.tokens.clone
39
+ tokens.delete(:EOF)
40
+ @lexer = ForkingRegexpLexer.new(tokens)
41
+ end
42
+ end
43
+
44
+ def parser_src_header
45
+ "# Parser for #{@parse_table.language}\n" +
46
+ "# created by Rockit version #{rockit_version} on #{Time.new.inspect}\n" +
47
+ "# Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se\n" +
48
+ "# and licensed under GPL\n" +
49
+ "# but this parser is under LGPL\n"
50
+ end
51
+
52
+ def to_src(assignToName = nil, nameHash = {})
53
+ ptname = "@@parse_table" + self.object_id.inspect.gsub('-', '_')
54
+ parser_src_header + @parse_table.to_src(ptname) + "\n" +
55
+ assign_to(assignToName,
56
+ new_of_my_type(as_code(ptname)))
57
+ end
58
+
59
+ def parse(aString)
60
+ @string_being_parsed = aString
61
+ @stacks_to_act_on, @accepted_stacks, @stacks_to_shift = [], [], []
62
+ @lexer.init(aString)
63
+ start_state = @parse_table.start_state
64
+ @active_stacks = [ParseStack.new(start_state, @lexer)]
65
+ @cnt, @reducer_cnt = -1, 0
66
+ while @active_stacks.length > 0
67
+ # File.open("as#{@cnt+=1}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
68
+ @stacks_to_shift.clear
69
+ @stacks_to_act_on = @active_stacks.clone
70
+ actor(@stacks_to_act_on.shift) while @stacks_to_act_on.length > 0
71
+ shifter
72
+ end
73
+ if @accepted_stacks.length > 0
74
+ tree = @accepted_stacks.first.links_to_stack_in_state?(start_state).tree
75
+ check_and_report_ambiguity tree
76
+ return tree
77
+ else
78
+ handle_parse_error
79
+ end
80
+ end
81
+
82
+ protected
83
+
84
+ def check_and_report_ambiguity(tree)
85
+ tree.each_node do |node|
86
+ if node.kind_of?(AmbiguityNode)
87
+ raise AmbigousParseException.new(@string_being_parsed, tree,
88
+ *(node.ambigous_trees))
89
+ end
90
+ end
91
+ end
92
+
93
+ def handle_parse_error
94
+ if @last_active_stacks
95
+ str = "No valid token found on stacks:\n"
96
+ @last_active_stacks.each_with_index do |stack, i|
97
+ tokens = stack.lexer.peek
98
+ str += "stack #{i}: #{context(stack.lexer.position.char_position)}" +
99
+ "in state #{stack.state}\n" +
100
+ "the lexer returns tokens = #{tokens.inspect} (#{tokens.map{|t| t.token_type.to_s}.inspect})\n" +
101
+ "and the valid tokens = #{@parse_table.valid_tokens(stack.state).inspect}\n"
102
+ end
103
+ else
104
+ str = "String could not be parsed"
105
+ end
106
+ raise ParseException, str
107
+ end
108
+
109
+ def context(position)
110
+ line, startpos = get_line_with_position(@string_being_parsed, position)
111
+ indent = (" " * (position - startpos + 1))
112
+ "on line:\n '" + line.to_s + "'\n " +
113
+ indent + "^\n " +
114
+ indent + "|\n " +
115
+ indent + "--- Parse error!!\n"
116
+ end
117
+
118
+ def get_line_with_position(string, position)
119
+ startpos = position
120
+ len = string.length
121
+ while string[startpos,1] != "\n" and startpos > 1
122
+ startpos -= 1
123
+ end
124
+ endpos = position
125
+ while string[endpos,1] != "\n" and endpos < len
126
+ endpos += 1
127
+ end
128
+ return string[startpos+1...endpos], startpos+1
129
+ end
130
+
131
+ def actor(stack)
132
+ #puts "actor(#{stack.state}) @stacks_to_act_on = #{@stacks_to_act_on.map{|s| s.state}.inspect}, @active_stacks = #{@active_stacks.map{|s| s.state}.inspect}"
133
+ tokens = stack.lexer.peek
134
+ #print "tokens = #{tokens.inspect}, "
135
+ tokens.each do |token|
136
+ #print "state = #{stack.state.inspect}, "
137
+ actions = @parse_table.actions(stack.state, token.token_type)
138
+ next unless actions
139
+ #puts "lexer = #{stack.lexer.inspect} (#{stack.lexer.id}), token = #{token.inspect}, actions = #{actions.inspect}"
140
+ actions.each do |action|
141
+ case action[0]
142
+ when :SHIFT
143
+ @stacks_to_shift.push [stack, action[1], token]
144
+ when :REDUCE
145
+ do_reductions(stack, action[1], stack.lexer)
146
+ when :ACCEPT
147
+ @accepted_stacks.push stack
148
+ end
149
+ end
150
+ end
151
+ end
152
+
153
+ def do_reductions(stack, productionNumber, lexer, pathsInludingLink = nil)
154
+ production = @parse_table.production(productionNumber)
155
+ #puts @parse_table.priorities.inspect
156
+ paths =
157
+ if @parse_table.priorities.in_some_conflict?(production)
158
+ # Only return valid paths, ie. paths without conflicts. Prune the
159
+ # invalid paths.
160
+ stack.valid_paths_of_length_with_pruning(production.length,
161
+ pathsInludingLink) do |l,p|
162
+ not @parse_table.priorities.conflict?(l.production, p, production)
163
+ end
164
+ else
165
+ stack.paths_of_length(production.length, pathsInludingLink)
166
+ end
167
+ paths.each do |path|
168
+ reducer(path.to, @parse_table.goto(path.to.state, productionNumber),
169
+ lexer, production, trees_on_path(path), productionNumber)
170
+ end
171
+ end
172
+
173
+ def reducer(stack, newState, newLexer, production, childTrees, productionNum)
174
+ # puts "#{@reducer_cnt+=1}: reducer(#{stack.state}, #{production.inspect})\n"
175
+ tree = production.create_tree(childTrees)
176
+
177
+ # ywesee
178
+ if(tree)
179
+ range = stack.pointer..newLexer.scanner.pointer
180
+ tree.raw_src = (@lexer.scanner.string[range] || '').strip
181
+ end
182
+
183
+ existing_stack = @active_stacks.detect do |as|
184
+ as.state==newState and as.lexer==newLexer
185
+ end
186
+ if existing_stack
187
+ # There is already a stack with the same state and lexer, so instead
188
+ # of creating a new stack we re-use the existing one.
189
+ if (existing_link = existing_stack.links_to_stack?(stack))
190
+ # There is already a link to the stack => Ambiguity unless existing
191
+ # tree same as new tree. The latter can happen when multiple token
192
+ # types that all match the current string are in the follow set of
193
+ # the same production.
194
+ if tree != existing_link.tree
195
+ handle_ambiguity(existing_stack, existing_link, tree, production)
196
+ end
197
+ else
198
+ new_link = existing_stack.add_link(stack, tree, production)
199
+ recheck_stacks(@active_stacks - @stacks_to_act_on, new_link)
200
+ end
201
+ else
202
+ new_stack = ParseStack.new(newState, newLexer)
203
+ new_stack.add_link(stack, tree, production)
204
+ @stacks_to_act_on.push new_stack
205
+ @active_stacks.push new_stack
206
+ end
207
+ # File.open("as#{@cnt}_#{@reducer_cnt}.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
208
+ end
209
+
210
+ def recheck_stacks(stacks, link)
211
+ # Recheck stacks to see if new reductions are possible including the new
212
+ # link
213
+ stacks.each do |stack|
214
+ actions(stack).each do |action|
215
+ if action[0] == :REDUCE
216
+ do_reductions(stack, action[1], stack.lexer, link)
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+ def actions(stack)
223
+ actions, state = [], stack.state
224
+ stack.lexer.peek.each do |token|
225
+ actions.concat @parse_table.actions(state, token.token_type)
226
+ end
227
+ actions
228
+ end
229
+
230
+ def handle_ambiguity(existingStack, existingLink, newTree, production)
231
+ # We can extend the parser here to return the full parse forest.
232
+ # For now we simplify things and raise an exception.
233
+ existing_tree = existingLink.tree
234
+ if existing_tree.kind_of?(AmbiguityNode)
235
+ existing_tree.add_ambigoustree(newTree)
236
+ else
237
+ existingLink.tree = AmbiguityNode.new(existing_tree, newTree)
238
+ end
239
+ #alternatives = [newTree.compact!]
240
+ #existingStack.links.each {|link| alternatives.push link.tree.compact!}
241
+ #raise AmbigousParseException.new(@string_being_parsed, *alternatives)
242
+ end
243
+
244
+ def write_graphs
245
+ #File.open("atree1.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(newTree)}
246
+ #File.open("atree2.graph", "w") {|f| f.write syntaxtree_as_dot_digraph(existingStack.links[0].tree)}
247
+ #File.open("as_at_ambiguity.graph", "w") {|f| f.write parsestacks_as_dot_digraph(@active_stacks)}
248
+ end
249
+
250
+ def shifter
251
+ # Save current active_stacks if there is a parse error
252
+ @last_active_stacks = @active_stacks.clone
253
+ @active_stacks.clear
254
+ @stacks_to_shift.each do |stack, newstate, lexertoken|
255
+ tree = lexertoken.create_tree
256
+ existing_stack = @active_stacks.detect do |as|
257
+ as.state==newstate and as.lexer==lexertoken.lexer
258
+ end
259
+ if existing_stack
260
+ existing_stack.add_link(stack, tree)
261
+ else
262
+ new_stack = ParseStack.new(newstate, lexertoken.lexer)
263
+ new_stack.add_link(stack, tree)
264
+ @active_stacks.push new_stack
265
+ end
266
+ end
267
+ end
268
+
269
+ def trees_on_path(path)
270
+ path.links.reverse.map {|link| link.tree}
271
+ end
272
+ end
273
+
274
+ class ParseStack
275
+ Link = Struct.new("Link", :stack, :tree, :production)
276
+ class Link
277
+ def inspect
278
+ "-#{tree.inspect}->#{stack.state.inspect}"
279
+ end
280
+ end
281
+
282
+ attr_reader :state, :lexer, :links
283
+ attr_reader :pointer # ywesee
284
+
285
+ def initialize(aState, aLexer)
286
+ @state, @lexer = aState, aLexer
287
+ @links = Array.new
288
+ end
289
+
290
+ # ywesee
291
+ def pointer=(value)
292
+ @pointer ||= value
293
+ end
294
+
295
+ def add_link(aParseStack, aTree, aProduction = nil)
296
+ aParseStack.pointer = @lexer.scanner.pointer # ywesee
297
+ @links.push(l = Link.new(aParseStack, aTree, aProduction))
298
+ l
299
+ end
300
+
301
+ def links_to_stack_in_state?(state)
302
+ @links.detect {|link| link.stack.state == state}
303
+ end
304
+
305
+ def links_to_stack?(stack)
306
+ @links.detect {|link| link.stack == stack}
307
+ end
308
+
309
+ def paths_of_length(length, aLink = nil)
310
+ return [] if length == 0
311
+ paths = Array.new
312
+ @links.each do |link|
313
+ child_paths = link.stack.paths_of_length(length-1)
314
+ if child_paths.length > 0
315
+ child_paths.each {|cpath| paths.push(StackPath.new(self, [link]).add_path(cpath))}
316
+ else
317
+ paths.push StackPath.new(self, [link])
318
+ end
319
+ end
320
+ delete_paths_without_link(paths, aLink)
321
+ end
322
+
323
+ def valid_paths_of_length_with_pruning(length, aLink = nil,
324
+ &validity_checker)
325
+ return [] if length == 0
326
+ paths, new_links = Array.new, Array.new
327
+ @links.each do |link|
328
+ if validity_checker.call(link, length-1)
329
+ if length == 1
330
+ new_links.push(link)
331
+ paths.push(StackPath.new(self, [link]))
332
+ else
333
+ child_paths =
334
+ link.stack.valid_paths_of_length_with_pruning(length-1,
335
+ &validity_checker)
336
+ if child_paths.length > 0
337
+ new_links.push(link)
338
+ child_paths.each do |cpath|
339
+ paths.push(StackPath.new(self, [link]).add_path(cpath))
340
+ end
341
+ end
342
+ end
343
+ end
344
+ end
345
+ @links = new_links
346
+ delete_paths_without_link(paths, aLink)
347
+ end
348
+
349
+ def inspect
350
+ "PSt(#{@state}, #{@links.inspect}, #{@lexer.inspect})"
351
+ end
352
+
353
+ private
354
+
355
+ def delete_paths_without_link(paths, aLink)
356
+ if aLink
357
+ return paths.find_all {|path| path.includes_link?(aLink)}
358
+ else
359
+ return paths
360
+ end
361
+ end
362
+ end
363
+
364
+ class StackPath
365
+ attr_reader :to, :from
366
+ attr_reader :links
367
+
368
+ def initialize(from, links = [])
369
+ @from, @links = from, Array.new
370
+ links.each {|link| self.add_link(link)}
371
+ end
372
+
373
+ def add_link(link)
374
+ @links.push link
375
+ @to = link.stack
376
+ end
377
+
378
+ def add_path(aStackPath)
379
+ @links.concat aStackPath.links
380
+ @to = @links.last.stack
381
+ self
382
+ end
383
+
384
+ def includes_link?(link)
385
+ @links.detect {|l| l==link}
386
+ end
387
+
388
+ def inspect
389
+ "#{from.state}(#{from.lexer.position.char_position}) " +
390
+ @links.map {|l| "-#{l.tree.inspect}-> #{l.stack.state} (#{l.stack.lexer.position.char_position}) "}.join
391
+ end
392
+ end
393
+