rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,644 @@
1
+ require 'rpdf2txt-rockit/token'
2
+ require 'rpdf2txt-rockit/base_extensions'
3
+ require 'rpdf2txt-rockit/conflict_resolution'
4
+
5
+ require 'rpdf2txt-rockit/profiler'
6
+
7
+ # A Grammar has a name, a set of tokens, a start symbol and a set of
8
+ # productions. Grammars are modular and can be merged. To help in resolving
9
+ # conflicts when grammars are merged the symbols that are exported from
10
+ # a grammar can be specified. By default all the nonterminals (left hand sides
11
+ # of the productions) are exported.
12
+ #
13
+ # Productions map a NonTerminal to a sequence of Element's.
14
+ # Elements can be either symbols (NonTerminals or terminals represented by
15
+ # their string name or symbol) or OperatorElements.
16
+ #
17
+ # OperatorElements are one of:
18
+ # Plus - corresponds to the EBNF operator '+', ie. one or several
19
+ # Mult - corresponds to the EBNF operator '*', ie. zero or several
20
+ # Maybe - corresponds to the EBNF operator '?', ie. zero or one
21
+ # List(Elements, separator) - A list of Elements separated by separator
22
+ # Or - One of a sequence (at least 2) elements
23
+ #
24
+ # A grammar is in normal form when no OperatorElements are in its productions.
25
+ # Converting a grammar to normal form is called normalization. Converting
26
+ # a syntax tree back to the unnormalized form of its grammar is called
27
+ # denormalization. Normalization is part of this file while denormalization
28
+ # is in a file of its own.
29
+ #
30
+ # Productions include a syntax tree specification describing how to build a
31
+ # (sub)tree for the syntax tree of the production when it is been matched.
32
+ # The SyntaxTreeSpecification needs to be known at this level since it
33
+ # is affected by normalization.
34
+ #
35
+ class Element
36
+ include SourceCodeDumpable
37
+ attr_accessor :sub_elements, :tree_specification
38
+
39
+ def initialize(subElements, treeSpecification = nil)
40
+ @sub_elements, @tree_specification = subElements, treeSpecification
41
+ end
42
+
43
+ # Normalize the element in the context of a production. The context is
44
+ # needed to give information for naming of extra productions that (may)
45
+ # need to be created. Returns two arrays, the former with the normalization
46
+ # of the element in the existing production and the latter with additional
47
+ # productions needed.
48
+ def normalize(productions)
49
+ # Default is that no normalization is needed and no extra productions
50
+ # are added
51
+ [productions, []]
52
+ end
53
+
54
+ protected
55
+
56
+ def clone_and_substitute_productions(productions, substitute,
57
+ &updater)
58
+ prods = productions.map do |prod|
59
+ p = prod.clone_and_substitute(self, substitute)
60
+ index = prod.elements.index(self)
61
+ p = updater.call(p, index) if updater and index
62
+ p
63
+ end
64
+ (prods.map {|p| p.normalize}).flatten
65
+ end
66
+ end
67
+
68
+ # To make it simple to enter productions some standard Ruby objects can be
69
+ # used in place of the "proper" objects. This function converts to the right
70
+ # type or raises an exception if object cannot be made into an element.
71
+ def make_element(anObject)
72
+ if anObject.kind_of?(String)
73
+ string_token(anObject)
74
+ elsif anObject.kind_of?(Regexp)
75
+ regexp_token(anObject)
76
+ elsif anObject.kind_of?(Symbol)
77
+ NonTerminal.new(anObject.to_s)
78
+ elsif anObject.kind_of?(NonTerminal) or anObject.kind_of?(Token) or
79
+ anObject.kind_of?(OperatorElement)
80
+ anObject
81
+ else
82
+ raise ArgumentError, "cannot make an element from #{anObject.inspect}"
83
+ end
84
+ end
85
+
86
+ def make_elements(anArrayOrElement, anArray = [])
87
+ if anArrayOrElement.kind_of?(Array)
88
+ anArrayOrElement += anArray
89
+ else
90
+ anArrayOrElement = [anArrayOrElement] + anArray
91
+ end
92
+ anArrayOrElement.map {|e| make_element(e)}
93
+ end
94
+
95
+ class OperatorElement < Element
96
+ attr_reader :sub_elements
97
+
98
+ def initialize(subElements, *rest)
99
+ @sub_elements = make_elements(subElements, rest)
100
+ end
101
+
102
+ def name
103
+ type_to_src.split("Element").first
104
+ end
105
+
106
+ def to_src(name = nil, nameHash = {})
107
+ assign_to(name,
108
+ new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash))))
109
+ end
110
+
111
+ def ==(other)
112
+ other.class == self.class and other.sub_elements == @sub_elements
113
+ end
114
+
115
+ protected
116
+
117
+ def inspect_sub_elements(separator = " ")
118
+ str = @sub_elements.map {|e| e.inspect}.join(separator)
119
+ str = "(" + str + ")" if @sub_elements.length > 1
120
+ str
121
+ end
122
+
123
+ def temp_nonterminal(name, production)
124
+ NonTerminal.new("#{name}#{object_id.inspect}")
125
+ end
126
+ end
127
+
128
+ class PlusElement < OperatorElement
129
+ def inspect
130
+ "#{inspect_sub_elements}+"
131
+ end
132
+
133
+ # NT -> a b (se1 ... sen)+ c d
134
+ #
135
+ # is normalized to
136
+ #
137
+ # NT -> a b NT-Plus-X c d
138
+ # NT-Plus-X -> NT-Plus-X se1 ... sen
139
+ # | se1 ... sen
140
+ def normalize(productions)
141
+ temp_nonterm = temp_nonterminal("Plus", productions.first)
142
+ num_sub_elements = @sub_elements.length
143
+ temp_production1 =
144
+ Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
145
+ ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
146
+ temp_production2 =
147
+ Production.new(temp_nonterm, @sub_elements,
148
+ ArrayNodeBuilder.new((0...num_sub_elements).to_a))
149
+ [clone_and_substitute_productions(productions, [temp_nonterm]),
150
+ temp_production1.normalize + temp_production2.normalize]
151
+ end
152
+ end
153
+
154
+ class MultElement < OperatorElement
155
+ def inspect
156
+ "#{inspect_sub_elements}*"
157
+ end
158
+
159
+ # NT -> a b (se1 ... sen)* c d
160
+ #
161
+ # is normalized to
162
+ #
163
+ # NT -> a b NT-Mult-X c d
164
+ # | a b c d
165
+ # NT-Mult-X -> NT-Mult-X se1 ... sen
166
+ # | se1 ... sen
167
+ def normalize(productions)
168
+ temp_nonterm = temp_nonterminal("Mult", productions.first)
169
+ num_sub_elements = @sub_elements.length
170
+ temp_production1 =
171
+ Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
172
+ ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
173
+ temp_production2 =
174
+ Production.new(temp_nonterm, @sub_elements,
175
+ ArrayNodeBuilder.new((0...num_sub_elements).to_a))
176
+ ps2 = clone_and_substitute_productions(productions, []) do |prod, i|
177
+ # Will insert empty ArrayNode
178
+ prod.tree_builder =
179
+ ArrayNodeBuilder.new([], nil, prod.tree_builder, i)
180
+ prod.tree_builder.shifting_insert = true
181
+ prod
182
+ end
183
+ [clone_and_substitute_productions(productions, [temp_nonterm]) + ps2,
184
+ temp_production1.normalize + temp_production2.normalize]
185
+ end
186
+ end
187
+
188
+ class MaybeElement < OperatorElement
189
+ def inspect
190
+ "#{inspect_sub_elements}?"
191
+ end
192
+
193
+ # NT -> a b (se1 ... sen)? c d
194
+ #
195
+ # is normalized to
196
+ #
197
+ # NT -> a b se1 ... sen c d
198
+ # | a b c d
199
+ def normalize(productions)
200
+ p1 = clone_and_substitute_productions(productions,
201
+ @sub_elements) do |prod, elemindex|
202
+ endindex = elemindex+@sub_elements.length-1
203
+ prod.tree_builder =
204
+ GroupingSyntaxTreeBuilder.new(elemindex,
205
+ endindex,
206
+ prod.tree_builder)
207
+ prod
208
+ end
209
+ p2 = clone_and_substitute_productions(productions, []) do |p,ei|
210
+ p.tree_builder.inactivate_child(ei)
211
+ p
212
+ end
213
+ [p1 + p2, []]
214
+ end
215
+ end
216
+
217
+ class ListElement < OperatorElement
218
+ def initialize(subElements, separatorElement = ",")
219
+ super(subElements)
220
+ @separator = make_element(separatorElement)
221
+ end
222
+
223
+ def to_src(name = nil, nameHash = {})
224
+ assign_to(name,
225
+ new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash)),
226
+ @separator))
227
+ end
228
+
229
+ # NT -> a b list(se1 ... sen, sep) c d
230
+ #
231
+ # is normalized to
232
+ #
233
+ # NT -> a b se1 ... sen NT-List-X c d
234
+ # | a b se1 ... sen c d
235
+ # NT-List-X -> NT-List-X sep se1 ... sen
236
+ # | sep se1 ... sen
237
+ def normalize(productions)
238
+ temp_nonterminal = temp_nonterminal("List", productions.first)
239
+ num_sub_elements = @sub_elements.length
240
+ ps1 =
241
+ clone_and_substitute_productions(productions, @sub_elements +
242
+ [temp_nonterminal]) do |prod, elemindex|
243
+ endindex = elemindex+@sub_elements.length-1
244
+ prod.tree_builder =
245
+ ArrayNodeBuilder.new((elemindex..endindex).to_a,
246
+ endindex+1, prod.tree_builder,
247
+ elemindex, ((elemindex+1)..(endindex+1)).to_a)
248
+ prod.tree_builder.append_element = false
249
+ prod
250
+ end
251
+ ps2 = clone_and_substitute_productions(productions,
252
+ @sub_elements) do |prod, elemindex|
253
+ endindex = elemindex+@sub_elements.length-1
254
+ prod.tree_builder =
255
+ ArrayNodeBuilder.new((elemindex..endindex).to_a,
256
+ nil, prod.tree_builder,
257
+ elemindex, ((elemindex+1)..endindex).to_a)
258
+ prod
259
+ end
260
+ temp_production1 =
261
+ Production.new(temp_nonterminal,
262
+ [temp_nonterminal, @separator] + @sub_elements,
263
+ ArrayNodeBuilder.new((2...(2+num_sub_elements)).to_a, 0))
264
+ temp_production2 =
265
+ Production.new(temp_nonterminal, [@separator] + @sub_elements,
266
+ ArrayNodeBuilder.new((1...(1+num_sub_elements)).to_a))
267
+ [ps1 + ps2,
268
+ temp_production1.normalize + temp_production2.normalize]
269
+ end
270
+
271
+ def inspect
272
+ "list(#{inspect_sub_elements}, #{@separator.inspect})"
273
+ end
274
+ end
275
+
276
+ class OrElement < OperatorElement
277
+ def initialize(*args)
278
+ super(*args)
279
+ raise ArgumentError, "At least two sub-elements needed" unless @sub_elements.length > 1
280
+ end
281
+
282
+ # NT -> a b (se1 | ... | sen) c d
283
+ #
284
+ # is normalized to
285
+ #
286
+ # NT -> a b se1 c d
287
+ # | a b ... c d
288
+ # | a b sen c d
289
+ def normalize(productions)
290
+ normalized_productions = @sub_elements.map do |subelement|
291
+ clone_and_substitute_productions(productions, [subelement])
292
+ end
293
+ [normalized_productions.flatten, []]
294
+ end
295
+
296
+ def inspect
297
+ inspect_sub_elements(" | ")
298
+ end
299
+ end
300
+
301
+ # Short hand funcs
302
+ def as_array(subElements)
303
+ if subElements.length == 1 and subElements[0].kind_of?(Array)
304
+ subElements[0]
305
+ else
306
+ subElements
307
+ end
308
+ end
309
+
310
+ def plus(*subElements); PlusElement.new(as_array(subElements)); end
311
+ def mult(*subElements); MultElement.new(as_array(subElements)); end
312
+ def maybe(*subElements); MaybeElement.new(as_array(subElements)); end
313
+ def ore(*subElements); OrElement.new(as_array(subElements)); end
314
+ def liste(subElements, separator); ListElement.new(subElements, separator); end
315
+
316
+ class GrammarSymbol < Element
317
+ attr_reader :name
318
+ def initialize(name)
319
+ super(nil, nil) # Symbols have no sub-elements or tree-specification
320
+ @name = name.to_s
321
+ end
322
+
323
+ def ==(other)
324
+ #Profiler.__enter__("GrammarSymbol#==".intern, other)
325
+ res = other.kind_of?(GrammarSymbol) and name == other.name
326
+ #Profiler.__leave__("GrammarSymbol#==".intern, res)
327
+ end
328
+
329
+ def hash
330
+ name.hash
331
+ end
332
+
333
+ def eql?(other)
334
+ other.hash == hash
335
+ end
336
+
337
+ def inspect
338
+ name
339
+ end
340
+
341
+ def to_src(assignToName = nil, nameHash = {})
342
+ assign_to(assignToName, new_of_my_type(name))
343
+ end
344
+ end
345
+
346
+ class NonTerminal < GrammarSymbol
347
+ end
348
+
349
+ # When init:ing productions nonterminals can be specified with symbols both
350
+ # as nonterminal or as elements. String tokens can be specified as strings.
351
+ #
352
+ class Production
353
+ include SourceCodeDumpable
354
+ attr_accessor :elements # Right hand side
355
+ attr_accessor :nonterminal # Left hand side
356
+ attr_accessor :tree_builder
357
+
358
+ def initialize(nonterminal, elements, treeBuilder = nil)
359
+ nonterminal = NonTerminal.new(nonterminal) if nonterminal.class == String
360
+ @nonterminal = make_element(nonterminal)
361
+ @elements = make_elements(elements)
362
+ init_tree_builder(treeBuilder)
363
+ end
364
+
365
+ =begin
366
+ def clone
367
+ Production.new(@nonterminal.clone, @elements.clone, @tree_builder.clone)
368
+ end
369
+ =end
370
+
371
+ def normalize
372
+ maybe_normalize, extra_productions = [self], []
373
+ @elements.each do |element|
374
+ if element.kind_of?(Element)
375
+ maybe_normalize, new_extra = element.normalize(maybe_normalize)
376
+ extra_productions += new_extra
377
+ end
378
+ end
379
+ (maybe_normalize + extra_productions).equality_uniq
380
+ end
381
+
382
+ def length
383
+ elements.length
384
+ end
385
+
386
+ def ==(other)
387
+ #Profiler.__enter__("Production#==".intern, other)
388
+ res = other.class == self.class and
389
+ nonterminal == other.nonterminal and
390
+ elements == other.elements and
391
+ tree_builder == other.tree_builder
392
+ #Profiler.__leave__("Production#==".intern, res)
393
+ end
394
+
395
+ def clone_and_substitute(element, substitute)
396
+ index = elements.index element
397
+ substitute = [substitute] unless substitute.kind_of?(Array)
398
+ if index
399
+ Production.new(nonterminal,
400
+ elements[0...index] + substitute +
401
+ elements[(index+1)..-1], tree_builder.copy)
402
+ else
403
+ self
404
+ end
405
+ end
406
+
407
+ def inspect
408
+ elements_inspect = @elements.map {|e| e.inspect}.join(" ")
409
+ "#{nonterminal.inspect} -> #{elements_inspect}"
410
+ end
411
+
412
+ def create_tree(childrenValues)
413
+ tree_builder.create_tree(childrenValues)
414
+ end
415
+
416
+ def to_src(name = nil, nameHash = {})
417
+ assign_to(name,
418
+ new_of_my_type(nonterm_to_symbol(nonterminal),
419
+ as_code(elements_to_src(nameHash)), tree_builder))
420
+ end
421
+
422
+ protected
423
+
424
+ def elements_to_src(nameHash)
425
+ "[" + elements.map do |element|
426
+ if element.kind_of?(Token) and nameHash[element]
427
+ nameHash[element]
428
+ elsif element.kind_of?(NonTerminal)
429
+ nonterm_to_symbol(element).to_src
430
+ else
431
+ element.to_compact_src(nil,nameHash)
432
+ end
433
+ end.join(", ") + "]"
434
+ end
435
+
436
+ def nonterm_to_symbol(o)
437
+ return o unless o.class == NonTerminal
438
+ if o.name.include? "'"
439
+ as_code('"' + o.name + '".intern')
440
+ else
441
+ o.name.intern
442
+ end
443
+ end
444
+
445
+ def init_tree_builder(treeBuilder)
446
+ if treeBuilder == nil
447
+ @tree_builder = stb(nonterminal.name, assign_element_names)
448
+ elsif [:^, "^"].include?(treeBuilder)
449
+ @tree_builder = stb("^", assign_element_names(true))
450
+ else
451
+ if ["_", nil, ""].include?(treeBuilder.node_name)
452
+ treeBuilder.node_name = nonterminal.name
453
+ end
454
+ @tree_builder = treeBuilder
455
+ end
456
+ end
457
+
458
+ def assign_element_names(inactivateStringTokens = false)
459
+ at_least_one_active, count = false, 0
460
+ name_count = Hash.new(0)
461
+ element_names = elements.map do |element|
462
+ count += 1
463
+ if element.kind_of?(StringToken)
464
+ inactivateStringTokens ? "_" : "c#{count}"
465
+ elsif element.kind_of?(Token)
466
+ name = element.name || "c#{count}"
467
+ name_count[name] += 1
468
+ name += "#{name_count[name]}" if name_count[name] > 1
469
+ inactivateStringTokens ? "_" : name.downcase
470
+ elsif element.kind_of?(NonTerminal) or element.kind_of?(OperatorElement)
471
+ at_least_one_active = true
472
+ n = element.name.downcase
473
+ name_count[n] += 1
474
+ n += "#{name_count[n]}" if name_count[n] > 1
475
+ n
476
+ end
477
+ end
478
+ element_names[0] = "c1" unless at_least_one_active
479
+ element_names
480
+ end
481
+ end
482
+
483
+ def prod(nonterminal, elements, treeSpec = nil)
484
+ Production.new(nonterminal, elements, treeSpec)
485
+ end
486
+
487
+ class Grammar
488
+ attr_reader :tokens, :productions, :exports, :start_symbol, :priorities
489
+ attr_reader :eof_terminal, :original_start_symbol
490
+ attr_accessor :name
491
+ alias_method :terminals, :tokens
492
+
493
+ def initialize(name = nil, productions = [], tokens = [],
494
+ priorities = nil,
495
+ startSymbol = nil, exports = [])
496
+ # NOTE! Beware that the EofToken below wont get a unique index number!!
497
+ @eof_terminal = EofToken.new
498
+ @name, @tokens, @alternatives = name, [@eof_terminal], Hash.new
499
+ @priorities = priorities || ProductionPriorities.new
500
+ @nonterminals = Array.new
501
+ add_tokens(tokens)
502
+ clear_productions
503
+ add_productions(productions)
504
+ init_start_symbol(startSymbol)
505
+ @exports = exports
506
+ end
507
+
508
+ def alternatives(nonterminal)
509
+ @alternatives[nonterminal.name]
510
+ end
511
+
512
+ def normalize
513
+ old_productions = @productions
514
+ clear_productions
515
+ old_productions.each do |production|
516
+ add_productions(production.normalize)
517
+ end
518
+ self
519
+ end
520
+
521
+ def add_token(token)
522
+ @tokens.push token unless @tokens.include?(token)
523
+ end
524
+
525
+ def add_unique_symbols(production)
526
+ production.elements = production.elements.map {|e| unique_symbol(e)}
527
+ production.nonterminal = unique_symbol(production.nonterminal)
528
+ end
529
+
530
+ def unique_symbol(symbol)
531
+ existing_symbol = (@tokens + @nonterminals).detect {|e| e==symbol}
532
+ if not existing_symbol
533
+ add_token(symbol) if symbol.kind_of?(Token)
534
+ @nonterminals.push(symbol) if symbol.kind_of?(NonTerminal)
535
+ return symbol
536
+ else
537
+ return existing_symbol
538
+ end
539
+ end
540
+
541
+ def add_production(production)
542
+ unless @productions.include?(production)
543
+ add_unique_symbols(production)
544
+ @productions.push production
545
+ nt = production.nonterminal
546
+ unless @alternatives[nt.name]
547
+ @alternatives[nt.name] = [production]
548
+ else
549
+ @alternatives[nt.name].push production
550
+ end
551
+ end
552
+ @start_symbol = production.nonterminal unless @start_symbol
553
+ end
554
+
555
+ def +(otherGrammar)
556
+ # NOTE: Maybe check for and/or handle naming conflicts?
557
+ add_tokens(otherGrammar.tokens)
558
+ add_productions(otherGrammar.productions)
559
+ otherGrammar.exports.each {|e| @exports.push(e)}
560
+ self
561
+ end
562
+
563
+ def nonterminals(includeStartSymbol = true)
564
+ nts = (@productions.map {|p| p.nonterminal}).equality_uniq
565
+ nts.delete(@start_symbol) unless includeStartSymbol
566
+ nts
567
+ end
568
+
569
+ def augmented?
570
+ @augmented ? true : false
571
+ end
572
+
573
+ def inspect
574
+ str = "Grammar #{@name}\n"
575
+ str += "Start symbol: #{@start_symbol.inspect}\n" if @start_symbol
576
+ str += "Tokens:\n" + @tokens.map{|t| " " + t.inspect}.join("\n")
577
+ str += "\nProductions:\n"
578
+ @productions.each do |prod|
579
+ str += " " + prod.inspect + "\n"
580
+ end
581
+ str
582
+ end
583
+
584
+ class EpsilonTokenType < Token
585
+ def initialize
586
+ end
587
+ def ==(other)
588
+ other.class == self.class
589
+ end
590
+ end
591
+ EpsilonToken = EpsilonTokenType.new
592
+
593
+ def Grammar.epsilon
594
+ EpsilonToken
595
+ end
596
+
597
+ # Augment the grammar by adding new start symbol and production from it
598
+ # to previous start symbol. The added production gets index 0.
599
+ def augment
600
+ return true if augmented?
601
+ @original_start_symbol = self.start_symbol
602
+ # Add prim's until unique nonterminal name
603
+ new_name = @original_start_symbol.name + "'"
604
+ while nonterminals.collect{|nt| nt.name}.include?(new_name)
605
+ new_name += "'"
606
+ end
607
+ @start_symbol = NonTerminal.new(new_name)
608
+ add_production(p = Production.new(@start_symbol, [@original_start_symbol]))
609
+ # Make sure its on top
610
+ @productions.delete p
611
+ @productions.unshift p
612
+ @augmented = true
613
+ end
614
+
615
+ def unaugment
616
+ return false unless augmented?
617
+ @start_symbol = @original_start_symbol
618
+ @productions.shift
619
+ @augmented = false
620
+ end
621
+
622
+ private
623
+
624
+ def init_start_symbol(startSymbol)
625
+ if startSymbol
626
+ @start_symbol = make_element(startSymbol)
627
+ elsif @productions.length > 0
628
+ @start_symbol = @productions.first.nonterminal
629
+ end
630
+ end
631
+
632
+ def add_tokens(tokens)
633
+ tokens.each {|token| add_token(token)}
634
+ end
635
+
636
+ def add_productions(productions)
637
+ productions.each {|production| add_production(production)}
638
+ end
639
+
640
+ def clear_productions
641
+ @productions = Array.new
642
+ @alternatives = Hash.new
643
+ end
644
+ end