rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,644 @@
1
+ require 'rpdf2txt-rockit/token'
2
+ require 'rpdf2txt-rockit/base_extensions'
3
+ require 'rpdf2txt-rockit/conflict_resolution'
4
+
5
+ require 'rpdf2txt-rockit/profiler'
6
+
7
+ # A Grammar has a name, a set of tokens, a start symbol and a set of
8
+ # productions. Grammars are modular and can be merged. To help in resolving
9
+ # conflicts when grammars are merged the symbols that are exported from
10
+ # a grammar can be specified. By default all the nonterminals (left hand sides
11
+ # of the productions) are exported.
12
+ #
13
+ # Productions map a NonTerminal to a sequence of Element's.
14
+ # Elements can be either symbols (NonTerminals or terminals represented by
15
+ # their string name or symbol) or OperatorElements.
16
+ #
17
+ # OperatorElements are one of:
18
+ # Plus - corresponds to the EBNF operator '+', ie. one or several
19
+ # Mult - corresponds to the EBNF operator '*', ie. zero or several
20
+ # Maybe - corresponds to the EBNF operator '?', ie. zero or one
21
+ # List(Elements, separator) - A list of Elements separated by separator
22
+ # Or - One of a sequence (at least 2) elements
23
+ #
24
+ # A grammar is in normal form when no OperatorElements are in its productions.
25
+ # Converting a grammar to normal form is called normalization. Converting
26
+ # a syntax tree back to the unnormalized form of its grammar is called
27
+ # denormalization. Normalization is part of this file while denormalization
28
+ # is in a file of its own.
29
+ #
30
+ # Productions include a syntax tree specification describing how to build a
31
+ # (sub)tree for the syntax tree of the production when it is been matched.
32
+ # The SyntaxTreeSpecification needs to be known at this level since it
33
+ # is affected by normalization.
34
+ #
35
+ class Element
36
+ include SourceCodeDumpable
37
+ attr_accessor :sub_elements, :tree_specification
38
+
39
+ def initialize(subElements, treeSpecification = nil)
40
+ @sub_elements, @tree_specification = subElements, treeSpecification
41
+ end
42
+
43
+ # Normalize the element in the context of a production. The context is
44
+ # needed to give information for naming of extra productions that (may)
45
+ # need to be created. Returns two arrays, the former with the normalization
46
+ # of the element in the existing production and the latter with additional
47
+ # productions needed.
48
+ def normalize(productions)
49
+ # Default is that no normalization is needed and no extra productions
50
+ # are added
51
+ [productions, []]
52
+ end
53
+
54
+ protected
55
+
56
+ def clone_and_substitute_productions(productions, substitute,
57
+ &updater)
58
+ prods = productions.map do |prod|
59
+ p = prod.clone_and_substitute(self, substitute)
60
+ index = prod.elements.index(self)
61
+ p = updater.call(p, index) if updater and index
62
+ p
63
+ end
64
+ (prods.map {|p| p.normalize}).flatten
65
+ end
66
+ end
67
+
68
+ # To make it simple to enter productions some standard Ruby objects can be
69
+ # used in place of the "proper" objects. This function converts to the right
70
+ # type or raises an exception if object cannot be made into an element.
71
+ def make_element(anObject)
72
+ if anObject.kind_of?(String)
73
+ string_token(anObject)
74
+ elsif anObject.kind_of?(Regexp)
75
+ regexp_token(anObject)
76
+ elsif anObject.kind_of?(Symbol)
77
+ NonTerminal.new(anObject.to_s)
78
+ elsif anObject.kind_of?(NonTerminal) or anObject.kind_of?(Token) or
79
+ anObject.kind_of?(OperatorElement)
80
+ anObject
81
+ else
82
+ raise ArgumentError, "cannot make an element from #{anObject.inspect}"
83
+ end
84
+ end
85
+
86
+ def make_elements(anArrayOrElement, anArray = [])
87
+ if anArrayOrElement.kind_of?(Array)
88
+ anArrayOrElement += anArray
89
+ else
90
+ anArrayOrElement = [anArrayOrElement] + anArray
91
+ end
92
+ anArrayOrElement.map {|e| make_element(e)}
93
+ end
94
+
95
+ class OperatorElement < Element
96
+ attr_reader :sub_elements
97
+
98
+ def initialize(subElements, *rest)
99
+ @sub_elements = make_elements(subElements, rest)
100
+ end
101
+
102
+ def name
103
+ type_to_src.split("Element").first
104
+ end
105
+
106
+ def to_src(name = nil, nameHash = {})
107
+ assign_to(name,
108
+ new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash))))
109
+ end
110
+
111
+ def ==(other)
112
+ other.class == self.class and other.sub_elements == @sub_elements
113
+ end
114
+
115
+ protected
116
+
117
+ def inspect_sub_elements(separator = " ")
118
+ str = @sub_elements.map {|e| e.inspect}.join(separator)
119
+ str = "(" + str + ")" if @sub_elements.length > 1
120
+ str
121
+ end
122
+
123
+ def temp_nonterminal(name, production)
124
+ NonTerminal.new("#{name}#{object_id.inspect}")
125
+ end
126
+ end
127
+
128
+ class PlusElement < OperatorElement
129
+ def inspect
130
+ "#{inspect_sub_elements}+"
131
+ end
132
+
133
+ # NT -> a b (se1 ... sen)+ c d
134
+ #
135
+ # is normalized to
136
+ #
137
+ # NT -> a b NT-Plus-X c d
138
+ # NT-Plus-X -> NT-Plus-X se1 ... sen
139
+ # | se1 ... sen
140
+ def normalize(productions)
141
+ temp_nonterm = temp_nonterminal("Plus", productions.first)
142
+ num_sub_elements = @sub_elements.length
143
+ temp_production1 =
144
+ Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
145
+ ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
146
+ temp_production2 =
147
+ Production.new(temp_nonterm, @sub_elements,
148
+ ArrayNodeBuilder.new((0...num_sub_elements).to_a))
149
+ [clone_and_substitute_productions(productions, [temp_nonterm]),
150
+ temp_production1.normalize + temp_production2.normalize]
151
+ end
152
+ end
153
+
154
+ class MultElement < OperatorElement
155
+ def inspect
156
+ "#{inspect_sub_elements}*"
157
+ end
158
+
159
+ # NT -> a b (se1 ... sen)* c d
160
+ #
161
+ # is normalized to
162
+ #
163
+ # NT -> a b NT-Mult-X c d
164
+ # | a b c d
165
+ # NT-Mult-X -> NT-Mult-X se1 ... sen
166
+ # | se1 ... sen
167
+ def normalize(productions)
168
+ temp_nonterm = temp_nonterminal("Mult", productions.first)
169
+ num_sub_elements = @sub_elements.length
170
+ temp_production1 =
171
+ Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
172
+ ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
173
+ temp_production2 =
174
+ Production.new(temp_nonterm, @sub_elements,
175
+ ArrayNodeBuilder.new((0...num_sub_elements).to_a))
176
+ ps2 = clone_and_substitute_productions(productions, []) do |prod, i|
177
+ # Will insert empty ArrayNode
178
+ prod.tree_builder =
179
+ ArrayNodeBuilder.new([], nil, prod.tree_builder, i)
180
+ prod.tree_builder.shifting_insert = true
181
+ prod
182
+ end
183
+ [clone_and_substitute_productions(productions, [temp_nonterm]) + ps2,
184
+ temp_production1.normalize + temp_production2.normalize]
185
+ end
186
+ end
187
+
188
+ class MaybeElement < OperatorElement
189
+ def inspect
190
+ "#{inspect_sub_elements}?"
191
+ end
192
+
193
+ # NT -> a b (se1 ... sen)? c d
194
+ #
195
+ # is normalized to
196
+ #
197
+ # NT -> a b se1 ... sen c d
198
+ # | a b c d
199
+ def normalize(productions)
200
+ p1 = clone_and_substitute_productions(productions,
201
+ @sub_elements) do |prod, elemindex|
202
+ endindex = elemindex+@sub_elements.length-1
203
+ prod.tree_builder =
204
+ GroupingSyntaxTreeBuilder.new(elemindex,
205
+ endindex,
206
+ prod.tree_builder)
207
+ prod
208
+ end
209
+ p2 = clone_and_substitute_productions(productions, []) do |p,ei|
210
+ p.tree_builder.inactivate_child(ei)
211
+ p
212
+ end
213
+ [p1 + p2, []]
214
+ end
215
+ end
216
+
217
+ class ListElement < OperatorElement
218
+ def initialize(subElements, separatorElement = ",")
219
+ super(subElements)
220
+ @separator = make_element(separatorElement)
221
+ end
222
+
223
+ def to_src(name = nil, nameHash = {})
224
+ assign_to(name,
225
+ new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash)),
226
+ @separator))
227
+ end
228
+
229
+ # NT -> a b list(se1 ... sen, sep) c d
230
+ #
231
+ # is normalized to
232
+ #
233
+ # NT -> a b se1 ... sen NT-List-X c d
234
+ # | a b se1 ... sen c d
235
+ # NT-List-X -> NT-List-X sep se1 ... sen
236
+ # | sep se1 ... sen
237
+ def normalize(productions)
238
+ temp_nonterminal = temp_nonterminal("List", productions.first)
239
+ num_sub_elements = @sub_elements.length
240
+ ps1 =
241
+ clone_and_substitute_productions(productions, @sub_elements +
242
+ [temp_nonterminal]) do |prod, elemindex|
243
+ endindex = elemindex+@sub_elements.length-1
244
+ prod.tree_builder =
245
+ ArrayNodeBuilder.new((elemindex..endindex).to_a,
246
+ endindex+1, prod.tree_builder,
247
+ elemindex, ((elemindex+1)..(endindex+1)).to_a)
248
+ prod.tree_builder.append_element = false
249
+ prod
250
+ end
251
+ ps2 = clone_and_substitute_productions(productions,
252
+ @sub_elements) do |prod, elemindex|
253
+ endindex = elemindex+@sub_elements.length-1
254
+ prod.tree_builder =
255
+ ArrayNodeBuilder.new((elemindex..endindex).to_a,
256
+ nil, prod.tree_builder,
257
+ elemindex, ((elemindex+1)..endindex).to_a)
258
+ prod
259
+ end
260
+ temp_production1 =
261
+ Production.new(temp_nonterminal,
262
+ [temp_nonterminal, @separator] + @sub_elements,
263
+ ArrayNodeBuilder.new((2...(2+num_sub_elements)).to_a, 0))
264
+ temp_production2 =
265
+ Production.new(temp_nonterminal, [@separator] + @sub_elements,
266
+ ArrayNodeBuilder.new((1...(1+num_sub_elements)).to_a))
267
+ [ps1 + ps2,
268
+ temp_production1.normalize + temp_production2.normalize]
269
+ end
270
+
271
+ def inspect
272
+ "list(#{inspect_sub_elements}, #{@separator.inspect})"
273
+ end
274
+ end
275
+
276
+ class OrElement < OperatorElement
277
+ def initialize(*args)
278
+ super(*args)
279
+ raise ArgumentError, "At least two sub-elements needed" unless @sub_elements.length > 1
280
+ end
281
+
282
+ # NT -> a b (se1 | ... | sen) c d
283
+ #
284
+ # is normalized to
285
+ #
286
+ # NT -> a b se1 c d
287
+ # | a b ... c d
288
+ # | a b sen c d
289
+ def normalize(productions)
290
+ normalized_productions = @sub_elements.map do |subelement|
291
+ clone_and_substitute_productions(productions, [subelement])
292
+ end
293
+ [normalized_productions.flatten, []]
294
+ end
295
+
296
+ def inspect
297
+ inspect_sub_elements(" | ")
298
+ end
299
+ end
300
+
301
+ # Short hand funcs
302
+ def as_array(subElements)
303
+ if subElements.length == 1 and subElements[0].kind_of?(Array)
304
+ subElements[0]
305
+ else
306
+ subElements
307
+ end
308
+ end
309
+
310
+ def plus(*subElements); PlusElement.new(as_array(subElements)); end
311
+ def mult(*subElements); MultElement.new(as_array(subElements)); end
312
+ def maybe(*subElements); MaybeElement.new(as_array(subElements)); end
313
+ def ore(*subElements); OrElement.new(as_array(subElements)); end
314
+ def liste(subElements, separator); ListElement.new(subElements, separator); end
315
+
316
+ class GrammarSymbol < Element
317
+ attr_reader :name
318
+ def initialize(name)
319
+ super(nil, nil) # Symbols have no sub-elements or tree-specification
320
+ @name = name.to_s
321
+ end
322
+
323
+ def ==(other)
324
+ #Profiler.__enter__("GrammarSymbol#==".intern, other)
325
+ res = other.kind_of?(GrammarSymbol) and name == other.name
326
+ #Profiler.__leave__("GrammarSymbol#==".intern, res)
327
+ end
328
+
329
+ def hash
330
+ name.hash
331
+ end
332
+
333
+ def eql?(other)
334
+ other.hash == hash
335
+ end
336
+
337
+ def inspect
338
+ name
339
+ end
340
+
341
+ def to_src(assignToName = nil, nameHash = {})
342
+ assign_to(assignToName, new_of_my_type(name))
343
+ end
344
+ end
345
+
346
+ class NonTerminal < GrammarSymbol
347
+ end
348
+
349
+ # When init:ing productions nonterminals can be specified with symbols both
350
+ # as nonterminal or as elements. String tokens can be specified as strings.
351
+ #
352
+ class Production
353
+ include SourceCodeDumpable
354
+ attr_accessor :elements # Right hand side
355
+ attr_accessor :nonterminal # Left hand side
356
+ attr_accessor :tree_builder
357
+
358
+ def initialize(nonterminal, elements, treeBuilder = nil)
359
+ nonterminal = NonTerminal.new(nonterminal) if nonterminal.class == String
360
+ @nonterminal = make_element(nonterminal)
361
+ @elements = make_elements(elements)
362
+ init_tree_builder(treeBuilder)
363
+ end
364
+
365
+ =begin
366
+ def clone
367
+ Production.new(@nonterminal.clone, @elements.clone, @tree_builder.clone)
368
+ end
369
+ =end
370
+
371
+ def normalize
372
+ maybe_normalize, extra_productions = [self], []
373
+ @elements.each do |element|
374
+ if element.kind_of?(Element)
375
+ maybe_normalize, new_extra = element.normalize(maybe_normalize)
376
+ extra_productions += new_extra
377
+ end
378
+ end
379
+ (maybe_normalize + extra_productions).equality_uniq
380
+ end
381
+
382
+ def length
383
+ elements.length
384
+ end
385
+
386
+ def ==(other)
387
+ #Profiler.__enter__("Production#==".intern, other)
388
+ res = other.class == self.class and
389
+ nonterminal == other.nonterminal and
390
+ elements == other.elements and
391
+ tree_builder == other.tree_builder
392
+ #Profiler.__leave__("Production#==".intern, res)
393
+ end
394
+
395
+ def clone_and_substitute(element, substitute)
396
+ index = elements.index element
397
+ substitute = [substitute] unless substitute.kind_of?(Array)
398
+ if index
399
+ Production.new(nonterminal,
400
+ elements[0...index] + substitute +
401
+ elements[(index+1)..-1], tree_builder.copy)
402
+ else
403
+ self
404
+ end
405
+ end
406
+
407
+ def inspect
408
+ elements_inspect = @elements.map {|e| e.inspect}.join(" ")
409
+ "#{nonterminal.inspect} -> #{elements_inspect}"
410
+ end
411
+
412
+ def create_tree(childrenValues)
413
+ tree_builder.create_tree(childrenValues)
414
+ end
415
+
416
+ def to_src(name = nil, nameHash = {})
417
+ assign_to(name,
418
+ new_of_my_type(nonterm_to_symbol(nonterminal),
419
+ as_code(elements_to_src(nameHash)), tree_builder))
420
+ end
421
+
422
+ protected
423
+
424
+ def elements_to_src(nameHash)
425
+ "[" + elements.map do |element|
426
+ if element.kind_of?(Token) and nameHash[element]
427
+ nameHash[element]
428
+ elsif element.kind_of?(NonTerminal)
429
+ nonterm_to_symbol(element).to_src
430
+ else
431
+ element.to_compact_src(nil,nameHash)
432
+ end
433
+ end.join(", ") + "]"
434
+ end
435
+
436
+ def nonterm_to_symbol(o)
437
+ return o unless o.class == NonTerminal
438
+ if o.name.include? "'"
439
+ as_code('"' + o.name + '".intern')
440
+ else
441
+ o.name.intern
442
+ end
443
+ end
444
+
445
+ def init_tree_builder(treeBuilder)
446
+ if treeBuilder == nil
447
+ @tree_builder = stb(nonterminal.name, assign_element_names)
448
+ elsif [:^, "^"].include?(treeBuilder)
449
+ @tree_builder = stb("^", assign_element_names(true))
450
+ else
451
+ if ["_", nil, ""].include?(treeBuilder.node_name)
452
+ treeBuilder.node_name = nonterminal.name
453
+ end
454
+ @tree_builder = treeBuilder
455
+ end
456
+ end
457
+
458
+ def assign_element_names(inactivateStringTokens = false)
459
+ at_least_one_active, count = false, 0
460
+ name_count = Hash.new(0)
461
+ element_names = elements.map do |element|
462
+ count += 1
463
+ if element.kind_of?(StringToken)
464
+ inactivateStringTokens ? "_" : "c#{count}"
465
+ elsif element.kind_of?(Token)
466
+ name = element.name || "c#{count}"
467
+ name_count[name] += 1
468
+ name += "#{name_count[name]}" if name_count[name] > 1
469
+ inactivateStringTokens ? "_" : name.downcase
470
+ elsif element.kind_of?(NonTerminal) or element.kind_of?(OperatorElement)
471
+ at_least_one_active = true
472
+ n = element.name.downcase
473
+ name_count[n] += 1
474
+ n += "#{name_count[n]}" if name_count[n] > 1
475
+ n
476
+ end
477
+ end
478
+ element_names[0] = "c1" unless at_least_one_active
479
+ element_names
480
+ end
481
+ end
482
+
483
+ def prod(nonterminal, elements, treeSpec = nil)
484
+ Production.new(nonterminal, elements, treeSpec)
485
+ end
486
+
487
+ class Grammar
488
+ attr_reader :tokens, :productions, :exports, :start_symbol, :priorities
489
+ attr_reader :eof_terminal, :original_start_symbol
490
+ attr_accessor :name
491
+ alias_method :terminals, :tokens
492
+
493
+ def initialize(name = nil, productions = [], tokens = [],
494
+ priorities = nil,
495
+ startSymbol = nil, exports = [])
496
+ # NOTE! Beware that the EofToken below wont get a unique index number!!
497
+ @eof_terminal = EofToken.new
498
+ @name, @tokens, @alternatives = name, [@eof_terminal], Hash.new
499
+ @priorities = priorities || ProductionPriorities.new
500
+ @nonterminals = Array.new
501
+ add_tokens(tokens)
502
+ clear_productions
503
+ add_productions(productions)
504
+ init_start_symbol(startSymbol)
505
+ @exports = exports
506
+ end
507
+
508
+ def alternatives(nonterminal)
509
+ @alternatives[nonterminal.name]
510
+ end
511
+
512
+ def normalize
513
+ old_productions = @productions
514
+ clear_productions
515
+ old_productions.each do |production|
516
+ add_productions(production.normalize)
517
+ end
518
+ self
519
+ end
520
+
521
+ def add_token(token)
522
+ @tokens.push token unless @tokens.include?(token)
523
+ end
524
+
525
+ def add_unique_symbols(production)
526
+ production.elements = production.elements.map {|e| unique_symbol(e)}
527
+ production.nonterminal = unique_symbol(production.nonterminal)
528
+ end
529
+
530
+ def unique_symbol(symbol)
531
+ existing_symbol = (@tokens + @nonterminals).detect {|e| e==symbol}
532
+ if not existing_symbol
533
+ add_token(symbol) if symbol.kind_of?(Token)
534
+ @nonterminals.push(symbol) if symbol.kind_of?(NonTerminal)
535
+ return symbol
536
+ else
537
+ return existing_symbol
538
+ end
539
+ end
540
+
541
+ def add_production(production)
542
+ unless @productions.include?(production)
543
+ add_unique_symbols(production)
544
+ @productions.push production
545
+ nt = production.nonterminal
546
+ unless @alternatives[nt.name]
547
+ @alternatives[nt.name] = [production]
548
+ else
549
+ @alternatives[nt.name].push production
550
+ end
551
+ end
552
+ @start_symbol = production.nonterminal unless @start_symbol
553
+ end
554
+
555
+ def +(otherGrammar)
556
+ # NOTE: Maybe check for and/or handle naming conflicts?
557
+ add_tokens(otherGrammar.tokens)
558
+ add_productions(otherGrammar.productions)
559
+ otherGrammar.exports.each {|e| @exports.push(e)}
560
+ self
561
+ end
562
+
563
+ def nonterminals(includeStartSymbol = true)
564
+ nts = (@productions.map {|p| p.nonterminal}).equality_uniq
565
+ nts.delete(@start_symbol) unless includeStartSymbol
566
+ nts
567
+ end
568
+
569
+ def augmented?
570
+ @augmented ? true : false
571
+ end
572
+
573
+ def inspect
574
+ str = "Grammar #{@name}\n"
575
+ str += "Start symbol: #{@start_symbol.inspect}\n" if @start_symbol
576
+ str += "Tokens:\n" + @tokens.map{|t| " " + t.inspect}.join("\n")
577
+ str += "\nProductions:\n"
578
+ @productions.each do |prod|
579
+ str += " " + prod.inspect + "\n"
580
+ end
581
+ str
582
+ end
583
+
584
+ class EpsilonTokenType < Token
585
+ def initialize
586
+ end
587
+ def ==(other)
588
+ other.class == self.class
589
+ end
590
+ end
591
+ EpsilonToken = EpsilonTokenType.new
592
+
593
+ def Grammar.epsilon
594
+ EpsilonToken
595
+ end
596
+
597
+ # Augment the grammar by adding new start symbol and production from it
598
+ # to previous start symbol. The added production gets index 0.
599
+ def augment
600
+ return true if augmented?
601
+ @original_start_symbol = self.start_symbol
602
+ # Add prim's until unique nonterminal name
603
+ new_name = @original_start_symbol.name + "'"
604
+ while nonterminals.collect{|nt| nt.name}.include?(new_name)
605
+ new_name += "'"
606
+ end
607
+ @start_symbol = NonTerminal.new(new_name)
608
+ add_production(p = Production.new(@start_symbol, [@original_start_symbol]))
609
+ # Make sure its on top
610
+ @productions.delete p
611
+ @productions.unshift p
612
+ @augmented = true
613
+ end
614
+
615
+ def unaugment
616
+ return false unless augmented?
617
+ @start_symbol = @original_start_symbol
618
+ @productions.shift
619
+ @augmented = false
620
+ end
621
+
622
+ private
623
+
624
+ def init_start_symbol(startSymbol)
625
+ if startSymbol
626
+ @start_symbol = make_element(startSymbol)
627
+ elsif @productions.length > 0
628
+ @start_symbol = @productions.first.nonterminal
629
+ end
630
+ end
631
+
632
+ def add_tokens(tokens)
633
+ tokens.each {|token| add_token(token)}
634
+ end
635
+
636
+ def add_productions(productions)
637
+ productions.each {|production| add_production(production)}
638
+ end
639
+
640
+ def clear_productions
641
+ @productions = Array.new
642
+ @alternatives = Hash.new
643
+ end
644
+ end