rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
|
@@ -0,0 +1,644 @@
|
|
|
1
|
+
require 'rpdf2txt-rockit/token'
|
|
2
|
+
require 'rpdf2txt-rockit/base_extensions'
|
|
3
|
+
require 'rpdf2txt-rockit/conflict_resolution'
|
|
4
|
+
|
|
5
|
+
require 'rpdf2txt-rockit/profiler'
|
|
6
|
+
|
|
7
|
+
# A Grammar has a name, a set of tokens, a start symbol and a set of
|
|
8
|
+
# productions. Grammars are modular and can be merged. To help in resolving
|
|
9
|
+
# conflicts when grammars are merged the symbols that are exported from
|
|
10
|
+
# a grammar can be specified. By default all the nonterminals (left hand sides
|
|
11
|
+
# of the productions) are exported.
|
|
12
|
+
#
|
|
13
|
+
# Productions map a NonTerminal to a sequence of Element's.
|
|
14
|
+
# Elements can be either symbols (NonTerminals or terminals represented by
|
|
15
|
+
# their string name or symbol) or OperatorElements.
|
|
16
|
+
#
|
|
17
|
+
# OperatorElements are one of:
|
|
18
|
+
# Plus - corresponds to the EBNF operator '+', ie. one or several
|
|
19
|
+
# Mult - corresponds to the EBNF operator '*', ie. zero or several
|
|
20
|
+
# Maybe - corresponds to the EBNF operator '?', ie. zero or one
|
|
21
|
+
# List(Elements, separator) - A list of Elements separated by separator
|
|
22
|
+
# Or - One of a sequence (at least 2) elements
|
|
23
|
+
#
|
|
24
|
+
# A grammar is in normal form when no OperatorElements are in its productions.
|
|
25
|
+
# Converting a grammar to normal form is called normalization. Converting
|
|
26
|
+
# a syntax tree back to the unnormalized form of its grammar is called
|
|
27
|
+
# denormalization. Normalization is part of this file while denormalization
|
|
28
|
+
# is in a file of its own.
|
|
29
|
+
#
|
|
30
|
+
# Productions include a syntax tree specification describing how to build a
|
|
31
|
+
# (sub)tree for the syntax tree of the production when it is been matched.
|
|
32
|
+
# The SyntaxTreeSpecification needs to be known at this level since it
|
|
33
|
+
# is affected by normalization.
|
|
34
|
+
#
|
|
35
|
+
class Element
|
|
36
|
+
include SourceCodeDumpable
|
|
37
|
+
attr_accessor :sub_elements, :tree_specification
|
|
38
|
+
|
|
39
|
+
def initialize(subElements, treeSpecification = nil)
|
|
40
|
+
@sub_elements, @tree_specification = subElements, treeSpecification
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Normalize the element in the context of a production. The context is
|
|
44
|
+
# needed to give information for naming of extra productions that (may)
|
|
45
|
+
# need to be created. Returns two arrays, the former with the normalization
|
|
46
|
+
# of the element in the existing production and the latter with additional
|
|
47
|
+
# productions needed.
|
|
48
|
+
def normalize(productions)
|
|
49
|
+
# Default is that no normalization is needed and no extra productions
|
|
50
|
+
# are added
|
|
51
|
+
[productions, []]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
protected
|
|
55
|
+
|
|
56
|
+
def clone_and_substitute_productions(productions, substitute,
|
|
57
|
+
&updater)
|
|
58
|
+
prods = productions.map do |prod|
|
|
59
|
+
p = prod.clone_and_substitute(self, substitute)
|
|
60
|
+
index = prod.elements.index(self)
|
|
61
|
+
p = updater.call(p, index) if updater and index
|
|
62
|
+
p
|
|
63
|
+
end
|
|
64
|
+
(prods.map {|p| p.normalize}).flatten
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# To make it simple to enter productions some standard Ruby objects can be
|
|
69
|
+
# used in place of the "proper" objects. This function converts to the right
|
|
70
|
+
# type or raises an exception if object cannot be made into an element.
|
|
71
|
+
def make_element(anObject)
|
|
72
|
+
if anObject.kind_of?(String)
|
|
73
|
+
string_token(anObject)
|
|
74
|
+
elsif anObject.kind_of?(Regexp)
|
|
75
|
+
regexp_token(anObject)
|
|
76
|
+
elsif anObject.kind_of?(Symbol)
|
|
77
|
+
NonTerminal.new(anObject.to_s)
|
|
78
|
+
elsif anObject.kind_of?(NonTerminal) or anObject.kind_of?(Token) or
|
|
79
|
+
anObject.kind_of?(OperatorElement)
|
|
80
|
+
anObject
|
|
81
|
+
else
|
|
82
|
+
raise ArgumentError, "cannot make an element from #{anObject.inspect}"
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def make_elements(anArrayOrElement, anArray = [])
|
|
87
|
+
if anArrayOrElement.kind_of?(Array)
|
|
88
|
+
anArrayOrElement += anArray
|
|
89
|
+
else
|
|
90
|
+
anArrayOrElement = [anArrayOrElement] + anArray
|
|
91
|
+
end
|
|
92
|
+
anArrayOrElement.map {|e| make_element(e)}
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
class OperatorElement < Element
|
|
96
|
+
attr_reader :sub_elements
|
|
97
|
+
|
|
98
|
+
def initialize(subElements, *rest)
|
|
99
|
+
@sub_elements = make_elements(subElements, rest)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def name
|
|
103
|
+
type_to_src.split("Element").first
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def to_src(name = nil, nameHash = {})
|
|
107
|
+
assign_to(name,
|
|
108
|
+
new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash))))
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def ==(other)
|
|
112
|
+
other.class == self.class and other.sub_elements == @sub_elements
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
protected
|
|
116
|
+
|
|
117
|
+
def inspect_sub_elements(separator = " ")
|
|
118
|
+
str = @sub_elements.map {|e| e.inspect}.join(separator)
|
|
119
|
+
str = "(" + str + ")" if @sub_elements.length > 1
|
|
120
|
+
str
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def temp_nonterminal(name, production)
|
|
124
|
+
NonTerminal.new("#{name}#{object_id.inspect}")
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
class PlusElement < OperatorElement
|
|
129
|
+
def inspect
|
|
130
|
+
"#{inspect_sub_elements}+"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# NT -> a b (se1 ... sen)+ c d
|
|
134
|
+
#
|
|
135
|
+
# is normalized to
|
|
136
|
+
#
|
|
137
|
+
# NT -> a b NT-Plus-X c d
|
|
138
|
+
# NT-Plus-X -> NT-Plus-X se1 ... sen
|
|
139
|
+
# | se1 ... sen
|
|
140
|
+
def normalize(productions)
|
|
141
|
+
temp_nonterm = temp_nonterminal("Plus", productions.first)
|
|
142
|
+
num_sub_elements = @sub_elements.length
|
|
143
|
+
temp_production1 =
|
|
144
|
+
Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
|
|
145
|
+
ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
|
|
146
|
+
temp_production2 =
|
|
147
|
+
Production.new(temp_nonterm, @sub_elements,
|
|
148
|
+
ArrayNodeBuilder.new((0...num_sub_elements).to_a))
|
|
149
|
+
[clone_and_substitute_productions(productions, [temp_nonterm]),
|
|
150
|
+
temp_production1.normalize + temp_production2.normalize]
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
class MultElement < OperatorElement
|
|
155
|
+
def inspect
|
|
156
|
+
"#{inspect_sub_elements}*"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# NT -> a b (se1 ... sen)* c d
|
|
160
|
+
#
|
|
161
|
+
# is normalized to
|
|
162
|
+
#
|
|
163
|
+
# NT -> a b NT-Mult-X c d
|
|
164
|
+
# | a b c d
|
|
165
|
+
# NT-Mult-X -> NT-Mult-X se1 ... sen
|
|
166
|
+
# | se1 ... sen
|
|
167
|
+
def normalize(productions)
|
|
168
|
+
temp_nonterm = temp_nonterminal("Mult", productions.first)
|
|
169
|
+
num_sub_elements = @sub_elements.length
|
|
170
|
+
temp_production1 =
|
|
171
|
+
Production.new(temp_nonterm, [temp_nonterm].concat(@sub_elements),
|
|
172
|
+
ArrayNodeBuilder.new((1..num_sub_elements).to_a, 0))
|
|
173
|
+
temp_production2 =
|
|
174
|
+
Production.new(temp_nonterm, @sub_elements,
|
|
175
|
+
ArrayNodeBuilder.new((0...num_sub_elements).to_a))
|
|
176
|
+
ps2 = clone_and_substitute_productions(productions, []) do |prod, i|
|
|
177
|
+
# Will insert empty ArrayNode
|
|
178
|
+
prod.tree_builder =
|
|
179
|
+
ArrayNodeBuilder.new([], nil, prod.tree_builder, i)
|
|
180
|
+
prod.tree_builder.shifting_insert = true
|
|
181
|
+
prod
|
|
182
|
+
end
|
|
183
|
+
[clone_and_substitute_productions(productions, [temp_nonterm]) + ps2,
|
|
184
|
+
temp_production1.normalize + temp_production2.normalize]
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
class MaybeElement < OperatorElement
|
|
189
|
+
def inspect
|
|
190
|
+
"#{inspect_sub_elements}?"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# NT -> a b (se1 ... sen)? c d
|
|
194
|
+
#
|
|
195
|
+
# is normalized to
|
|
196
|
+
#
|
|
197
|
+
# NT -> a b se1 ... sen c d
|
|
198
|
+
# | a b c d
|
|
199
|
+
def normalize(productions)
|
|
200
|
+
p1 = clone_and_substitute_productions(productions,
|
|
201
|
+
@sub_elements) do |prod, elemindex|
|
|
202
|
+
endindex = elemindex+@sub_elements.length-1
|
|
203
|
+
prod.tree_builder =
|
|
204
|
+
GroupingSyntaxTreeBuilder.new(elemindex,
|
|
205
|
+
endindex,
|
|
206
|
+
prod.tree_builder)
|
|
207
|
+
prod
|
|
208
|
+
end
|
|
209
|
+
p2 = clone_and_substitute_productions(productions, []) do |p,ei|
|
|
210
|
+
p.tree_builder.inactivate_child(ei)
|
|
211
|
+
p
|
|
212
|
+
end
|
|
213
|
+
[p1 + p2, []]
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
class ListElement < OperatorElement
|
|
218
|
+
def initialize(subElements, separatorElement = ",")
|
|
219
|
+
super(subElements)
|
|
220
|
+
@separator = make_element(separatorElement)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def to_src(name = nil, nameHash = {})
|
|
224
|
+
assign_to(name,
|
|
225
|
+
new_of_my_type(as_code(@sub_elements.to_src(nil,nameHash)),
|
|
226
|
+
@separator))
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# NT -> a b list(se1 ... sen, sep) c d
|
|
230
|
+
#
|
|
231
|
+
# is normalized to
|
|
232
|
+
#
|
|
233
|
+
# NT -> a b se1 ... sen NT-List-X c d
|
|
234
|
+
# | a b se1 ... sen c d
|
|
235
|
+
# NT-List-X -> NT-List-X sep se1 ... sen
|
|
236
|
+
# | sep se1 ... sen
|
|
237
|
+
def normalize(productions)
|
|
238
|
+
temp_nonterminal = temp_nonterminal("List", productions.first)
|
|
239
|
+
num_sub_elements = @sub_elements.length
|
|
240
|
+
ps1 =
|
|
241
|
+
clone_and_substitute_productions(productions, @sub_elements +
|
|
242
|
+
[temp_nonterminal]) do |prod, elemindex|
|
|
243
|
+
endindex = elemindex+@sub_elements.length-1
|
|
244
|
+
prod.tree_builder =
|
|
245
|
+
ArrayNodeBuilder.new((elemindex..endindex).to_a,
|
|
246
|
+
endindex+1, prod.tree_builder,
|
|
247
|
+
elemindex, ((elemindex+1)..(endindex+1)).to_a)
|
|
248
|
+
prod.tree_builder.append_element = false
|
|
249
|
+
prod
|
|
250
|
+
end
|
|
251
|
+
ps2 = clone_and_substitute_productions(productions,
|
|
252
|
+
@sub_elements) do |prod, elemindex|
|
|
253
|
+
endindex = elemindex+@sub_elements.length-1
|
|
254
|
+
prod.tree_builder =
|
|
255
|
+
ArrayNodeBuilder.new((elemindex..endindex).to_a,
|
|
256
|
+
nil, prod.tree_builder,
|
|
257
|
+
elemindex, ((elemindex+1)..endindex).to_a)
|
|
258
|
+
prod
|
|
259
|
+
end
|
|
260
|
+
temp_production1 =
|
|
261
|
+
Production.new(temp_nonterminal,
|
|
262
|
+
[temp_nonterminal, @separator] + @sub_elements,
|
|
263
|
+
ArrayNodeBuilder.new((2...(2+num_sub_elements)).to_a, 0))
|
|
264
|
+
temp_production2 =
|
|
265
|
+
Production.new(temp_nonterminal, [@separator] + @sub_elements,
|
|
266
|
+
ArrayNodeBuilder.new((1...(1+num_sub_elements)).to_a))
|
|
267
|
+
[ps1 + ps2,
|
|
268
|
+
temp_production1.normalize + temp_production2.normalize]
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def inspect
|
|
272
|
+
"list(#{inspect_sub_elements}, #{@separator.inspect})"
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
class OrElement < OperatorElement
|
|
277
|
+
def initialize(*args)
|
|
278
|
+
super(*args)
|
|
279
|
+
raise ArgumentError, "At least two sub-elements needed" unless @sub_elements.length > 1
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# NT -> a b (se1 | ... | sen) c d
|
|
283
|
+
#
|
|
284
|
+
# is normalized to
|
|
285
|
+
#
|
|
286
|
+
# NT -> a b se1 c d
|
|
287
|
+
# | a b ... c d
|
|
288
|
+
# | a b sen c d
|
|
289
|
+
def normalize(productions)
|
|
290
|
+
normalized_productions = @sub_elements.map do |subelement|
|
|
291
|
+
clone_and_substitute_productions(productions, [subelement])
|
|
292
|
+
end
|
|
293
|
+
[normalized_productions.flatten, []]
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def inspect
|
|
297
|
+
inspect_sub_elements(" | ")
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Short hand funcs
|
|
302
|
+
def as_array(subElements)
|
|
303
|
+
if subElements.length == 1 and subElements[0].kind_of?(Array)
|
|
304
|
+
subElements[0]
|
|
305
|
+
else
|
|
306
|
+
subElements
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def plus(*subElements); PlusElement.new(as_array(subElements)); end
|
|
311
|
+
def mult(*subElements); MultElement.new(as_array(subElements)); end
|
|
312
|
+
def maybe(*subElements); MaybeElement.new(as_array(subElements)); end
|
|
313
|
+
def ore(*subElements); OrElement.new(as_array(subElements)); end
|
|
314
|
+
def liste(subElements, separator); ListElement.new(subElements, separator); end
|
|
315
|
+
|
|
316
|
+
class GrammarSymbol < Element
|
|
317
|
+
attr_reader :name
|
|
318
|
+
def initialize(name)
|
|
319
|
+
super(nil, nil) # Symbols have no sub-elements or tree-specification
|
|
320
|
+
@name = name.to_s
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def ==(other)
|
|
324
|
+
#Profiler.__enter__("GrammarSymbol#==".intern, other)
|
|
325
|
+
res = other.kind_of?(GrammarSymbol) and name == other.name
|
|
326
|
+
#Profiler.__leave__("GrammarSymbol#==".intern, res)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def hash
|
|
330
|
+
name.hash
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def eql?(other)
|
|
334
|
+
other.hash == hash
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def inspect
|
|
338
|
+
name
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def to_src(assignToName = nil, nameHash = {})
|
|
342
|
+
assign_to(assignToName, new_of_my_type(name))
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
class NonTerminal < GrammarSymbol
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
# When init:ing productions nonterminals can be specified with symbols both
|
|
350
|
+
# as nonterminal or as elements. String tokens can be specified as strings.
|
|
351
|
+
#
|
|
352
|
+
class Production
|
|
353
|
+
include SourceCodeDumpable
|
|
354
|
+
attr_accessor :elements # Right hand side
|
|
355
|
+
attr_accessor :nonterminal # Left hand side
|
|
356
|
+
attr_accessor :tree_builder
|
|
357
|
+
|
|
358
|
+
def initialize(nonterminal, elements, treeBuilder = nil)
|
|
359
|
+
nonterminal = NonTerminal.new(nonterminal) if nonterminal.class == String
|
|
360
|
+
@nonterminal = make_element(nonterminal)
|
|
361
|
+
@elements = make_elements(elements)
|
|
362
|
+
init_tree_builder(treeBuilder)
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
=begin
|
|
366
|
+
def clone
|
|
367
|
+
Production.new(@nonterminal.clone, @elements.clone, @tree_builder.clone)
|
|
368
|
+
end
|
|
369
|
+
=end
|
|
370
|
+
|
|
371
|
+
def normalize
|
|
372
|
+
maybe_normalize, extra_productions = [self], []
|
|
373
|
+
@elements.each do |element|
|
|
374
|
+
if element.kind_of?(Element)
|
|
375
|
+
maybe_normalize, new_extra = element.normalize(maybe_normalize)
|
|
376
|
+
extra_productions += new_extra
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
(maybe_normalize + extra_productions).equality_uniq
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def length
|
|
383
|
+
elements.length
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
def ==(other)
|
|
387
|
+
#Profiler.__enter__("Production#==".intern, other)
|
|
388
|
+
res = other.class == self.class and
|
|
389
|
+
nonterminal == other.nonterminal and
|
|
390
|
+
elements == other.elements and
|
|
391
|
+
tree_builder == other.tree_builder
|
|
392
|
+
#Profiler.__leave__("Production#==".intern, res)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def clone_and_substitute(element, substitute)
|
|
396
|
+
index = elements.index element
|
|
397
|
+
substitute = [substitute] unless substitute.kind_of?(Array)
|
|
398
|
+
if index
|
|
399
|
+
Production.new(nonterminal,
|
|
400
|
+
elements[0...index] + substitute +
|
|
401
|
+
elements[(index+1)..-1], tree_builder.copy)
|
|
402
|
+
else
|
|
403
|
+
self
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
def inspect
|
|
408
|
+
elements_inspect = @elements.map {|e| e.inspect}.join(" ")
|
|
409
|
+
"#{nonterminal.inspect} -> #{elements_inspect}"
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def create_tree(childrenValues)
|
|
413
|
+
tree_builder.create_tree(childrenValues)
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
def to_src(name = nil, nameHash = {})
|
|
417
|
+
assign_to(name,
|
|
418
|
+
new_of_my_type(nonterm_to_symbol(nonterminal),
|
|
419
|
+
as_code(elements_to_src(nameHash)), tree_builder))
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
protected
|
|
423
|
+
|
|
424
|
+
def elements_to_src(nameHash)
|
|
425
|
+
"[" + elements.map do |element|
|
|
426
|
+
if element.kind_of?(Token) and nameHash[element]
|
|
427
|
+
nameHash[element]
|
|
428
|
+
elsif element.kind_of?(NonTerminal)
|
|
429
|
+
nonterm_to_symbol(element).to_src
|
|
430
|
+
else
|
|
431
|
+
element.to_compact_src(nil,nameHash)
|
|
432
|
+
end
|
|
433
|
+
end.join(", ") + "]"
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
def nonterm_to_symbol(o)
|
|
437
|
+
return o unless o.class == NonTerminal
|
|
438
|
+
if o.name.include? "'"
|
|
439
|
+
as_code('"' + o.name + '".intern')
|
|
440
|
+
else
|
|
441
|
+
o.name.intern
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
def init_tree_builder(treeBuilder)
|
|
446
|
+
if treeBuilder == nil
|
|
447
|
+
@tree_builder = stb(nonterminal.name, assign_element_names)
|
|
448
|
+
elsif [:^, "^"].include?(treeBuilder)
|
|
449
|
+
@tree_builder = stb("^", assign_element_names(true))
|
|
450
|
+
else
|
|
451
|
+
if ["_", nil, ""].include?(treeBuilder.node_name)
|
|
452
|
+
treeBuilder.node_name = nonterminal.name
|
|
453
|
+
end
|
|
454
|
+
@tree_builder = treeBuilder
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
def assign_element_names(inactivateStringTokens = false)
|
|
459
|
+
at_least_one_active, count = false, 0
|
|
460
|
+
name_count = Hash.new(0)
|
|
461
|
+
element_names = elements.map do |element|
|
|
462
|
+
count += 1
|
|
463
|
+
if element.kind_of?(StringToken)
|
|
464
|
+
inactivateStringTokens ? "_" : "c#{count}"
|
|
465
|
+
elsif element.kind_of?(Token)
|
|
466
|
+
name = element.name || "c#{count}"
|
|
467
|
+
name_count[name] += 1
|
|
468
|
+
name += "#{name_count[name]}" if name_count[name] > 1
|
|
469
|
+
inactivateStringTokens ? "_" : name.downcase
|
|
470
|
+
elsif element.kind_of?(NonTerminal) or element.kind_of?(OperatorElement)
|
|
471
|
+
at_least_one_active = true
|
|
472
|
+
n = element.name.downcase
|
|
473
|
+
name_count[n] += 1
|
|
474
|
+
n += "#{name_count[n]}" if name_count[n] > 1
|
|
475
|
+
n
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
element_names[0] = "c1" unless at_least_one_active
|
|
479
|
+
element_names
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
def prod(nonterminal, elements, treeSpec = nil)
|
|
484
|
+
Production.new(nonterminal, elements, treeSpec)
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
class Grammar
|
|
488
|
+
attr_reader :tokens, :productions, :exports, :start_symbol, :priorities
|
|
489
|
+
attr_reader :eof_terminal, :original_start_symbol
|
|
490
|
+
attr_accessor :name
|
|
491
|
+
alias_method :terminals, :tokens
|
|
492
|
+
|
|
493
|
+
def initialize(name = nil, productions = [], tokens = [],
|
|
494
|
+
priorities = nil,
|
|
495
|
+
startSymbol = nil, exports = [])
|
|
496
|
+
# NOTE! Beware that the EofToken below wont get a unique index number!!
|
|
497
|
+
@eof_terminal = EofToken.new
|
|
498
|
+
@name, @tokens, @alternatives = name, [@eof_terminal], Hash.new
|
|
499
|
+
@priorities = priorities || ProductionPriorities.new
|
|
500
|
+
@nonterminals = Array.new
|
|
501
|
+
add_tokens(tokens)
|
|
502
|
+
clear_productions
|
|
503
|
+
add_productions(productions)
|
|
504
|
+
init_start_symbol(startSymbol)
|
|
505
|
+
@exports = exports
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def alternatives(nonterminal)
|
|
509
|
+
@alternatives[nonterminal.name]
|
|
510
|
+
end
|
|
511
|
+
|
|
512
|
+
def normalize
|
|
513
|
+
old_productions = @productions
|
|
514
|
+
clear_productions
|
|
515
|
+
old_productions.each do |production|
|
|
516
|
+
add_productions(production.normalize)
|
|
517
|
+
end
|
|
518
|
+
self
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
def add_token(token)
|
|
522
|
+
@tokens.push token unless @tokens.include?(token)
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
def add_unique_symbols(production)
|
|
526
|
+
production.elements = production.elements.map {|e| unique_symbol(e)}
|
|
527
|
+
production.nonterminal = unique_symbol(production.nonterminal)
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
def unique_symbol(symbol)
|
|
531
|
+
existing_symbol = (@tokens + @nonterminals).detect {|e| e==symbol}
|
|
532
|
+
if not existing_symbol
|
|
533
|
+
add_token(symbol) if symbol.kind_of?(Token)
|
|
534
|
+
@nonterminals.push(symbol) if symbol.kind_of?(NonTerminal)
|
|
535
|
+
return symbol
|
|
536
|
+
else
|
|
537
|
+
return existing_symbol
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def add_production(production)
|
|
542
|
+
unless @productions.include?(production)
|
|
543
|
+
add_unique_symbols(production)
|
|
544
|
+
@productions.push production
|
|
545
|
+
nt = production.nonterminal
|
|
546
|
+
unless @alternatives[nt.name]
|
|
547
|
+
@alternatives[nt.name] = [production]
|
|
548
|
+
else
|
|
549
|
+
@alternatives[nt.name].push production
|
|
550
|
+
end
|
|
551
|
+
end
|
|
552
|
+
@start_symbol = production.nonterminal unless @start_symbol
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
def +(otherGrammar)
|
|
556
|
+
# NOTE: Maybe check for and/or handle naming conflicts?
|
|
557
|
+
add_tokens(otherGrammar.tokens)
|
|
558
|
+
add_productions(otherGrammar.productions)
|
|
559
|
+
otherGrammar.exports.each {|e| @exports.push(e)}
|
|
560
|
+
self
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
def nonterminals(includeStartSymbol = true)
|
|
564
|
+
nts = (@productions.map {|p| p.nonterminal}).equality_uniq
|
|
565
|
+
nts.delete(@start_symbol) unless includeStartSymbol
|
|
566
|
+
nts
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
def augmented?
|
|
570
|
+
@augmented ? true : false
|
|
571
|
+
end
|
|
572
|
+
|
|
573
|
+
def inspect
|
|
574
|
+
str = "Grammar #{@name}\n"
|
|
575
|
+
str += "Start symbol: #{@start_symbol.inspect}\n" if @start_symbol
|
|
576
|
+
str += "Tokens:\n" + @tokens.map{|t| " " + t.inspect}.join("\n")
|
|
577
|
+
str += "\nProductions:\n"
|
|
578
|
+
@productions.each do |prod|
|
|
579
|
+
str += " " + prod.inspect + "\n"
|
|
580
|
+
end
|
|
581
|
+
str
|
|
582
|
+
end
|
|
583
|
+
|
|
584
|
+
class EpsilonTokenType < Token
|
|
585
|
+
def initialize
|
|
586
|
+
end
|
|
587
|
+
def ==(other)
|
|
588
|
+
other.class == self.class
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
EpsilonToken = EpsilonTokenType.new
|
|
592
|
+
|
|
593
|
+
def Grammar.epsilon
|
|
594
|
+
EpsilonToken
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
# Augment the grammar by adding new start symbol and production from it
|
|
598
|
+
# to previous start symbol. The added production gets index 0.
|
|
599
|
+
def augment
|
|
600
|
+
return true if augmented?
|
|
601
|
+
@original_start_symbol = self.start_symbol
|
|
602
|
+
# Add prim's until unique nonterminal name
|
|
603
|
+
new_name = @original_start_symbol.name + "'"
|
|
604
|
+
while nonterminals.collect{|nt| nt.name}.include?(new_name)
|
|
605
|
+
new_name += "'"
|
|
606
|
+
end
|
|
607
|
+
@start_symbol = NonTerminal.new(new_name)
|
|
608
|
+
add_production(p = Production.new(@start_symbol, [@original_start_symbol]))
|
|
609
|
+
# Make sure its on top
|
|
610
|
+
@productions.delete p
|
|
611
|
+
@productions.unshift p
|
|
612
|
+
@augmented = true
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
def unaugment
|
|
616
|
+
return false unless augmented?
|
|
617
|
+
@start_symbol = @original_start_symbol
|
|
618
|
+
@productions.shift
|
|
619
|
+
@augmented = false
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
private
|
|
623
|
+
|
|
624
|
+
def init_start_symbol(startSymbol)
|
|
625
|
+
if startSymbol
|
|
626
|
+
@start_symbol = make_element(startSymbol)
|
|
627
|
+
elsif @productions.length > 0
|
|
628
|
+
@start_symbol = @productions.first.nonterminal
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
def add_tokens(tokens)
|
|
633
|
+
tokens.each {|token| add_token(token)}
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
def add_productions(productions)
|
|
637
|
+
productions.each {|production| add_production(production)}
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
def clear_productions
|
|
641
|
+
@productions = Array.new
|
|
642
|
+
@alternatives = Hash.new
|
|
643
|
+
end
|
|
644
|
+
end
|