asmodis-rlsm 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENCE +674 -0
- data/README +35 -0
- data/lib/dfa.rb +686 -0
- data/lib/mgen.rb +130 -0
- data/lib/monkey_patching.rb +109 -0
- data/lib/monoid.rb +533 -0
- data/lib/rlsm.rb +22 -0
- data/lib/rlsm_regexp.rb +584 -0
- data/spec/dfa_spec.rb +99 -0
- data/spec/monoid_spec.rb +270 -0
- data/spec/regexp_spec.rb +25 -0
- metadata +64 -0
data/lib/rlsm.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Copyright 2008 Gunther Diemant
|
2
|
+
#
|
3
|
+
# This file is part of the RLSM module.
|
4
|
+
#
|
5
|
+
# Foobar is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# RLSM is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with RLSM. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
|
19
|
+
require File.join(File.dirname(__FILE__), 'monoid')
|
20
|
+
require File.join(File.dirname(__FILE__), 'mgen')
|
21
|
+
require File.join(File.dirname(__FILE__), 'dfa')
|
22
|
+
require File.join(File.dirname(__FILE__), 'rlsm_regexp')
|
data/lib/rlsm_regexp.rb
ADDED
@@ -0,0 +1,584 @@
|
|
1
|
+
# Copyright 2008 Gunther Diemant
|
2
|
+
#
|
3
|
+
# This file is part of the RLSM module.
|
4
|
+
#
|
5
|
+
# Foobar is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# RLSM is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with RLSM. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
|
18
|
+
|
19
|
+
require File.join(File.dirname(__FILE__), 'monkey_patching')
|
20
|
+
|
21
|
+
module RLSM
|
22
|
+
class RegExp
|
23
|
+
#Creates a new RegExp from a string description. Metacharacters are
|
24
|
+
# & * | ( )
|
25
|
+
#Here & is the empty word and an empty string represents the empty set.
|
26
|
+
def initialize(str = "")
|
27
|
+
#Is the argument a well formed RegExp?
|
28
|
+
_well_formed?(str)
|
29
|
+
|
30
|
+
#More than one & or * in a row is useless
|
31
|
+
re = str.squeeze('&*')
|
32
|
+
|
33
|
+
#* on a & is &
|
34
|
+
re = re.gsub('&*', '&')
|
35
|
+
|
36
|
+
@re = NodeFactory.new_node(nil, re)
|
37
|
+
end
|
38
|
+
|
39
|
+
#--
|
40
|
+
#Operations of a regexp
|
41
|
+
#++
|
42
|
+
|
43
|
+
#Kleene star of the regexp. Alters the regexp in place
|
44
|
+
def star!
|
45
|
+
#For empty set and empty word a star changes nothing.
|
46
|
+
#A double star is also useless
|
47
|
+
return if empty? or lambda? or (@re.class == Star)
|
48
|
+
str = '(' + to_s + ')*'
|
49
|
+
@re = NodeFactory.new_node(nil, str)
|
50
|
+
|
51
|
+
#Unset the str rep
|
52
|
+
@re_str = nil
|
53
|
+
|
54
|
+
self
|
55
|
+
end
|
56
|
+
|
57
|
+
#Returns the kleene star of this regexp. Leaves the regexp untouched.
|
58
|
+
def star
|
59
|
+
self.deep_copy.star!
|
60
|
+
end
|
61
|
+
|
62
|
+
#Returns the concatenation of two regexps
|
63
|
+
def *(other)
|
64
|
+
return RegExp.new if empty? or other.empty?
|
65
|
+
RegExp.new('(' + to_s + ')(' + other.to_s + ')')
|
66
|
+
end
|
67
|
+
|
68
|
+
#Returns the union of two regexps
|
69
|
+
def +(other)
|
70
|
+
return self.deep_copy if other.empty?
|
71
|
+
return other.deep_copy if empty?
|
72
|
+
RegExp.new('(' + to_s + ')|(' + other.to_s + ')')
|
73
|
+
end
|
74
|
+
|
75
|
+
#--
|
76
|
+
#Some small flags
|
77
|
+
#++
|
78
|
+
#Returns true if this regexp is the empty word.
|
79
|
+
def lambda?
|
80
|
+
@re.lambda?
|
81
|
+
end
|
82
|
+
|
83
|
+
#Returns true if this regexp is the empty set.
|
84
|
+
def empty?
|
85
|
+
@re.empty?
|
86
|
+
end
|
87
|
+
|
88
|
+
#Returns true if the empty word matches this regexp
|
89
|
+
def null?
|
90
|
+
@re.null?
|
91
|
+
end
|
92
|
+
|
93
|
+
#--
|
94
|
+
#Some properties of a regexp
|
95
|
+
#++
|
96
|
+
|
97
|
+
#Returns an array of beginning symbols of the regexp.
|
98
|
+
def first
|
99
|
+
@re.first
|
100
|
+
end
|
101
|
+
|
102
|
+
#Returns an array of end symbols of the regexp.
|
103
|
+
def last
|
104
|
+
@re.last
|
105
|
+
end
|
106
|
+
|
107
|
+
#Returns an array of all possible two letter substrings of words matched by the regexp.
|
108
|
+
def follow
|
109
|
+
@re.follow.uniq
|
110
|
+
end
|
111
|
+
|
112
|
+
#--
|
113
|
+
#Conversion methods
|
114
|
+
#++
|
115
|
+
#Returns a string representation of the regexp
|
116
|
+
def to_s
|
117
|
+
@re_str ||= @re.to_s
|
118
|
+
end
|
119
|
+
|
120
|
+
#Returns a minimal DFA which accepts the same language as the regexp.
|
121
|
+
def to_dfa
|
122
|
+
#Step 1: Substitute every char such that every character is unique
|
123
|
+
#Add also a beginning marker
|
124
|
+
|
125
|
+
orig = []
|
126
|
+
rre = [0]
|
127
|
+
to_s.each_char do |c|
|
128
|
+
if ['(', ')', '|', '*', '&'].include? c
|
129
|
+
rre << c
|
130
|
+
else
|
131
|
+
orig << c
|
132
|
+
rre << (orig.size)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
tmp_re = NodeFactory.new_node(nil, rre)
|
137
|
+
|
138
|
+
#Step 2a: Construct a DFA representation of this new regexp
|
139
|
+
#Step 2b: reverse the substitution (yields (maybe) a NFA)
|
140
|
+
|
141
|
+
alph = orig.uniq
|
142
|
+
initial = 0
|
143
|
+
|
144
|
+
tmp_finals = tmp_re.last
|
145
|
+
|
146
|
+
tmp_trans = tmp_re.follow.map do |s1,s2|
|
147
|
+
[orig[s2-1], s1, s2]
|
148
|
+
end
|
149
|
+
|
150
|
+
#Step 4: Transform the NFA to a DFA
|
151
|
+
states = [[0]]
|
152
|
+
new_states = [[0]]
|
153
|
+
trans = []
|
154
|
+
while new_states.size > 0
|
155
|
+
tmp = new_states.deep_copy
|
156
|
+
new_states = []
|
157
|
+
|
158
|
+
tmp.each do |new_state|
|
159
|
+
alph.each do |char|
|
160
|
+
tr_set = tmp_trans.find_all do |c,s1,s2|
|
161
|
+
c == char and new_state.include? s1
|
162
|
+
end
|
163
|
+
|
164
|
+
unless tr_set.empty?
|
165
|
+
state = tr_set.map { |c,s1,s2| s2 }.sort
|
166
|
+
|
167
|
+
#Found a new state?
|
168
|
+
unless states.include? state
|
169
|
+
new_states << state
|
170
|
+
states << state
|
171
|
+
end
|
172
|
+
|
173
|
+
tr = [char, states.index(new_state), states.index(state)]
|
174
|
+
|
175
|
+
#Found new trans?
|
176
|
+
trans << tr unless trans.include? tr
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
finals = states.find_all do |state|
|
183
|
+
tmp_finals.any? { |tf| state.include? tf }
|
184
|
+
end.map { |fi| states.index(fi) }
|
185
|
+
|
186
|
+
states = (0...states.size).to_a
|
187
|
+
|
188
|
+
#Step 5: Return the result
|
189
|
+
RLSM::DFA.new(alph,states,initial,finals,trans).minimize(:rename_states => true)
|
190
|
+
end
|
191
|
+
|
192
|
+
def inspect # :nodoc:
|
193
|
+
"<#{self.class} : '#{to_s}' >"
|
194
|
+
end
|
195
|
+
|
196
|
+
#Returns true if the two regexps are the same, i.e. the dfas are isomorphic.
|
197
|
+
def ==(other)
|
198
|
+
to_dfa == other.to_dfa
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
private
|
203
|
+
def _well_formed?(str)
|
204
|
+
#parantheses must be balanced, somthing like |) or *a or (| isn't allowed
|
205
|
+
#1 balanced parenthesis
|
206
|
+
state = 0
|
207
|
+
count = Hash.new(0)
|
208
|
+
count['('] = 1
|
209
|
+
count[')'] = -1
|
210
|
+
str.each_char do |c|
|
211
|
+
state += count[c]
|
212
|
+
end
|
213
|
+
|
214
|
+
if state != 0
|
215
|
+
raise Exception, "Unbalanced parenthesis in #{str}"
|
216
|
+
end
|
217
|
+
|
218
|
+
#2 bad sequenzes
|
219
|
+
if str =~ /\(\)|\|\)|\(\||\|\*|^\*|\(\*/
|
220
|
+
raise Exception, "Bad character sequence #{$&} found in #{str}"
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
class PrimExp
|
225
|
+
def initialize(parent, str)
|
226
|
+
@parent = parent
|
227
|
+
if str == '&' or str == ['&']
|
228
|
+
@content = '&'
|
229
|
+
@null = true
|
230
|
+
else
|
231
|
+
@content = str.reject { |c| c == '&' }
|
232
|
+
@null = false
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def null?
|
237
|
+
@null
|
238
|
+
end
|
239
|
+
|
240
|
+
def first
|
241
|
+
@null ? [] : @content[0,1]
|
242
|
+
end
|
243
|
+
|
244
|
+
def last
|
245
|
+
@null ? [] : @content[-1,1]
|
246
|
+
end
|
247
|
+
|
248
|
+
def follow
|
249
|
+
res = []
|
250
|
+
|
251
|
+
(1...@content.length).each do |i|
|
252
|
+
res << [@content[i-1,1], @content[i,1]]
|
253
|
+
end
|
254
|
+
|
255
|
+
res
|
256
|
+
end
|
257
|
+
|
258
|
+
def to_s
|
259
|
+
@content.to_s
|
260
|
+
end
|
261
|
+
|
262
|
+
def lambda?
|
263
|
+
@null
|
264
|
+
end
|
265
|
+
|
266
|
+
def empty?
|
267
|
+
@content == '' or @content == []
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
class Star
|
272
|
+
def initialize(parent, str)
|
273
|
+
@parent = parent
|
274
|
+
@child = NodeFactory.new_node(self, str[(0..-2)])
|
275
|
+
end
|
276
|
+
|
277
|
+
def null?
|
278
|
+
true
|
279
|
+
end
|
280
|
+
|
281
|
+
def first
|
282
|
+
@child.first
|
283
|
+
end
|
284
|
+
|
285
|
+
def last
|
286
|
+
@child.last
|
287
|
+
end
|
288
|
+
|
289
|
+
def follow
|
290
|
+
res = @child.follow
|
291
|
+
|
292
|
+
#Cross of last and first
|
293
|
+
first.each do |f|
|
294
|
+
last.each do |l|
|
295
|
+
res << [l,f]
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
res
|
300
|
+
end
|
301
|
+
|
302
|
+
def to_s
|
303
|
+
if @child.class == PrimExp and @child.to_s.length == 1
|
304
|
+
return "#{@child.to_s}*"
|
305
|
+
else
|
306
|
+
return "(#{@child.to_s})*"
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
def lambda?
|
311
|
+
false
|
312
|
+
end
|
313
|
+
|
314
|
+
def empty?
|
315
|
+
false
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
class Union
|
320
|
+
def initialize(parent, str)
|
321
|
+
@parent = parent
|
322
|
+
@childs = _split(str).map do |substr|
|
323
|
+
NodeFactory.new_node(self, substr)
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
def null?
|
328
|
+
@childs.any? { |child| child.null? }
|
329
|
+
end
|
330
|
+
|
331
|
+
def first
|
332
|
+
res = []
|
333
|
+
@childs.each do |child|
|
334
|
+
child.first.each do |f|
|
335
|
+
res << f
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
res
|
340
|
+
end
|
341
|
+
|
342
|
+
def last
|
343
|
+
res = []
|
344
|
+
@childs.each do |child|
|
345
|
+
child.last.each do |l|
|
346
|
+
res << l
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
res
|
351
|
+
end
|
352
|
+
|
353
|
+
def follow
|
354
|
+
res = []
|
355
|
+
@childs.each do |child|
|
356
|
+
child.follow.each do |f|
|
357
|
+
res << f
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
res
|
362
|
+
end
|
363
|
+
|
364
|
+
def to_s
|
365
|
+
if @parent.nil? or @parent.class == Union
|
366
|
+
return @childs.map { |child| child.to_s }.join('|')
|
367
|
+
else
|
368
|
+
return '(' + @childs.map { |child| child.to_s }.join('|') + ')'
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
def lambda?
|
373
|
+
false
|
374
|
+
end
|
375
|
+
|
376
|
+
def empty?
|
377
|
+
false
|
378
|
+
end
|
379
|
+
|
380
|
+
private
|
381
|
+
def _split(str)
|
382
|
+
state = 0
|
383
|
+
count = Hash.new(0)
|
384
|
+
count['('] = 1
|
385
|
+
count[')'] = -1
|
386
|
+
|
387
|
+
res = [[]]
|
388
|
+
|
389
|
+
str.each_char do |c|
|
390
|
+
state += count[c]
|
391
|
+
if c == '|' and state == 0
|
392
|
+
res << []
|
393
|
+
else
|
394
|
+
res.last << c
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
res#.map { |substr| substr.join }
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
class Concat
|
403
|
+
def initialize(parent, str)
|
404
|
+
@parent = parent
|
405
|
+
@childs = _split(str).map do |substr|
|
406
|
+
NodeFactory.new_node(self, substr)
|
407
|
+
end.reject { |child| child.lambda? }
|
408
|
+
end
|
409
|
+
|
410
|
+
def null?
|
411
|
+
@childs.all? { |child| child.null? }
|
412
|
+
end
|
413
|
+
|
414
|
+
def first
|
415
|
+
res = []
|
416
|
+
@childs.each do |child|
|
417
|
+
child.first.each do |f|
|
418
|
+
res << f
|
419
|
+
end
|
420
|
+
|
421
|
+
break unless child.null?
|
422
|
+
end
|
423
|
+
|
424
|
+
res
|
425
|
+
end
|
426
|
+
|
427
|
+
def last
|
428
|
+
res = []
|
429
|
+
@childs.reverse.each do |child|
|
430
|
+
child.last.each do |f|
|
431
|
+
res << f
|
432
|
+
end
|
433
|
+
|
434
|
+
break unless child.null?
|
435
|
+
end
|
436
|
+
|
437
|
+
res
|
438
|
+
end
|
439
|
+
|
440
|
+
def follow
|
441
|
+
res = []
|
442
|
+
|
443
|
+
@childs.each do |child|
|
444
|
+
child.follow.each do |f|
|
445
|
+
res << f
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
(1...@childs.size).each do |i|
|
450
|
+
@childs[i-1].last.each do |l|
|
451
|
+
@childs[(i..-1)].each do |ch|
|
452
|
+
ch.first.each do |f|
|
453
|
+
res << [l,f]
|
454
|
+
end
|
455
|
+
|
456
|
+
break unless ch.null?
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
461
|
+
res
|
462
|
+
end
|
463
|
+
|
464
|
+
def to_s
|
465
|
+
@childs.map { |child| child.to_s }.join
|
466
|
+
end
|
467
|
+
|
468
|
+
def lambda?
|
469
|
+
false
|
470
|
+
end
|
471
|
+
|
472
|
+
def empty?
|
473
|
+
false
|
474
|
+
end
|
475
|
+
|
476
|
+
private
|
477
|
+
def _split(str)
|
478
|
+
state = 0
|
479
|
+
count = Hash.new(0)
|
480
|
+
count['('] = 1
|
481
|
+
count[')'] = -1
|
482
|
+
|
483
|
+
res = [[]]
|
484
|
+
previous = nil
|
485
|
+
str.each_char do |c|
|
486
|
+
state += count[c]
|
487
|
+
|
488
|
+
if state == 1 and c == '('
|
489
|
+
res << []
|
490
|
+
res.last << c
|
491
|
+
elsif state == 0 and c == '*'
|
492
|
+
if previous == ')'
|
493
|
+
res[-2] << c
|
494
|
+
else
|
495
|
+
res << [res.last.pop, c]
|
496
|
+
res << []
|
497
|
+
end
|
498
|
+
elsif state == 0 and c == ')'
|
499
|
+
res.last << c
|
500
|
+
res << []
|
501
|
+
else
|
502
|
+
res.last << c
|
503
|
+
end
|
504
|
+
|
505
|
+
previous = c
|
506
|
+
end
|
507
|
+
|
508
|
+
res.select { |subarr| subarr.size > 0 }#.map { |substr| substr.join }
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
class NodeFactory
|
513
|
+
def self.new_node(parent, arg)
|
514
|
+
|
515
|
+
#Remove parentheses
|
516
|
+
str = arg.dup
|
517
|
+
while sp(str)
|
518
|
+
str = str[(1..-2)]
|
519
|
+
end
|
520
|
+
#puts "Processing: #{arg} from #{parent.class}"
|
521
|
+
#Choose the right node type
|
522
|
+
if prim?(str)
|
523
|
+
return PrimExp.new(parent, str)
|
524
|
+
elsif star?(str)
|
525
|
+
return Star.new(parent, str)
|
526
|
+
elsif union?(str)
|
527
|
+
return Union.new(parent, str)
|
528
|
+
else
|
529
|
+
return Concat.new(parent, str)
|
530
|
+
end
|
531
|
+
|
532
|
+
end
|
533
|
+
|
534
|
+
private
|
535
|
+
def self.sp(str)
|
536
|
+
if str[0,1].include? '(' and str[-1,1].include? ')'
|
537
|
+
state = 0
|
538
|
+
l = 0
|
539
|
+
count = Hash.new(0)
|
540
|
+
count['('] = 1
|
541
|
+
count[')'] = -1
|
542
|
+
|
543
|
+
str.each_char do |c|
|
544
|
+
state += count[c]
|
545
|
+
l += 1
|
546
|
+
break if state == 0
|
547
|
+
end
|
548
|
+
|
549
|
+
return true if str.length == l
|
550
|
+
end
|
551
|
+
|
552
|
+
false
|
553
|
+
end
|
554
|
+
|
555
|
+
def self.prim?(str)
|
556
|
+
not ['(', ')', '|', '*'].any? { |c| str.include? c }
|
557
|
+
end
|
558
|
+
|
559
|
+
def self.star?(str)
|
560
|
+
if str[-1,1].include? '*'
|
561
|
+
return true if sp(str[(0..-2)]) #something like (....)*
|
562
|
+
return true if str.length == 2 #something like a*
|
563
|
+
end
|
564
|
+
|
565
|
+
false
|
566
|
+
end
|
567
|
+
|
568
|
+
def self.union?(str)
|
569
|
+
state = 0
|
570
|
+
count = Hash.new(0)
|
571
|
+
count['('] = 1
|
572
|
+
count[')'] = -1
|
573
|
+
|
574
|
+
str.each_char do |c|
|
575
|
+
state += count[c]
|
576
|
+
|
577
|
+
return true if c == '|' and state == 0
|
578
|
+
end
|
579
|
+
|
580
|
+
false
|
581
|
+
end
|
582
|
+
end
|
583
|
+
end
|
584
|
+
end
|