stamina-induction 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +78 -0
- data/LICENCE.md +22 -0
- data/lib/stamina-induction/stamina-induction.rb +1 -0
- data/lib/stamina-induction/stamina/abbadingo.rb +2 -0
- data/lib/stamina-induction/stamina/abbadingo/random_dfa.rb +55 -0
- data/lib/stamina-induction/stamina/abbadingo/random_sample.rb +146 -0
- data/lib/stamina-induction/stamina/classifier.rb +55 -0
- data/lib/stamina-induction/stamina/command.rb +6 -0
- data/lib/stamina-induction/stamina/command/abbadingo_dfa.rb +80 -0
- data/lib/stamina-induction/stamina/command/abbadingo_samples.rb +39 -0
- data/lib/stamina-induction/stamina/command/classify.rb +47 -0
- data/lib/stamina-induction/stamina/command/infer.rb +140 -0
- data/lib/stamina-induction/stamina/command/metrics.rb +50 -0
- data/lib/stamina-induction/stamina/command/score.rb +34 -0
- data/lib/stamina-induction/stamina/dsl.rb +2 -0
- data/lib/stamina-induction/stamina/dsl/induction.rb +29 -0
- data/lib/stamina-induction/stamina/dsl/reg_lang.rb +69 -0
- data/lib/stamina-induction/stamina/induction.rb +13 -0
- data/lib/stamina-induction/stamina/induction/blue_fringe.rb +265 -0
- data/lib/stamina-induction/stamina/induction/commons.rb +156 -0
- data/lib/stamina-induction/stamina/induction/rpni.rb +186 -0
- data/lib/stamina-induction/stamina/induction/union_find.rb +377 -0
- data/lib/stamina-induction/stamina/input_string.rb +123 -0
- data/lib/stamina-induction/stamina/reg_lang.rb +226 -0
- data/lib/stamina-induction/stamina/reg_lang/canonical_info.rb +181 -0
- data/lib/stamina-induction/stamina/reg_lang/parser.rb +10 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/alternative.rb +19 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/node.rb +22 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parenthesized.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parser.citrus +49 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/plus.rb +14 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/question.rb +17 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/regexp.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/sequence.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/star.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/symbol.rb +14 -0
- data/lib/stamina-induction/stamina/sample.rb +309 -0
- data/lib/stamina-induction/stamina/scoring.rb +213 -0
- metadata +106 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative "parser/node"
|
2
|
+
require_relative "parser/parenthesized"
|
3
|
+
require_relative "parser/symbol"
|
4
|
+
require_relative "parser/question"
|
5
|
+
require_relative "parser/plus"
|
6
|
+
require_relative "parser/star"
|
7
|
+
require_relative "parser/sequence"
|
8
|
+
require_relative "parser/alternative"
|
9
|
+
require_relative "parser/regexp"
|
10
|
+
Citrus.require File.expand_path("../parser/parser", __FILE__)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Alternative
|
4
|
+
include Node
|
5
|
+
|
6
|
+
def to_fa!(fa)
|
7
|
+
from, to = fa.add_n_states(2)
|
8
|
+
f1, t1 = self.head.to_fa!(fa)
|
9
|
+
f2, t2 = self.tail.to_fa!(fa)
|
10
|
+
fa.connect(from, f1, nil)
|
11
|
+
fa.connect(from, f2, nil)
|
12
|
+
fa.connect(t1, to, nil)
|
13
|
+
fa.connect(t2, to, nil)
|
14
|
+
[from, to]
|
15
|
+
end
|
16
|
+
|
17
|
+
end # module Alternative
|
18
|
+
end # class RegLang
|
19
|
+
end # module Stamina
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Node
|
4
|
+
|
5
|
+
def to_fa
|
6
|
+
from, to = to_fa!(fa = Automaton.new)
|
7
|
+
from.initial!
|
8
|
+
to.accepting!
|
9
|
+
fa
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_dfa
|
13
|
+
to_fa.to_dfa
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_cdfa
|
17
|
+
to_fa.to_cdfa
|
18
|
+
end
|
19
|
+
|
20
|
+
end # module Node
|
21
|
+
end # class RegLang
|
22
|
+
end # module Stamina
|
@@ -0,0 +1,49 @@
|
|
1
|
+
grammar Stamina::RegLang::Parser
|
2
|
+
|
3
|
+
rule regexp
|
4
|
+
(space* alt:alternative space*) <Stamina::RegLang::Regexp>
|
5
|
+
end
|
6
|
+
|
7
|
+
rule alternative
|
8
|
+
(head:sequence space* '|' space* tail:alternative) <Stamina::RegLang::Alternative>
|
9
|
+
| sequence
|
10
|
+
end
|
11
|
+
|
12
|
+
rule sequence
|
13
|
+
(head:monadic space+ tail:sequence) <Stamina::RegLang::Sequence>
|
14
|
+
| monadic
|
15
|
+
end
|
16
|
+
|
17
|
+
rule monadic
|
18
|
+
star | plus | question | term
|
19
|
+
end
|
20
|
+
|
21
|
+
rule star
|
22
|
+
(term '*') <Stamina::RegLang::Star>
|
23
|
+
end
|
24
|
+
|
25
|
+
rule plus
|
26
|
+
(term '+') <Stamina::RegLang::Plus>
|
27
|
+
end
|
28
|
+
|
29
|
+
rule question
|
30
|
+
(term '?') <Stamina::RegLang::Question>
|
31
|
+
end
|
32
|
+
|
33
|
+
rule term
|
34
|
+
symbol | parenthesized
|
35
|
+
end
|
36
|
+
|
37
|
+
rule symbol
|
38
|
+
[a-zA-Z0-9$_-]+ <Stamina::RegLang::Symbol>
|
39
|
+
end
|
40
|
+
|
41
|
+
rule parenthesized
|
42
|
+
('(' space* expr:regexp space* ')') <Stamina::RegLang::Parenthesized>
|
43
|
+
end
|
44
|
+
|
45
|
+
rule space
|
46
|
+
[ \t\n]
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Question
|
4
|
+
include Node
|
5
|
+
|
6
|
+
def to_fa!(fa)
|
7
|
+
f1, t1 = fa.add_n_states(2)
|
8
|
+
f2, t2 = self.term.to_fa!(fa)
|
9
|
+
fa.connect(f1,f2,nil)
|
10
|
+
fa.connect(t2,t1,nil)
|
11
|
+
fa.connect(f1,t1,nil)
|
12
|
+
[f1, t1]
|
13
|
+
end
|
14
|
+
|
15
|
+
end # module Question
|
16
|
+
end # class RegLang
|
17
|
+
end # module Stamina
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Sequence
|
4
|
+
include Node
|
5
|
+
|
6
|
+
def to_fa!(fa)
|
7
|
+
f1, t1 = self.head.to_fa!(fa)
|
8
|
+
f2, t2 = self.tail.to_fa!(fa)
|
9
|
+
fa.connect(t1, f2, nil)
|
10
|
+
[f1, t2]
|
11
|
+
end
|
12
|
+
|
13
|
+
end # module Sequence
|
14
|
+
end # class RegLang
|
15
|
+
end # module Stamina
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Star
|
4
|
+
include Node
|
5
|
+
|
6
|
+
def to_fa!(fa)
|
7
|
+
from, to = self.term.to_fa!(fa)
|
8
|
+
fa.connect(to, from, nil)
|
9
|
+
fa.connect(from, to, nil)
|
10
|
+
[from, to]
|
11
|
+
end
|
12
|
+
|
13
|
+
end # module Star
|
14
|
+
end # class RegLang
|
15
|
+
end # module Stamina
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Stamina
|
2
|
+
class RegLang
|
3
|
+
module Symbol
|
4
|
+
include Node
|
5
|
+
|
6
|
+
def to_fa!(fa)
|
7
|
+
from, to = fa.add_n_states(2, :initial => false, :accepting => false)
|
8
|
+
fa.connect(from, to, to_s)
|
9
|
+
[from, to]
|
10
|
+
end
|
11
|
+
|
12
|
+
end # module Symbol
|
13
|
+
end # class RegLang
|
14
|
+
end # module Stamina
|
@@ -0,0 +1,309 @@
|
|
1
|
+
module Stamina
|
2
|
+
|
3
|
+
#
|
4
|
+
# A sample as an ordered collection of InputString labeled as positive or negative.
|
5
|
+
#
|
6
|
+
# == Tips and tricks
|
7
|
+
# - loading samples from disk is easy thanks to ADL !
|
8
|
+
#
|
9
|
+
# == Detailed API
|
10
|
+
class Sample
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
# Number of strings in the sample
|
14
|
+
attr_reader :size
|
15
|
+
|
16
|
+
# Number of positive strings in the sample
|
17
|
+
attr_reader :positive_count
|
18
|
+
|
19
|
+
# Number of negative strings in the sample
|
20
|
+
attr_reader :negative_count
|
21
|
+
|
22
|
+
#
|
23
|
+
# Creates an empty sample and appends it with args, by calling Sample#<< on
|
24
|
+
# each of them.
|
25
|
+
#
|
26
|
+
def self.[](*args) Sample.new << args end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Creates an empty sample.
|
30
|
+
#
|
31
|
+
def initialize(strings = nil)
|
32
|
+
@strings = []
|
33
|
+
@size, @positive_count, @negative_count = 0, 0, 0
|
34
|
+
strings.each{|s| self << s } unless strings.nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Coerces `arg` to a Sample instance.
|
39
|
+
#
|
40
|
+
def self.coerce(arg)
|
41
|
+
if arg.is_a?(Sample)
|
42
|
+
arg
|
43
|
+
elsif arg.is_a?(String)
|
44
|
+
parse(arg)
|
45
|
+
else
|
46
|
+
raise ArgumentError, "Invalid argument #{arg} for `Sample`"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Parses an ADL input
|
52
|
+
#
|
53
|
+
def self.parse(adl)
|
54
|
+
ADL::parse_sample(adl)
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Returns true if this sample does not contain any string,
|
59
|
+
# false otherwise.
|
60
|
+
#
|
61
|
+
def empty?()
|
62
|
+
@size==0
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Adds a string to the sample. The _str_ argument may be an InputString instance,
|
67
|
+
# a String (parsed using ADL), a Sample instance (all strings are added) or an
|
68
|
+
# Array (recurses on each element).
|
69
|
+
#
|
70
|
+
# Raises an InconsistencyError if the same string already exists with the
|
71
|
+
# opposite label. Raises an ArgumentError if the _str_ argument is not recognized.
|
72
|
+
#
|
73
|
+
def <<(str)
|
74
|
+
case str
|
75
|
+
when InputString
|
76
|
+
#raise(InconsistencyError, "Inconsistent sample on #{str}", caller) if self.include?(str.negate)
|
77
|
+
@size += 1
|
78
|
+
str.positive? ? (@positive_count += 1) : (@negative_count += 1)
|
79
|
+
@strings << str
|
80
|
+
when String
|
81
|
+
self << ADL::parse_string(str)
|
82
|
+
when Sample
|
83
|
+
str.each {|s| self << s}
|
84
|
+
when Array
|
85
|
+
str.each {|s| self << s}
|
86
|
+
else
|
87
|
+
raise(ArgumentError, "#{str} is not a valid argument.", caller)
|
88
|
+
end
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Returns true if a given string is included in the sample, false otherwise.
|
94
|
+
# This method allows same flexibility as << for the _str_ argument.
|
95
|
+
#
|
96
|
+
def include?(str)
|
97
|
+
case str
|
98
|
+
when InputString
|
99
|
+
@strings.include?(str)
|
100
|
+
when String
|
101
|
+
include?(ADL::parse_string(str))
|
102
|
+
when Array
|
103
|
+
str.each {|s| return false unless include?(s)}
|
104
|
+
true
|
105
|
+
when Sample
|
106
|
+
str.each {|s| return false unless include?(s)}
|
107
|
+
true
|
108
|
+
else
|
109
|
+
raise(ArgumentError, "#{str} is not a valid argument.", caller)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Returns a new sample as the union of both `self` and `other`
|
115
|
+
#
|
116
|
+
def +(other)
|
117
|
+
s = Sample.new
|
118
|
+
each{|x| s << x}
|
119
|
+
other.each{|x| s << x}
|
120
|
+
s
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Compares with another sample _other_, which is required to be a Sample
|
125
|
+
# instance. Returns true if the two samples contains the same strings (including
|
126
|
+
# labels), false otherwise.
|
127
|
+
#
|
128
|
+
def ==(other)
|
129
|
+
include?(other) and other.include?(self)
|
130
|
+
end
|
131
|
+
alias :eql? :==
|
132
|
+
|
133
|
+
#
|
134
|
+
# Computes an hash code for this sample.
|
135
|
+
#
|
136
|
+
def hash
|
137
|
+
self.inject(37){|memo,str| memo + 17*str.hash}
|
138
|
+
end
|
139
|
+
|
140
|
+
#
|
141
|
+
# Yields the block with each string. This method has no effect if no
|
142
|
+
# block is given.
|
143
|
+
#
|
144
|
+
def each
|
145
|
+
return unless block_given?
|
146
|
+
@strings.each {|str| yield str}
|
147
|
+
end
|
148
|
+
|
149
|
+
#
|
150
|
+
# Yields the block with each positive string. This method has no effect if no
|
151
|
+
# block is given.
|
152
|
+
#
|
153
|
+
def each_positive
|
154
|
+
return unless block_given?
|
155
|
+
each {|str| yield str if str.positive?}
|
156
|
+
end
|
157
|
+
|
158
|
+
#
|
159
|
+
# Returns an enumerator on positive strings.
|
160
|
+
#
|
161
|
+
def positive_enumerator
|
162
|
+
if RUBY_VERSION >= "1.9"
|
163
|
+
Enumerator.new(self, :each_positive)
|
164
|
+
else
|
165
|
+
Enumerable::Enumerator.new(self, :each_positive)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Yields the block with each negative string. This method has no effect if no
|
171
|
+
# block is given.
|
172
|
+
#
|
173
|
+
def each_negative
|
174
|
+
each {|str| yield str if str.negative?}
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
# Returns an enumerator on negative strings.
|
179
|
+
#
|
180
|
+
def negative_enumerator
|
181
|
+
if RUBY_VERSION >= "1.9"
|
182
|
+
Enumerator.new(self, :each_negative)
|
183
|
+
else
|
184
|
+
Enumerable::Enumerator.new(self, :each_negative)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
#
|
189
|
+
# Checks if the sample is correctly classified by a given classifier
|
190
|
+
# (expected to include the Stamina::Classfier module).
|
191
|
+
# Unlabeled strings are simply ignored.
|
192
|
+
#
|
193
|
+
def correctly_classified_by?(classifier)
|
194
|
+
classifier.correctly_classify?(self)
|
195
|
+
end
|
196
|
+
|
197
|
+
#
|
198
|
+
# Computes and returns the binary signature of the sample. The signature
|
199
|
+
# is a String having one character for each string in the sample. A '1'
|
200
|
+
# is used for positive strings, '0' for negative ones and '?' for unlabeled.
|
201
|
+
#
|
202
|
+
def signature
|
203
|
+
signature = ''
|
204
|
+
each do |str|
|
205
|
+
signature << (str.unlabeled? ? '?' : str.positive? ? '1' : '0')
|
206
|
+
end
|
207
|
+
signature
|
208
|
+
end
|
209
|
+
|
210
|
+
#
|
211
|
+
# Takes only a given proportion of this sample and returns it as a new Sample.
|
212
|
+
#
|
213
|
+
def take(proportion = 0.5)
|
214
|
+
taken = Stamina::Sample.new
|
215
|
+
each_positive{|s| taken << s if Kernel.rand < proportion}
|
216
|
+
each_negative{|s| taken << s if Kernel.rand < proportion}
|
217
|
+
taken
|
218
|
+
end
|
219
|
+
|
220
|
+
#
|
221
|
+
# Prints an ADL description of this sample on the buffer.
|
222
|
+
#
|
223
|
+
def to_adl(buffer="")
|
224
|
+
self.inject(buffer) {|memo,str| memo << "\n" << str.to_adl}
|
225
|
+
end
|
226
|
+
alias :to_s :to_adl
|
227
|
+
alias :inspect :to_adl
|
228
|
+
|
229
|
+
#
|
230
|
+
# Converts a Sample to an (augmented) prefix tree acceptor. This method ensures
|
231
|
+
# that the states of the PTA are in lexical order, according to the <code><=></code>
|
232
|
+
# operator defined on symbols. States reached by negative strings are tagged as
|
233
|
+
# non accepting and error.
|
234
|
+
#
|
235
|
+
def self.to_pta(sample)
|
236
|
+
thepta = Automaton.new do |pta|
|
237
|
+
initial_state = add_state(:initial => true, :accepting => false)
|
238
|
+
|
239
|
+
# Fill the PTA with each string
|
240
|
+
sample.each do |str|
|
241
|
+
# split string using the dfa
|
242
|
+
parsed, reached, remaining = pta.dfa_split(str, initial_state)
|
243
|
+
|
244
|
+
# remaining symbols are not empty -> build the PTA
|
245
|
+
unless remaining.empty?
|
246
|
+
remaining.each do |symbol|
|
247
|
+
newone = pta.add_state(:initial => false, :accepting => false, :error => false)
|
248
|
+
pta.connect(reached, newone, symbol)
|
249
|
+
reached = newone
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# flag state
|
254
|
+
str.positive? ? reached.accepting! : reached.error!
|
255
|
+
|
256
|
+
# check consistency, should not arrive as Sample does not allow
|
257
|
+
# inconsistencies. Should appear only if _sample_ is not a Sample
|
258
|
+
# instance but some other enumerable.
|
259
|
+
raise(InconsistencyError, "Inconsistent sample on #{str}", caller)\
|
260
|
+
if (reached.error? and reached.accepting?)
|
261
|
+
end
|
262
|
+
|
263
|
+
# Reindex states by applying BFS
|
264
|
+
to_index, index = [initial_state], 0
|
265
|
+
until to_index.empty?
|
266
|
+
state = to_index.shift
|
267
|
+
state[:__index__] = index
|
268
|
+
state.out_edges.sort{|e,f| e.symbol<=>f.symbol}.each{|e| to_index << e.target}
|
269
|
+
index += 1
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
# Now we rebuild a fresh one with states in order.
|
274
|
+
# This look more efficient that reordering states of the PTA
|
275
|
+
Automaton.new do |ordered|
|
276
|
+
ordered.add_n_states(thepta.state_count)
|
277
|
+
thepta.each_state do |pta_state|
|
278
|
+
source = ordered.ith_state(pta_state[:__index__])
|
279
|
+
source.initial! if pta_state.initial?
|
280
|
+
source.accepting! if pta_state.accepting?
|
281
|
+
source.error! if pta_state.error?
|
282
|
+
pta_state.out_edges.each do |e|
|
283
|
+
target = ordered.ith_state(e.target[:__index__])
|
284
|
+
ordered.connect(source, target, e.symbol)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
end
|
290
|
+
|
291
|
+
# Converts this sample to a PTA
|
292
|
+
def to_pta
|
293
|
+
Sample.to_pta(self)
|
294
|
+
end
|
295
|
+
alias :to_fa :to_pta
|
296
|
+
alias :to_dfa :to_pta
|
297
|
+
|
298
|
+
# Converts this sample to a canonical dfa
|
299
|
+
def to_cdfa
|
300
|
+
to_pta.to_cdfa
|
301
|
+
end
|
302
|
+
|
303
|
+
# Converts this sample to a dot output
|
304
|
+
def to_dot
|
305
|
+
to_pta.to_dot
|
306
|
+
end
|
307
|
+
|
308
|
+
end # class Sample
|
309
|
+
end # module Stamina
|