stamina 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/CHANGELOG.md +22 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +33 -0
- data/LICENCE.md +22 -0
- data/Manifest.txt +16 -0
- data/README.md +78 -0
- data/Rakefile +23 -0
- data/bin/adl2dot +12 -0
- data/bin/classify +12 -0
- data/bin/redblue +12 -0
- data/bin/rpni +12 -0
- data/example/adl/automaton.adl +49 -0
- data/example/adl/sample.adl +53 -0
- data/example/basic/characteristic_sample.adl +32 -0
- data/example/basic/target.adl +9 -0
- data/example/competition/31_test.adl +1500 -0
- data/example/competition/31_training.adl +1759 -0
- data/lib/stamina.rb +19 -0
- data/lib/stamina/adl.rb +298 -0
- data/lib/stamina/automaton.rb +1237 -0
- data/lib/stamina/automaton/walking.rb +336 -0
- data/lib/stamina/classifier.rb +37 -0
- data/lib/stamina/command/adl2dot_command.rb +73 -0
- data/lib/stamina/command/classify_command.rb +57 -0
- data/lib/stamina/command/redblue_command.rb +58 -0
- data/lib/stamina/command/rpni_command.rb +58 -0
- data/lib/stamina/command/stamina_command.rb +79 -0
- data/lib/stamina/errors.rb +20 -0
- data/lib/stamina/induction/commons.rb +170 -0
- data/lib/stamina/induction/redblue.rb +264 -0
- data/lib/stamina/induction/rpni.rb +188 -0
- data/lib/stamina/induction/union_find.rb +377 -0
- data/lib/stamina/input_string.rb +123 -0
- data/lib/stamina/loader.rb +0 -0
- data/lib/stamina/markable.rb +42 -0
- data/lib/stamina/sample.rb +190 -0
- data/lib/stamina/version.rb +14 -0
- data/stamina.gemspec +190 -0
- data/stamina.noespec +35 -0
- data/tasks/debug_mail.rake +78 -0
- data/tasks/debug_mail.txt +13 -0
- data/tasks/gem.rake +68 -0
- data/tasks/spec_test.rake +79 -0
- data/tasks/unit_test.rake +77 -0
- data/tasks/yard.rake +51 -0
- data/test/stamina/adl_test.rb +491 -0
- data/test/stamina/automaton_additional_test.rb +190 -0
- data/test/stamina/automaton_classifier_test.rb +155 -0
- data/test/stamina/automaton_test.rb +1092 -0
- data/test/stamina/automaton_to_dot_test.rb +64 -0
- data/test/stamina/automaton_walking_test.rb +206 -0
- data/test/stamina/exit.rb +3 -0
- data/test/stamina/induction/induction_test.rb +70 -0
- data/test/stamina/induction/redblue_mergesamestatebug_expected.adl +19 -0
- data/test/stamina/induction/redblue_mergesamestatebug_pta.dot +64 -0
- data/test/stamina/induction/redblue_mergesamestatebug_sample.adl +9 -0
- data/test/stamina/induction/redblue_test.rb +83 -0
- data/test/stamina/induction/redblue_universal_expected.adl +4 -0
- data/test/stamina/induction/redblue_universal_sample.adl +5 -0
- data/test/stamina/induction/rpni_inria_expected.adl +7 -0
- data/test/stamina/induction/rpni_inria_sample.adl +9 -0
- data/test/stamina/induction/rpni_test.rb +129 -0
- data/test/stamina/induction/rpni_test_pta.dot +22 -0
- data/test/stamina/induction/rpni_universal_expected.adl +4 -0
- data/test/stamina/induction/rpni_universal_sample.adl +4 -0
- data/test/stamina/induction/union_find_test.rb +124 -0
- data/test/stamina/input_string_test.rb +323 -0
- data/test/stamina/markable_test.rb +70 -0
- data/test/stamina/randdfa.adl +66 -0
- data/test/stamina/sample.adl +4 -0
- data/test/stamina/sample_classify_test.rb +149 -0
- data/test/stamina/sample_test.rb +218 -0
- data/test/stamina/small_dfa.dot +16 -0
- data/test/stamina/small_dfa.gif +0 -0
- data/test/stamina/small_nfa.dot +18 -0
- data/test/stamina/small_nfa.gif +0 -0
- data/test/stamina/stamina_test.rb +69 -0
- data/test/test_all.rb +7 -0
- metadata +279 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'stamina/command/stamina_command'
|
2
|
+
require 'stamina/induction/redblue'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Implementation of the redblue command line tool
|
7
|
+
class RedBlueCommand < StaminaCommand
|
8
|
+
|
9
|
+
# Creates a score command instance
|
10
|
+
def initialize
|
11
|
+
super("redblue", "[options] sample.adl",
|
12
|
+
"Executes RedBlue (Regular Positive and Negative Inference) on a ADL sample and\n"\
|
13
|
+
"flushes the induced DFA on the standard output in ADL format as well")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Installs additional options
|
17
|
+
def options
|
18
|
+
super do |opt|
|
19
|
+
opt.on("-v", "--verbose", "Verbose mode") do
|
20
|
+
@verbose = true
|
21
|
+
end
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush induced DFA in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Sets the sample file
|
31
|
+
def sample_file=(file)
|
32
|
+
assert_readable_file(file)
|
33
|
+
puts "Parsing sample and building PTA" if @verbose
|
34
|
+
@sample = Stamina::ADL.parse_sample_file(file)
|
35
|
+
rescue Stamina::ADL::ParseError
|
36
|
+
raise ArgumentError, "#{file} is not a valid ADL sample file"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Executes the command
|
40
|
+
def main(argv)
|
41
|
+
parse(argv, :sample_file)
|
42
|
+
t1 = Time.now
|
43
|
+
dfa = Stamina::Induction::RedBlue.execute(@sample, {:verbose => @verbose})
|
44
|
+
t2 = Time.now
|
45
|
+
if @output_file
|
46
|
+
File.open(@output_file, 'w') do |file|
|
47
|
+
Stamina::ADL.print_automaton(dfa, file)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
Stamina::ADL.print_automaton(dfa, STDOUT)
|
51
|
+
end
|
52
|
+
puts "Executed in #{t2-t1} sec" if @verbose
|
53
|
+
end
|
54
|
+
|
55
|
+
end # class ScoreCommand
|
56
|
+
|
57
|
+
end # module Command
|
58
|
+
end # module Stamina
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'stamina/command/stamina_command'
|
2
|
+
require 'stamina/induction/rpni'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Implementation of the rpni command line tool
|
7
|
+
class RPNICommand < StaminaCommand
|
8
|
+
|
9
|
+
# Creates a score command instance
|
10
|
+
def initialize
|
11
|
+
super("rpni", "[options] sample.adl",
|
12
|
+
"Executes RPNI (Regular Positive and Negative Inference) on a ADL sample and\n"\
|
13
|
+
"flushes the induced DFA on the standard output in ADL format as well")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Installs additional options
|
17
|
+
def options
|
18
|
+
super do |opt|
|
19
|
+
opt.on("-v", "--verbose", "Verbose mode") do
|
20
|
+
@verbose = true
|
21
|
+
end
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush induced DFA in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Sets the sample file
|
31
|
+
def sample_file=(file)
|
32
|
+
assert_readable_file(file)
|
33
|
+
puts "Parsing sample and building PTA" if @verbose
|
34
|
+
@sample = Stamina::ADL.parse_sample_file(file)
|
35
|
+
rescue Stamina::ADL::ParseError
|
36
|
+
raise ArgumentError, "#{file} is not a valid ADL sample file"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Executes the command
|
40
|
+
def main(argv)
|
41
|
+
parse(argv, :sample_file)
|
42
|
+
t1 = Time.now
|
43
|
+
dfa = Stamina::Induction::RPNI.execute(@sample, {:verbose => @verbose})
|
44
|
+
t2 = Time.now
|
45
|
+
if @output_file
|
46
|
+
File.open(@output_file, 'w') do |file|
|
47
|
+
Stamina::ADL.print_automaton(dfa, file)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
Stamina::ADL.print_automaton(dfa, STDOUT)
|
51
|
+
end
|
52
|
+
puts "Executed in #{t2-t1} sec" if @verbose
|
53
|
+
end
|
54
|
+
|
55
|
+
end # class ScoreCommand
|
56
|
+
|
57
|
+
end # module Command
|
58
|
+
end # module Stamina
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'stamina'
|
2
|
+
require 'optparse'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Helper to create stamina commands
|
7
|
+
class StaminaCommand
|
8
|
+
|
9
|
+
# Command name
|
10
|
+
attr_reader :name
|
11
|
+
|
12
|
+
# Command description
|
13
|
+
attr_reader :description
|
14
|
+
|
15
|
+
# Command usage
|
16
|
+
attr_reader :usage
|
17
|
+
|
18
|
+
# Creates a command with a name, usage and description
|
19
|
+
def initialize(name, usage, description)
|
20
|
+
@name = name
|
21
|
+
@usage = usage
|
22
|
+
@description = description
|
23
|
+
end
|
24
|
+
|
25
|
+
# Creates options
|
26
|
+
def options(&block)
|
27
|
+
OptionParser.new do |opt|
|
28
|
+
opt.program_name = name
|
29
|
+
opt.version = Stamina::VERSION
|
30
|
+
opt.release = nil
|
31
|
+
opt.summary_indent = ' ' * 4
|
32
|
+
banner = <<-EOF
|
33
|
+
# usage: #{opt.program_name} #{usage}
|
34
|
+
# #{description}
|
35
|
+
EOF
|
36
|
+
opt.banner = banner.gsub(/[ \t]+# /, "")
|
37
|
+
block.call(opt) if block
|
38
|
+
opt.on_tail("-h", "--help", "Show this message") do
|
39
|
+
puts opt
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Prints usage (and optionnaly exits)
|
46
|
+
def show_usage(and_exit=true)
|
47
|
+
puts options
|
48
|
+
Kernel.exit if and_exit
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks that a given file is readable or raises an ArgumentError
|
52
|
+
def assert_readable_file(file)
|
53
|
+
raise ArgumentError, "File #{file} does not exists" unless File.exists?(file)
|
54
|
+
raise ArgumentError, "File #{file} cannot be read" unless File.readable?(file)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Checks that a given file is writable or raises an ArgumentError
|
58
|
+
def assert_writable_file(file)
|
59
|
+
raise ArgumentError, "File #{file} cannot be written" \
|
60
|
+
unless not(File.exists?(file)) or File.writable?(file)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Parses arguments and install last argument as instance variables
|
64
|
+
def parse(argv, *variables)
|
65
|
+
rest = options.parse(argv)
|
66
|
+
show_usage(true) unless rest.size==variables.size
|
67
|
+
variables.each_with_index do |var,i|
|
68
|
+
self.send("#{var}=".to_sym, rest[i])
|
69
|
+
end
|
70
|
+
rescue ArgumentError => ex
|
71
|
+
puts ex.message
|
72
|
+
puts
|
73
|
+
show_usage(true)
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class StaminaCommand
|
77
|
+
|
78
|
+
end # module Command
|
79
|
+
end # module Stamina
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Stamina
|
2
|
+
|
3
|
+
# Main class of all stamina errors.
|
4
|
+
class StaminaError < StandardError; end
|
5
|
+
|
6
|
+
# Raised by samples implementations and other induction algorithms
|
7
|
+
# when a sample is inconsistent (same string labeled as being both
|
8
|
+
# positive and negative)
|
9
|
+
class InconsistencyError < StaminaError; end
|
10
|
+
|
11
|
+
# Specific errors of the ADL module.
|
12
|
+
module ADL
|
13
|
+
|
14
|
+
# Raised by the ADL module when an automaton, string or sample
|
15
|
+
# format is violated at parsing time.
|
16
|
+
class ParseError < StaminaError; end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end # module Stamina
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Defines common utilities used by rpni and redblue. About acronyms:
|
6
|
+
# - _pta_ stands for Prefix Tree Acceptor
|
7
|
+
# - _ufds_ stands for Union-Find Data Structure
|
8
|
+
#
|
9
|
+
# Methods pta2ufds, sample2pta and sample2ufds are simply conversion methods used
|
10
|
+
# when the induction algorithm starts (executed on a sample, it first built a pta
|
11
|
+
# then convert it to a union find). Method ufds2pta is used when the algorithm ends,
|
12
|
+
# to convert refined union find to a dfa.
|
13
|
+
#
|
14
|
+
# The merge_user_data method is probably the most important as it actually computes
|
15
|
+
# the merging of two states and build information about merging for determinization.
|
16
|
+
#
|
17
|
+
module Commons
|
18
|
+
|
19
|
+
#
|
20
|
+
# Factors and returns a UnionFind data structure from a PTA, keeping natural order
|
21
|
+
# of its states for union-find elements. The resulting UnionFind contains a Hash as
|
22
|
+
# mergeable user data, presenting the following keys:
|
23
|
+
# - :initial, :accepting and :error flags of each state
|
24
|
+
# - :master indicating the index of the state in the PTA
|
25
|
+
# - :delta a delta function through a Hash {symbol => state_index}
|
26
|
+
#
|
27
|
+
# In this version, other user data attached to PTA states is lost during the
|
28
|
+
# conversion.
|
29
|
+
#
|
30
|
+
def pta2ufds(pta)
|
31
|
+
Stamina::Induction::UnionFind.new(pta.state_count) do |i|
|
32
|
+
state = pta.ith_state(i)
|
33
|
+
data = {:initial => state.initial?,
|
34
|
+
:accepting => state.accepting?,
|
35
|
+
:error => state.error?,
|
36
|
+
:master => i,
|
37
|
+
:delta => {}}
|
38
|
+
state.out_edges.each {|edge| data[:delta][edge.symbol] = edge.target.index}
|
39
|
+
data
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Converts a Sample to an (augmented) prefix tree acceptor. This method ensures
|
45
|
+
# that the states of the PTA are in lexical order, according to the <code><=></code>
|
46
|
+
# operator defined on symbols. States reached by negative strings are tagged as
|
47
|
+
# non accepting and error.
|
48
|
+
#
|
49
|
+
def sample2pta(sample)
|
50
|
+
Automaton.new do |pta|
|
51
|
+
initial_state = add_state(:initial => true, :accepting => false)
|
52
|
+
|
53
|
+
# Fill the PTA with each string
|
54
|
+
sample.each do |str|
|
55
|
+
# split string using the dfa
|
56
|
+
parsed, reached, remaining = pta.dfa_split(str, initial_state)
|
57
|
+
|
58
|
+
# remaining symbols are not empty -> build the PTA
|
59
|
+
unless remaining.empty?
|
60
|
+
remaining.each do |symbol|
|
61
|
+
newone = pta.add_state(:initial => false, :accepting => false, :error => false)
|
62
|
+
pta.connect(reached, newone, symbol)
|
63
|
+
reached = newone
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# flag state
|
68
|
+
str.positive? ? reached.accepting! : reached.error!
|
69
|
+
|
70
|
+
# check consistency, should not arrive as Sample does not allow
|
71
|
+
# inconsistencies. Should appear only if _sample_ is not a Sample
|
72
|
+
# instance but some other enumerable.
|
73
|
+
raise(InconsistencyError, "Inconsistent sample on #{str}", caller)\
|
74
|
+
if (reached.error? and reached.accepting?)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Reindex states by applying BFS
|
78
|
+
to_index, index = [initial_state], 0
|
79
|
+
until to_index.empty?
|
80
|
+
state = to_index.shift
|
81
|
+
state[:__index__] = index
|
82
|
+
state.out_edges.sort{|e,f| e.symbol<=>f.symbol}.each {|e| to_index << e.target}
|
83
|
+
index += 1
|
84
|
+
end
|
85
|
+
# Force the automaton to reindex
|
86
|
+
pta.order_states{|s0,s1| s0[:__index__]<=>s1[:__index__]}
|
87
|
+
# Remove marks
|
88
|
+
pta.states.each{|s| s.remove_mark(:__index__)}
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Converts a Sample instance to a 'ready to refine' union find data structure.
|
94
|
+
# This method is simply a shortcut for <code>pta2ufds(sample2pta(sample))</code>.
|
95
|
+
#
|
96
|
+
def sample2ufds(sample)
|
97
|
+
pta2ufds(sample2pta(sample))
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# Computes the quotient automaton from a refined UnionFind data structure.
|
102
|
+
#
|
103
|
+
# In this version, only accepting and initial flags are taken into account
|
104
|
+
# when creating quotient automaton states. Other user data is lost during
|
105
|
+
# the conversion.
|
106
|
+
#
|
107
|
+
def ufds2dfa(ufds)
|
108
|
+
Automaton.new(false) do |fa|
|
109
|
+
mergeable_datas = ufds.mergeable_datas
|
110
|
+
mergeable_datas.each do |data|
|
111
|
+
state_data = data.reject {|key,value| [:master, :count, :delta].include?(key)}
|
112
|
+
state_data[:name] = data[:master].to_s
|
113
|
+
state_data[:error] = false
|
114
|
+
fa.add_state(state_data)
|
115
|
+
end
|
116
|
+
mergeable_datas.each do |data|
|
117
|
+
source = fa.get_state(data[:master].to_s)
|
118
|
+
data[:delta].each_pair do |symbol, target|
|
119
|
+
target = fa.get_state(ufds.find(target).to_s)
|
120
|
+
fa.connect(source, target, symbol)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
#
|
127
|
+
# Merges two user data hashes _d1_ and _d2_ according to rules defined
|
128
|
+
# below. Also fills a _determinization_ array with pairs of state indices
|
129
|
+
# that are reached from d1 and d2 through the same symbol and should be
|
130
|
+
# merged for determinization. This method does NOT ensure that those pairs
|
131
|
+
# correspond to distinguish states according to the union find. In other
|
132
|
+
# words state indices in these pairs do not necessarily corespond to master
|
133
|
+
# states (see UnionFind for this term).
|
134
|
+
#
|
135
|
+
# Returns the resulting data if the merge is successful (does not lead to
|
136
|
+
# merging an error state with an accepting one), nil otherwise.
|
137
|
+
#
|
138
|
+
# The merging procedure for the different hash keys is as follows:
|
139
|
+
# - result[:initial] = d1[:initial] or d2[:initial]
|
140
|
+
# - result[:accepting] = d1[:accepting] or d2[:accepting]
|
141
|
+
# - result[:error] = d1[:error] or d2[:error]
|
142
|
+
# - result[:master] = min(d1[:master], d2[:master])
|
143
|
+
# - result[:delta] = merging of delta hashes, keeping smaller target index
|
144
|
+
# on key collisions.
|
145
|
+
#
|
146
|
+
def merge_user_data(d1, d2, determinization)
|
147
|
+
# we compute flags first
|
148
|
+
new_data = {:initial => d1[:initial] || d2[:initial],
|
149
|
+
:accepting => d1[:accepting] || d2[:accepting],
|
150
|
+
:error => d1[:error] || d2[:error],
|
151
|
+
:master => d1[:master] < d2[:master] ? d1[:master] : d2[:master]}
|
152
|
+
|
153
|
+
# merge failure if accepting and error states are merged
|
154
|
+
return nil if new_data[:accepting] and new_data[:error]
|
155
|
+
|
156
|
+
# we recompute the delta function of the resulting state
|
157
|
+
# keeping merging for determinization as pairs in _determinization_
|
158
|
+
new_data[:delta] = d1[:delta].merge(d2[:delta]) do |symbol, t1, t2|
|
159
|
+
determinization << [t1, t2]
|
160
|
+
t1 < t2 ? t1 : t2
|
161
|
+
end
|
162
|
+
|
163
|
+
# returns merged data
|
164
|
+
new_data
|
165
|
+
end
|
166
|
+
|
167
|
+
end # module Commons
|
168
|
+
|
169
|
+
end # module Induction
|
170
|
+
end # module Stamina
|
@@ -0,0 +1,264 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Implementation of the RedBlue variant of the RPNI algorithm (with the blue-fringe
|
6
|
+
# heuristics).
|
7
|
+
#
|
8
|
+
# See Lang, K., B. Pearlmutter, andR. Price. 1998. Results of the Abbadingo One DFA
|
9
|
+
# Learning Competition and a New Evidence-Driven State Merging Algorithm, In Grammatical
|
10
|
+
# Inference, pp. 1–12. Ames, IO: Springer-Verlag.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# # sample typically comes from an ADL file
|
14
|
+
# sample = Stamina::ADL.parse_sample_file('sample.adl')
|
15
|
+
#
|
16
|
+
# # let RedBlue build the smallest dfa
|
17
|
+
# dfa = Stamina::Induction::RedBlue.execute(sample, {:verbose => true})
|
18
|
+
#
|
19
|
+
# Remarks:
|
20
|
+
# - Constructor and instance methods of this class are public but not intended
|
21
|
+
# to be used directly. They are left public for testing purposes only.
|
22
|
+
# - Having read the Stamina::Induction::RedBlue base algorithm may help undertanding
|
23
|
+
# this variant.
|
24
|
+
# - This class intensively uses the Stamina::Induction::UnionFind class and
|
25
|
+
# methods defined in the Stamina::Induction::Commons module which are worth
|
26
|
+
# reading to understand the algorithm implementation.
|
27
|
+
#
|
28
|
+
class RedBlue
|
29
|
+
include Stamina::Induction::Commons
|
30
|
+
|
31
|
+
# Union-find data structure used internally
|
32
|
+
attr_reader :ufds
|
33
|
+
|
34
|
+
# Additional options of the algorithm
|
35
|
+
attr_reader :options
|
36
|
+
|
37
|
+
#
|
38
|
+
# Creates an algorithm instance with specific options
|
39
|
+
#
|
40
|
+
def initialize(options={})
|
41
|
+
@options = options
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Computes the score of a single (group) merge. Returned value is 1 if both are
|
46
|
+
# accepting states or both are error states and 0 otherwise. Note that d1 and d2
|
47
|
+
# are expected to be merge compatible as this method does not distinguish this
|
48
|
+
# case.
|
49
|
+
#
|
50
|
+
def merge_score(d1, d2)
|
51
|
+
# Score of 1 if both accepting or both error
|
52
|
+
((d1[:accepting] and d2[:accepting]) or (d1[:error] and d2[:error])) ? 1 : 0
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# Merges a state of rank j with a state of lower rank i. This merge method
|
57
|
+
# includes merging for determinization. It returns nil if the merge is
|
58
|
+
# incompatible, a merge score otherwise.
|
59
|
+
#
|
60
|
+
# Preconditions:
|
61
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
62
|
+
# - States denoted by i and j are expected to be different
|
63
|
+
#
|
64
|
+
# Postconditions:
|
65
|
+
# - Union find is refined, states i and j having been merged, as well as all
|
66
|
+
# state pairs that need to be merged to ensure the deterministic property
|
67
|
+
# of the quotient automaton.
|
68
|
+
# - If the resulting quotient automaton is consistent with the negative sample,
|
69
|
+
# this method returns the number of accepting pairs + the number of error pairs
|
70
|
+
# that have been merged. The refined union-find correctly encodes the quotient
|
71
|
+
# automaton. Otherwise, the method returns nil and the union-find information
|
72
|
+
# must be considered inaccurate.
|
73
|
+
#
|
74
|
+
def merge_and_determinize(i, j)
|
75
|
+
# Make the union (keep merging score as well as additional merges to be performed
|
76
|
+
# in score and determinization, respectively). Recompute the user data attached to
|
77
|
+
# the new state group (new_data)
|
78
|
+
determinization, score = [], nil
|
79
|
+
@ufds.union(i, j) do |d1, d2|
|
80
|
+
# states are incompatible if new_data cannot be created because it would
|
81
|
+
# lead to merge and error and an accepting state. We simply return nil in this
|
82
|
+
# case...
|
83
|
+
return nil unless (new_data = merge_user_data(d1, d2, determinization))
|
84
|
+
# otherwise, we score
|
85
|
+
score = merge_score(d1, d2)
|
86
|
+
# and we let the union find keep the new_data for the group
|
87
|
+
new_data
|
88
|
+
end
|
89
|
+
|
90
|
+
# Merge for determinization starts here, based on the determinization array
|
91
|
+
# computed as a side effect of merge_user_data
|
92
|
+
determinization.each do |pair|
|
93
|
+
# we take the leader states of the pair to merge
|
94
|
+
pair = pair.collect{|i| @ufds.find(i)}
|
95
|
+
# do nothing if already the same leader state
|
96
|
+
next if pair[0]==pair[1]
|
97
|
+
# otherwise recurse and keep subscore
|
98
|
+
subscore = merge_and_determinize(pair[0], pair[1])
|
99
|
+
# failure if merging for determinization led to merge error and accepting
|
100
|
+
# states
|
101
|
+
return nil if subscore.nil?
|
102
|
+
# this is the new score
|
103
|
+
score += subscore
|
104
|
+
end
|
105
|
+
|
106
|
+
score
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Evaluates the score of merging states i and j. Returns nil if the states are
|
111
|
+
# cannot be merged, a positive score otherwise.
|
112
|
+
#
|
113
|
+
# Preconditions:
|
114
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
115
|
+
# - States denoted by i and j are expected to be different
|
116
|
+
#
|
117
|
+
# Postconditions:
|
118
|
+
# - Returned value is nil if the quotient automaton would be incompatible with
|
119
|
+
# the sample. Otherwise a positive number is returned, encoding the number of
|
120
|
+
# interresting pairs that have been merged (interesting = both accepting or both
|
121
|
+
# error)
|
122
|
+
# - The union find is ALWAYS restored to its previous value after merging has
|
123
|
+
# been evaluated and is then seen unchanged by the caller.
|
124
|
+
#
|
125
|
+
def merge_and_determinize_score(i, j)
|
126
|
+
# score the merging, always rollback the transaction
|
127
|
+
score = nil
|
128
|
+
@ufds.transactional do
|
129
|
+
score = merge_and_determinize(i, j)
|
130
|
+
false
|
131
|
+
end
|
132
|
+
score
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Computes the fringe given the current union find. The fringe is returned as an
|
137
|
+
# array of state indices.
|
138
|
+
#
|
139
|
+
# Postconditions:
|
140
|
+
# - Returned array contains indices of leader states only.
|
141
|
+
# - Returned array is disjoint with the kernel.
|
142
|
+
#
|
143
|
+
def fringe
|
144
|
+
fringe = []
|
145
|
+
@kernel.each do |k1|
|
146
|
+
delta = @ufds.mergeable_data(k1)[:delta]
|
147
|
+
delta.each_pair{|symbol, target| fringe << @ufds.find(target)}
|
148
|
+
end
|
149
|
+
(fringe - @kernel).sort
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Main method of the algorithm. Refines the union find passed as first argument
|
154
|
+
# by merging well chosen state pairs. Returns the refined union find.
|
155
|
+
#
|
156
|
+
# Preconditions:
|
157
|
+
# - The union find _ufds_ is correctly initialized (contains :initial, :accepting,
|
158
|
+
# and :error boolean flags as well as a :delta sub hash)
|
159
|
+
#
|
160
|
+
# Postconditions:
|
161
|
+
# - The union find has been refined. It encodes a quotient automaton (of the PTA
|
162
|
+
# it comes from) such that all positive and negative strings of the underlying
|
163
|
+
# sample are correctly classified by it.
|
164
|
+
#
|
165
|
+
def main(ufds)
|
166
|
+
puts "Starting RedBlue (#{ufds.size} states)" if @options[:verbose]
|
167
|
+
@ufds, @kernel = ufds, [0]
|
168
|
+
|
169
|
+
# we do it until the fringe is empty (compute it only once each step)
|
170
|
+
until (the_fringe=fringe).empty?
|
171
|
+
# state to consolidate (if any)
|
172
|
+
to_consolidate = nil
|
173
|
+
# best candidate [source index, target index, score]
|
174
|
+
best = [nil, nil, -1]
|
175
|
+
|
176
|
+
# for each state on the fringe as merge candidate
|
177
|
+
the_fringe.each do |candidate|
|
178
|
+
to_consolidate = candidate
|
179
|
+
|
180
|
+
# evaluate score of merging candidate with each kernel state
|
181
|
+
@kernel.each do |target|
|
182
|
+
score = merge_and_determinize_score(candidate, target)
|
183
|
+
unless score.nil?
|
184
|
+
# if a score has been found, the candidate will not be
|
185
|
+
# consolidated. We keep it as best if its better than the
|
186
|
+
# previous one
|
187
|
+
to_consolidate = nil
|
188
|
+
best = [candidate, target, score] if score > best[2]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# No possible target, break the loop (will consolidate right now)!
|
193
|
+
break unless to_consolidate.nil?
|
194
|
+
end
|
195
|
+
|
196
|
+
# If not found, the last candidate must be consolidated. Otherwise, we
|
197
|
+
# do the best merging
|
198
|
+
unless to_consolidate.nil?
|
199
|
+
puts "Consolidation of #{to_consolidate}" if @options[:verbose]
|
200
|
+
@kernel << to_consolidate
|
201
|
+
else
|
202
|
+
puts "Merging #{best[0]} and #{best[1]} [#{best[2]}]" if @options[:verbose]
|
203
|
+
# this one should never fail because its score was positive before
|
204
|
+
raise "Unexpected case" unless merge_and_determinize(best[0], best[1])
|
205
|
+
end
|
206
|
+
|
207
|
+
# redblue does not guarantee that it will not merge a state of lower rank
|
208
|
+
# with a kernel state. The kernel should then be update at each step to keep
|
209
|
+
# lowest indices for the whole kernel, and we sort it
|
210
|
+
@kernel = @kernel.collect{|k| @ufds.find(k)}.sort
|
211
|
+
end
|
212
|
+
|
213
|
+
# return the refined union find now
|
214
|
+
@ufds
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Build the smallest DFA compatible with the sample given as input.
|
219
|
+
#
|
220
|
+
# Preconditions:
|
221
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
222
|
+
# positive and negative) and contains at least one string.
|
223
|
+
#
|
224
|
+
# Postconditions:
|
225
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
226
|
+
# given as input.
|
227
|
+
#
|
228
|
+
# Remarks:
|
229
|
+
# - This instance version of RedBlue.execute is not intended to be used directly and
|
230
|
+
# is mainly provided for testing purposes. Please use the class variant of this
|
231
|
+
# method if possible.
|
232
|
+
#
|
233
|
+
def execute(sample)
|
234
|
+
# create union-find
|
235
|
+
puts "Creating PTA and UnionFind structure" if @options[:verbose]
|
236
|
+
ufds = sample2ufds(sample)
|
237
|
+
# refine it
|
238
|
+
ufds = main(ufds)
|
239
|
+
# compute and return quotient automaton
|
240
|
+
ufds2dfa(ufds)
|
241
|
+
end
|
242
|
+
|
243
|
+
#
|
244
|
+
# Build the smallest DFA compatible with the sample given as input.
|
245
|
+
#
|
246
|
+
# Options (the _options_ hash):
|
247
|
+
# - :verbose can be set to true to trace algorithm execution on standard output.
|
248
|
+
#
|
249
|
+
# Preconditions:
|
250
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
251
|
+
# positive and negative) and contains at least one string.
|
252
|
+
#
|
253
|
+
# Postconditions:
|
254
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
255
|
+
# given as input.
|
256
|
+
#
|
257
|
+
def self.execute(sample, options={})
|
258
|
+
RedBlue.new(options).execute(sample)
|
259
|
+
end
|
260
|
+
|
261
|
+
end # class RedBlue
|
262
|
+
|
263
|
+
end # module Induction
|
264
|
+
end # module Stamina
|