stamina 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/CHANGELOG.md +22 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +33 -0
- data/LICENCE.md +22 -0
- data/Manifest.txt +16 -0
- data/README.md +78 -0
- data/Rakefile +23 -0
- data/bin/adl2dot +12 -0
- data/bin/classify +12 -0
- data/bin/redblue +12 -0
- data/bin/rpni +12 -0
- data/example/adl/automaton.adl +49 -0
- data/example/adl/sample.adl +53 -0
- data/example/basic/characteristic_sample.adl +32 -0
- data/example/basic/target.adl +9 -0
- data/example/competition/31_test.adl +1500 -0
- data/example/competition/31_training.adl +1759 -0
- data/lib/stamina.rb +19 -0
- data/lib/stamina/adl.rb +298 -0
- data/lib/stamina/automaton.rb +1237 -0
- data/lib/stamina/automaton/walking.rb +336 -0
- data/lib/stamina/classifier.rb +37 -0
- data/lib/stamina/command/adl2dot_command.rb +73 -0
- data/lib/stamina/command/classify_command.rb +57 -0
- data/lib/stamina/command/redblue_command.rb +58 -0
- data/lib/stamina/command/rpni_command.rb +58 -0
- data/lib/stamina/command/stamina_command.rb +79 -0
- data/lib/stamina/errors.rb +20 -0
- data/lib/stamina/induction/commons.rb +170 -0
- data/lib/stamina/induction/redblue.rb +264 -0
- data/lib/stamina/induction/rpni.rb +188 -0
- data/lib/stamina/induction/union_find.rb +377 -0
- data/lib/stamina/input_string.rb +123 -0
- data/lib/stamina/loader.rb +0 -0
- data/lib/stamina/markable.rb +42 -0
- data/lib/stamina/sample.rb +190 -0
- data/lib/stamina/version.rb +14 -0
- data/stamina.gemspec +190 -0
- data/stamina.noespec +35 -0
- data/tasks/debug_mail.rake +78 -0
- data/tasks/debug_mail.txt +13 -0
- data/tasks/gem.rake +68 -0
- data/tasks/spec_test.rake +79 -0
- data/tasks/unit_test.rake +77 -0
- data/tasks/yard.rake +51 -0
- data/test/stamina/adl_test.rb +491 -0
- data/test/stamina/automaton_additional_test.rb +190 -0
- data/test/stamina/automaton_classifier_test.rb +155 -0
- data/test/stamina/automaton_test.rb +1092 -0
- data/test/stamina/automaton_to_dot_test.rb +64 -0
- data/test/stamina/automaton_walking_test.rb +206 -0
- data/test/stamina/exit.rb +3 -0
- data/test/stamina/induction/induction_test.rb +70 -0
- data/test/stamina/induction/redblue_mergesamestatebug_expected.adl +19 -0
- data/test/stamina/induction/redblue_mergesamestatebug_pta.dot +64 -0
- data/test/stamina/induction/redblue_mergesamestatebug_sample.adl +9 -0
- data/test/stamina/induction/redblue_test.rb +83 -0
- data/test/stamina/induction/redblue_universal_expected.adl +4 -0
- data/test/stamina/induction/redblue_universal_sample.adl +5 -0
- data/test/stamina/induction/rpni_inria_expected.adl +7 -0
- data/test/stamina/induction/rpni_inria_sample.adl +9 -0
- data/test/stamina/induction/rpni_test.rb +129 -0
- data/test/stamina/induction/rpni_test_pta.dot +22 -0
- data/test/stamina/induction/rpni_universal_expected.adl +4 -0
- data/test/stamina/induction/rpni_universal_sample.adl +4 -0
- data/test/stamina/induction/union_find_test.rb +124 -0
- data/test/stamina/input_string_test.rb +323 -0
- data/test/stamina/markable_test.rb +70 -0
- data/test/stamina/randdfa.adl +66 -0
- data/test/stamina/sample.adl +4 -0
- data/test/stamina/sample_classify_test.rb +149 -0
- data/test/stamina/sample_test.rb +218 -0
- data/test/stamina/small_dfa.dot +16 -0
- data/test/stamina/small_dfa.gif +0 -0
- data/test/stamina/small_nfa.dot +18 -0
- data/test/stamina/small_nfa.gif +0 -0
- data/test/stamina/stamina_test.rb +69 -0
- data/test/test_all.rb +7 -0
- metadata +279 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'stamina/command/stamina_command'
|
2
|
+
require 'stamina/induction/redblue'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Implementation of the redblue command line tool
|
7
|
+
class RedBlueCommand < StaminaCommand
|
8
|
+
|
9
|
+
# Creates a score command instance
|
10
|
+
def initialize
|
11
|
+
super("redblue", "[options] sample.adl",
|
12
|
+
"Executes RedBlue (Regular Positive and Negative Inference) on a ADL sample and\n"\
|
13
|
+
"flushes the induced DFA on the standard output in ADL format as well")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Installs additional options
|
17
|
+
def options
|
18
|
+
super do |opt|
|
19
|
+
opt.on("-v", "--verbose", "Verbose mode") do
|
20
|
+
@verbose = true
|
21
|
+
end
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush induced DFA in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Sets the sample file
|
31
|
+
def sample_file=(file)
|
32
|
+
assert_readable_file(file)
|
33
|
+
puts "Parsing sample and building PTA" if @verbose
|
34
|
+
@sample = Stamina::ADL.parse_sample_file(file)
|
35
|
+
rescue Stamina::ADL::ParseError
|
36
|
+
raise ArgumentError, "#{file} is not a valid ADL sample file"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Executes the command
|
40
|
+
def main(argv)
|
41
|
+
parse(argv, :sample_file)
|
42
|
+
t1 = Time.now
|
43
|
+
dfa = Stamina::Induction::RedBlue.execute(@sample, {:verbose => @verbose})
|
44
|
+
t2 = Time.now
|
45
|
+
if @output_file
|
46
|
+
File.open(@output_file, 'w') do |file|
|
47
|
+
Stamina::ADL.print_automaton(dfa, file)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
Stamina::ADL.print_automaton(dfa, STDOUT)
|
51
|
+
end
|
52
|
+
puts "Executed in #{t2-t1} sec" if @verbose
|
53
|
+
end
|
54
|
+
|
55
|
+
end # class ScoreCommand
|
56
|
+
|
57
|
+
end # module Command
|
58
|
+
end # module Stamina
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'stamina/command/stamina_command'
|
2
|
+
require 'stamina/induction/rpni'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Implementation of the rpni command line tool
|
7
|
+
class RPNICommand < StaminaCommand
|
8
|
+
|
9
|
+
# Creates a score command instance
|
10
|
+
def initialize
|
11
|
+
super("rpni", "[options] sample.adl",
|
12
|
+
"Executes RPNI (Regular Positive and Negative Inference) on a ADL sample and\n"\
|
13
|
+
"flushes the induced DFA on the standard output in ADL format as well")
|
14
|
+
end
|
15
|
+
|
16
|
+
# Installs additional options
|
17
|
+
def options
|
18
|
+
super do |opt|
|
19
|
+
opt.on("-v", "--verbose", "Verbose mode") do
|
20
|
+
@verbose = true
|
21
|
+
end
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush induced DFA in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Sets the sample file
|
31
|
+
def sample_file=(file)
|
32
|
+
assert_readable_file(file)
|
33
|
+
puts "Parsing sample and building PTA" if @verbose
|
34
|
+
@sample = Stamina::ADL.parse_sample_file(file)
|
35
|
+
rescue Stamina::ADL::ParseError
|
36
|
+
raise ArgumentError, "#{file} is not a valid ADL sample file"
|
37
|
+
end
|
38
|
+
|
39
|
+
# Executes the command
|
40
|
+
def main(argv)
|
41
|
+
parse(argv, :sample_file)
|
42
|
+
t1 = Time.now
|
43
|
+
dfa = Stamina::Induction::RPNI.execute(@sample, {:verbose => @verbose})
|
44
|
+
t2 = Time.now
|
45
|
+
if @output_file
|
46
|
+
File.open(@output_file, 'w') do |file|
|
47
|
+
Stamina::ADL.print_automaton(dfa, file)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
Stamina::ADL.print_automaton(dfa, STDOUT)
|
51
|
+
end
|
52
|
+
puts "Executed in #{t2-t1} sec" if @verbose
|
53
|
+
end
|
54
|
+
|
55
|
+
end # class ScoreCommand
|
56
|
+
|
57
|
+
end # module Command
|
58
|
+
end # module Stamina
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'stamina'
|
2
|
+
require 'optparse'
|
3
|
+
module Stamina
|
4
|
+
module Command
|
5
|
+
|
6
|
+
# Helper to create stamina commands
|
7
|
+
class StaminaCommand
|
8
|
+
|
9
|
+
# Command name
|
10
|
+
attr_reader :name
|
11
|
+
|
12
|
+
# Command description
|
13
|
+
attr_reader :description
|
14
|
+
|
15
|
+
# Command usage
|
16
|
+
attr_reader :usage
|
17
|
+
|
18
|
+
# Creates a command with a name, usage and description
|
19
|
+
def initialize(name, usage, description)
|
20
|
+
@name = name
|
21
|
+
@usage = usage
|
22
|
+
@description = description
|
23
|
+
end
|
24
|
+
|
25
|
+
# Creates options
|
26
|
+
def options(&block)
|
27
|
+
OptionParser.new do |opt|
|
28
|
+
opt.program_name = name
|
29
|
+
opt.version = Stamina::VERSION
|
30
|
+
opt.release = nil
|
31
|
+
opt.summary_indent = ' ' * 4
|
32
|
+
banner = <<-EOF
|
33
|
+
# usage: #{opt.program_name} #{usage}
|
34
|
+
# #{description}
|
35
|
+
EOF
|
36
|
+
opt.banner = banner.gsub(/[ \t]+# /, "")
|
37
|
+
block.call(opt) if block
|
38
|
+
opt.on_tail("-h", "--help", "Show this message") do
|
39
|
+
puts opt
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Prints usage (and optionnaly exits)
|
46
|
+
def show_usage(and_exit=true)
|
47
|
+
puts options
|
48
|
+
Kernel.exit if and_exit
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks that a given file is readable or raises an ArgumentError
|
52
|
+
def assert_readable_file(file)
|
53
|
+
raise ArgumentError, "File #{file} does not exists" unless File.exists?(file)
|
54
|
+
raise ArgumentError, "File #{file} cannot be read" unless File.readable?(file)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Checks that a given file is writable or raises an ArgumentError
|
58
|
+
def assert_writable_file(file)
|
59
|
+
raise ArgumentError, "File #{file} cannot be written" \
|
60
|
+
unless not(File.exists?(file)) or File.writable?(file)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Parses arguments and install last argument as instance variables
|
64
|
+
def parse(argv, *variables)
|
65
|
+
rest = options.parse(argv)
|
66
|
+
show_usage(true) unless rest.size==variables.size
|
67
|
+
variables.each_with_index do |var,i|
|
68
|
+
self.send("#{var}=".to_sym, rest[i])
|
69
|
+
end
|
70
|
+
rescue ArgumentError => ex
|
71
|
+
puts ex.message
|
72
|
+
puts
|
73
|
+
show_usage(true)
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class StaminaCommand
|
77
|
+
|
78
|
+
end # module Command
|
79
|
+
end # module Stamina
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Stamina
|
2
|
+
|
3
|
+
# Main class of all stamina errors.
|
4
|
+
class StaminaError < StandardError; end
|
5
|
+
|
6
|
+
# Raised by samples implementations and other induction algorithms
|
7
|
+
# when a sample is inconsistent (same string labeled as being both
|
8
|
+
# positive and negative)
|
9
|
+
class InconsistencyError < StaminaError; end
|
10
|
+
|
11
|
+
# Specific errors of the ADL module.
|
12
|
+
module ADL
|
13
|
+
|
14
|
+
# Raised by the ADL module when an automaton, string or sample
|
15
|
+
# format is violated at parsing time.
|
16
|
+
class ParseError < StaminaError; end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end # module Stamina
|
@@ -0,0 +1,170 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Defines common utilities used by rpni and redblue. About acronyms:
|
6
|
+
# - _pta_ stands for Prefix Tree Acceptor
|
7
|
+
# - _ufds_ stands for Union-Find Data Structure
|
8
|
+
#
|
9
|
+
# Methods pta2ufds, sample2pta and sample2ufds are simply conversion methods used
|
10
|
+
# when the induction algorithm starts (executed on a sample, it first built a pta
|
11
|
+
# then convert it to a union find). Method ufds2pta is used when the algorithm ends,
|
12
|
+
# to convert refined union find to a dfa.
|
13
|
+
#
|
14
|
+
# The merge_user_data method is probably the most important as it actually computes
|
15
|
+
# the merging of two states and build information about merging for determinization.
|
16
|
+
#
|
17
|
+
module Commons
|
18
|
+
|
19
|
+
#
|
20
|
+
# Factors and returns a UnionFind data structure from a PTA, keeping natural order
|
21
|
+
# of its states for union-find elements. The resulting UnionFind contains a Hash as
|
22
|
+
# mergeable user data, presenting the following keys:
|
23
|
+
# - :initial, :accepting and :error flags of each state
|
24
|
+
# - :master indicating the index of the state in the PTA
|
25
|
+
# - :delta a delta function through a Hash {symbol => state_index}
|
26
|
+
#
|
27
|
+
# In this version, other user data attached to PTA states is lost during the
|
28
|
+
# conversion.
|
29
|
+
#
|
30
|
+
def pta2ufds(pta)
|
31
|
+
Stamina::Induction::UnionFind.new(pta.state_count) do |i|
|
32
|
+
state = pta.ith_state(i)
|
33
|
+
data = {:initial => state.initial?,
|
34
|
+
:accepting => state.accepting?,
|
35
|
+
:error => state.error?,
|
36
|
+
:master => i,
|
37
|
+
:delta => {}}
|
38
|
+
state.out_edges.each {|edge| data[:delta][edge.symbol] = edge.target.index}
|
39
|
+
data
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Converts a Sample to an (augmented) prefix tree acceptor. This method ensures
|
45
|
+
# that the states of the PTA are in lexical order, according to the <code><=></code>
|
46
|
+
# operator defined on symbols. States reached by negative strings are tagged as
|
47
|
+
# non accepting and error.
|
48
|
+
#
|
49
|
+
def sample2pta(sample)
|
50
|
+
Automaton.new do |pta|
|
51
|
+
initial_state = add_state(:initial => true, :accepting => false)
|
52
|
+
|
53
|
+
# Fill the PTA with each string
|
54
|
+
sample.each do |str|
|
55
|
+
# split string using the dfa
|
56
|
+
parsed, reached, remaining = pta.dfa_split(str, initial_state)
|
57
|
+
|
58
|
+
# remaining symbols are not empty -> build the PTA
|
59
|
+
unless remaining.empty?
|
60
|
+
remaining.each do |symbol|
|
61
|
+
newone = pta.add_state(:initial => false, :accepting => false, :error => false)
|
62
|
+
pta.connect(reached, newone, symbol)
|
63
|
+
reached = newone
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# flag state
|
68
|
+
str.positive? ? reached.accepting! : reached.error!
|
69
|
+
|
70
|
+
# check consistency, should not arrive as Sample does not allow
|
71
|
+
# inconsistencies. Should appear only if _sample_ is not a Sample
|
72
|
+
# instance but some other enumerable.
|
73
|
+
raise(InconsistencyError, "Inconsistent sample on #{str}", caller)\
|
74
|
+
if (reached.error? and reached.accepting?)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Reindex states by applying BFS
|
78
|
+
to_index, index = [initial_state], 0
|
79
|
+
until to_index.empty?
|
80
|
+
state = to_index.shift
|
81
|
+
state[:__index__] = index
|
82
|
+
state.out_edges.sort{|e,f| e.symbol<=>f.symbol}.each {|e| to_index << e.target}
|
83
|
+
index += 1
|
84
|
+
end
|
85
|
+
# Force the automaton to reindex
|
86
|
+
pta.order_states{|s0,s1| s0[:__index__]<=>s1[:__index__]}
|
87
|
+
# Remove marks
|
88
|
+
pta.states.each{|s| s.remove_mark(:__index__)}
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Converts a Sample instance to a 'ready to refine' union find data structure.
|
94
|
+
# This method is simply a shortcut for <code>pta2ufds(sample2pta(sample))</code>.
|
95
|
+
#
|
96
|
+
def sample2ufds(sample)
|
97
|
+
pta2ufds(sample2pta(sample))
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# Computes the quotient automaton from a refined UnionFind data structure.
|
102
|
+
#
|
103
|
+
# In this version, only accepting and initial flags are taken into account
|
104
|
+
# when creating quotient automaton states. Other user data is lost during
|
105
|
+
# the conversion.
|
106
|
+
#
|
107
|
+
def ufds2dfa(ufds)
|
108
|
+
Automaton.new(false) do |fa|
|
109
|
+
mergeable_datas = ufds.mergeable_datas
|
110
|
+
mergeable_datas.each do |data|
|
111
|
+
state_data = data.reject {|key,value| [:master, :count, :delta].include?(key)}
|
112
|
+
state_data[:name] = data[:master].to_s
|
113
|
+
state_data[:error] = false
|
114
|
+
fa.add_state(state_data)
|
115
|
+
end
|
116
|
+
mergeable_datas.each do |data|
|
117
|
+
source = fa.get_state(data[:master].to_s)
|
118
|
+
data[:delta].each_pair do |symbol, target|
|
119
|
+
target = fa.get_state(ufds.find(target).to_s)
|
120
|
+
fa.connect(source, target, symbol)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
#
|
127
|
+
# Merges two user data hashes _d1_ and _d2_ according to rules defined
|
128
|
+
# below. Also fills a _determinization_ array with pairs of state indices
|
129
|
+
# that are reached from d1 and d2 through the same symbol and should be
|
130
|
+
# merged for determinization. This method does NOT ensure that those pairs
|
131
|
+
# correspond to distinguish states according to the union find. In other
|
132
|
+
# words state indices in these pairs do not necessarily corespond to master
|
133
|
+
# states (see UnionFind for this term).
|
134
|
+
#
|
135
|
+
# Returns the resulting data if the merge is successful (does not lead to
|
136
|
+
# merging an error state with an accepting one), nil otherwise.
|
137
|
+
#
|
138
|
+
# The merging procedure for the different hash keys is as follows:
|
139
|
+
# - result[:initial] = d1[:initial] or d2[:initial]
|
140
|
+
# - result[:accepting] = d1[:accepting] or d2[:accepting]
|
141
|
+
# - result[:error] = d1[:error] or d2[:error]
|
142
|
+
# - result[:master] = min(d1[:master], d2[:master])
|
143
|
+
# - result[:delta] = merging of delta hashes, keeping smaller target index
|
144
|
+
# on key collisions.
|
145
|
+
#
|
146
|
+
def merge_user_data(d1, d2, determinization)
|
147
|
+
# we compute flags first
|
148
|
+
new_data = {:initial => d1[:initial] || d2[:initial],
|
149
|
+
:accepting => d1[:accepting] || d2[:accepting],
|
150
|
+
:error => d1[:error] || d2[:error],
|
151
|
+
:master => d1[:master] < d2[:master] ? d1[:master] : d2[:master]}
|
152
|
+
|
153
|
+
# merge failure if accepting and error states are merged
|
154
|
+
return nil if new_data[:accepting] and new_data[:error]
|
155
|
+
|
156
|
+
# we recompute the delta function of the resulting state
|
157
|
+
# keeping merging for determinization as pairs in _determinization_
|
158
|
+
new_data[:delta] = d1[:delta].merge(d2[:delta]) do |symbol, t1, t2|
|
159
|
+
determinization << [t1, t2]
|
160
|
+
t1 < t2 ? t1 : t2
|
161
|
+
end
|
162
|
+
|
163
|
+
# returns merged data
|
164
|
+
new_data
|
165
|
+
end
|
166
|
+
|
167
|
+
end # module Commons
|
168
|
+
|
169
|
+
end # module Induction
|
170
|
+
end # module Stamina
|
@@ -0,0 +1,264 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Implementation of the RedBlue variant of the RPNI algorithm (with the blue-fringe
|
6
|
+
# heuristics).
|
7
|
+
#
|
8
|
+
# See Lang, K., B. Pearlmutter, andR. Price. 1998. Results of the Abbadingo One DFA
|
9
|
+
# Learning Competition and a New Evidence-Driven State Merging Algorithm, In Grammatical
|
10
|
+
# Inference, pp. 1–12. Ames, IO: Springer-Verlag.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# # sample typically comes from an ADL file
|
14
|
+
# sample = Stamina::ADL.parse_sample_file('sample.adl')
|
15
|
+
#
|
16
|
+
# # let RedBlue build the smallest dfa
|
17
|
+
# dfa = Stamina::Induction::RedBlue.execute(sample, {:verbose => true})
|
18
|
+
#
|
19
|
+
# Remarks:
|
20
|
+
# - Constructor and instance methods of this class are public but not intended
|
21
|
+
# to be used directly. They are left public for testing purposes only.
|
22
|
+
# - Having read the Stamina::Induction::RedBlue base algorithm may help undertanding
|
23
|
+
# this variant.
|
24
|
+
# - This class intensively uses the Stamina::Induction::UnionFind class and
|
25
|
+
# methods defined in the Stamina::Induction::Commons module which are worth
|
26
|
+
# reading to understand the algorithm implementation.
|
27
|
+
#
|
28
|
+
class RedBlue
|
29
|
+
include Stamina::Induction::Commons
|
30
|
+
|
31
|
+
# Union-find data structure used internally
|
32
|
+
attr_reader :ufds
|
33
|
+
|
34
|
+
# Additional options of the algorithm
|
35
|
+
attr_reader :options
|
36
|
+
|
37
|
+
#
|
38
|
+
# Creates an algorithm instance with specific options
|
39
|
+
#
|
40
|
+
def initialize(options={})
|
41
|
+
@options = options
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Computes the score of a single (group) merge. Returned value is 1 if both are
|
46
|
+
# accepting states or both are error states and 0 otherwise. Note that d1 and d2
|
47
|
+
# are expected to be merge compatible as this method does not distinguish this
|
48
|
+
# case.
|
49
|
+
#
|
50
|
+
def merge_score(d1, d2)
|
51
|
+
# Score of 1 if both accepting or both error
|
52
|
+
((d1[:accepting] and d2[:accepting]) or (d1[:error] and d2[:error])) ? 1 : 0
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# Merges a state of rank j with a state of lower rank i. This merge method
|
57
|
+
# includes merging for determinization. It returns nil if the merge is
|
58
|
+
# incompatible, a merge score otherwise.
|
59
|
+
#
|
60
|
+
# Preconditions:
|
61
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
62
|
+
# - States denoted by i and j are expected to be different
|
63
|
+
#
|
64
|
+
# Postconditions:
|
65
|
+
# - Union find is refined, states i and j having been merged, as well as all
|
66
|
+
# state pairs that need to be merged to ensure the deterministic property
|
67
|
+
# of the quotient automaton.
|
68
|
+
# - If the resulting quotient automaton is consistent with the negative sample,
|
69
|
+
# this method returns the number of accepting pairs + the number of error pairs
|
70
|
+
# that have been merged. The refined union-find correctly encodes the quotient
|
71
|
+
# automaton. Otherwise, the method returns nil and the union-find information
|
72
|
+
# must be considered inaccurate.
|
73
|
+
#
|
74
|
+
def merge_and_determinize(i, j)
|
75
|
+
# Make the union (keep merging score as well as additional merges to be performed
|
76
|
+
# in score and determinization, respectively). Recompute the user data attached to
|
77
|
+
# the new state group (new_data)
|
78
|
+
determinization, score = [], nil
|
79
|
+
@ufds.union(i, j) do |d1, d2|
|
80
|
+
# states are incompatible if new_data cannot be created because it would
|
81
|
+
# lead to merge and error and an accepting state. We simply return nil in this
|
82
|
+
# case...
|
83
|
+
return nil unless (new_data = merge_user_data(d1, d2, determinization))
|
84
|
+
# otherwise, we score
|
85
|
+
score = merge_score(d1, d2)
|
86
|
+
# and we let the union find keep the new_data for the group
|
87
|
+
new_data
|
88
|
+
end
|
89
|
+
|
90
|
+
# Merge for determinization starts here, based on the determinization array
|
91
|
+
# computed as a side effect of merge_user_data
|
92
|
+
determinization.each do |pair|
|
93
|
+
# we take the leader states of the pair to merge
|
94
|
+
pair = pair.collect{|i| @ufds.find(i)}
|
95
|
+
# do nothing if already the same leader state
|
96
|
+
next if pair[0]==pair[1]
|
97
|
+
# otherwise recurse and keep subscore
|
98
|
+
subscore = merge_and_determinize(pair[0], pair[1])
|
99
|
+
# failure if merging for determinization led to merge error and accepting
|
100
|
+
# states
|
101
|
+
return nil if subscore.nil?
|
102
|
+
# this is the new score
|
103
|
+
score += subscore
|
104
|
+
end
|
105
|
+
|
106
|
+
score
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Evaluates the score of merging states i and j. Returns nil if the states are
|
111
|
+
# cannot be merged, a positive score otherwise.
|
112
|
+
#
|
113
|
+
# Preconditions:
|
114
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
115
|
+
# - States denoted by i and j are expected to be different
|
116
|
+
#
|
117
|
+
# Postconditions:
|
118
|
+
# - Returned value is nil if the quotient automaton would be incompatible with
|
119
|
+
# the sample. Otherwise a positive number is returned, encoding the number of
|
120
|
+
# interresting pairs that have been merged (interesting = both accepting or both
|
121
|
+
# error)
|
122
|
+
# - The union find is ALWAYS restored to its previous value after merging has
|
123
|
+
# been evaluated and is then seen unchanged by the caller.
|
124
|
+
#
|
125
|
+
def merge_and_determinize_score(i, j)
|
126
|
+
# score the merging, always rollback the transaction
|
127
|
+
score = nil
|
128
|
+
@ufds.transactional do
|
129
|
+
score = merge_and_determinize(i, j)
|
130
|
+
false
|
131
|
+
end
|
132
|
+
score
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Computes the fringe given the current union find. The fringe is returned as an
|
137
|
+
# array of state indices.
|
138
|
+
#
|
139
|
+
# Postconditions:
|
140
|
+
# - Returned array contains indices of leader states only.
|
141
|
+
# - Returned array is disjoint with the kernel.
|
142
|
+
#
|
143
|
+
def fringe
|
144
|
+
fringe = []
|
145
|
+
@kernel.each do |k1|
|
146
|
+
delta = @ufds.mergeable_data(k1)[:delta]
|
147
|
+
delta.each_pair{|symbol, target| fringe << @ufds.find(target)}
|
148
|
+
end
|
149
|
+
(fringe - @kernel).sort
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Main method of the algorithm. Refines the union find passed as first argument
|
154
|
+
# by merging well chosen state pairs. Returns the refined union find.
|
155
|
+
#
|
156
|
+
# Preconditions:
|
157
|
+
# - The union find _ufds_ is correctly initialized (contains :initial, :accepting,
|
158
|
+
# and :error boolean flags as well as a :delta sub hash)
|
159
|
+
#
|
160
|
+
# Postconditions:
|
161
|
+
# - The union find has been refined. It encodes a quotient automaton (of the PTA
|
162
|
+
# it comes from) such that all positive and negative strings of the underlying
|
163
|
+
# sample are correctly classified by it.
|
164
|
+
#
|
165
|
+
def main(ufds)
|
166
|
+
puts "Starting RedBlue (#{ufds.size} states)" if @options[:verbose]
|
167
|
+
@ufds, @kernel = ufds, [0]
|
168
|
+
|
169
|
+
# we do it until the fringe is empty (compute it only once each step)
|
170
|
+
until (the_fringe=fringe).empty?
|
171
|
+
# state to consolidate (if any)
|
172
|
+
to_consolidate = nil
|
173
|
+
# best candidate [source index, target index, score]
|
174
|
+
best = [nil, nil, -1]
|
175
|
+
|
176
|
+
# for each state on the fringe as merge candidate
|
177
|
+
the_fringe.each do |candidate|
|
178
|
+
to_consolidate = candidate
|
179
|
+
|
180
|
+
# evaluate score of merging candidate with each kernel state
|
181
|
+
@kernel.each do |target|
|
182
|
+
score = merge_and_determinize_score(candidate, target)
|
183
|
+
unless score.nil?
|
184
|
+
# if a score has been found, the candidate will not be
|
185
|
+
# consolidated. We keep it as best if its better than the
|
186
|
+
# previous one
|
187
|
+
to_consolidate = nil
|
188
|
+
best = [candidate, target, score] if score > best[2]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# No possible target, break the loop (will consolidate right now)!
|
193
|
+
break unless to_consolidate.nil?
|
194
|
+
end
|
195
|
+
|
196
|
+
# If not found, the last candidate must be consolidated. Otherwise, we
|
197
|
+
# do the best merging
|
198
|
+
unless to_consolidate.nil?
|
199
|
+
puts "Consolidation of #{to_consolidate}" if @options[:verbose]
|
200
|
+
@kernel << to_consolidate
|
201
|
+
else
|
202
|
+
puts "Merging #{best[0]} and #{best[1]} [#{best[2]}]" if @options[:verbose]
|
203
|
+
# this one should never fail because its score was positive before
|
204
|
+
raise "Unexpected case" unless merge_and_determinize(best[0], best[1])
|
205
|
+
end
|
206
|
+
|
207
|
+
# redblue does not guarantee that it will not merge a state of lower rank
|
208
|
+
# with a kernel state. The kernel should then be update at each step to keep
|
209
|
+
# lowest indices for the whole kernel, and we sort it
|
210
|
+
@kernel = @kernel.collect{|k| @ufds.find(k)}.sort
|
211
|
+
end
|
212
|
+
|
213
|
+
# return the refined union find now
|
214
|
+
@ufds
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Build the smallest DFA compatible with the sample given as input.
|
219
|
+
#
|
220
|
+
# Preconditions:
|
221
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
222
|
+
# positive and negative) and contains at least one string.
|
223
|
+
#
|
224
|
+
# Postconditions:
|
225
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
226
|
+
# given as input.
|
227
|
+
#
|
228
|
+
# Remarks:
|
229
|
+
# - This instance version of RedBlue.execute is not intended to be used directly and
|
230
|
+
# is mainly provided for testing purposes. Please use the class variant of this
|
231
|
+
# method if possible.
|
232
|
+
#
|
233
|
+
def execute(sample)
|
234
|
+
# create union-find
|
235
|
+
puts "Creating PTA and UnionFind structure" if @options[:verbose]
|
236
|
+
ufds = sample2ufds(sample)
|
237
|
+
# refine it
|
238
|
+
ufds = main(ufds)
|
239
|
+
# compute and return quotient automaton
|
240
|
+
ufds2dfa(ufds)
|
241
|
+
end
|
242
|
+
|
243
|
+
#
|
244
|
+
# Build the smallest DFA compatible with the sample given as input.
|
245
|
+
#
|
246
|
+
# Options (the _options_ hash):
|
247
|
+
# - :verbose can be set to true to trace algorithm execution on standard output.
|
248
|
+
#
|
249
|
+
# Preconditions:
|
250
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
251
|
+
# positive and negative) and contains at least one string.
|
252
|
+
#
|
253
|
+
# Postconditions:
|
254
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
255
|
+
# given as input.
|
256
|
+
#
|
257
|
+
def self.execute(sample, options={})
|
258
|
+
RedBlue.new(options).execute(sample)
|
259
|
+
end
|
260
|
+
|
261
|
+
end # class RedBlue
|
262
|
+
|
263
|
+
end # module Induction
|
264
|
+
end # module Stamina
|