stamina-induction 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +78 -0
- data/LICENCE.md +22 -0
- data/lib/stamina-induction/stamina-induction.rb +1 -0
- data/lib/stamina-induction/stamina/abbadingo.rb +2 -0
- data/lib/stamina-induction/stamina/abbadingo/random_dfa.rb +55 -0
- data/lib/stamina-induction/stamina/abbadingo/random_sample.rb +146 -0
- data/lib/stamina-induction/stamina/classifier.rb +55 -0
- data/lib/stamina-induction/stamina/command.rb +6 -0
- data/lib/stamina-induction/stamina/command/abbadingo_dfa.rb +80 -0
- data/lib/stamina-induction/stamina/command/abbadingo_samples.rb +39 -0
- data/lib/stamina-induction/stamina/command/classify.rb +47 -0
- data/lib/stamina-induction/stamina/command/infer.rb +140 -0
- data/lib/stamina-induction/stamina/command/metrics.rb +50 -0
- data/lib/stamina-induction/stamina/command/score.rb +34 -0
- data/lib/stamina-induction/stamina/dsl.rb +2 -0
- data/lib/stamina-induction/stamina/dsl/induction.rb +29 -0
- data/lib/stamina-induction/stamina/dsl/reg_lang.rb +69 -0
- data/lib/stamina-induction/stamina/induction.rb +13 -0
- data/lib/stamina-induction/stamina/induction/blue_fringe.rb +265 -0
- data/lib/stamina-induction/stamina/induction/commons.rb +156 -0
- data/lib/stamina-induction/stamina/induction/rpni.rb +186 -0
- data/lib/stamina-induction/stamina/induction/union_find.rb +377 -0
- data/lib/stamina-induction/stamina/input_string.rb +123 -0
- data/lib/stamina-induction/stamina/reg_lang.rb +226 -0
- data/lib/stamina-induction/stamina/reg_lang/canonical_info.rb +181 -0
- data/lib/stamina-induction/stamina/reg_lang/parser.rb +10 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/alternative.rb +19 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/node.rb +22 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parenthesized.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parser.citrus +49 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/plus.rb +14 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/question.rb +17 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/regexp.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/sequence.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/star.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/symbol.rb +14 -0
- data/lib/stamina-induction/stamina/sample.rb +309 -0
- data/lib/stamina-induction/stamina/scoring.rb +213 -0
- metadata +106 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Generates samples following Abbadingo's protocol
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} target.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class AbbadingoSamples < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
|
14
|
+
# Install options
|
15
|
+
options do |opt|
|
16
|
+
|
17
|
+
end # options
|
18
|
+
|
19
|
+
# Command execution
|
20
|
+
def execute(args)
|
21
|
+
raise Quickl::Help unless args.size == 1
|
22
|
+
|
23
|
+
# Loads the target automaton
|
24
|
+
target_file = args.first
|
25
|
+
basename = File.basename(target_file, '.adl')
|
26
|
+
dirname = File.dirname(target_file)
|
27
|
+
target = Stamina::ADL::parse_automaton_file(target_file)
|
28
|
+
|
29
|
+
require 'stamina/abbadingo'
|
30
|
+
training, test = Stamina::Abbadingo::RandomSample.execute(target)
|
31
|
+
|
32
|
+
# Flush results aside the target automaton file
|
33
|
+
Stamina::ADL::print_sample_in_file(training, File.join(dirname, "#{basename}-training.adl"))
|
34
|
+
Stamina::ADL::print_sample_in_file(test, File.join(dirname, "#{basename}-test.adl"))
|
35
|
+
end
|
36
|
+
|
37
|
+
end # class AbbadingoSamples
|
38
|
+
end # class Command
|
39
|
+
end # module Stamina
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Classifies a sample thanks with an automaton
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} sample.adl automaton.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Classify < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Where to flush the output
|
16
|
+
attr_accessor :output_file
|
17
|
+
|
18
|
+
# Install options
|
19
|
+
options do |opt|
|
20
|
+
|
21
|
+
@output_file = nil
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush classification signature in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
|
28
|
+
end # options
|
29
|
+
|
30
|
+
# Command execution
|
31
|
+
def execute(args)
|
32
|
+
raise Quickl::Help unless args.size == 2
|
33
|
+
sample = Stamina::ADL::parse_sample_file assert_readable_file(args.first)
|
34
|
+
automaton = Stamina::ADL::parse_automaton_file assert_readable_file(args.last)
|
35
|
+
|
36
|
+
if of = output_file
|
37
|
+
File.open(of, 'w'){|io|
|
38
|
+
io << automaton.signature(sample)
|
39
|
+
}
|
40
|
+
else
|
41
|
+
$stdout << automaton.signature(sample)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end # class Classify
|
46
|
+
end # class Command
|
47
|
+
end # module Stamina
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Grammar inference, induces a DFA from a training sample using an
|
5
|
+
# chosen algorithm.
|
6
|
+
#
|
7
|
+
# SYNOPSIS
|
8
|
+
# #{program_name} #{command_name} sample.adl
|
9
|
+
#
|
10
|
+
# OPTIONS
|
11
|
+
# #{summarized_options}
|
12
|
+
#
|
13
|
+
class Infer < Quickl::Command(__FILE__, __LINE__)
|
14
|
+
include Robustness
|
15
|
+
|
16
|
+
attr_accessor :algorithm
|
17
|
+
attr_accessor :take
|
18
|
+
attr_accessor :score
|
19
|
+
attr_accessor :verbose
|
20
|
+
attr_accessor :drop
|
21
|
+
attr_accessor :output_file
|
22
|
+
|
23
|
+
# Install options
|
24
|
+
options do |opt|
|
25
|
+
|
26
|
+
@algorithm = :rpni
|
27
|
+
opt.on("--algorithm=X", "Sets the induction algorithm to use (rpni, bluefringe)") do |x|
|
28
|
+
@algorithm = x.to_sym
|
29
|
+
end
|
30
|
+
|
31
|
+
@take = 1.0
|
32
|
+
opt.on("--take=X", Float, "Take only X% of available strings") do |x|
|
33
|
+
@take = x.to_f
|
34
|
+
unless @take > 0.0 and @take <= 1.0
|
35
|
+
raise Quickl::InvalidOption, "Invalid --take option: #{@take}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
@score = nil
|
40
|
+
opt.on("--score=test.adl", "Add scoring information to metadata, using test.adl file") do |x|
|
41
|
+
@score = assert_readable_file(x)
|
42
|
+
end
|
43
|
+
|
44
|
+
@verbose = true
|
45
|
+
opt.on("-v", "--[no-]verbose", "Verbose mode") do |x|
|
46
|
+
@verbose = x
|
47
|
+
end
|
48
|
+
|
49
|
+
@drop = false
|
50
|
+
opt.on("-d", "--drop", "Drop result") do |x|
|
51
|
+
@drop = x
|
52
|
+
end
|
53
|
+
|
54
|
+
@output_file = nil
|
55
|
+
opt.on("-o", "--output=OUTPUT",
|
56
|
+
"Flush induced DFA in output file") do |value|
|
57
|
+
@output_file = assert_writable_file(value)
|
58
|
+
end
|
59
|
+
|
60
|
+
end # options
|
61
|
+
|
62
|
+
def launch_induction(sample)
|
63
|
+
require 'benchmark'
|
64
|
+
|
65
|
+
algo_clazz = case algorithm
|
66
|
+
when :rpni
|
67
|
+
Stamina::Induction::RPNI
|
68
|
+
when :bluefringe
|
69
|
+
Stamina::Induction::BlueFringe
|
70
|
+
else
|
71
|
+
raise Quickl::InvalidOption, "Unknown induction algorithm: #{algo}"
|
72
|
+
end
|
73
|
+
|
74
|
+
dfa, tms = nil, nil
|
75
|
+
tms = Benchmark.measure do
|
76
|
+
dfa = algo_clazz.execute(sample, {:verbose => verbose})
|
77
|
+
end
|
78
|
+
[dfa, tms]
|
79
|
+
end
|
80
|
+
|
81
|
+
def load_sample(file)
|
82
|
+
sample = Stamina::ADL.parse_sample_file(file)
|
83
|
+
if @take != 1.0
|
84
|
+
sampled = Stamina::Sample.new
|
85
|
+
sample.each_positive{|s| sampled << s if Kernel.rand < @take}
|
86
|
+
sample.each_negative{|s| sampled << s if Kernel.rand < @take}
|
87
|
+
sample = sampled
|
88
|
+
end
|
89
|
+
sample
|
90
|
+
end
|
91
|
+
|
92
|
+
# Command execution
|
93
|
+
def execute(args)
|
94
|
+
raise Quickl::Help unless args.size == 1
|
95
|
+
|
96
|
+
# Parses the sample
|
97
|
+
$stderr << "Parsing sample...\n" if verbose
|
98
|
+
sample = load_sample(assert_readable_file(args.first))
|
99
|
+
|
100
|
+
# Induce the DFA
|
101
|
+
dfa, tms = launch_induction(sample)
|
102
|
+
|
103
|
+
# Flush result
|
104
|
+
unless drop
|
105
|
+
if output_file
|
106
|
+
File.open(output_file, 'w') do |file|
|
107
|
+
Stamina::ADL.print_automaton(dfa, file)
|
108
|
+
end
|
109
|
+
else
|
110
|
+
Stamina::ADL.print_automaton(dfa, $stdout)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# build meta information
|
115
|
+
meta = {:algorithm => algorithm,
|
116
|
+
:sample => File.basename(args.first),
|
117
|
+
:take => take,
|
118
|
+
:sample_size => sample.size,
|
119
|
+
:positive_count => sample.positive_count,
|
120
|
+
:negative_count => sample.negative_count,
|
121
|
+
:real_time => tms.real,
|
122
|
+
:total_time => tms.total,
|
123
|
+
:user_time => tms.utime + tms.cutime,
|
124
|
+
:system_time => tms.stime + tms.cstime}
|
125
|
+
|
126
|
+
if score
|
127
|
+
test = Stamina::ADL::parse_sample_file(score)
|
128
|
+
classified_as = dfa.signature(test)
|
129
|
+
reference = test.signature
|
130
|
+
scoring = Scoring.scoring(classified_as, reference)
|
131
|
+
meta.merge!(scoring.to_h)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Display information
|
135
|
+
puts meta.inspect
|
136
|
+
end
|
137
|
+
|
138
|
+
end # class Infer
|
139
|
+
end # class Command
|
140
|
+
end # module Stamina
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Prints metrics about an automaton or sample
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} [file.adl]
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Metrics < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Install options
|
16
|
+
options do |opt|
|
17
|
+
|
18
|
+
end # options
|
19
|
+
|
20
|
+
# Command execution
|
21
|
+
def execute(args)
|
22
|
+
raise Quickl::Help unless args.size <= 1
|
23
|
+
|
24
|
+
# Loads the target automaton
|
25
|
+
input = if args.size == 1
|
26
|
+
File.read assert_readable_file(args.first)
|
27
|
+
else
|
28
|
+
$stdin.readlines.join("\n")
|
29
|
+
end
|
30
|
+
|
31
|
+
# Flush metrics
|
32
|
+
begin
|
33
|
+
target = Stamina::ADL::parse_automaton(input)
|
34
|
+
puts "Alphabet size: #{target.alphabet_size}"
|
35
|
+
puts "State count: #{target.state_count}"
|
36
|
+
puts "Edge count: #{target.edge_count}"
|
37
|
+
puts "Degree (avg): #{target.avg_degree}"
|
38
|
+
puts "Accepting ratio: #{target.accepting_ratio}"
|
39
|
+
puts "Depth: #{target.depth}"
|
40
|
+
rescue ADL::ParseError
|
41
|
+
sample = Stamina::ADL::parse_sample(input)
|
42
|
+
puts "Size: #{sample.size}"
|
43
|
+
puts "Positive: #{sample.positive_count} (#{sample.positive_count.to_f / sample.size})"
|
44
|
+
puts "Negative: #{sample.negative_count} (#{sample.negative_count.to_f / sample.size})"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end # class Metrics
|
49
|
+
end # class Command
|
50
|
+
end # module Stamina
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Scores the labelling of a sample by an automaton
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} sample.adl automaton.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Score < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Install options
|
16
|
+
options do |opt|
|
17
|
+
|
18
|
+
end # options
|
19
|
+
|
20
|
+
# Command execution
|
21
|
+
def execute(args)
|
22
|
+
raise Quickl::Help unless args.size == 2
|
23
|
+
sample = Stamina::ADL::parse_sample_file assert_readable_file(args.first)
|
24
|
+
automaton = Stamina::ADL::parse_automaton_file assert_readable_file(args.last)
|
25
|
+
|
26
|
+
classified_as = automaton.signature(sample)
|
27
|
+
reference = sample.signature
|
28
|
+
scoring = Scoring.scoring(classified_as, reference)
|
29
|
+
puts scoring.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
end # class Score
|
33
|
+
end # class Command
|
34
|
+
end # module Stamina
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Dsl
|
3
|
+
module Induction
|
4
|
+
|
5
|
+
#
|
6
|
+
# Coerces `arg` to a Sample
|
7
|
+
#
|
8
|
+
def sample(arg)
|
9
|
+
Sample.coerce(arg)
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Learn a regular language from `arg` using the RPNI algorithm.
|
14
|
+
#
|
15
|
+
def rpni(arg)
|
16
|
+
regular Stamina::Induction::RPNI.execute(sample(arg))
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Learn a regular language from `arg` using the RPNI algorithm.
|
21
|
+
#
|
22
|
+
def blue_fringe(arg)
|
23
|
+
regular Stamina::Induction::BlueFringe.execute(sample(arg))
|
24
|
+
end
|
25
|
+
|
26
|
+
end # module Induction
|
27
|
+
include Induction
|
28
|
+
end # module Dsl
|
29
|
+
end # module Stamina
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Dsl
|
3
|
+
module RegLang
|
4
|
+
|
5
|
+
EMPTY_LANG = ::Stamina::RegLang::EMPTY
|
6
|
+
|
7
|
+
#
|
8
|
+
# Coerces `arg` to a regular language.
|
9
|
+
#
|
10
|
+
def regular(arg)
|
11
|
+
Stamina::RegLang.coerce(arg)
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns the universal language on a given alphabet.
|
16
|
+
#
|
17
|
+
def sigma_star(alphabet)
|
18
|
+
Stamina::RegLang.sigma_star(alphabet)
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Coerces `arg` to a prefix-closed regular language.
|
23
|
+
#
|
24
|
+
def prefix_closed(arg)
|
25
|
+
regular(arg).prefix_closed
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Extracts the short prefixes of a regular language (coerced from `arg`)
|
30
|
+
# as a Sample instance.
|
31
|
+
#
|
32
|
+
def short_prefixes(arg)
|
33
|
+
regular(arg).short_prefixes
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Extracts the kernel of a regular language (coerced from `arg`) as
|
38
|
+
# a Sample instance.
|
39
|
+
#
|
40
|
+
def kernel(arg)
|
41
|
+
regular(arg).kernel
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Extracts a characteristic sample for a regular language (coerced from
|
46
|
+
# `arg`) as a Sample instance.
|
47
|
+
#
|
48
|
+
def characteristic_sample(arg)
|
49
|
+
regular(arg).characteristic_sample
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Hides allbut `alph` symbols in the regular language `arg`
|
54
|
+
#
|
55
|
+
def project(arg, alph)
|
56
|
+
regular(arg).project(alph)
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Hides `alph` symbols in the regular language `arg`
|
61
|
+
#
|
62
|
+
def hide(arg, alph)
|
63
|
+
regular(arg).hide(alph)
|
64
|
+
end
|
65
|
+
|
66
|
+
end # module RegLang
|
67
|
+
include RegLang
|
68
|
+
end # module Dsl
|
69
|
+
end # module Stamina
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative 'sample'
|
2
|
+
require_relative 'input_string'
|
3
|
+
require_relative 'classifier'
|
4
|
+
require_relative 'scoring'
|
5
|
+
require_relative 'induction/union_find'
|
6
|
+
require_relative 'induction/commons'
|
7
|
+
require_relative 'induction/rpni'
|
8
|
+
require_relative 'induction/blue_fringe'
|
9
|
+
require_relative 'abbadingo'
|
10
|
+
require_relative 'dsl/induction'
|
11
|
+
require_relative 'reg_lang'
|
12
|
+
require_relative 'dsl/reg_lang'
|
13
|
+
require_relative 'command'
|
@@ -0,0 +1,265 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Implementation of the BlueFringe variant of the RPNI algorithm (with the blue-fringe
|
6
|
+
# heuristics).
|
7
|
+
#
|
8
|
+
# See Lang, K., B. Pearlmutter, andR. Price. 1998. Results of the Abbadingo One DFA
|
9
|
+
# Learning Competition and a New Evidence-Driven State Merging Algorithm, In Grammatical
|
10
|
+
# Inference, pp. 1–12. Ames, IO: Springer-Verlag.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# # sample typically comes from an ADL file
|
14
|
+
# sample = Stamina::ADL.parse_sample_file('sample.adl')
|
15
|
+
#
|
16
|
+
# # let BlueFringe build the smallest dfa
|
17
|
+
# dfa = Stamina::Induction::BlueFringe.execute(sample, {:verbose => true})
|
18
|
+
#
|
19
|
+
# Remarks:
|
20
|
+
# - Constructor and instance methods of this class are public but not intended
|
21
|
+
# to be used directly. They are left public for testing purposes only.
|
22
|
+
# - Having read the Stamina::Induction::BlueFringe base algorithm may help undertanding
|
23
|
+
# this variant.
|
24
|
+
# - This class intensively uses the Stamina::Induction::UnionFind class and
|
25
|
+
# methods defined in the Stamina::Induction::Commons module which are worth
|
26
|
+
# reading to understand the algorithm implementation.
|
27
|
+
#
|
28
|
+
class BlueFringe
|
29
|
+
include Stamina::Induction::Commons
|
30
|
+
|
31
|
+
# Union-find data structure used internally
|
32
|
+
attr_reader :ufds
|
33
|
+
|
34
|
+
# Creates an algorithm instance with given options.
|
35
|
+
def initialize(options={})
|
36
|
+
raise ArgumentError, "Invalid options #{options.inspect}" unless options.is_a?(Hash)
|
37
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
38
|
+
@score_cache = {}
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Computes the score of a single (group) merge. Returned value is 1 if both are
|
43
|
+
# accepting states or both are error states and 0 otherwise. Note that d1 and d2
|
44
|
+
# are expected to be merge compatible as this method does not distinguish this
|
45
|
+
# case.
|
46
|
+
#
|
47
|
+
def merge_score(d1, d2)
|
48
|
+
# Score of 1 if both accepting or both error
|
49
|
+
((d1[:accepting] and d2[:accepting]) or (d1[:error] and d2[:error])) ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Merges a state of rank j with a state of lower rank i. This merge method
|
54
|
+
# includes merging for determinization. It returns nil if the merge is
|
55
|
+
# incompatible, a merge score otherwise.
|
56
|
+
#
|
57
|
+
# Preconditions:
|
58
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
59
|
+
# - States denoted by i and j are expected to be different
|
60
|
+
#
|
61
|
+
# Postconditions:
|
62
|
+
# - Union find is refined, states i and j having been merged, as well as all
|
63
|
+
# state pairs that need to be merged to ensure the deterministic property
|
64
|
+
# of the quotient automaton.
|
65
|
+
# - If the resulting quotient automaton is consistent with the negative sample,
|
66
|
+
# this method returns the number of accepting pairs + the number of error pairs
|
67
|
+
# that have been merged. The refined union-find correctly encodes the quotient
|
68
|
+
# automaton. Otherwise, the method returns nil and the union-find information
|
69
|
+
# must be considered inaccurate.
|
70
|
+
#
|
71
|
+
def merge_and_determinize(i, j)
|
72
|
+
# Make the union (keep merging score as well as additional merges to be performed
|
73
|
+
# in score and determinization, respectively). Recompute the user data attached to
|
74
|
+
# the new state group (new_data)
|
75
|
+
determinization, score = [], nil
|
76
|
+
@ufds.union(i, j) do |d1, d2|
|
77
|
+
# states are incompatible if new_data cannot be created because it would
|
78
|
+
# lead to merge and error and an accepting state. We simply return nil in this
|
79
|
+
# case...
|
80
|
+
return nil unless (new_data = merge_user_data(d1, d2, determinization))
|
81
|
+
# otherwise, we score
|
82
|
+
score = merge_score(d1, d2)
|
83
|
+
# and we let the union find keep the new_data for the group
|
84
|
+
new_data
|
85
|
+
end
|
86
|
+
|
87
|
+
# Merge for determinization starts here, based on the determinization array
|
88
|
+
# computed as a side effect of merge_user_data
|
89
|
+
determinization.each do |pair|
|
90
|
+
# we take the leader states of the pair to merge
|
91
|
+
pair = pair.collect{|i| @ufds.find(i)}
|
92
|
+
# do nothing if already the same leader state
|
93
|
+
next if pair[0]==pair[1]
|
94
|
+
# otherwise recurse and keep subscore
|
95
|
+
subscore = merge_and_determinize(pair[0], pair[1])
|
96
|
+
# failure if merging for determinization led to merge error and accepting
|
97
|
+
# states
|
98
|
+
return nil if subscore.nil?
|
99
|
+
# this is the new score
|
100
|
+
score += subscore
|
101
|
+
end
|
102
|
+
|
103
|
+
score
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Evaluates the score of merging states i and j. Returns nil if the states are
|
108
|
+
# cannot be merged, a positive score otherwise.
|
109
|
+
#
|
110
|
+
# Preconditions:
|
111
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
112
|
+
# - States denoted by i and j are expected to be different
|
113
|
+
#
|
114
|
+
# Postconditions:
|
115
|
+
# - Returned value is nil if the quotient automaton would be incompatible with
|
116
|
+
# the sample. Otherwise a positive number is returned, encoding the number of
|
117
|
+
# interresting pairs that have been merged (interesting = both accepting or both
|
118
|
+
# error)
|
119
|
+
# - The union find is ALWAYS restored to its previous value after merging has
|
120
|
+
# been evaluated and is then seen unchanged by the caller.
|
121
|
+
#
|
122
|
+
def merge_and_determinize_score(i, j)
|
123
|
+
score = @score_cache[[i,j]] ||= begin
|
124
|
+
# score the merging, always rollback the transaction
|
125
|
+
score = nil
|
126
|
+
@ufds.transactional do
|
127
|
+
score = merge_and_determinize(i, j)
|
128
|
+
false
|
129
|
+
end
|
130
|
+
score || -1
|
131
|
+
end
|
132
|
+
score == -1 ? nil : score
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Computes the fringe given the current union find. The fringe is returned as an
|
137
|
+
# array of state indices.
|
138
|
+
#
|
139
|
+
# Postconditions:
|
140
|
+
# - Returned array contains indices of leader states only.
|
141
|
+
# - Returned array is disjoint with the kernel.
|
142
|
+
#
|
143
|
+
def fringe
|
144
|
+
fringe = []
|
145
|
+
@kernel.each do |k1|
|
146
|
+
delta = @ufds.mergeable_data(k1)[:delta]
|
147
|
+
delta.each_pair{|symbol, target| fringe << @ufds.find(target)}
|
148
|
+
end
|
149
|
+
(fringe - @kernel).sort
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Main method of the algorithm. Refines the union find passed as first argument
|
154
|
+
# by merging well chosen state pairs. Returns the refined union find.
|
155
|
+
#
|
156
|
+
# Preconditions:
|
157
|
+
# - The union find _ufds_ is correctly initialized (contains :initial, :accepting,
|
158
|
+
# and :error boolean flags as well as a :delta sub hash)
|
159
|
+
#
|
160
|
+
# Postconditions:
|
161
|
+
# - The union find has been refined. It encodes a quotient automaton (of the PTA
|
162
|
+
# it comes from) such that all positive and negative strings of the underlying
|
163
|
+
# sample are correctly classified by it.
|
164
|
+
#
|
165
|
+
def main(ufds)
|
166
|
+
info("Starting BlueFringe (#{ufds.size} states)")
|
167
|
+
@ufds, @kernel, @score_cache = ufds, [0], {}
|
168
|
+
|
169
|
+
# we do it until the fringe is empty (compute it only once each step)
|
170
|
+
until (the_fringe=fringe).empty?
|
171
|
+
# state to consolidate (if any)
|
172
|
+
to_consolidate = nil
|
173
|
+
# best candidate [source index, target index, score]
|
174
|
+
best = [nil, nil, -1]
|
175
|
+
|
176
|
+
# for each state on the fringe as merge candidate
|
177
|
+
the_fringe.each do |candidate|
|
178
|
+
to_consolidate = candidate
|
179
|
+
|
180
|
+
# evaluate score of merging candidate with each kernel state
|
181
|
+
@kernel.each do |target|
|
182
|
+
score = merge_and_determinize_score(candidate, target)
|
183
|
+
unless score.nil?
|
184
|
+
# if a score has been found, the candidate will not be
|
185
|
+
# consolidated. We keep it as best if its better than the
|
186
|
+
# previous one
|
187
|
+
to_consolidate = nil
|
188
|
+
best = [candidate, target, score] if score > best[2]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# No possible target, break the loop (will consolidate right now)!
|
193
|
+
break unless to_consolidate.nil?
|
194
|
+
end
|
195
|
+
|
196
|
+
# If not found, the last candidate must be consolidated. Otherwise, we
|
197
|
+
# do the best merging
|
198
|
+
unless to_consolidate.nil?
|
199
|
+
info("Consolidation of #{to_consolidate}")
|
200
|
+
@kernel << to_consolidate
|
201
|
+
else
|
202
|
+
@score_cache.clear
|
203
|
+
info("Merging #{best[0]} and #{best[1]} [#{best[2]}]")
|
204
|
+
# this one should never fail because its score was positive before
|
205
|
+
raise "Unexpected case" unless merge_and_determinize(best[0], best[1])
|
206
|
+
end
|
207
|
+
|
208
|
+
# blue_fringe does not guarantee that it will not merge a state of lower rank
|
209
|
+
# with a kernel state. The kernel should then be update at each step to keep
|
210
|
+
# lowest indices for the whole kernel, and we sort it
|
211
|
+
@kernel = @kernel.collect{|k| @ufds.find(k)}.sort
|
212
|
+
end
|
213
|
+
|
214
|
+
# return the refined union find now
|
215
|
+
@ufds
|
216
|
+
end
|
217
|
+
|
218
|
+
#
|
219
|
+
# Build the smallest DFA compatible with the sample given as input.
|
220
|
+
#
|
221
|
+
# Preconditions:
|
222
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
223
|
+
# positive and negative) and contains at least one string.
|
224
|
+
#
|
225
|
+
# Postconditions:
|
226
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
227
|
+
# given as input.
|
228
|
+
#
|
229
|
+
# Remarks:
|
230
|
+
# - This instance version of BlueFringe.execute is not intended to be used directly and
|
231
|
+
# is mainly provided for testing purposes. Please use the class variant of this
|
232
|
+
# method if possible.
|
233
|
+
#
|
234
|
+
def execute(sample)
|
235
|
+
# create union-find
|
236
|
+
info("Creating PTA and UnionFind structure")
|
237
|
+
ufds = sample2ufds(sample)
|
238
|
+
# refine it
|
239
|
+
ufds = main(ufds)
|
240
|
+
# compute and return quotient automaton
|
241
|
+
ufds2dfa(ufds)
|
242
|
+
end
|
243
|
+
|
244
|
+
#
|
245
|
+
# Build the smallest DFA compatible with the sample given as input.
|
246
|
+
#
|
247
|
+
# Options (the _options_ hash):
|
248
|
+
# - :verbose can be set to true to trace algorithm execution on standard output.
|
249
|
+
#
|
250
|
+
# Preconditions:
|
251
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
252
|
+
# positive and negative) and contains at least one string.
|
253
|
+
#
|
254
|
+
# Postconditions:
|
255
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
256
|
+
# given as input.
|
257
|
+
#
|
258
|
+
def self.execute(sample, options={})
|
259
|
+
BlueFringe.new(options).execute(sample)
|
260
|
+
end
|
261
|
+
|
262
|
+
end # class BlueFringe
|
263
|
+
|
264
|
+
end # module Induction
|
265
|
+
end # module Stamina
|