stamina-induction 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +78 -0
- data/LICENCE.md +22 -0
- data/lib/stamina-induction/stamina-induction.rb +1 -0
- data/lib/stamina-induction/stamina/abbadingo.rb +2 -0
- data/lib/stamina-induction/stamina/abbadingo/random_dfa.rb +55 -0
- data/lib/stamina-induction/stamina/abbadingo/random_sample.rb +146 -0
- data/lib/stamina-induction/stamina/classifier.rb +55 -0
- data/lib/stamina-induction/stamina/command.rb +6 -0
- data/lib/stamina-induction/stamina/command/abbadingo_dfa.rb +80 -0
- data/lib/stamina-induction/stamina/command/abbadingo_samples.rb +39 -0
- data/lib/stamina-induction/stamina/command/classify.rb +47 -0
- data/lib/stamina-induction/stamina/command/infer.rb +140 -0
- data/lib/stamina-induction/stamina/command/metrics.rb +50 -0
- data/lib/stamina-induction/stamina/command/score.rb +34 -0
- data/lib/stamina-induction/stamina/dsl.rb +2 -0
- data/lib/stamina-induction/stamina/dsl/induction.rb +29 -0
- data/lib/stamina-induction/stamina/dsl/reg_lang.rb +69 -0
- data/lib/stamina-induction/stamina/induction.rb +13 -0
- data/lib/stamina-induction/stamina/induction/blue_fringe.rb +265 -0
- data/lib/stamina-induction/stamina/induction/commons.rb +156 -0
- data/lib/stamina-induction/stamina/induction/rpni.rb +186 -0
- data/lib/stamina-induction/stamina/induction/union_find.rb +377 -0
- data/lib/stamina-induction/stamina/input_string.rb +123 -0
- data/lib/stamina-induction/stamina/reg_lang.rb +226 -0
- data/lib/stamina-induction/stamina/reg_lang/canonical_info.rb +181 -0
- data/lib/stamina-induction/stamina/reg_lang/parser.rb +10 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/alternative.rb +19 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/node.rb +22 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parenthesized.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/parser.citrus +49 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/plus.rb +14 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/question.rb +17 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/regexp.rb +12 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/sequence.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/star.rb +15 -0
- data/lib/stamina-induction/stamina/reg_lang/parser/symbol.rb +14 -0
- data/lib/stamina-induction/stamina/sample.rb +309 -0
- data/lib/stamina-induction/stamina/scoring.rb +213 -0
- metadata +106 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Generates samples following Abbadingo's protocol
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} target.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class AbbadingoSamples < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
|
14
|
+
# Install options
|
15
|
+
options do |opt|
|
16
|
+
|
17
|
+
end # options
|
18
|
+
|
19
|
+
# Command execution
|
20
|
+
def execute(args)
|
21
|
+
raise Quickl::Help unless args.size == 1
|
22
|
+
|
23
|
+
# Loads the target automaton
|
24
|
+
target_file = args.first
|
25
|
+
basename = File.basename(target_file, '.adl')
|
26
|
+
dirname = File.dirname(target_file)
|
27
|
+
target = Stamina::ADL::parse_automaton_file(target_file)
|
28
|
+
|
29
|
+
require 'stamina/abbadingo'
|
30
|
+
training, test = Stamina::Abbadingo::RandomSample.execute(target)
|
31
|
+
|
32
|
+
# Flush results aside the target automaton file
|
33
|
+
Stamina::ADL::print_sample_in_file(training, File.join(dirname, "#{basename}-training.adl"))
|
34
|
+
Stamina::ADL::print_sample_in_file(test, File.join(dirname, "#{basename}-test.adl"))
|
35
|
+
end
|
36
|
+
|
37
|
+
end # class AbbadingoSamples
|
38
|
+
end # class Command
|
39
|
+
end # module Stamina
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Classifies a sample thanks with an automaton
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} sample.adl automaton.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Classify < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Where to flush the output
|
16
|
+
attr_accessor :output_file
|
17
|
+
|
18
|
+
# Install options
|
19
|
+
options do |opt|
|
20
|
+
|
21
|
+
@output_file = nil
|
22
|
+
opt.on("-o", "--output=OUTPUT",
|
23
|
+
"Flush classification signature in output file") do |value|
|
24
|
+
assert_writable_file(value)
|
25
|
+
@output_file = value
|
26
|
+
end
|
27
|
+
|
28
|
+
end # options
|
29
|
+
|
30
|
+
# Command execution
|
31
|
+
def execute(args)
|
32
|
+
raise Quickl::Help unless args.size == 2
|
33
|
+
sample = Stamina::ADL::parse_sample_file assert_readable_file(args.first)
|
34
|
+
automaton = Stamina::ADL::parse_automaton_file assert_readable_file(args.last)
|
35
|
+
|
36
|
+
if of = output_file
|
37
|
+
File.open(of, 'w'){|io|
|
38
|
+
io << automaton.signature(sample)
|
39
|
+
}
|
40
|
+
else
|
41
|
+
$stdout << automaton.signature(sample)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end # class Classify
|
46
|
+
end # class Command
|
47
|
+
end # module Stamina
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Grammar inference, induces a DFA from a training sample using an
|
5
|
+
# chosen algorithm.
|
6
|
+
#
|
7
|
+
# SYNOPSIS
|
8
|
+
# #{program_name} #{command_name} sample.adl
|
9
|
+
#
|
10
|
+
# OPTIONS
|
11
|
+
# #{summarized_options}
|
12
|
+
#
|
13
|
+
class Infer < Quickl::Command(__FILE__, __LINE__)
|
14
|
+
include Robustness
|
15
|
+
|
16
|
+
attr_accessor :algorithm
|
17
|
+
attr_accessor :take
|
18
|
+
attr_accessor :score
|
19
|
+
attr_accessor :verbose
|
20
|
+
attr_accessor :drop
|
21
|
+
attr_accessor :output_file
|
22
|
+
|
23
|
+
# Install options
|
24
|
+
options do |opt|
|
25
|
+
|
26
|
+
@algorithm = :rpni
|
27
|
+
opt.on("--algorithm=X", "Sets the induction algorithm to use (rpni, bluefringe)") do |x|
|
28
|
+
@algorithm = x.to_sym
|
29
|
+
end
|
30
|
+
|
31
|
+
@take = 1.0
|
32
|
+
opt.on("--take=X", Float, "Take only X% of available strings") do |x|
|
33
|
+
@take = x.to_f
|
34
|
+
unless @take > 0.0 and @take <= 1.0
|
35
|
+
raise Quickl::InvalidOption, "Invalid --take option: #{@take}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
@score = nil
|
40
|
+
opt.on("--score=test.adl", "Add scoring information to metadata, using test.adl file") do |x|
|
41
|
+
@score = assert_readable_file(x)
|
42
|
+
end
|
43
|
+
|
44
|
+
@verbose = true
|
45
|
+
opt.on("-v", "--[no-]verbose", "Verbose mode") do |x|
|
46
|
+
@verbose = x
|
47
|
+
end
|
48
|
+
|
49
|
+
@drop = false
|
50
|
+
opt.on("-d", "--drop", "Drop result") do |x|
|
51
|
+
@drop = x
|
52
|
+
end
|
53
|
+
|
54
|
+
@output_file = nil
|
55
|
+
opt.on("-o", "--output=OUTPUT",
|
56
|
+
"Flush induced DFA in output file") do |value|
|
57
|
+
@output_file = assert_writable_file(value)
|
58
|
+
end
|
59
|
+
|
60
|
+
end # options
|
61
|
+
|
62
|
+
def launch_induction(sample)
|
63
|
+
require 'benchmark'
|
64
|
+
|
65
|
+
algo_clazz = case algorithm
|
66
|
+
when :rpni
|
67
|
+
Stamina::Induction::RPNI
|
68
|
+
when :bluefringe
|
69
|
+
Stamina::Induction::BlueFringe
|
70
|
+
else
|
71
|
+
raise Quickl::InvalidOption, "Unknown induction algorithm: #{algo}"
|
72
|
+
end
|
73
|
+
|
74
|
+
dfa, tms = nil, nil
|
75
|
+
tms = Benchmark.measure do
|
76
|
+
dfa = algo_clazz.execute(sample, {:verbose => verbose})
|
77
|
+
end
|
78
|
+
[dfa, tms]
|
79
|
+
end
|
80
|
+
|
81
|
+
def load_sample(file)
|
82
|
+
sample = Stamina::ADL.parse_sample_file(file)
|
83
|
+
if @take != 1.0
|
84
|
+
sampled = Stamina::Sample.new
|
85
|
+
sample.each_positive{|s| sampled << s if Kernel.rand < @take}
|
86
|
+
sample.each_negative{|s| sampled << s if Kernel.rand < @take}
|
87
|
+
sample = sampled
|
88
|
+
end
|
89
|
+
sample
|
90
|
+
end
|
91
|
+
|
92
|
+
# Command execution
|
93
|
+
def execute(args)
|
94
|
+
raise Quickl::Help unless args.size == 1
|
95
|
+
|
96
|
+
# Parses the sample
|
97
|
+
$stderr << "Parsing sample...\n" if verbose
|
98
|
+
sample = load_sample(assert_readable_file(args.first))
|
99
|
+
|
100
|
+
# Induce the DFA
|
101
|
+
dfa, tms = launch_induction(sample)
|
102
|
+
|
103
|
+
# Flush result
|
104
|
+
unless drop
|
105
|
+
if output_file
|
106
|
+
File.open(output_file, 'w') do |file|
|
107
|
+
Stamina::ADL.print_automaton(dfa, file)
|
108
|
+
end
|
109
|
+
else
|
110
|
+
Stamina::ADL.print_automaton(dfa, $stdout)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# build meta information
|
115
|
+
meta = {:algorithm => algorithm,
|
116
|
+
:sample => File.basename(args.first),
|
117
|
+
:take => take,
|
118
|
+
:sample_size => sample.size,
|
119
|
+
:positive_count => sample.positive_count,
|
120
|
+
:negative_count => sample.negative_count,
|
121
|
+
:real_time => tms.real,
|
122
|
+
:total_time => tms.total,
|
123
|
+
:user_time => tms.utime + tms.cutime,
|
124
|
+
:system_time => tms.stime + tms.cstime}
|
125
|
+
|
126
|
+
if score
|
127
|
+
test = Stamina::ADL::parse_sample_file(score)
|
128
|
+
classified_as = dfa.signature(test)
|
129
|
+
reference = test.signature
|
130
|
+
scoring = Scoring.scoring(classified_as, reference)
|
131
|
+
meta.merge!(scoring.to_h)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Display information
|
135
|
+
puts meta.inspect
|
136
|
+
end
|
137
|
+
|
138
|
+
end # class Infer
|
139
|
+
end # class Command
|
140
|
+
end # module Stamina
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Prints metrics about an automaton or sample
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} [file.adl]
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Metrics < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Install options
|
16
|
+
options do |opt|
|
17
|
+
|
18
|
+
end # options
|
19
|
+
|
20
|
+
# Command execution
|
21
|
+
def execute(args)
|
22
|
+
raise Quickl::Help unless args.size <= 1
|
23
|
+
|
24
|
+
# Loads the target automaton
|
25
|
+
input = if args.size == 1
|
26
|
+
File.read assert_readable_file(args.first)
|
27
|
+
else
|
28
|
+
$stdin.readlines.join("\n")
|
29
|
+
end
|
30
|
+
|
31
|
+
# Flush metrics
|
32
|
+
begin
|
33
|
+
target = Stamina::ADL::parse_automaton(input)
|
34
|
+
puts "Alphabet size: #{target.alphabet_size}"
|
35
|
+
puts "State count: #{target.state_count}"
|
36
|
+
puts "Edge count: #{target.edge_count}"
|
37
|
+
puts "Degree (avg): #{target.avg_degree}"
|
38
|
+
puts "Accepting ratio: #{target.accepting_ratio}"
|
39
|
+
puts "Depth: #{target.depth}"
|
40
|
+
rescue ADL::ParseError
|
41
|
+
sample = Stamina::ADL::parse_sample(input)
|
42
|
+
puts "Size: #{sample.size}"
|
43
|
+
puts "Positive: #{sample.positive_count} (#{sample.positive_count.to_f / sample.size})"
|
44
|
+
puts "Negative: #{sample.negative_count} (#{sample.negative_count.to_f / sample.size})"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end # class Metrics
|
49
|
+
end # class Command
|
50
|
+
end # module Stamina
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Stamina
|
2
|
+
class Command
|
3
|
+
#
|
4
|
+
# Scores the labelling of a sample by an automaton
|
5
|
+
#
|
6
|
+
# SYNOPSIS
|
7
|
+
# #{program_name} #{command_name} sample.adl automaton.adl
|
8
|
+
#
|
9
|
+
# OPTIONS
|
10
|
+
# #{summarized_options}
|
11
|
+
#
|
12
|
+
class Score < Quickl::Command(__FILE__, __LINE__)
|
13
|
+
include Robustness
|
14
|
+
|
15
|
+
# Install options
|
16
|
+
options do |opt|
|
17
|
+
|
18
|
+
end # options
|
19
|
+
|
20
|
+
# Command execution
|
21
|
+
def execute(args)
|
22
|
+
raise Quickl::Help unless args.size == 2
|
23
|
+
sample = Stamina::ADL::parse_sample_file assert_readable_file(args.first)
|
24
|
+
automaton = Stamina::ADL::parse_automaton_file assert_readable_file(args.last)
|
25
|
+
|
26
|
+
classified_as = automaton.signature(sample)
|
27
|
+
reference = sample.signature
|
28
|
+
scoring = Scoring.scoring(classified_as, reference)
|
29
|
+
puts scoring.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
end # class Score
|
33
|
+
end # class Command
|
34
|
+
end # module Stamina
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Dsl
|
3
|
+
module Induction
|
4
|
+
|
5
|
+
#
|
6
|
+
# Coerces `arg` to a Sample
|
7
|
+
#
|
8
|
+
def sample(arg)
|
9
|
+
Sample.coerce(arg)
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Learn a regular language from `arg` using the RPNI algorithm.
|
14
|
+
#
|
15
|
+
def rpni(arg)
|
16
|
+
regular Stamina::Induction::RPNI.execute(sample(arg))
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Learn a regular language from `arg` using the RPNI algorithm.
|
21
|
+
#
|
22
|
+
def blue_fringe(arg)
|
23
|
+
regular Stamina::Induction::BlueFringe.execute(sample(arg))
|
24
|
+
end
|
25
|
+
|
26
|
+
end # module Induction
|
27
|
+
include Induction
|
28
|
+
end # module Dsl
|
29
|
+
end # module Stamina
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Dsl
|
3
|
+
module RegLang
|
4
|
+
|
5
|
+
EMPTY_LANG = ::Stamina::RegLang::EMPTY
|
6
|
+
|
7
|
+
#
|
8
|
+
# Coerces `arg` to a regular language.
|
9
|
+
#
|
10
|
+
def regular(arg)
|
11
|
+
Stamina::RegLang.coerce(arg)
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns the universal language on a given alphabet.
|
16
|
+
#
|
17
|
+
def sigma_star(alphabet)
|
18
|
+
Stamina::RegLang.sigma_star(alphabet)
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Coerces `arg` to a prefix-closed regular language.
|
23
|
+
#
|
24
|
+
def prefix_closed(arg)
|
25
|
+
regular(arg).prefix_closed
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Extracts the short prefixes of a regular language (coerced from `arg`)
|
30
|
+
# as a Sample instance.
|
31
|
+
#
|
32
|
+
def short_prefixes(arg)
|
33
|
+
regular(arg).short_prefixes
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Extracts the kernel of a regular language (coerced from `arg`) as
|
38
|
+
# a Sample instance.
|
39
|
+
#
|
40
|
+
def kernel(arg)
|
41
|
+
regular(arg).kernel
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# Extracts a characteristic sample for a regular language (coerced from
|
46
|
+
# `arg`) as a Sample instance.
|
47
|
+
#
|
48
|
+
def characteristic_sample(arg)
|
49
|
+
regular(arg).characteristic_sample
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Hides allbut `alph` symbols in the regular language `arg`
|
54
|
+
#
|
55
|
+
def project(arg, alph)
|
56
|
+
regular(arg).project(alph)
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Hides `alph` symbols in the regular language `arg`
|
61
|
+
#
|
62
|
+
def hide(arg, alph)
|
63
|
+
regular(arg).hide(alph)
|
64
|
+
end
|
65
|
+
|
66
|
+
end # module RegLang
|
67
|
+
include RegLang
|
68
|
+
end # module Dsl
|
69
|
+
end # module Stamina
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative 'sample'
|
2
|
+
require_relative 'input_string'
|
3
|
+
require_relative 'classifier'
|
4
|
+
require_relative 'scoring'
|
5
|
+
require_relative 'induction/union_find'
|
6
|
+
require_relative 'induction/commons'
|
7
|
+
require_relative 'induction/rpni'
|
8
|
+
require_relative 'induction/blue_fringe'
|
9
|
+
require_relative 'abbadingo'
|
10
|
+
require_relative 'dsl/induction'
|
11
|
+
require_relative 'reg_lang'
|
12
|
+
require_relative 'dsl/reg_lang'
|
13
|
+
require_relative 'command'
|
@@ -0,0 +1,265 @@
|
|
1
|
+
module Stamina
|
2
|
+
module Induction
|
3
|
+
|
4
|
+
#
|
5
|
+
# Implementation of the BlueFringe variant of the RPNI algorithm (with the blue-fringe
|
6
|
+
# heuristics).
|
7
|
+
#
|
8
|
+
# See Lang, K., B. Pearlmutter, andR. Price. 1998. Results of the Abbadingo One DFA
|
9
|
+
# Learning Competition and a New Evidence-Driven State Merging Algorithm, In Grammatical
|
10
|
+
# Inference, pp. 1–12. Ames, IO: Springer-Verlag.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# # sample typically comes from an ADL file
|
14
|
+
# sample = Stamina::ADL.parse_sample_file('sample.adl')
|
15
|
+
#
|
16
|
+
# # let BlueFringe build the smallest dfa
|
17
|
+
# dfa = Stamina::Induction::BlueFringe.execute(sample, {:verbose => true})
|
18
|
+
#
|
19
|
+
# Remarks:
|
20
|
+
# - Constructor and instance methods of this class are public but not intended
|
21
|
+
# to be used directly. They are left public for testing purposes only.
|
22
|
+
# - Having read the Stamina::Induction::BlueFringe base algorithm may help undertanding
|
23
|
+
# this variant.
|
24
|
+
# - This class intensively uses the Stamina::Induction::UnionFind class and
|
25
|
+
# methods defined in the Stamina::Induction::Commons module which are worth
|
26
|
+
# reading to understand the algorithm implementation.
|
27
|
+
#
|
28
|
+
class BlueFringe
|
29
|
+
include Stamina::Induction::Commons
|
30
|
+
|
31
|
+
# Union-find data structure used internally
|
32
|
+
attr_reader :ufds
|
33
|
+
|
34
|
+
# Creates an algorithm instance with given options.
|
35
|
+
def initialize(options={})
|
36
|
+
raise ArgumentError, "Invalid options #{options.inspect}" unless options.is_a?(Hash)
|
37
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
38
|
+
@score_cache = {}
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Computes the score of a single (group) merge. Returned value is 1 if both are
|
43
|
+
# accepting states or both are error states and 0 otherwise. Note that d1 and d2
|
44
|
+
# are expected to be merge compatible as this method does not distinguish this
|
45
|
+
# case.
|
46
|
+
#
|
47
|
+
def merge_score(d1, d2)
|
48
|
+
# Score of 1 if both accepting or both error
|
49
|
+
((d1[:accepting] and d2[:accepting]) or (d1[:error] and d2[:error])) ? 1 : 0
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Merges a state of rank j with a state of lower rank i. This merge method
|
54
|
+
# includes merging for determinization. It returns nil if the merge is
|
55
|
+
# incompatible, a merge score otherwise.
|
56
|
+
#
|
57
|
+
# Preconditions:
|
58
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
59
|
+
# - States denoted by i and j are expected to be different
|
60
|
+
#
|
61
|
+
# Postconditions:
|
62
|
+
# - Union find is refined, states i and j having been merged, as well as all
|
63
|
+
# state pairs that need to be merged to ensure the deterministic property
|
64
|
+
# of the quotient automaton.
|
65
|
+
# - If the resulting quotient automaton is consistent with the negative sample,
|
66
|
+
# this method returns the number of accepting pairs + the number of error pairs
|
67
|
+
# that have been merged. The refined union-find correctly encodes the quotient
|
68
|
+
# automaton. Otherwise, the method returns nil and the union-find information
|
69
|
+
# must be considered inaccurate.
|
70
|
+
#
|
71
|
+
def merge_and_determinize(i, j)
|
72
|
+
# Make the union (keep merging score as well as additional merges to be performed
|
73
|
+
# in score and determinization, respectively). Recompute the user data attached to
|
74
|
+
# the new state group (new_data)
|
75
|
+
determinization, score = [], nil
|
76
|
+
@ufds.union(i, j) do |d1, d2|
|
77
|
+
# states are incompatible if new_data cannot be created because it would
|
78
|
+
# lead to merge and error and an accepting state. We simply return nil in this
|
79
|
+
# case...
|
80
|
+
return nil unless (new_data = merge_user_data(d1, d2, determinization))
|
81
|
+
# otherwise, we score
|
82
|
+
score = merge_score(d1, d2)
|
83
|
+
# and we let the union find keep the new_data for the group
|
84
|
+
new_data
|
85
|
+
end
|
86
|
+
|
87
|
+
# Merge for determinization starts here, based on the determinization array
|
88
|
+
# computed as a side effect of merge_user_data
|
89
|
+
determinization.each do |pair|
|
90
|
+
# we take the leader states of the pair to merge
|
91
|
+
pair = pair.collect{|i| @ufds.find(i)}
|
92
|
+
# do nothing if already the same leader state
|
93
|
+
next if pair[0]==pair[1]
|
94
|
+
# otherwise recurse and keep subscore
|
95
|
+
subscore = merge_and_determinize(pair[0], pair[1])
|
96
|
+
# failure if merging for determinization led to merge error and accepting
|
97
|
+
# states
|
98
|
+
return nil if subscore.nil?
|
99
|
+
# this is the new score
|
100
|
+
score += subscore
|
101
|
+
end
|
102
|
+
|
103
|
+
score
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Evaluates the score of merging states i and j. Returns nil if the states are
|
108
|
+
# cannot be merged, a positive score otherwise.
|
109
|
+
#
|
110
|
+
# Preconditions:
|
111
|
+
# - States denoted by i and j are expected leader states (non merged ones)
|
112
|
+
# - States denoted by i and j are expected to be different
|
113
|
+
#
|
114
|
+
# Postconditions:
|
115
|
+
# - Returned value is nil if the quotient automaton would be incompatible with
|
116
|
+
# the sample. Otherwise a positive number is returned, encoding the number of
|
117
|
+
# interresting pairs that have been merged (interesting = both accepting or both
|
118
|
+
# error)
|
119
|
+
# - The union find is ALWAYS restored to its previous value after merging has
|
120
|
+
# been evaluated and is then seen unchanged by the caller.
|
121
|
+
#
|
122
|
+
def merge_and_determinize_score(i, j)
|
123
|
+
score = @score_cache[[i,j]] ||= begin
|
124
|
+
# score the merging, always rollback the transaction
|
125
|
+
score = nil
|
126
|
+
@ufds.transactional do
|
127
|
+
score = merge_and_determinize(i, j)
|
128
|
+
false
|
129
|
+
end
|
130
|
+
score || -1
|
131
|
+
end
|
132
|
+
score == -1 ? nil : score
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Computes the fringe given the current union find. The fringe is returned as an
|
137
|
+
# array of state indices.
|
138
|
+
#
|
139
|
+
# Postconditions:
|
140
|
+
# - Returned array contains indices of leader states only.
|
141
|
+
# - Returned array is disjoint with the kernel.
|
142
|
+
#
|
143
|
+
def fringe
|
144
|
+
fringe = []
|
145
|
+
@kernel.each do |k1|
|
146
|
+
delta = @ufds.mergeable_data(k1)[:delta]
|
147
|
+
delta.each_pair{|symbol, target| fringe << @ufds.find(target)}
|
148
|
+
end
|
149
|
+
(fringe - @kernel).sort
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Main method of the algorithm. Refines the union find passed as first argument
|
154
|
+
# by merging well chosen state pairs. Returns the refined union find.
|
155
|
+
#
|
156
|
+
# Preconditions:
|
157
|
+
# - The union find _ufds_ is correctly initialized (contains :initial, :accepting,
|
158
|
+
# and :error boolean flags as well as a :delta sub hash)
|
159
|
+
#
|
160
|
+
# Postconditions:
|
161
|
+
# - The union find has been refined. It encodes a quotient automaton (of the PTA
|
162
|
+
# it comes from) such that all positive and negative strings of the underlying
|
163
|
+
# sample are correctly classified by it.
|
164
|
+
#
|
165
|
+
def main(ufds)
|
166
|
+
info("Starting BlueFringe (#{ufds.size} states)")
|
167
|
+
@ufds, @kernel, @score_cache = ufds, [0], {}
|
168
|
+
|
169
|
+
# we do it until the fringe is empty (compute it only once each step)
|
170
|
+
until (the_fringe=fringe).empty?
|
171
|
+
# state to consolidate (if any)
|
172
|
+
to_consolidate = nil
|
173
|
+
# best candidate [source index, target index, score]
|
174
|
+
best = [nil, nil, -1]
|
175
|
+
|
176
|
+
# for each state on the fringe as merge candidate
|
177
|
+
the_fringe.each do |candidate|
|
178
|
+
to_consolidate = candidate
|
179
|
+
|
180
|
+
# evaluate score of merging candidate with each kernel state
|
181
|
+
@kernel.each do |target|
|
182
|
+
score = merge_and_determinize_score(candidate, target)
|
183
|
+
unless score.nil?
|
184
|
+
# if a score has been found, the candidate will not be
|
185
|
+
# consolidated. We keep it as best if its better than the
|
186
|
+
# previous one
|
187
|
+
to_consolidate = nil
|
188
|
+
best = [candidate, target, score] if score > best[2]
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# No possible target, break the loop (will consolidate right now)!
|
193
|
+
break unless to_consolidate.nil?
|
194
|
+
end
|
195
|
+
|
196
|
+
# If not found, the last candidate must be consolidated. Otherwise, we
|
197
|
+
# do the best merging
|
198
|
+
unless to_consolidate.nil?
|
199
|
+
info("Consolidation of #{to_consolidate}")
|
200
|
+
@kernel << to_consolidate
|
201
|
+
else
|
202
|
+
@score_cache.clear
|
203
|
+
info("Merging #{best[0]} and #{best[1]} [#{best[2]}]")
|
204
|
+
# this one should never fail because its score was positive before
|
205
|
+
raise "Unexpected case" unless merge_and_determinize(best[0], best[1])
|
206
|
+
end
|
207
|
+
|
208
|
+
# blue_fringe does not guarantee that it will not merge a state of lower rank
|
209
|
+
# with a kernel state. The kernel should then be update at each step to keep
|
210
|
+
# lowest indices for the whole kernel, and we sort it
|
211
|
+
@kernel = @kernel.collect{|k| @ufds.find(k)}.sort
|
212
|
+
end
|
213
|
+
|
214
|
+
# return the refined union find now
|
215
|
+
@ufds
|
216
|
+
end
|
217
|
+
|
218
|
+
#
|
219
|
+
# Build the smallest DFA compatible with the sample given as input.
|
220
|
+
#
|
221
|
+
# Preconditions:
|
222
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
223
|
+
# positive and negative) and contains at least one string.
|
224
|
+
#
|
225
|
+
# Postconditions:
|
226
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
227
|
+
# given as input.
|
228
|
+
#
|
229
|
+
# Remarks:
|
230
|
+
# - This instance version of BlueFringe.execute is not intended to be used directly and
|
231
|
+
# is mainly provided for testing purposes. Please use the class variant of this
|
232
|
+
# method if possible.
|
233
|
+
#
|
234
|
+
def execute(sample)
|
235
|
+
# create union-find
|
236
|
+
info("Creating PTA and UnionFind structure")
|
237
|
+
ufds = sample2ufds(sample)
|
238
|
+
# refine it
|
239
|
+
ufds = main(ufds)
|
240
|
+
# compute and return quotient automaton
|
241
|
+
ufds2dfa(ufds)
|
242
|
+
end
|
243
|
+
|
244
|
+
#
|
245
|
+
# Build the smallest DFA compatible with the sample given as input.
|
246
|
+
#
|
247
|
+
# Options (the _options_ hash):
|
248
|
+
# - :verbose can be set to true to trace algorithm execution on standard output.
|
249
|
+
#
|
250
|
+
# Preconditions:
|
251
|
+
# - The sample is consistent (does not contains the same string both labeled as
|
252
|
+
# positive and negative) and contains at least one string.
|
253
|
+
#
|
254
|
+
# Postconditions:
|
255
|
+
# - The returned DFA is the smallest DFA that correctly labels the learning sample
|
256
|
+
# given as input.
|
257
|
+
#
|
258
|
+
def self.execute(sample, options={})
|
259
|
+
BlueFringe.new(options).execute(sample)
|
260
|
+
end
|
261
|
+
|
262
|
+
end # class BlueFringe
|
263
|
+
|
264
|
+
end # module Induction
|
265
|
+
end # module Stamina
|