ariel 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +49 -83
- data/bin/ariel +29 -20
- data/examples/google_calculator/structure.rb +2 -2
- data/examples/google_calculator/structure.yaml +13 -15
- data/examples/raa/labeled/highline.html +5 -4
- data/examples/raa/labeled/mongrel.html +9 -8
- data/examples/raa/structure.rb +4 -2
- data/examples/raa/structure.yaml +94 -78
- data/lib/ariel.rb +71 -33
- data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
- data/lib/ariel/label_utils.rb +46 -18
- data/lib/ariel/labeled_document_loader.rb +77 -0
- data/lib/ariel/learner.rb +60 -38
- data/lib/ariel/log.rb +67 -0
- data/lib/ariel/node.rb +52 -0
- data/lib/ariel/node/extracted.rb +90 -0
- data/lib/ariel/node/structure.rb +91 -0
- data/lib/ariel/rule.rb +114 -32
- data/lib/ariel/rule_set.rb +34 -15
- data/lib/ariel/token.rb +9 -3
- data/lib/ariel/token_stream.rb +32 -17
- data/lib/ariel/wildcards.rb +19 -15
- data/test/fixtures.rb +45 -3
- data/test/specs/candidate_refiner_spec.rb +48 -0
- data/test/specs/label_utils_spec.rb +97 -0
- data/test/specs/learner_spec.rb +39 -0
- data/test/specs/node_extracted_spec.rb +90 -0
- data/test/specs/node_spec.rb +76 -0
- data/test/specs/node_structure_spec.rb +74 -0
- data/test/specs/rule_set_spec.rb +85 -0
- data/test/specs/rule_spec.rb +110 -0
- data/test/specs/token_stream_spec.rb +100 -7
- metadata +21 -28
- data/lib/ariel/example_document_loader.rb +0 -59
- data/lib/ariel/extracted_node.rb +0 -20
- data/lib/ariel/node_like.rb +0 -26
- data/lib/ariel/structure_node.rb +0 -75
- data/test/ariel_test_case.rb +0 -15
- data/test/test_candidate_selector.rb +0 -58
- data/test/test_example_document_loader.rb +0 -7
- data/test/test_label_utils.rb +0 -15
- data/test/test_learner.rb +0 -38
- data/test/test_rule.rb +0 -38
- data/test/test_structure_node.rb +0 -81
- data/test/test_token.rb +0 -16
- data/test/test_token_stream.rb +0 -82
- data/test/test_wildcards.rb +0 -18
data/lib/ariel.rb
CHANGED
@@ -1,32 +1,16 @@
|
|
1
|
+
require 'ariel/log'
|
2
|
+
require 'ariel/wildcards'
|
3
|
+
require 'ariel/label_utils'
|
1
4
|
require 'ariel/token'
|
2
5
|
require 'ariel/token_stream'
|
3
6
|
require 'ariel/learner'
|
4
|
-
require 'ariel/
|
5
|
-
require 'ariel/
|
6
|
-
require 'ariel/structure_node'
|
7
|
+
require 'ariel/node/structure'
|
8
|
+
require 'ariel/node/extracted'
|
7
9
|
require 'ariel/rule'
|
8
|
-
require 'ariel/
|
9
|
-
require 'ariel/
|
10
|
-
require 'ariel/label_utils'
|
11
|
-
require 'ariel/example_document_loader'
|
10
|
+
require 'ariel/candidate_refiner'
|
11
|
+
require 'ariel/labeled_document_loader'
|
12
12
|
require 'ariel/rule_set'
|
13
13
|
|
14
|
-
if $DEBUG
|
15
|
-
# require 'logger'
|
16
|
-
|
17
|
-
# DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
|
18
|
-
# DEBUGLOG.datetime_format = " \010"
|
19
|
-
# DEBUGLOG.progname = "\010\010\010"
|
20
|
-
|
21
|
-
def debug(message)
|
22
|
-
p message
|
23
|
-
#DEBUGLOG.debug message
|
24
|
-
end
|
25
|
-
else
|
26
|
-
def debug(message)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
14
|
# = Ariel - A Ruby Information Extraction Library
|
31
15
|
# Ariel intends to assist in extracting information from semi-structured
|
32
16
|
# documents including (but not in any way limited to) web pages. Although you
|
@@ -41,29 +25,83 @@ end
|
|
41
25
|
# 1. Define a structure for the data you wish to extract. For example:
|
42
26
|
#
|
43
27
|
# @structure = Ariel::StructureNode.new do |r|
|
44
|
-
# r.article do |a|
|
45
|
-
# a.title
|
46
|
-
# a.author
|
47
|
-
# a.date
|
48
|
-
# a.body
|
28
|
+
# r.item :article do |a|
|
29
|
+
# a.item :title
|
30
|
+
# a.item :author
|
31
|
+
# a.item :date
|
32
|
+
# a.item :body
|
49
33
|
# end
|
50
|
-
# r.
|
51
|
-
# c.
|
52
|
-
#
|
53
|
-
#
|
34
|
+
# r.list :comments do |c|
|
35
|
+
# c.list_item :comment do |c|
|
36
|
+
# c.item :author
|
37
|
+
# c.item :date
|
38
|
+
# c.item :body
|
39
|
+
# end
|
54
40
|
# end
|
55
41
|
# end
|
56
42
|
# 2. Label these fields in a few example documents (normally at least 3).
|
57
43
|
# Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
|
58
44
|
# 3. Ariel will read these examples, and try to generate suitable rules that can
|
59
|
-
# be used to extract this data from other similarly structured documents.
|
45
|
+
# be used to extract this data from other similarly structured documents. Use
|
46
|
+
# Ariel#learn to initiate learn ruling.
|
60
47
|
# 4. A wrapper has been generated - we can now happily load documents with the
|
61
48
|
# same structure (normally documents generated by the same rules, so
|
62
49
|
# different pages from a single site perhaps) and query the extracted data.
|
50
|
+
# See Ariel#extract.
|
63
51
|
module Ariel
|
64
52
|
|
53
|
+
class << self
|
54
|
+
# Given a root Node::Structure and a list of labeled_files (either IO objects
|
55
|
+
# or strings representing files that can be opened with File.read, will learn
|
56
|
+
# rules using the labeled examples. The passed Node::Structure tree is
|
57
|
+
# returned with new RuleSets added containing the learnt rules. This structure
|
58
|
+
# can now be used with Ariel#extract on unlabeled documents.
|
59
|
+
#
|
60
|
+
# <tt>Ariel.learn structure, 'file1.html', fileobj, 'file2.html'</tt>
|
61
|
+
def learn(structure, *labeled_files)
|
62
|
+
raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
|
63
|
+
labeled_strings=collect_strings(labeled_files)
|
64
|
+
return LabeledDocumentLoader.supervise_learning(structure, *labeled_strings)
|
65
|
+
end
|
65
66
|
|
67
|
+
# Will use the given root Node::Structure to extract information from each of
|
68
|
+
# the given files (can be any object responding to #read, and if passed a
|
69
|
+
# string the parameter will be opened using File.read). If a block is given,
|
70
|
+
# each root Node::Extracted is yielded. An array of each root extracted node
|
71
|
+
# is returned.
|
72
|
+
#
|
73
|
+
# <tt>Ariel.extract structure, 'file1.txt', fileobj, 'file2.html' # =></tt> an
|
74
|
+
# array of 3 Node::Extracted objects
|
75
|
+
def extract(structure, *files_to_extract)
|
76
|
+
raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
|
77
|
+
extractions=[]
|
78
|
+
collect_strings(files_to_extract).each do |string|
|
79
|
+
tokenstream = TokenStream.new
|
80
|
+
tokenstream.tokenize string
|
81
|
+
root_node=Ariel::Node::Extracted.new :root, tokenstream, structure
|
82
|
+
structure.apply_extraction_tree_on root_node
|
83
|
+
extractions << root_node
|
84
|
+
yield root_node if block_given?
|
85
|
+
end
|
86
|
+
return extractions
|
87
|
+
end
|
66
88
|
|
89
|
+
private
|
90
|
+
def collect_strings(files)
|
91
|
+
strings=[]
|
92
|
+
files.each do |file|
|
93
|
+
if file.kind_of? String
|
94
|
+
next unless File.file? file
|
95
|
+
strings << File.read(file)
|
96
|
+
elsif file.respond_to? :read
|
97
|
+
strings << file.read
|
98
|
+
else
|
99
|
+
raise ArgumentError, "Don't know how to handle #{file.inspect}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
return strings
|
103
|
+
end
|
104
|
+
end
|
67
105
|
end
|
68
106
|
|
69
107
|
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Ariel
|
2
2
|
|
3
3
|
# Given an array of candidate Rules, and an array of LabeledStreams,
|
4
|
-
# allows heuristics to be applied to select the ideal Rule. All
|
4
|
+
# allows heuristics to be applied to select the ideal Rule. All refine_* instance
|
5
5
|
# methods will remove candidates from the internal candidates array.
|
6
|
-
class
|
6
|
+
class CandidateRefiner
|
7
7
|
|
8
8
|
attr_accessor :candidates
|
9
9
|
def initialize(candidates, examples)
|
10
|
-
@candidates=candidates.dup #Just in case
|
10
|
+
@candidates=candidates.dup #Just in case we directly modify the array. Shouldn't happen.
|
11
11
|
@examples=examples
|
12
12
|
end
|
13
13
|
|
@@ -15,8 +15,8 @@ module Ariel
|
|
15
15
|
# against the given examples. e.g. select_best_by_match_type(:early, :perfect)
|
16
16
|
# will select the rules that have the most matches that are early or
|
17
17
|
# perfect.
|
18
|
-
def
|
19
|
-
debug "
|
18
|
+
def refine_by_match_type(*match_types)
|
19
|
+
Log.debug "Refining by match types #{match_types.inspect}"
|
20
20
|
return @candidates if @candidates.size==1
|
21
21
|
@candidates = highest_scoring_by do |rule|
|
22
22
|
rule_score=0
|
@@ -28,44 +28,19 @@ module Ariel
|
|
28
28
|
return @candidates
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
# Each rule is yielded to the given block, which is expected to return that
|
34
|
-
# rule's score.
|
35
|
-
def score_by
|
36
|
-
score_hash={}
|
37
|
-
@candidates.each_with_index do |rule, index|
|
38
|
-
score_hash[index]= yield rule
|
39
|
-
end
|
40
|
-
return score_hash
|
41
|
-
end
|
42
|
-
|
43
|
-
# Takes a scoring function as a block, and yields each rule to it. Returns
|
44
|
-
# an array of the Rule candidates that have the highest score.
|
45
|
-
def highest_scoring_by(&scorer)
|
46
|
-
score_hash = score_by &scorer
|
47
|
-
best_score = score_hash.values.sort.last
|
48
|
-
highest_scorers=[]
|
49
|
-
score_hash.each do |candidate_index, score|
|
50
|
-
highest_scorers << @candidates[candidate_index] if score==best_score
|
51
|
-
end
|
52
|
-
debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
|
53
|
-
return highest_scorers
|
54
|
-
end
|
55
|
-
|
56
|
-
def select_with_fewer_wildcards
|
57
|
-
debug "Selecting the rules with the fewest wildcards"
|
31
|
+
def refine_by_fewer_wildcards
|
32
|
+
Log.debug "Refining to the rules with the fewest wildcards"
|
58
33
|
@candidates = highest_scoring_by {|rule| -rule.wildcard_count} #hack or not?
|
59
34
|
return @candidates
|
60
35
|
end
|
61
36
|
|
62
|
-
def
|
63
|
-
debug "Selecting rules that match the examples closest to the label"
|
37
|
+
def refine_by_label_proximity
|
38
|
+
Log.debug "Selecting rules that match the examples closest to the label"
|
64
39
|
@candidates = highest_scoring_by do |rule|
|
65
40
|
rule_score=0
|
66
41
|
matched_examples=0
|
67
42
|
@examples.each do |example|
|
68
|
-
match_index = rule.
|
43
|
+
match_index = rule.closest_match(example)
|
69
44
|
if match_index.nil?
|
70
45
|
next
|
71
46
|
else
|
@@ -79,16 +54,42 @@ module Ariel
|
|
79
54
|
return @candidates
|
80
55
|
end
|
81
56
|
|
82
|
-
def
|
83
|
-
debug "Selecting rules that have longer end landmarks"
|
57
|
+
def refine_by_longer_end_landmarks
|
58
|
+
Log.debug "Selecting rules that have longer end landmarks"
|
84
59
|
@candidates = highest_scoring_by {|rule| rule.landmarks.last.size unless rule.landmarks.last.nil?}
|
85
60
|
end
|
86
61
|
|
87
62
|
# Returns a random candidate. Meant for making the final choice in case
|
88
63
|
# previous selections have still left multiple candidates.
|
89
64
|
def random_from_remaining
|
90
|
-
debug "Selecting random from last #{candidates.size} candidate rules"
|
65
|
+
Log.debug "Selecting random from last #{candidates.size} candidate rules"
|
91
66
|
@candidates.sort_by {rand}.first
|
92
67
|
end
|
68
|
+
|
69
|
+
private
|
70
|
+
# All scoring functions use this indirectly. It iterates over each
|
71
|
+
# Rule candidate, and assigns it a score in a hash of index:score pairs.
|
72
|
+
# Each rule is yielded to the given block, which is expected to return that
|
73
|
+
# rule's score.
|
74
|
+
def score_by
|
75
|
+
score_hash={}
|
76
|
+
@candidates.each_with_index do |rule, index|
|
77
|
+
score_hash[index]= yield rule
|
78
|
+
end
|
79
|
+
return score_hash
|
80
|
+
end
|
81
|
+
|
82
|
+
# Takes a scoring function as a block, and yields each rule to it. Returns
|
83
|
+
# an array of the Rule candidates that have the highest score.
|
84
|
+
def highest_scoring_by(&scorer)
|
85
|
+
score_hash = score_by &scorer
|
86
|
+
best_score = score_hash.values.sort.last
|
87
|
+
highest_scorers=[]
|
88
|
+
score_hash.each do |candidate_index, score|
|
89
|
+
highest_scorers << @candidates[candidate_index] if score==best_score
|
90
|
+
end
|
91
|
+
Log.debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
|
92
|
+
return highest_scorers
|
93
|
+
end
|
93
94
|
end
|
94
95
|
end
|
data/lib/ariel/label_utils.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Ariel
|
2
2
|
|
3
|
-
# A set of methods for use when dealing with strings from labeled documents
|
3
|
+
# A set of methods for use when dealing with strings from labeled documents.
|
4
4
|
module LabelUtils
|
5
5
|
S_LABEL="<"
|
6
6
|
E_LABEL=">"
|
@@ -15,7 +15,7 @@ module Ariel
|
|
15
15
|
/#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
|
16
16
|
end
|
17
17
|
|
18
|
-
# Helper function that returns a regex that will
|
18
|
+
# Helper function that returns a regex that will match any open or closing
|
19
19
|
# label tags.
|
20
20
|
def self.any_label_regex()
|
21
21
|
Regexp.union(*self.label_regex)
|
@@ -28,17 +28,42 @@ module Ariel
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# Extracts the labeled region representing the given structure node from the
|
31
|
-
# parent_extracted_node. A new
|
31
|
+
# parent_extracted_node. A new Node::Extracted is returned to be added as a
|
32
32
|
# child to the parent_extracted_node. Used when loading labeled documents.
|
33
33
|
def self.extract_labeled_region(structure, parent_extracted_node)
|
34
34
|
tokenstream=parent_extracted_node.tokenstream
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
start_idxs=[]
|
36
|
+
end_idxs=[]
|
37
|
+
tokenstream.rewind
|
38
|
+
while start_idx = self.skip_to_label_tag(tokenstream, structure.node_name, :open)
|
39
|
+
start_idxs << start_idx
|
40
|
+
break unless structure.node_type==:list_item
|
41
|
+
end
|
42
|
+
tokenstream.rewind
|
43
|
+
while end_idx=self.skip_to_label_tag(tokenstream, structure.node_name, :closed)
|
44
|
+
end_idxs << (end_idx -2) #rewind to token before the label tag token
|
45
|
+
break unless structure.node_type==:list_item
|
46
|
+
end
|
47
|
+
result=[]
|
48
|
+
i=0
|
49
|
+
start_idxs.zip(end_idxs) do |start_idx, end_idx|
|
50
|
+
if start_idx && end_idx && (start_idx <= end_idx)
|
51
|
+
newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
|
52
|
+
if structure.node_type==:list_item
|
53
|
+
new_name="#{structure.node_name}_#{i}"
|
54
|
+
i+=1
|
55
|
+
else
|
56
|
+
new_name = structure.node_name
|
57
|
+
end
|
58
|
+
child_node = Node::Extracted.new(new_name, newstream, structure)
|
59
|
+
result << child_node
|
60
|
+
parent_extracted_node.add_child child_node
|
61
|
+
yield child_node if block_given?
|
62
|
+
else
|
63
|
+
break
|
64
|
+
end
|
65
|
+
end
|
66
|
+
return result
|
42
67
|
end
|
43
68
|
|
44
69
|
private
|
@@ -50,22 +75,25 @@ module Ariel
|
|
50
75
|
when :closed
|
51
76
|
re_index=1
|
52
77
|
end
|
53
|
-
tokenstream.rewind
|
54
78
|
regex = self.label_regex(name.to_s)[re_index]
|
55
|
-
debug "Seeking #{name.to_s} of type #{type}"
|
79
|
+
Log.debug "Seeking #{name.to_s} of type #{type}"
|
56
80
|
nesting_level=0
|
57
81
|
tokenstream.each do |token|
|
58
|
-
if token.matches?(regex)
|
59
|
-
|
82
|
+
if token.matches?(regex) && nesting_level==0
|
83
|
+
Log.debug "Found a match"
|
84
|
+
return tokenstream.cur_pos
|
60
85
|
end
|
61
86
|
if token.matches?(self.label_regex[0])
|
62
|
-
|
63
|
-
|
87
|
+
# Don't increase nesting if encounter the unnested start tag that
|
88
|
+
# pairs with the end tag we're searching for.
|
89
|
+
nesting_level+=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[0])
|
90
|
+
Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
|
64
91
|
elsif token.matches?(self.label_regex[1])
|
65
|
-
nesting_level-=1
|
66
|
-
debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
|
92
|
+
nesting_level-=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[1])
|
93
|
+
Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
|
67
94
|
end
|
68
95
|
end
|
96
|
+
return nil
|
69
97
|
end
|
70
98
|
end
|
71
99
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# Provides methods that read an example document, using a Node::Structure tree
|
4
|
+
# to populate a tree of Nodes with each labeled example.
|
5
|
+
class LabeledDocumentLoader
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
# As its first argument it takes a root Node::Structure to which any
|
10
|
+
# learnt rules will be added. The following arguments are strings
|
11
|
+
# containing labeled examples for members of the passed Node::Structure
|
12
|
+
# tree. Ariel#learn is the preferred interface for rule-learning - this
|
13
|
+
# one may change.
|
14
|
+
def supervise_learning(structure, *labeled_strings)
|
15
|
+
raise ArgumentError, "No labeled strings were given" if labeled_strings.size==0
|
16
|
+
loaded_example_hash=process_labeled_strings(structure, *labeled_strings)
|
17
|
+
loaded_example_hash.each_pair do |structure_node, example_nodes|
|
18
|
+
if structure_node.node_type==:list_item
|
19
|
+
exhaustive=true
|
20
|
+
else
|
21
|
+
exhaustive=false
|
22
|
+
end
|
23
|
+
examples = collect_labeled_tokenstreams(example_nodes, :start)
|
24
|
+
Log.info "Learning #{"exhaustive " if exhaustive}rules for node #{structure_node.node_name} with #{example_nodes.size} examples"
|
25
|
+
learner = Learner.new(*examples)
|
26
|
+
start_rules = learner.learn_rule :forward, exhaustive
|
27
|
+
Log.info "Learnt start rules #{start_rules.inspect}"
|
28
|
+
examples = collect_labeled_tokenstreams(example_nodes, :end)
|
29
|
+
learner = Learner.new(*examples)
|
30
|
+
end_rules = learner.learn_rule :back, exhaustive
|
31
|
+
Log.info "Learnt end rules, #{end_rules.inspect}"
|
32
|
+
structure_node.ruleset=RuleSet.new(start_rules, end_rules)
|
33
|
+
end
|
34
|
+
return structure
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
# Processes the given labeled strings by creating a Node::Extracted tree.
|
39
|
+
# A hash is returned with each child of the passed Node::Structure as a key,
|
40
|
+
# and an array of the relevant extracted examples (as Node::Extracted
|
41
|
+
# objects).
|
42
|
+
def process_labeled_strings(structure, *labeled_strings)
|
43
|
+
loaded_example_hash = Hash.new {|h, k| h[k]=[]}
|
44
|
+
labeled_strings.each do |string|
|
45
|
+
tokenstream = TokenStream.new
|
46
|
+
tokenstream.tokenize(string, true)
|
47
|
+
root = Node::Extracted.new(:root, tokenstream, structure)
|
48
|
+
structure.apply_extraction_tree_on(root, true)
|
49
|
+
root.each_descendant(true) do |extracted_node|
|
50
|
+
if extracted_node.parent
|
51
|
+
loaded_example_hash[extracted_node.structure_node] << extracted_node
|
52
|
+
end
|
53
|
+
extracted_node.tokenstream.remove_label_tags
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return loaded_example_hash
|
57
|
+
end
|
58
|
+
|
59
|
+
# Given an array of example nodes, will return an array of tokenstreams
|
60
|
+
# labeled for learning, at either the start or end. The example node
|
61
|
+
# passed are actually the nodes to be extracted. This method then looks up
|
62
|
+
# the parent, and labels their position in the parent so rules to extract
|
63
|
+
# the given node can be learnt. Type is either :start or :end
|
64
|
+
def collect_labeled_tokenstreams(example_nodes, type)
|
65
|
+
example_nodes.collect do |node|
|
66
|
+
tokenstream=node.parent.tokenstream #Rules are based on extracting from the parent
|
67
|
+
if type==:start
|
68
|
+
tokenstream.set_label_at(node.tokenstream.tokens.first.start_loc)
|
69
|
+
elsif type==:end
|
70
|
+
tokenstream.set_label_at(node.tokenstream.tokens.last.start_loc)
|
71
|
+
end
|
72
|
+
tokenstream
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/ariel/learner.rb
CHANGED
@@ -15,7 +15,7 @@ module Ariel
|
|
15
15
|
if examples.any? {|example| example.label_index.nil?}
|
16
16
|
raise ArgumentError, "Passed a TokenStream with no label"
|
17
17
|
end
|
18
|
-
debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
|
18
|
+
Log.debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
|
19
19
|
@examples=examples
|
20
20
|
@candidates=[]
|
21
21
|
set_seed
|
@@ -25,22 +25,30 @@ module Ariel
|
|
25
25
|
# to use as its seed example, then finds a rule that matches the maximum
|
26
26
|
# number of examples correctly and fails on all overs. All matched examples
|
27
27
|
# are then removed and the process is repeated considering all examples that
|
28
|
-
# remain. Returns an array of the rules found (in order).
|
29
|
-
|
30
|
-
|
28
|
+
# remain. Returns an array of the rules found (in order). learn_rule will
|
29
|
+
# take care of reversing the given examples if necessary.
|
30
|
+
def learn_rule(direction, exhaustive=false)
|
31
|
+
Log.debug "Searching for a #{direction} rule"
|
32
|
+
@examples=@examples.collect {|tokenstream| Rule.prepare_tokenstream(tokenstream, direction)}
|
31
33
|
@direction=direction
|
32
|
-
@
|
34
|
+
@exhaustive=exhaustive
|
35
|
+
if exhaustive
|
36
|
+
@examples.delete_if {|example| example_is_unsuitable?(example)}
|
37
|
+
raise StandardError, "No examples are suitable for exhaustive rule learning" if @examples.empty?
|
38
|
+
end
|
39
|
+
@current_rule=Rule.new([], direction, exhaustive)
|
33
40
|
combined_rules=[]
|
34
41
|
while not @examples.empty?
|
35
42
|
set_seed unless @examples.include? @current_seed
|
36
43
|
rule = find_best_rule() # Find the rule that matches the most examples and fails on the others
|
37
44
|
prev_size = @examples.size
|
38
45
|
@examples.delete_if {|example| rule.apply_to(example)} #separate and conquer!
|
39
|
-
debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
|
46
|
+
Log.debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
|
40
47
|
combined_rules << rule
|
41
48
|
end
|
42
49
|
# rule = order_rule(rule) #STALKER paper suggests that the generated rules should be ordered. This doesn't make sense, seeing as they are all generated based only on examples not matched by previous rules
|
43
|
-
debug "Generated rules: #{combined_rules.inspect}"
|
50
|
+
Log.debug "Generated rules: #{combined_rules.inspect}"
|
51
|
+
Rule.clear_cache
|
44
52
|
return combined_rules
|
45
53
|
end
|
46
54
|
|
@@ -49,7 +57,7 @@ module Ariel
|
|
49
57
|
def set_seed
|
50
58
|
sorted = @examples.sort_by {|example| example.label_index}
|
51
59
|
self.current_seed=sorted.first
|
52
|
-
debug "current_seed=#{current_seed.text}"
|
60
|
+
Log.debug "current_seed=#{current_seed.text}"
|
53
61
|
return current_seed
|
54
62
|
end
|
55
63
|
|
@@ -59,13 +67,13 @@ module Ariel
|
|
59
67
|
# token's text or any of it's matching wildcards.
|
60
68
|
def generate_initial_candidates
|
61
69
|
if current_seed.label_index==0
|
62
|
-
@candidates << Rule.new(@direction)
|
70
|
+
@candidates << Rule.new([], @direction, @exhaustive)
|
63
71
|
else
|
64
72
|
end_token=current_seed.tokens[current_seed.label_index-1]
|
65
|
-
debug "Creating initial candidates based on #{end_token.text}"
|
66
|
-
@candidates<< Rule.new(
|
73
|
+
Log.debug "Creating initial candidates based on #{end_token.text}"
|
74
|
+
@candidates<< Rule.new([[end_token.text]], @direction, @exhaustive)
|
67
75
|
@candidates.concat(@candidates[0].generalise_feature(0))
|
68
|
-
debug "Initial candidates: #{@candidates.inspect} created"
|
76
|
+
Log.debug "Initial candidates: #{@candidates.inspect} created"
|
69
77
|
end
|
70
78
|
return @candidates.size
|
71
79
|
end
|
@@ -83,7 +91,7 @@ module Ariel
|
|
83
91
|
refine
|
84
92
|
end
|
85
93
|
# return post_process(best_solution)
|
86
|
-
debug "Rule found: #{best_solution.inspect}"
|
94
|
+
Log.debug "Rule found: #{best_solution.inspect}"
|
87
95
|
return best_solution
|
88
96
|
end
|
89
97
|
|
@@ -95,16 +103,14 @@ module Ariel
|
|
95
103
|
@examples.each do |example|
|
96
104
|
if rule.matches(example, :perfect)
|
97
105
|
perfect_count+=1
|
98
|
-
debug "#{rule.inspect} matches #{example.text} perfectly"
|
99
106
|
elsif rule.matches(example, :fail)
|
100
107
|
fail_count+=1
|
101
|
-
debug "#{rule.inspect} fails to match #{example.text}"
|
102
108
|
end
|
103
109
|
end
|
104
110
|
if (perfect_count >= 1) && (fail_count == (@examples.size - perfect_count))
|
105
111
|
return true
|
106
112
|
else
|
107
|
-
debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
|
113
|
+
Log.debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
|
108
114
|
return false
|
109
115
|
end
|
110
116
|
end
|
@@ -121,15 +127,15 @@ module Ariel
|
|
121
127
|
# document structure.
|
122
128
|
# * longer end landmarks - prefer "local context" landmarks.
|
123
129
|
def get_best_refiner
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
best_refiner =
|
132
|
-
debug "best_refiner found => #{best_refiner.inspect}"
|
130
|
+
r = CandidateRefiner.new(@candidates, @examples)
|
131
|
+
r.refine_by_match_type :early, :perfect #Discriminate on coverage
|
132
|
+
r.refine_by_match_type :early
|
133
|
+
r.refine_by_match_type :fail
|
134
|
+
r.refine_by_fewer_wildcards
|
135
|
+
r.refine_by_label_proximity
|
136
|
+
r.refine_by_longer_end_landmarks
|
137
|
+
best_refiner = r.random_from_remaining #just pick a random one for now if still multiple
|
138
|
+
Log.debug "best_refiner found => #{best_refiner.inspect}"
|
133
139
|
return best_refiner
|
134
140
|
end
|
135
141
|
|
@@ -141,14 +147,14 @@ module Ariel
|
|
141
147
|
# * longer end landmarks
|
142
148
|
# * shorter unconsumed prefixes
|
143
149
|
def get_best_solution
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
best_solution =
|
151
|
-
debug "best_solution found => #{best_solution.inspect}"
|
150
|
+
r = CandidateRefiner.new(@candidates, @examples)
|
151
|
+
r.refine_by_match_type :perfect
|
152
|
+
r.refine_by_match_type :fail
|
153
|
+
r.refine_by_fewer_wildcards
|
154
|
+
r.refine_by_label_proximity
|
155
|
+
r.refine_by_longer_end_landmarks
|
156
|
+
best_solution = r.random_from_remaining
|
157
|
+
Log.debug "best_solution found => #{best_solution.inspect}"
|
152
158
|
return best_solution
|
153
159
|
end
|
154
160
|
|
@@ -180,7 +186,7 @@ module Ariel
|
|
180
186
|
# alternative landmark extensions that use relevant wildcards.
|
181
187
|
def lengthen_landmark(landmark, index)
|
182
188
|
current_seed.rewind #In case apply_rule isn't called as index=0
|
183
|
-
result = @current_rule.partial(0..(index-1)).
|
189
|
+
result = @current_rule.partial(0..(index-1)).closest_match current_seed if index > 0 #Don't care about already matched tokens
|
184
190
|
return 0 unless result # Rule doesn't match, no point refining
|
185
191
|
refined_rules=[]
|
186
192
|
width = landmark.size
|
@@ -202,7 +208,7 @@ module Ariel
|
|
202
208
|
refined_rules.concat b.generalise_feature(index, -1)
|
203
209
|
end
|
204
210
|
@candidates.concat refined_rules
|
205
|
-
debug "#{refined_rules.size} landmark refinements generated"
|
211
|
+
Log.debug "#{refined_rules.size} landmark refinements generated"
|
206
212
|
return refined_rules.size
|
207
213
|
end
|
208
214
|
|
@@ -219,7 +225,7 @@ module Ariel
|
|
219
225
|
# is also done for each of that token's matching wildcards.
|
220
226
|
def add_new_landmarks(landmark, index)
|
221
227
|
topology_refs=[]
|
222
|
-
start_pos = current_rule.partial(0..index).
|
228
|
+
start_pos = current_rule.partial(0..index).closest_match(current_seed, :early)
|
223
229
|
end_pos = current_seed.label_index #No point adding tokens that occur after the label_index
|
224
230
|
current_seed.tokens[start_pos...end_pos].each do |token|
|
225
231
|
r=current_rule.deep_clone
|
@@ -227,11 +233,27 @@ module Ariel
|
|
227
233
|
topology_refs << r
|
228
234
|
topology_refs.concat r.generalise_feature(index+1)
|
229
235
|
end
|
230
|
-
debug "Topology refinements before uniq! #{topology_refs.size}"
|
236
|
+
Log.debug "Topology refinements before uniq! #{topology_refs.size}"
|
231
237
|
topology_refs.uniq!
|
232
238
|
@candidates.concat topology_refs
|
233
|
-
debug "#{topology_refs.size} topology refinements generated"
|
239
|
+
Log.debug "#{topology_refs.size} topology refinements generated"
|
234
240
|
return topology_refs.size
|
235
241
|
end
|
242
|
+
|
243
|
+
# When learning list iteration rules, some examples may be unsuitable. For
|
244
|
+
# instance if there is a list item at the start of an example with no tokens
|
245
|
+
# before it, a skip_to(nil) start rule would be generated that wouldn't make
|
246
|
+
# sense for exhaustive rules. The example should be caught by the
|
247
|
+
# corresponding end rule. This should only be run after tokenstream's have
|
248
|
+
# been prepared (reversed based on whether a :forward or :back rule is being
|
249
|
+
# searched for). Only returns a valid conclusion if the examples are
|
250
|
+
# intended to be used for exhaustive rule learning
|
251
|
+
def example_is_unsuitable?(tokenstream)
|
252
|
+
if tokenstream.label_index==0
|
253
|
+
return true
|
254
|
+
else
|
255
|
+
return false
|
256
|
+
end
|
257
|
+
end
|
236
258
|
end
|
237
259
|
end
|