ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -1,32 +1,16 @@
1
+ require 'ariel/log'
2
+ require 'ariel/wildcards'
3
+ require 'ariel/label_utils'
1
4
  require 'ariel/token'
2
5
  require 'ariel/token_stream'
3
6
  require 'ariel/learner'
4
- require 'ariel/node_like'
5
- require 'ariel/extracted_node'
6
- require 'ariel/structure_node'
7
+ require 'ariel/node/structure'
8
+ require 'ariel/node/extracted'
7
9
  require 'ariel/rule'
8
- require 'ariel/wildcards'
9
- require 'ariel/candidate_selector'
10
- require 'ariel/label_utils'
11
- require 'ariel/example_document_loader'
10
+ require 'ariel/candidate_refiner'
11
+ require 'ariel/labeled_document_loader'
12
12
  require 'ariel/rule_set'
13
13
 
14
- if $DEBUG
15
- # require 'logger'
16
-
17
- # DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
18
- # DEBUGLOG.datetime_format = " \010"
19
- # DEBUGLOG.progname = "\010\010\010"
20
-
21
- def debug(message)
22
- p message
23
- #DEBUGLOG.debug message
24
- end
25
- else
26
- def debug(message)
27
- end
28
- end
29
-
30
14
  # = Ariel - A Ruby Information Extraction Library
31
15
  # Ariel intends to assist in extracting information from semi-structured
32
16
  # documents including (but not in any way limited to) web pages. Although you
@@ -41,29 +25,83 @@ end
41
25
  # 1. Define a structure for the data you wish to extract. For example:
42
26
  #
43
27
  # @structure = Ariel::StructureNode.new do |r|
44
- # r.article do |a|
45
- # a.title
46
- # a.author
47
- # a.date
48
- # a.body
28
+ # r.item :article do |a|
29
+ # a.item :title
30
+ # a.item :author
31
+ # a.item :date
32
+ # a.item :body
49
33
  # end
50
- # r.comment_list do |c|
51
- # c.author
52
- # c.date
53
- # c.body
34
+ # r.list :comments do |c|
35
+ # c.list_item :comment do |c|
36
+ # c.item :author
37
+ # c.item :date
38
+ # c.item :body
39
+ # end
54
40
  # end
55
41
  # end
56
42
  # 2. Label these fields in a few example documents (normally at least 3).
57
43
  # Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
58
44
  # 3. Ariel will read these examples, and try to generate suitable rules that can
59
- # be used to extract this data from other similarly structured documents.
45
+ # be used to extract this data from other similarly structured documents. Use
46
+ # Ariel#learn to initiate learn ruling.
60
47
  # 4. A wrapper has been generated - we can now happily load documents with the
61
48
  # same structure (normally documents generated by the same rules, so
62
49
  # different pages from a single site perhaps) and query the extracted data.
50
+ # See Ariel#extract.
63
51
  module Ariel
64
52
 
53
+ class << self
54
+ # Given a root Node::Structure and a list of labeled_files (either IO objects
55
+ # or strings representing files that can be opened with File.read, will learn
56
+ # rules using the labeled examples. The passed Node::Structure tree is
57
+ # returned with new RuleSets added containing the learnt rules. This structure
58
+ # can now be used with Ariel#extract on unlabeled documents.
59
+ #
60
+ # <tt>Ariel.learn structure, 'file1.html', fileobj, 'file2.html'</tt>
61
+ def learn(structure, *labeled_files)
62
+ raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
63
+ labeled_strings=collect_strings(labeled_files)
64
+ return LabeledDocumentLoader.supervise_learning(structure, *labeled_strings)
65
+ end
65
66
 
67
+ # Will use the given root Node::Structure to extract information from each of
68
+ # the given files (can be any object responding to #read, and if passed a
69
+ # string the parameter will be opened using File.read). If a block is given,
70
+ # each root Node::Extracted is yielded. An array of each root extracted node
71
+ # is returned.
72
+ #
73
+ # <tt>Ariel.extract structure, 'file1.txt', fileobj, 'file2.html' # =></tt> an
74
+ # array of 3 Node::Extracted objects
75
+ def extract(structure, *files_to_extract)
76
+ raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
77
+ extractions=[]
78
+ collect_strings(files_to_extract).each do |string|
79
+ tokenstream = TokenStream.new
80
+ tokenstream.tokenize string
81
+ root_node=Ariel::Node::Extracted.new :root, tokenstream, structure
82
+ structure.apply_extraction_tree_on root_node
83
+ extractions << root_node
84
+ yield root_node if block_given?
85
+ end
86
+ return extractions
87
+ end
66
88
 
89
+ private
90
+ def collect_strings(files)
91
+ strings=[]
92
+ files.each do |file|
93
+ if file.kind_of? String
94
+ next unless File.file? file
95
+ strings << File.read(file)
96
+ elsif file.respond_to? :read
97
+ strings << file.read
98
+ else
99
+ raise ArgumentError, "Don't know how to handle #{file.inspect}"
100
+ end
101
+ end
102
+ return strings
103
+ end
104
+ end
67
105
  end
68
106
 
69
107
 
@@ -1,13 +1,13 @@
1
1
  module Ariel
2
2
 
3
3
  # Given an array of candidate Rules, and an array of LabeledStreams,
4
- # allows heuristics to be applied to select the ideal Rule. All select_* instance
4
+ # allows heuristics to be applied to select the ideal Rule. All refine_* instance
5
5
  # methods will remove candidates from the internal candidates array.
6
- class CandidateSelector
6
+ class CandidateRefiner
7
7
 
8
8
  attr_accessor :candidates
9
9
  def initialize(candidates, examples)
10
- @candidates=candidates.dup #Just in case a CandidateSelector function directly modifies the array, affecting the original. Shouldn't happen.
10
+ @candidates=candidates.dup #Just in case we directly modify the array. Shouldn't happen.
11
11
  @examples=examples
12
12
  end
13
13
 
@@ -15,8 +15,8 @@ module Ariel
15
15
  # against the given examples. e.g. select_best_by_match_type(:early, :perfect)
16
16
  # will select the rules that have the most matches that are early or
17
17
  # perfect.
18
- def select_best_by_match_type(*match_types)
19
- debug "Selecting best by match types #{match_types}"
18
+ def refine_by_match_type(*match_types)
19
+ Log.debug "Refining by match types #{match_types.inspect}"
20
20
  return @candidates if @candidates.size==1
21
21
  @candidates = highest_scoring_by do |rule|
22
22
  rule_score=0
@@ -28,44 +28,19 @@ module Ariel
28
28
  return @candidates
29
29
  end
30
30
 
31
- # All scoring functions use this indirectly. It iterates over each
32
- # Rule candidate, and assigns it a score in a hash of index:score pairs.
33
- # Each rule is yielded to the given block, which is expected to return that
34
- # rule's score.
35
- def score_by
36
- score_hash={}
37
- @candidates.each_with_index do |rule, index|
38
- score_hash[index]= yield rule
39
- end
40
- return score_hash
41
- end
42
-
43
- # Takes a scoring function as a block, and yields each rule to it. Returns
44
- # an array of the Rule candidates that have the highest score.
45
- def highest_scoring_by(&scorer)
46
- score_hash = score_by &scorer
47
- best_score = score_hash.values.sort.last
48
- highest_scorers=[]
49
- score_hash.each do |candidate_index, score|
50
- highest_scorers << @candidates[candidate_index] if score==best_score
51
- end
52
- debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
53
- return highest_scorers
54
- end
55
-
56
- def select_with_fewer_wildcards
57
- debug "Selecting the rules with the fewest wildcards"
31
+ def refine_by_fewer_wildcards
32
+ Log.debug "Refining to the rules with the fewest wildcards"
58
33
  @candidates = highest_scoring_by {|rule| -rule.wildcard_count} #hack or not?
59
34
  return @candidates
60
35
  end
61
36
 
62
- def select_closest_to_label
63
- debug "Selecting rules that match the examples closest to the label"
37
+ def refine_by_label_proximity
38
+ Log.debug "Selecting rules that match the examples closest to the label"
64
39
  @candidates = highest_scoring_by do |rule|
65
40
  rule_score=0
66
41
  matched_examples=0
67
42
  @examples.each do |example|
68
- match_index = rule.apply_to(example)
43
+ match_index = rule.closest_match(example)
69
44
  if match_index.nil?
70
45
  next
71
46
  else
@@ -79,16 +54,42 @@ module Ariel
79
54
  return @candidates
80
55
  end
81
56
 
82
- def select_with_longer_end_landmarks
83
- debug "Selecting rules that have longer end landmarks"
57
+ def refine_by_longer_end_landmarks
58
+ Log.debug "Selecting rules that have longer end landmarks"
84
59
  @candidates = highest_scoring_by {|rule| rule.landmarks.last.size unless rule.landmarks.last.nil?}
85
60
  end
86
61
 
87
62
  # Returns a random candidate. Meant for making the final choice in case
88
63
  # previous selections have still left multiple candidates.
89
64
  def random_from_remaining
90
- debug "Selecting random from last #{candidates.size} candidate rules"
65
+ Log.debug "Selecting random from last #{candidates.size} candidate rules"
91
66
  @candidates.sort_by {rand}.first
92
67
  end
68
+
69
+ private
70
+ # All scoring functions use this indirectly. It iterates over each
71
+ # Rule candidate, and assigns it a score in a hash of index:score pairs.
72
+ # Each rule is yielded to the given block, which is expected to return that
73
+ # rule's score.
74
+ def score_by
75
+ score_hash={}
76
+ @candidates.each_with_index do |rule, index|
77
+ score_hash[index]= yield rule
78
+ end
79
+ return score_hash
80
+ end
81
+
82
+ # Takes a scoring function as a block, and yields each rule to it. Returns
83
+ # an array of the Rule candidates that have the highest score.
84
+ def highest_scoring_by(&scorer)
85
+ score_hash = score_by &scorer
86
+ best_score = score_hash.values.sort.last
87
+ highest_scorers=[]
88
+ score_hash.each do |candidate_index, score|
89
+ highest_scorers << @candidates[candidate_index] if score==best_score
90
+ end
91
+ Log.debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
92
+ return highest_scorers
93
+ end
93
94
  end
94
95
  end
@@ -1,6 +1,6 @@
1
1
  module Ariel
2
2
 
3
- # A set of methods for use when dealing with strings from labeled documents
3
+ # A set of methods for use when dealing with strings from labeled documents.
4
4
  module LabelUtils
5
5
  S_LABEL="<"
6
6
  E_LABEL=">"
@@ -15,7 +15,7 @@ module Ariel
15
15
  /#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
16
16
  end
17
17
 
18
- # Helper function that returns a regex that will return any open or closing
18
+ # Helper function that returns a regex that will match any open or closing
19
19
  # label tags.
20
20
  def self.any_label_regex()
21
21
  Regexp.union(*self.label_regex)
@@ -28,17 +28,42 @@ module Ariel
28
28
  end
29
29
 
30
30
  # Extracts the labeled region representing the given structure node from the
31
- # parent_extracted_node. A new ExtractedNode is returned to be added as a
31
+ # parent_extracted_node. A new Node::Extracted is returned to be added as a
32
32
  # child to the parent_extracted_node. Used when loading labeled documents.
33
33
  def self.extract_labeled_region(structure, parent_extracted_node)
34
34
  tokenstream=parent_extracted_node.tokenstream
35
- start_idx=self.skip_to_label_tag(tokenstream, structure.meta.name, :open)
36
- end_idx=self.skip_to_label_tag(tokenstream.reverse, structure.meta.name, :closed)
37
- end_idx=tokenstream.reverse_pos end_idx
38
- newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
39
- child_node=ExtractedNode.new(structure.meta.name, newstream, structure)
40
- parent_extracted_node.add_child child_node
41
- return child_node
35
+ start_idxs=[]
36
+ end_idxs=[]
37
+ tokenstream.rewind
38
+ while start_idx = self.skip_to_label_tag(tokenstream, structure.node_name, :open)
39
+ start_idxs << start_idx
40
+ break unless structure.node_type==:list_item
41
+ end
42
+ tokenstream.rewind
43
+ while end_idx=self.skip_to_label_tag(tokenstream, structure.node_name, :closed)
44
+ end_idxs << (end_idx -2) #rewind to token before the label tag token
45
+ break unless structure.node_type==:list_item
46
+ end
47
+ result=[]
48
+ i=0
49
+ start_idxs.zip(end_idxs) do |start_idx, end_idx|
50
+ if start_idx && end_idx && (start_idx <= end_idx)
51
+ newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
52
+ if structure.node_type==:list_item
53
+ new_name="#{structure.node_name}_#{i}"
54
+ i+=1
55
+ else
56
+ new_name = structure.node_name
57
+ end
58
+ child_node = Node::Extracted.new(new_name, newstream, structure)
59
+ result << child_node
60
+ parent_extracted_node.add_child child_node
61
+ yield child_node if block_given?
62
+ else
63
+ break
64
+ end
65
+ end
66
+ return result
42
67
  end
43
68
 
44
69
  private
@@ -50,22 +75,25 @@ module Ariel
50
75
  when :closed
51
76
  re_index=1
52
77
  end
53
- tokenstream.rewind
54
78
  regex = self.label_regex(name.to_s)[re_index]
55
- debug "Seeking #{name.to_s} of type #{type}"
79
+ Log.debug "Seeking #{name.to_s} of type #{type}"
56
80
  nesting_level=0
57
81
  tokenstream.each do |token|
58
- if token.matches?(regex)
59
- return tokenstream.cur_pos if nesting_level==0
82
+ if token.matches?(regex) && nesting_level==0
83
+ Log.debug "Found a match"
84
+ return tokenstream.cur_pos
60
85
  end
61
86
  if token.matches?(self.label_regex[0])
62
- nesting_level+=1
63
- debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
87
+ # Don't increase nesting if encounter the unnested start tag that
88
+ # pairs with the end tag we're searching for.
89
+ nesting_level+=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[0])
90
+ Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
64
91
  elsif token.matches?(self.label_regex[1])
65
- nesting_level-=1
66
- debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
92
+ nesting_level-=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[1])
93
+ Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
67
94
  end
68
95
  end
96
+ return nil
69
97
  end
70
98
  end
71
99
  end
@@ -0,0 +1,77 @@
1
+ module Ariel
2
+
3
+ # Provides methods that read an example document, using a Node::Structure tree
4
+ # to populate a tree of Nodes with each labeled example.
5
+ class LabeledDocumentLoader
6
+
7
+ class << self
8
+
9
+ # As its first argument it takes a root Node::Structure to which any
10
+ # learnt rules will be added. The following arguments are strings
11
+ # containing labeled examples for members of the passed Node::Structure
12
+ # tree. Ariel#learn is the preferred interface for rule-learning - this
13
+ # one may change.
14
+ def supervise_learning(structure, *labeled_strings)
15
+ raise ArgumentError, "No labeled strings were given" if labeled_strings.size==0
16
+ loaded_example_hash=process_labeled_strings(structure, *labeled_strings)
17
+ loaded_example_hash.each_pair do |structure_node, example_nodes|
18
+ if structure_node.node_type==:list_item
19
+ exhaustive=true
20
+ else
21
+ exhaustive=false
22
+ end
23
+ examples = collect_labeled_tokenstreams(example_nodes, :start)
24
+ Log.info "Learning #{"exhaustive " if exhaustive}rules for node #{structure_node.node_name} with #{example_nodes.size} examples"
25
+ learner = Learner.new(*examples)
26
+ start_rules = learner.learn_rule :forward, exhaustive
27
+ Log.info "Learnt start rules #{start_rules.inspect}"
28
+ examples = collect_labeled_tokenstreams(example_nodes, :end)
29
+ learner = Learner.new(*examples)
30
+ end_rules = learner.learn_rule :back, exhaustive
31
+ Log.info "Learnt end rules, #{end_rules.inspect}"
32
+ structure_node.ruleset=RuleSet.new(start_rules, end_rules)
33
+ end
34
+ return structure
35
+ end
36
+
37
+ private
38
+ # Processes the given labeled strings by creating a Node::Extracted tree.
39
+ # A hash is returned with each child of the passed Node::Structure as a key,
40
+ # and an array of the relevant extracted examples (as Node::Extracted
41
+ # objects).
42
+ def process_labeled_strings(structure, *labeled_strings)
43
+ loaded_example_hash = Hash.new {|h, k| h[k]=[]}
44
+ labeled_strings.each do |string|
45
+ tokenstream = TokenStream.new
46
+ tokenstream.tokenize(string, true)
47
+ root = Node::Extracted.new(:root, tokenstream, structure)
48
+ structure.apply_extraction_tree_on(root, true)
49
+ root.each_descendant(true) do |extracted_node|
50
+ if extracted_node.parent
51
+ loaded_example_hash[extracted_node.structure_node] << extracted_node
52
+ end
53
+ extracted_node.tokenstream.remove_label_tags
54
+ end
55
+ end
56
+ return loaded_example_hash
57
+ end
58
+
59
+ # Given an array of example nodes, will return an array of tokenstreams
60
+ # labeled for learning, at either the start or end. The example node
61
+ # passed are actually the nodes to be extracted. This method then looks up
62
+ # the parent, and labels their position in the parent so rules to extract
63
+ # the given node can be learnt. Type is either :start or :end
64
+ def collect_labeled_tokenstreams(example_nodes, type)
65
+ example_nodes.collect do |node|
66
+ tokenstream=node.parent.tokenstream #Rules are based on extracting from the parent
67
+ if type==:start
68
+ tokenstream.set_label_at(node.tokenstream.tokens.first.start_loc)
69
+ elsif type==:end
70
+ tokenstream.set_label_at(node.tokenstream.tokens.last.start_loc)
71
+ end
72
+ tokenstream
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -15,7 +15,7 @@ module Ariel
15
15
  if examples.any? {|example| example.label_index.nil?}
16
16
  raise ArgumentError, "Passed a TokenStream with no label"
17
17
  end
18
- debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
18
+ Log.debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
19
19
  @examples=examples
20
20
  @candidates=[]
21
21
  set_seed
@@ -25,22 +25,30 @@ module Ariel
25
25
  # to use as its seed example, then finds a rule that matches the maximum
26
26
  # number of examples correctly and fails on all overs. All matched examples
27
27
  # are then removed and the process is repeated considering all examples that
28
- # remain. Returns an array of the rules found (in order).
29
- def learn_rule(direction)
30
- debug "Searching for a #{direction} rule"
28
+ # remain. Returns an array of the rules found (in order). learn_rule will
29
+ # take care of reversing the given examples if necessary.
30
+ def learn_rule(direction, exhaustive=false)
31
+ Log.debug "Searching for a #{direction} rule"
32
+ @examples=@examples.collect {|tokenstream| Rule.prepare_tokenstream(tokenstream, direction)}
31
33
  @direction=direction
32
- @current_rule=Rule.new(direction)
34
+ @exhaustive=exhaustive
35
+ if exhaustive
36
+ @examples.delete_if {|example| example_is_unsuitable?(example)}
37
+ raise StandardError, "No examples are suitable for exhaustive rule learning" if @examples.empty?
38
+ end
39
+ @current_rule=Rule.new([], direction, exhaustive)
33
40
  combined_rules=[]
34
41
  while not @examples.empty?
35
42
  set_seed unless @examples.include? @current_seed
36
43
  rule = find_best_rule() # Find the rule that matches the most examples and fails on the others
37
44
  prev_size = @examples.size
38
45
  @examples.delete_if {|example| rule.apply_to(example)} #separate and conquer!
39
- debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
46
+ Log.debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
40
47
  combined_rules << rule
41
48
  end
42
49
  # rule = order_rule(rule) #STALKER paper suggests that the generated rules should be ordered. This doesn't make sense, seeing as they are all generated based only on examples not matched by previous rules
43
- debug "Generated rules: #{combined_rules.inspect}"
50
+ Log.debug "Generated rules: #{combined_rules.inspect}"
51
+ Rule.clear_cache
44
52
  return combined_rules
45
53
  end
46
54
 
@@ -49,7 +57,7 @@ module Ariel
49
57
  def set_seed
50
58
  sorted = @examples.sort_by {|example| example.label_index}
51
59
  self.current_seed=sorted.first
52
- debug "current_seed=#{current_seed.text}"
60
+ Log.debug "current_seed=#{current_seed.text}"
53
61
  return current_seed
54
62
  end
55
63
 
@@ -59,13 +67,13 @@ module Ariel
59
67
  # token's text or any of it's matching wildcards.
60
68
  def generate_initial_candidates
61
69
  if current_seed.label_index==0
62
- @candidates << Rule.new(@direction)
70
+ @candidates << Rule.new([], @direction, @exhaustive)
63
71
  else
64
72
  end_token=current_seed.tokens[current_seed.label_index-1]
65
- debug "Creating initial candidates based on #{end_token.text}"
66
- @candidates<< Rule.new(@direction, [[end_token.text]])
73
+ Log.debug "Creating initial candidates based on #{end_token.text}"
74
+ @candidates<< Rule.new([[end_token.text]], @direction, @exhaustive)
67
75
  @candidates.concat(@candidates[0].generalise_feature(0))
68
- debug "Initial candidates: #{@candidates.inspect} created"
76
+ Log.debug "Initial candidates: #{@candidates.inspect} created"
69
77
  end
70
78
  return @candidates.size
71
79
  end
@@ -83,7 +91,7 @@ module Ariel
83
91
  refine
84
92
  end
85
93
  # return post_process(best_solution)
86
- debug "Rule found: #{best_solution.inspect}"
94
+ Log.debug "Rule found: #{best_solution.inspect}"
87
95
  return best_solution
88
96
  end
89
97
 
@@ -95,16 +103,14 @@ module Ariel
95
103
  @examples.each do |example|
96
104
  if rule.matches(example, :perfect)
97
105
  perfect_count+=1
98
- debug "#{rule.inspect} matches #{example.text} perfectly"
99
106
  elsif rule.matches(example, :fail)
100
107
  fail_count+=1
101
- debug "#{rule.inspect} fails to match #{example.text}"
102
108
  end
103
109
  end
104
110
  if (perfect_count >= 1) && (fail_count == (@examples.size - perfect_count))
105
111
  return true
106
112
  else
107
- debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
113
+ Log.debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
108
114
  return false
109
115
  end
110
116
  end
@@ -121,15 +127,15 @@ module Ariel
121
127
  # document structure.
122
128
  # * longer end landmarks - prefer "local context" landmarks.
123
129
  def get_best_refiner
124
- selector = CandidateSelector.new(@candidates, @examples)
125
- selector.select_best_by_match_type :early, :perfect #Discriminate on coverage
126
- selector.select_best_by_match_type :early
127
- selector.select_best_by_match_type :fail
128
- selector.select_with_fewer_wildcards
129
- selector.select_closest_to_label
130
- selector.select_with_longer_end_landmarks
131
- best_refiner = selector.random_from_remaining #just pick a random one for now if still multiple
132
- debug "best_refiner found => #{best_refiner.inspect}"
130
+ r = CandidateRefiner.new(@candidates, @examples)
131
+ r.refine_by_match_type :early, :perfect #Discriminate on coverage
132
+ r.refine_by_match_type :early
133
+ r.refine_by_match_type :fail
134
+ r.refine_by_fewer_wildcards
135
+ r.refine_by_label_proximity
136
+ r.refine_by_longer_end_landmarks
137
+ best_refiner = r.random_from_remaining #just pick a random one for now if still multiple
138
+ Log.debug "best_refiner found => #{best_refiner.inspect}"
133
139
  return best_refiner
134
140
  end
135
141
 
@@ -141,14 +147,14 @@ module Ariel
141
147
  # * longer end landmarks
142
148
  # * shorter unconsumed prefixes
143
149
  def get_best_solution
144
- selector = CandidateSelector.new(@candidates, @examples)
145
- selector.select_best_by_match_type :perfect
146
- selector.select_best_by_match_type :fail
147
- selector.select_with_fewer_wildcards
148
- selector.select_closest_to_label
149
- selector.select_with_longer_end_landmarks
150
- best_solution = selector.random_from_remaining
151
- debug "best_solution found => #{best_solution.inspect}"
150
+ r = CandidateRefiner.new(@candidates, @examples)
151
+ r.refine_by_match_type :perfect
152
+ r.refine_by_match_type :fail
153
+ r.refine_by_fewer_wildcards
154
+ r.refine_by_label_proximity
155
+ r.refine_by_longer_end_landmarks
156
+ best_solution = r.random_from_remaining
157
+ Log.debug "best_solution found => #{best_solution.inspect}"
152
158
  return best_solution
153
159
  end
154
160
 
@@ -180,7 +186,7 @@ module Ariel
180
186
  # alternative landmark extensions that use relevant wildcards.
181
187
  def lengthen_landmark(landmark, index)
182
188
  current_seed.rewind #In case apply_rule isn't called as index=0
183
- result = @current_rule.partial(0..(index-1)).apply_to current_seed if index > 0 #Don't care about already matched tokens
189
+ result = @current_rule.partial(0..(index-1)).closest_match current_seed if index > 0 #Don't care about already matched tokens
184
190
  return 0 unless result # Rule doesn't match, no point refining
185
191
  refined_rules=[]
186
192
  width = landmark.size
@@ -202,7 +208,7 @@ module Ariel
202
208
  refined_rules.concat b.generalise_feature(index, -1)
203
209
  end
204
210
  @candidates.concat refined_rules
205
- debug "#{refined_rules.size} landmark refinements generated"
211
+ Log.debug "#{refined_rules.size} landmark refinements generated"
206
212
  return refined_rules.size
207
213
  end
208
214
 
@@ -219,7 +225,7 @@ module Ariel
219
225
  # is also done for each of that token's matching wildcards.
220
226
  def add_new_landmarks(landmark, index)
221
227
  topology_refs=[]
222
- start_pos = current_rule.partial(0..index).apply_to(current_seed)
228
+ start_pos = current_rule.partial(0..index).closest_match(current_seed, :early)
223
229
  end_pos = current_seed.label_index #No point adding tokens that occur after the label_index
224
230
  current_seed.tokens[start_pos...end_pos].each do |token|
225
231
  r=current_rule.deep_clone
@@ -227,11 +233,27 @@ module Ariel
227
233
  topology_refs << r
228
234
  topology_refs.concat r.generalise_feature(index+1)
229
235
  end
230
- debug "Topology refinements before uniq! #{topology_refs.size}"
236
+ Log.debug "Topology refinements before uniq! #{topology_refs.size}"
231
237
  topology_refs.uniq!
232
238
  @candidates.concat topology_refs
233
- debug "#{topology_refs.size} topology refinements generated"
239
+ Log.debug "#{topology_refs.size} topology refinements generated"
234
240
  return topology_refs.size
235
241
  end
242
+
243
+ # When learning list iteration rules, some examples may be unsuitable. For
244
+ # instance if there is a list item at the start of an example with no tokens
245
+ # before it, a skip_to(nil) start rule would be generated that wouldn't make
246
+ # sense for exhaustive rules. The example should be caught by the
247
+ # corresponding end rule. This should only be run after tokenstream's have
248
+ # been prepared (reversed based on whether a :forward or :back rule is being
249
+ # searched for). Only returns a valid conclusion if the examples are
250
+ # intended to be used for exhaustive rule learning
251
+ def example_is_unsuitable?(tokenstream)
252
+ if tokenstream.label_index==0
253
+ return true
254
+ else
255
+ return false
256
+ end
257
+ end
236
258
  end
237
259
  end