RubyGems - ariel - Versions diffs - 0.0.1 → 0.1.0 - Mend

ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README +49 -83
data/bin/ariel +29 -20
data/examples/google_calculator/structure.rb +2 -2
data/examples/google_calculator/structure.yaml +13 -15
data/examples/raa/labeled/highline.html +5 -4
data/examples/raa/labeled/mongrel.html +9 -8
data/examples/raa/structure.rb +4 -2
data/examples/raa/structure.yaml +94 -78
data/lib/ariel.rb +71 -33
data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
data/lib/ariel/label_utils.rb +46 -18
data/lib/ariel/labeled_document_loader.rb +77 -0
data/lib/ariel/learner.rb +60 -38
data/lib/ariel/log.rb +67 -0
data/lib/ariel/node.rb +52 -0
data/lib/ariel/node/extracted.rb +90 -0
data/lib/ariel/node/structure.rb +91 -0
data/lib/ariel/rule.rb +114 -32
data/lib/ariel/rule_set.rb +34 -15
data/lib/ariel/token.rb +9 -3
data/lib/ariel/token_stream.rb +32 -17
data/lib/ariel/wildcards.rb +19 -15
data/test/fixtures.rb +45 -3
data/test/specs/candidate_refiner_spec.rb +48 -0
data/test/specs/label_utils_spec.rb +97 -0
data/test/specs/learner_spec.rb +39 -0
data/test/specs/node_extracted_spec.rb +90 -0
data/test/specs/node_spec.rb +76 -0
data/test/specs/node_structure_spec.rb +74 -0
data/test/specs/rule_set_spec.rb +85 -0
data/test/specs/rule_spec.rb +110 -0
data/test/specs/token_stream_spec.rb +100 -7
metadata +21 -28
data/lib/ariel/example_document_loader.rb +0 -59
data/lib/ariel/extracted_node.rb +0 -20
data/lib/ariel/node_like.rb +0 -26
data/lib/ariel/structure_node.rb +0 -75
data/test/ariel_test_case.rb +0 -15
data/test/test_candidate_selector.rb +0 -58
data/test/test_example_document_loader.rb +0 -7
data/test/test_label_utils.rb +0 -15
data/test/test_learner.rb +0 -38
data/test/test_rule.rb +0 -38
data/test/test_structure_node.rb +0 -81
data/test/test_token.rb +0 -16
data/test/test_token_stream.rb +0 -82
data/test/test_wildcards.rb +0 -18

data/lib/ariel.rb CHANGED

@@ -1,32 +1,16 @@
+require 'ariel/log'
+require 'ariel/wildcards'
+require 'ariel/label_utils'
 require 'ariel/token'
 require 'ariel/token_stream'
 require 'ariel/learner'
-require 'ariel/node_like'
-require 'ariel/extracted_node'
-require 'ariel/structure_node'
+require 'ariel/node/structure'
+require 'ariel/node/extracted'
 require 'ariel/rule'
-require 'ariel/wildcards'
-require 'ariel/candidate_selector'
-require 'ariel/label_utils'
-require 'ariel/example_document_loader'
+require 'ariel/candidate_refiner'
+require 'ariel/labeled_document_loader'
 require 'ariel/rule_set'
-if $DEBUG
-#  require 'logger'
-#  DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
-#  DEBUGLOG.datetime_format = " \010"
-#  DEBUGLOG.progname = "\010\010\010"
-  def debug(message)
-     p message
-    #DEBUGLOG.debug message
-  end
-else
-  def debug(message)
-  end
-end
 # = Ariel - A Ruby Information Extraction Library
 # Ariel intends to assist in extracting information from semi-structured
 # documents including (but not in any way limited to) web pages. Although you
@@ -41,29 +25,83 @@ end
 # 1. Define a structure for the data you wish to extract. For example:
 #
 #     @structure = Ariel::StructureNode.new do |r|
-#       r.article do |a|
-#         a.title
-#         a.author
-#         a.date
-#         a.body
+#       r.item :article do |a|
+#         a.item :title
+#         a.item :author
+#         a.item :date
+#         a.item :body
 #       end
-#       r.comment_list do |c|
-#         c.author
-#         c.date
-#         c.body
+#       r.list :comments do |c|
+#         c.list_item :comment do |c|
+#           c.item :author
+#           c.item :date
+#           c.item :body
+#         end
 #       end
 #     end
 # 2. Label these fields in a few example documents (normally at least 3).
 #    Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
 # 3. Ariel will read these examples, and try to generate suitable rules that can
-#    be used to extract this data from other similarly structured documents.
+#    be used to extract this data from other similarly structured documents. Use
+#    Ariel#learn to initiate learn ruling.
 # 4. A wrapper has been generated - we can now happily load documents with the
 #    same structure (normally documents generated by the same rules, so
 #    different pages from a single site perhaps) and query the extracted data.
+#    See Ariel#extract.
 module Ariel
+  class << self
+    # Given a root Node::Structure and a list of labeled_files (either IO objects
+    # or strings representing files that can be opened with File.read, will learn
+    # rules using the labeled examples. The passed Node::Structure tree is
+    # returned with new RuleSets added containing the learnt rules. This structure
+    # can now be used with Ariel#extract on unlabeled documents.
+    #
+    # <tt>Ariel.learn structure, 'file1.html', fileobj, 'file2.html'</tt>
+    def learn(structure, *labeled_files)
+      raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
+      labeled_strings=collect_strings(labeled_files)
+      return LabeledDocumentLoader.supervise_learning(structure, *labeled_strings)
+    end
+    # Will use the given root Node::Structure to extract information from each of
+    # the given files (can be any object responding to #read, and if passed a
+    # string the parameter will be opened using File.read). If a block is given,
+    # each root Node::Extracted is yielded. An array of each root extracted node
+    # is returned.
+    #
+    # <tt>Ariel.extract structure, 'file1.txt', fileobj, 'file2.html'  # =></tt> an
+    # array of 3 Node::Extracted objects
+    def extract(structure, *files_to_extract)
+      raise ArgumentError, "Passed structure is not the parent of the document tree" unless structure.parent.nil?
+      extractions=[]
+      collect_strings(files_to_extract).each do |string|
+        tokenstream = TokenStream.new
+        tokenstream.tokenize string
+        root_node=Ariel::Node::Extracted.new :root, tokenstream, structure
+        structure.apply_extraction_tree_on root_node
+        extractions << root_node
+        yield root_node if block_given?
+      end
+      return extractions
+    end
+    private
+    def collect_strings(files)
+      strings=[]
+      files.each do |file|
+        if file.kind_of? String
+          next unless File.file? file
+          strings << File.read(file)
+        elsif file.respond_to? :read
+          strings << file.read
+        else
+          raise ArgumentError, "Don't know how to handle #{file.inspect}"
+        end
+      end
+      return strings
+    end
+  end
 end

data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} RENAMED

@@ -1,13 +1,13 @@
 module Ariel
   # Given an array of candidate Rules, and an array of LabeledStreams,
-  # allows heuristics to be applied to select the ideal Rule. All select_* instance
+  # allows heuristics to be applied to select the ideal Rule. All refine_* instance
   # methods will remove candidates from the internal candidates array.
-  class CandidateSelector
+  class CandidateRefiner
     attr_accessor :candidates
     def initialize(candidates, examples)
-      @candidates=candidates.dup #Just in case a CandidateSelector function directly modifies the array, affecting the original. Shouldn't happen.
+      @candidates=candidates.dup #Just in case we directly modify the array. Shouldn't happen.
       @examples=examples
     end
@@ -15,8 +15,8 @@ module Ariel
     # against the given examples. e.g. select_best_by_match_type(:early, :perfect)
     # will select the rules that have the most matches that are early or
     # perfect.
-    def select_best_by_match_type(*match_types)
-      debug "Selecting best by match types #{match_types}"
+    def refine_by_match_type(*match_types)
+      Log.debug "Refining by match types #{match_types.inspect}"
       return @candidates if @candidates.size==1
       @candidates = highest_scoring_by do |rule|
         rule_score=0
@@ -28,44 +28,19 @@ module Ariel
       return @candidates
     end
-    # All scoring functions use this indirectly. It iterates over each
-    # Rule candidate, and assigns it a score in a hash of index:score pairs.
-    # Each rule is yielded to the given block, which is expected to return that
-    # rule's score.
-    def score_by
-      score_hash={}
-      @candidates.each_with_index do |rule, index|
-        score_hash[index]= yield rule
-      end
-      return score_hash
-    end
-    # Takes a scoring function as a block, and yields each rule to it. Returns
-    # an array of the Rule candidates that have the highest score.
-    def highest_scoring_by(&scorer)
-      score_hash = score_by &scorer
-      best_score = score_hash.values.sort.last
-      highest_scorers=[]
-      score_hash.each do |candidate_index, score|
-        highest_scorers << @candidates[candidate_index] if score==best_score
-      end
-      debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
-      return highest_scorers
-    end
-    def select_with_fewer_wildcards
-      debug "Selecting the rules with the fewest wildcards"
+    def refine_by_fewer_wildcards
+      Log.debug "Refining to the rules with the fewest wildcards"
       @candidates = highest_scoring_by {|rule| -rule.wildcard_count} #hack or not?
       return @candidates
     end
-    def select_closest_to_label
-      debug "Selecting rules that match the examples closest to the label"
+    def refine_by_label_proximity
+      Log.debug "Selecting rules that match the examples closest to the label"
       @candidates = highest_scoring_by do |rule|
         rule_score=0
         matched_examples=0
         @examples.each do |example|
-          match_index = rule.apply_to(example)
+          match_index = rule.closest_match(example)
           if match_index.nil?
             next
           else
@@ -79,16 +54,42 @@ module Ariel
       return @candidates
     end
-    def select_with_longer_end_landmarks
-      debug "Selecting rules that have longer end landmarks"
+    def refine_by_longer_end_landmarks
+      Log.debug "Selecting rules that have longer end landmarks"
       @candidates = highest_scoring_by {|rule| rule.landmarks.last.size unless rule.landmarks.last.nil?}
     end
     # Returns a random candidate. Meant for making the final choice in case
     # previous selections have still left multiple candidates.
     def random_from_remaining
-      debug "Selecting random from last #{candidates.size} candidate rules"
+      Log.debug "Selecting random from last #{candidates.size} candidate rules"
       @candidates.sort_by {rand}.first
     end
+    private
+    # All scoring functions use this indirectly. It iterates over each
+    # Rule candidate, and assigns it a score in a hash of index:score pairs.
+    # Each rule is yielded to the given block, which is expected to return that
+    # rule's score.
+    def score_by
+      score_hash={}
+      @candidates.each_with_index do |rule, index|
+        score_hash[index]= yield rule
+      end
+      return score_hash
+    end
+    # Takes a scoring function as a block, and yields each rule to it. Returns
+    # an array of the Rule candidates that have the highest score.
+    def highest_scoring_by(&scorer)
+      score_hash = score_by &scorer
+      best_score = score_hash.values.sort.last
+      highest_scorers=[]
+      score_hash.each do |candidate_index, score|
+        highest_scorers << @candidates[candidate_index] if score==best_score
+      end
+      Log.debug "#{highest_scorers.size} highest_scorers were found, with a score of #{best_score}"
+      return highest_scorers
+    end
   end
 end

data/lib/ariel/label_utils.rb CHANGED

@@ -1,6 +1,6 @@
 module Ariel
-  # A set of methods for use when dealing with strings from labeled documents
+  # A set of methods for use when dealing with strings from labeled documents.
   module LabelUtils
     S_LABEL="<"
     E_LABEL=">"
@@ -15,7 +15,7 @@ module Ariel
       /#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
     end
-    # Helper function that returns a regex that will return any open or closing
+    # Helper function that returns a regex that will match any open or closing
     # label tags.
     def self.any_label_regex()
       Regexp.union(*self.label_regex)
@@ -28,17 +28,42 @@ module Ariel
     end
     # Extracts the labeled region representing the given structure node from the
-    # parent_extracted_node. A new ExtractedNode is returned to be added as a
+    # parent_extracted_node. A new Node::Extracted is returned to be added as a
     # child to the parent_extracted_node. Used when loading labeled documents.
     def self.extract_labeled_region(structure, parent_extracted_node)
       tokenstream=parent_extracted_node.tokenstream
-      start_idx=self.skip_to_label_tag(tokenstream, structure.meta.name, :open)
-      end_idx=self.skip_to_label_tag(tokenstream.reverse, structure.meta.name, :closed)
-      end_idx=tokenstream.reverse_pos end_idx
-      newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
-      child_node=ExtractedNode.new(structure.meta.name, newstream, structure)
-      parent_extracted_node.add_child child_node
-      return child_node
+      start_idxs=[]
+      end_idxs=[]
+      tokenstream.rewind
+      while start_idx = self.skip_to_label_tag(tokenstream, structure.node_name, :open)
+        start_idxs << start_idx
+        break unless structure.node_type==:list_item
+      end
+      tokenstream.rewind
+      while end_idx=self.skip_to_label_tag(tokenstream, structure.node_name, :closed)
+        end_idxs << (end_idx -2) #rewind to token before the label tag token
+        break unless structure.node_type==:list_item
+      end
+      result=[]
+      i=0
+      start_idxs.zip(end_idxs) do |start_idx, end_idx|
+        if start_idx && end_idx && (start_idx <= end_idx)
+          newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
+          if structure.node_type==:list_item
+            new_name="#{structure.node_name}_#{i}"
+            i+=1
+          else
+            new_name = structure.node_name
+          end
+          child_node = Node::Extracted.new(new_name, newstream, structure)
+          result << child_node
+          parent_extracted_node.add_child child_node
+          yield child_node if block_given?
+        else
+          break
+        end
+      end
+      return result
     end
     private
@@ -50,22 +75,25 @@ module Ariel
       when :closed
         re_index=1
       end
-      tokenstream.rewind
       regex = self.label_regex(name.to_s)[re_index]
-      debug "Seeking #{name.to_s} of type #{type}"
+      Log.debug "Seeking #{name.to_s} of type #{type}"
       nesting_level=0
       tokenstream.each do |token|
-        if token.matches?(regex)
-          return tokenstream.cur_pos if nesting_level==0
+        if token.matches?(regex) && nesting_level==0
+          Log.debug "Found a match"
+          return tokenstream.cur_pos
         end
         if token.matches?(self.label_regex[0])
-          nesting_level+=1
-          debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
+          # Don't increase nesting if encounter the unnested start tag that
+          # pairs with the end tag we're searching for.
+          nesting_level+=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[0])
+          Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
         elsif token.matches?(self.label_regex[1])
-          nesting_level-=1
-          debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
+          nesting_level-=1 unless nesting_level==0 && token.matches?(self.label_regex(name.to_s)[1])
+          Log.debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
         end
       end
+      return nil
     end
   end
 end

data/lib/ariel/labeled_document_loader.rb ADDED

@@ -0,0 +1,77 @@
+module Ariel
+  # Provides methods that read an example document, using a Node::Structure tree
+  # to populate a tree of Nodes with each labeled example.
+  class LabeledDocumentLoader
+    class << self
+      # As its first argument it takes a root Node::Structure to which any
+      # learnt rules will be added. The following arguments are strings
+      # containing labeled examples for members of the passed Node::Structure
+      # tree. Ariel#learn is the preferred interface for rule-learning - this
+      # one may change.
+      def supervise_learning(structure, *labeled_strings)
+        raise ArgumentError, "No labeled strings were given" if labeled_strings.size==0
+        loaded_example_hash=process_labeled_strings(structure, *labeled_strings)
+        loaded_example_hash.each_pair do |structure_node, example_nodes|
+          if structure_node.node_type==:list_item
+            exhaustive=true
+          else
+            exhaustive=false
+          end
+          examples = collect_labeled_tokenstreams(example_nodes, :start)
+          Log.info "Learning #{"exhaustive " if exhaustive}rules for node #{structure_node.node_name} with #{example_nodes.size} examples"
+          learner = Learner.new(*examples)
+          start_rules = learner.learn_rule :forward, exhaustive
+          Log.info "Learnt start rules #{start_rules.inspect}"
+          examples = collect_labeled_tokenstreams(example_nodes, :end)
+          learner = Learner.new(*examples)
+          end_rules = learner.learn_rule :back, exhaustive
+          Log.info "Learnt end rules, #{end_rules.inspect}"
+          structure_node.ruleset=RuleSet.new(start_rules, end_rules)
+        end
+        return structure
+      end
+      private
+      # Processes the given labeled strings by creating a Node::Extracted tree.
+      # A hash is returned with each child of the passed Node::Structure as a key,
+      # and an array of the relevant extracted examples (as Node::Extracted
+      # objects).
+      def process_labeled_strings(structure, *labeled_strings)
+        loaded_example_hash = Hash.new {|h, k| h[k]=[]}
+        labeled_strings.each do |string|
+          tokenstream = TokenStream.new
+          tokenstream.tokenize(string, true)
+          root = Node::Extracted.new(:root, tokenstream, structure)
+          structure.apply_extraction_tree_on(root, true)
+          root.each_descendant(true) do |extracted_node|
+            if extracted_node.parent
+              loaded_example_hash[extracted_node.structure_node] << extracted_node
+            end
+            extracted_node.tokenstream.remove_label_tags
+          end
+        end
+        return loaded_example_hash
+      end
+			# Given an array of example nodes, will return an array of tokenstreams
+      # labeled for learning, at either the start or end. The example node
+      # passed are actually the nodes to be extracted. This method then looks up
+      # the parent, and labels their position in the parent so rules to extract
+      # the given node can be learnt. Type is either :start or :end
+			def collect_labeled_tokenstreams(example_nodes, type)
+        example_nodes.collect do |node|
+          tokenstream=node.parent.tokenstream #Rules are based on extracting from the parent
+          if type==:start
+            tokenstream.set_label_at(node.tokenstream.tokens.first.start_loc)
+          elsif type==:end
+            tokenstream.set_label_at(node.tokenstream.tokens.last.start_loc)
+          end
+          tokenstream
+        end
+      end
+    end
+  end
+end

data/lib/ariel/learner.rb CHANGED

@@ -15,7 +15,7 @@ module Ariel
       if examples.any? {|example| example.label_index.nil?}
         raise ArgumentError, "Passed a TokenStream with no label"
       end
-      debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
+      Log.debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
       @examples=examples
       @candidates=[]
       set_seed
@@ -25,22 +25,30 @@ module Ariel
     # to use as its seed example, then finds a rule that matches the maximum
     # number of examples correctly and fails on all overs. All matched examples
     # are then removed and the process is repeated considering all examples that
-    # remain. Returns an array of the rules found (in order).
-    def learn_rule(direction)
-      debug "Searching for a #{direction} rule"
+    # remain. Returns an array of the rules found (in order). learn_rule will
+    # take care of reversing the given examples if necessary.
+    def learn_rule(direction, exhaustive=false)
+      Log.debug "Searching for a #{direction} rule"
+      @examples=@examples.collect {|tokenstream| Rule.prepare_tokenstream(tokenstream, direction)}
       @direction=direction
-      @current_rule=Rule.new(direction)
+      @exhaustive=exhaustive
+      if exhaustive
+        @examples.delete_if {|example| example_is_unsuitable?(example)}
+        raise StandardError, "No examples are suitable for exhaustive rule learning" if @examples.empty?
+      end
+      @current_rule=Rule.new([], direction, exhaustive)
       combined_rules=[]
       while not @examples.empty?
         set_seed unless @examples.include? @current_seed
         rule = find_best_rule() # Find the rule that matches the most examples and fails on the others
         prev_size = @examples.size
         @examples.delete_if {|example| rule.apply_to(example)} #separate and conquer!
-        debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
+        Log.debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
         combined_rules << rule
       end
 #      rule = order_rule(rule) #STALKER paper suggests that the generated rules should be ordered. This doesn't make sense, seeing as they are all generated based only on examples not matched by previous rules
-      debug "Generated rules: #{combined_rules.inspect}"
+      Log.debug "Generated rules: #{combined_rules.inspect}"
+      Rule.clear_cache
       return combined_rules
     end
@@ -49,7 +57,7 @@ module Ariel
     def set_seed
       sorted = @examples.sort_by {|example| example.label_index}
       self.current_seed=sorted.first
-      debug "current_seed=#{current_seed.text}"
+      Log.debug "current_seed=#{current_seed.text}"
       return current_seed
     end
@@ -59,13 +67,13 @@ module Ariel
     # token's text or any of it's matching wildcards.
     def generate_initial_candidates
       if current_seed.label_index==0
-        @candidates << Rule.new(@direction)
+        @candidates << Rule.new([], @direction, @exhaustive)
       else
         end_token=current_seed.tokens[current_seed.label_index-1]
-        debug "Creating initial candidates based on #{end_token.text}"
-        @candidates<< Rule.new(@direction, [[end_token.text]])
+        Log.debug "Creating initial candidates based on #{end_token.text}"
+        @candidates<< Rule.new([[end_token.text]], @direction, @exhaustive)
         @candidates.concat(@candidates[0].generalise_feature(0))
-        debug "Initial candidates: #{@candidates.inspect} created"
+        Log.debug "Initial candidates: #{@candidates.inspect} created"
       end
       return @candidates.size
     end
@@ -83,7 +91,7 @@ module Ariel
         refine
       end
 #     return post_process(best_solution)
-      debug "Rule found: #{best_solution.inspect}"
+      Log.debug "Rule found: #{best_solution.inspect}"
       return best_solution
     end
@@ -95,16 +103,14 @@ module Ariel
       @examples.each do |example|
         if rule.matches(example, :perfect)
           perfect_count+=1
-          debug "#{rule.inspect} matches #{example.text} perfectly"
         elsif rule.matches(example, :fail)
           fail_count+=1
-          debug "#{rule.inspect} fails to match #{example.text}"
         end
       end
       if (perfect_count >= 1) && (fail_count == (@examples.size - perfect_count))
         return true
       else
-        debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
+        Log.debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
         return false
       end
     end
@@ -121,15 +127,15 @@ module Ariel
     #   document structure.
     # * longer end landmarks - prefer "local context" landmarks.
     def get_best_refiner
-      selector = CandidateSelector.new(@candidates, @examples)
-      selector.select_best_by_match_type :early, :perfect #Discriminate on coverage
-      selector.select_best_by_match_type :early
-      selector.select_best_by_match_type :fail
-      selector.select_with_fewer_wildcards
-      selector.select_closest_to_label
-      selector.select_with_longer_end_landmarks
-      best_refiner = selector.random_from_remaining #just pick a random one for now if still multiple
-      debug "best_refiner found => #{best_refiner.inspect}"
+      r = CandidateRefiner.new(@candidates, @examples)
+      r.refine_by_match_type :early, :perfect #Discriminate on coverage
+      r.refine_by_match_type :early
+      r.refine_by_match_type :fail
+      r.refine_by_fewer_wildcards
+      r.refine_by_label_proximity
+      r.refine_by_longer_end_landmarks
+      best_refiner = r.random_from_remaining #just pick a random one for now if still multiple
+      Log.debug "best_refiner found => #{best_refiner.inspect}"
       return best_refiner
     end
@@ -141,14 +147,14 @@ module Ariel
     # * longer end landmarks
     # * shorter unconsumed prefixes
     def get_best_solution
-      selector = CandidateSelector.new(@candidates, @examples)
-      selector.select_best_by_match_type :perfect
-      selector.select_best_by_match_type :fail
-      selector.select_with_fewer_wildcards
-      selector.select_closest_to_label
-      selector.select_with_longer_end_landmarks
-      best_solution = selector.random_from_remaining
-      debug "best_solution found => #{best_solution.inspect}"
+      r = CandidateRefiner.new(@candidates, @examples)
+      r.refine_by_match_type :perfect
+      r.refine_by_match_type :fail
+      r.refine_by_fewer_wildcards
+      r.refine_by_label_proximity
+      r.refine_by_longer_end_landmarks
+      best_solution = r.random_from_remaining
+      Log.debug "best_solution found => #{best_solution.inspect}"
       return best_solution
     end
@@ -180,7 +186,7 @@ module Ariel
     #   alternative landmark extensions that use relevant wildcards.
     def lengthen_landmark(landmark, index)
       current_seed.rewind #In case apply_rule isn't called as index=0
-      result = @current_rule.partial(0..(index-1)).apply_to current_seed if index > 0 #Don't care about already matched tokens
+      result = @current_rule.partial(0..(index-1)).closest_match current_seed if index > 0 #Don't care about already matched tokens
       return 0 unless result # Rule doesn't match, no point refining
       refined_rules=[]
       width = landmark.size
@@ -202,7 +208,7 @@ module Ariel
         refined_rules.concat b.generalise_feature(index, -1)
       end
       @candidates.concat refined_rules
-      debug "#{refined_rules.size} landmark refinements generated"
+      Log.debug "#{refined_rules.size} landmark refinements generated"
       return refined_rules.size
     end
@@ -219,7 +225,7 @@ module Ariel
     #   is also done for each of that token's matching wildcards.
     def add_new_landmarks(landmark, index)
       topology_refs=[]
-      start_pos = current_rule.partial(0..index).apply_to(current_seed)
+      start_pos = current_rule.partial(0..index).closest_match(current_seed, :early)
       end_pos = current_seed.label_index #No point adding tokens that occur after the label_index
       current_seed.tokens[start_pos...end_pos].each do |token|
           r=current_rule.deep_clone
@@ -227,11 +233,27 @@ module Ariel
           topology_refs << r
           topology_refs.concat r.generalise_feature(index+1)
       end
-    debug "Topology refinements before uniq! #{topology_refs.size}"
+    Log.debug "Topology refinements before uniq! #{topology_refs.size}"
     topology_refs.uniq!
     @candidates.concat topology_refs
-    debug "#{topology_refs.size} topology refinements generated"
+    Log.debug "#{topology_refs.size} topology refinements generated"
     return topology_refs.size
     end
+    # When learning list iteration rules, some examples may be unsuitable. For
+    # instance if there is a list item at the start of an example with no tokens
+    # before it, a skip_to(nil) start rule would be generated that wouldn't make
+    # sense for exhaustive rules. The example should be caught by the
+    # corresponding end rule. This should only be run after tokenstream's have
+    # been prepared (reversed based on whether a :forward or :back rule is being
+    # searched for). Only returns a valid conclusion if the examples are
+    # intended to be used for exhaustive rule learning
+    def example_is_unsuitable?(tokenstream)
+      if tokenstream.label_index==0
+        return true
+      else
+        return false
+      end
+    end
   end
 end