RubyGems - ariel - Versions diffs - 0.0.1 - Mend

ariel 0.0.1

Files changed (43) hide show

data/LICENSE +21 -0
data/README +98 -0
data/bin/ariel +56 -0
data/examples/google_calculator/labeled/1 +43 -0
data/examples/google_calculator/labeled/2 +41 -0
data/examples/google_calculator/labeled/3 +41 -0
data/examples/google_calculator/structure.rb +12 -0
data/examples/google_calculator/structure.yaml +46 -0
data/examples/google_calculator/unlabeled/1 +43 -0
data/examples/google_calculator/unlabeled/2 +43 -0
data/examples/raa/labeled/highline.html +135 -0
data/examples/raa/labeled/mongrel.html +168 -0
data/examples/raa/structure.rb +17 -0
data/examples/raa/structure.yaml +183 -0
data/examples/raa/unlabeled/pdf-writer.html +175 -0
data/lib/ariel/candidate_selector.rb +94 -0
data/lib/ariel/example_document_loader.rb +59 -0
data/lib/ariel/extracted_node.rb +20 -0
data/lib/ariel/label_utils.rb +71 -0
data/lib/ariel/learner.rb +237 -0
data/lib/ariel/node_like.rb +26 -0
data/lib/ariel/rule.rb +112 -0
data/lib/ariel/rule_set.rb +34 -0
data/lib/ariel/structure_node.rb +75 -0
data/lib/ariel/token.rb +68 -0
data/lib/ariel/token_stream.rb +240 -0
data/lib/ariel/wildcards.rb +33 -0
data/lib/ariel.rb +69 -0
data/test/ariel_test_case.rb +15 -0
data/test/fixtures.rb +43 -0
data/test/specs/token_spec.rb +65 -0
data/test/specs/token_stream_spec.rb +43 -0
data/test/specs/wildcards_spec.rb +26 -0
data/test/test_candidate_selector.rb +58 -0
data/test/test_example_document_loader.rb +7 -0
data/test/test_label_utils.rb +15 -0
data/test/test_learner.rb +38 -0
data/test/test_rule.rb +38 -0
data/test/test_structure_node.rb +81 -0
data/test/test_token.rb +16 -0
data/test/test_token_stream.rb +82 -0
data/test/test_wildcards.rb +18 -0
metadata +103 -0

data/lib/ariel/token_stream.rb ADDED Viewed

@@ -0,0 +1,240 @@
+module Ariel
+  require 'enumerator'
+  # A TokenStream instance stores a stream of Tokens once it has used its tokenization
+  # rules to extract them from a string. A TokenStream knows its current
+  # position (TokenStream#cur_pos), which is incremented when any of the
+  # Enumerable methods are used (due to the redefinition of TokenStream#each).
+  # As you advance through the stream, the current token is always returned and
+  # then consumed. A TokenStream also provides methods for finding patterns in a
+  # given stream much like StringScanner but for an array of tokens. For rule
+  # generation, a certain token can be marked as being the start point of a label.
+  # Finally, a TokenStream will record whether it is in a reversed or unreversed
+  # state so that when rules are applied, they are always applied from the front
+  # or end of the stream as required, whether it is reversed or not.
+  class TokenStream
+    include Enumerable
+    attr_accessor :tokens, :cur_pos, :label_index, :original_text
+    def initialize()
+      @tokens=[]
+      @cur_pos=0
+      @original_text = ""
+      @token_regexen = [
+      Wildcards.list[:html_tag], # Match html tags that don't have attributes
+      /\d+/, # Match any numbers, probably good to make a split
+      /\b\w+\b/, # Pick up words, will split at punctuation
+      /\S/ # Grab any characters left over that aren't whitespace
+      ]
+      @label_tag_regexen = [LabelUtils.any_label_regex]
+      @reversed=false
+    end
+    # The tokenizer operates on a string by splitting it at every point it
+    # finds a match to a regular expression. Each match is added as a token, and
+    # the strings between each match are stored along with their original
+    # offsets. The same is then done with the next regular expression on each of
+    # these split strings, and new tokens are created with the correct offset in
+    # the original text. Any characters left unmatched by any of the regular
+    # expressions in @token_regexen are discarded. This approach allows a
+    # hierarchy of regular expressions to work simply and easily. A simple
+    # regular expression to match html tags might operate first, and then later
+    # expressions that pick up runs of word characters can operate on what's
+    # left. If contains_labels is set to true when calling tokenize, the
+    # tokenizer will first remove and discard any occurences of label_tags (as
+    # defined by the Regex set in LabelUtils) before matching and adding tokens.
+    # Any label_tag tokens will be marked as such upon creation.
+    def tokenize(input, contains_labels=false)
+      string_array=[[input, 0]]
+      @original_text = input
+      @original_text_contains_labels=contains_labels
+      @label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
+      @token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
+      @tokens.sort!
+      @tokens.size
+    end
+    # Goes through all stored Token instances, removing them if
+    # Token#is_label_tag? Called after a labeled document has been extracted to
+    # a tree ready for the rule learning process.
+    def remove_label_tags
+      @tokens.delete_if {|token| token.is_label_tag?}
+    end
+    # Returns the slice of the current instance containing all the tokens
+    # between the token where the start_loc == the left parameter and the token
+    # where the end_loc == the right parameter.
+    def slice_by_string_pos(left, right)
+      l_index=nil
+      r_index=nil
+      @tokens.each_index {|i| l_index = i if @tokens[i].start_loc == left}
+      @tokens.each_index {|i| r_index = i if @tokens[i].end_loc == right}
+      if l_index.nil? or r_index.nil?
+        raise ArgumentError, "Cannot slice between those locations"
+      else
+        return slice_by_token_index(l_index, r_index)
+      end
+    end
+    # Slices tokens between the l_index and the r_index inclusive.
+    def slice_by_token_index(l_index, r_index)
+      sliced = self.dup
+      sliced.tokens=@tokens[l_index..r_index]
+      return sliced
+    end
+    # Used to ensure operations such as @tokens.reverse! in one instance won't
+    # inadvertently effect another.
+    def deep_clone
+      Marshal::load(Marshal.dump(self))
+    end
+    # Set a label at a given offset in the original text. Searches for a token
+    # with a start_loc equal to the position passed as an argument, and raises
+    # an error if one is not found.
+    def set_label_at(pos)
+      token_pos=nil
+      @tokens.each_index {|i| token_pos = i if @tokens[i].start_loc == pos}
+      if token_pos.nil?
+        raise ArgumentError, "Given string position does not match the start of any token"
+      else
+        @label_index = token_pos
+        debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
+        return @label_index
+      end
+    end
+    # Returns all text represented by the instance's stored tokens, stripping any
+    # label tags if the stream was declared to be containing them when it was
+    # initialized (this would only happen during the process of loading labeled
+    # examples). See also TokenStream#raw_text
+    def text(l_index=0, r_index=-1)
+      out=raw_text(l_index, r_index)
+      if @original_text_contains_labels
+        LabelUtils.clean_string(out)
+      else
+        out
+      end
+    end
+    # Returns all text represented by the instance's stored tokens it will not
+    # strip label tags even if the stream is marked to contain them. However,
+    # you should not expect to get the raw_text once any label_tags have been
+    # filtered (TokenStream#remove_label_tags).
+    def raw_text(l_index=0, r_index=-1)
+      return "" if @tokens.size==0
+      if reversed?
+        l_index, r_index = r_index, l_index
+      end
+      @original_text[@tokens[l_index].start_loc...@tokens[r_index].end_loc]
+    end
+    # Returns the current Token and consumes it.
+		def advance
+      return nil if @cur_pos > @tokens.size
+      while true
+        @cur_pos+=1
+        current_token = @tokens[@cur_pos-1]
+        return nil if current_token.nil?
+        return current_token
+      end
+    end
+    # Return to the beginning of the TokenStream.
+    def rewind
+      @cur_pos=0
+      self
+    end
+    # Returns a copy of the current instance with a reversed set of tokens. If
+    # it is set, the label_index is adjusted accordingly to point to the correct
+    # token.
+    def reverse
+      self.deep_clone.reverse!
+    end
+    # Converts the given position so it points to the same token once the stream
+    # is reversed. Result invalid for when @tokens.size==0
+    def reverse_pos(pos)
+      @tokens.size-(pos + 1)
+    end
+    # Same as LabeledStream#reverse, but changes are made in place.
+    def reverse!
+      @tokens.reverse!
+      if label_index
+        @label_index = reverse_pos(@label_index)
+      end
+      @cur_pos = reverse_pos(@cur_pos)
+      @reversed=!@reversed
+      return self
+    end
+    # Returns true or false depending on whether the given tokenstream is in a
+    # reversed state
+    def reversed?
+      @reversed
+    end
+    # Takes a list of Strings and Symbols as its arguments representing text to be matched in
+    # individual tokens and Wildcards. For a match to be a
+    # success, all wildcards and strings must match a consecutive sequence
+    # of Tokens in the TokenStream. All matched Tokens are consumed, and the
+    # TokenStream's current position is returned on success. On failure, the
+    # TokenStream is returned to its original state and returns nil.
+    def skip_to(*features)
+      original_pos=@cur_pos
+      self.each_cons(features.size) do |tokens|
+        i=0
+        return @cur_pos if tokens.all? {|token| i+=1; token.matches?(features[i-1])}
+      end
+      @cur_pos=original_pos #No match, return TokenStream to original state
+      return nil
+    end
+    # Iterates over and consumes every Token from the cur_pos.
+    def each
+      while (token = self.advance)
+        yield token
+      end
+    end
+    # Returns the current Token.
+    def current_token
+      @tokens[@cur_pos]
+    end
+    private
+    # Uses split_by_regex to split each member of a given array of string and
+    # offset pairs in to new arrays of string and offset pairs.
+    def split_string_array_by_regex(string_array, regex, add_matches=true)
+      new_string_array = []
+      string_array.each do |arr|
+        result = split_by_regex(arr[0], arr[1], regex, add_matches)
+        new_string_array.concat result
+      end
+      string_array.replace new_string_array
+    end
+    # For tokenization, removes regex matches and creates new strings to
+    # represent the gaps between each match.
+    def split_by_regex(string, offset, regex, add_matches=true)
+      split_points=[0]
+      string_holder = []
+      string.scan(regex) do |s|
+        match = Regexp.last_match
+        split_points << match.begin(0)
+        split_points << match.end(0)
+        @tokens << Token.new(match[0], match.begin(0)+offset, match.end(0)+offset, !add_matches)
+      end
+      split_points << string.size
+      split_points.each_slice(2) do |s_pos, e_pos|
+        split_string = string[s_pos...e_pos]
+        string_holder << [split_string, s_pos+offset] unless split_string.empty?
+      end
+      return string_holder
+    end
+  end
+end

data/lib/ariel/wildcards.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Ariel
+  # Contains all wildcards to be used in rule generation.
+  class Wildcards
+    private_class_method :new
+    @@list = {
+        :anything=>/.+/,
+        :numeric=>/\d+/,
+        :alpha_numeric=>/\w+/,
+        :alpha=>/[[:alpha:]]+/,
+        :capitalized=>/[[:upper:]]+\w+/,
+        :all_caps=>/[[:upper:]]+/,
+        :html_tag=>/<\/?\w+>|<\w+\s+\/>/,
+        :punctuation=>/[[:punct:]]+/
+      }
+    # Returns the hash of wildcard name (symbol) and regular expression pairs.
+    def self.list
+      @@list
+    end
+    # Given a string, will return an array of symbols from Wildcards::list that
+    # match it.
+    def self.matching(string)
+      matches=[]
+      @@list.each do |name, regex|
+        if string[regex]==string
+          yield name if block_given?
+          matches << name
+        end
+      end
+      matches
+    end
+  end
+end

data/lib/ariel.rb ADDED Viewed

@@ -0,0 +1,69 @@
+require 'ariel/token'
+require 'ariel/token_stream'
+require 'ariel/learner'
+require 'ariel/node_like'
+require 'ariel/extracted_node'
+require 'ariel/structure_node'
+require 'ariel/rule'
+require 'ariel/wildcards'
+require 'ariel/candidate_selector'
+require 'ariel/label_utils'
+require 'ariel/example_document_loader'
+require 'ariel/rule_set'
+if $DEBUG
+#  require 'logger'
+#  DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
+#  DEBUGLOG.datetime_format = " \010"
+#  DEBUGLOG.progname = "\010\010\010"
+  def debug(message)
+     p message
+    #DEBUGLOG.debug message
+  end
+else
+  def debug(message)
+  end
+end
+# = Ariel - A Ruby Information Extraction Library
+# Ariel intends to assist in extracting information from semi-structured
+# documents including (but not in any way limited to) web pages. Although you
+# may use libraries such as Hpricot or Rubyful Soup, or even plain Regular
+# Expressions to achieve the same goal, Ariel approaches the problem very
+# differently. Ariel relies on the user labeling examples of the data they
+# want to extract, and then finds patterns across several such labeled
+# examples in order to produce a set of general rules for extracting this
+# information from any similar document.
+#
+# When working with Ariel, your workflow might look something like this:
+# 1. Define a structure for the data you wish to extract. For example:
+#
+#     @structure = Ariel::StructureNode.new do |r|
+#       r.article do |a|
+#         a.title
+#         a.author
+#         a.date
+#         a.body
+#       end
+#       r.comment_list do |c|
+#         c.author
+#         c.date
+#         c.body
+#       end
+#     end
+# 2. Label these fields in a few example documents (normally at least 3).
+#    Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
+# 3. Ariel will read these examples, and try to generate suitable rules that can
+#    be used to extract this data from other similarly structured documents.
+# 4. A wrapper has been generated - we can now happily load documents with the
+#    same structure (normally documents generated by the same rules, so
+#    different pages from a single site perhaps) and query the extracted data.
+module Ariel
+end

data/test/ariel_test_case.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'test/unit'
+require 'fixtures'
+module Ariel
+  include Fixtures
+  class TestCase < Test::Unit::TestCase
+    def run(result)
+      debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
+      super
+    end
+    def default_test
+    end
+  end
+end

data/test/fixtures.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module Fixtures
+  @@labeled_document = <<EOS
+Title: <l:title>The test of the Century</l:title>
+<l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
+<l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
+EOS
+  @@labeled_document_structure = Ariel::StructureNode.new do |r|
+    r.item :title
+    r.item :content do |c|
+      c.item :excerpt
+      c.item :body
+    end
+  end
+  @@unlabeled_document=<<EOS
+Title: The test of the Century
+<b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
+There was once a test designed to assess whether apply_extraction_tree_on worked.
+EOS
+  # Document with nested labels with clashing names. i.e. a label at the top
+  # level as well as a label lower down in the tree that has the same label
+  # name.
+  @@labeled_document_with_list=<<EOS
+Title: <l:title>Another example</l:title>
+<l:body>I love to write examples, you love to read them, ruby loves to process them.
+In conclusion, we're has happy as can be.</l:body>
+<l:comment_list>Comments:
+<l:comment>Title:<l:title>Great example</l:title>
+<l:author>Adoring fan</l:author>
+<l:body>Always love reading your examples, keep up the great work.</l:body>
+</l:comment></l:comment_list>
+EOS
+  @@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
+  @@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
+  @@labeled_addresses[0].set_label_at 36
+  @@labeled_addresses[1].tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
+  @@labeled_addresses[1].set_label_at 35
+  @@labeled_addresses[2].tokenize("523 1st St., <b> LA </b>, Phone: 1-<b>888</b>-578-2293")
+  @@labeled_addresses[2].set_label_at 38
+  @@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
+  @@labeled_addresses[3].set_label_at 39
+end

data/test/specs/token_spec.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'ariel'
+context "An average token" do
+  setup do
+    @token = Ariel::Token.new("Test", 0, 4)
+  end
+  specify "Should return the string it holds when text is called" do
+    @token.text.should_equal "Test"
+  end
+  specify "Should not be a label tag" do
+    @token.is_label_tag?.should_be false
+  end
+  specify "Should return true if if the token string matches a given wildcard or equals a given string" do
+    @token.matches?("Test").should_be true
+    @token.matches?(:alpha_numeric).should_be true
+  end
+  specify "Should return false if the token string doesn't match the given wildcard or string" do
+    @token.matches?("Tes").should_be false
+    @token.matches?(:html_tag).should_be false
+  end
+  specify "Should raise an error if an invalid wildcard is given" do
+    lambda {@token.matches? :not_a_wildcard}.should_raise ArgumentError
+  end
+  specify "Should be able to list all wildcard symbols that match its text" do
+    @token.matching_wildcards.should_be_an_instance_of Array
+    @token.matching_wildcards.each {|wildcard| wildcard.should_be_an_instance_of Symbol}
+  end
+end
+context "Comparing two Tokens" do
+  setup do
+    @token1 = Ariel::Token.new("Alice", 0, 5)
+    @token2 = Ariel::Token.new("Bob", 5, 8)
+    @token1_clone = Ariel::Token.new("Alice", 0, 5)
+    @token1_almost_clone = Ariel::Token.new("Alice", 0, 4)
+  end
+  specify "Should be equal if and only if text, start location and end location are equal" do
+    @token1.should_equal @token1_clone
+    @token1.should_not_equal @token2
+    @token1.should_not_equal @token1_almost_clone
+  end
+  specify "Should define a way of comparing itself to other tokens" do
+    @token1.should_respond_to :<=>
+  end
+  specify "Should make comparisons based on the start location of the token" do
+    (@token1<=>@token1_almost_clone).should_equal 0
+    (@token1<=>@token2).should_equal -1
+  end
+end
+context "Initializing a label tag token" do
+  specify "Should be ignored if passed true as the final argument to Token#new" do
+    Ariel::Token.new("Test", 0, 4, true).is_label_tag?.should_be true
+  end
+end

data/test/specs/token_stream_spec.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'ariel'
+require 'fixtures'
+include Fixtures
+context "A new TokenStream" do
+  setup do
+    @tokenstream = Ariel::TokenStream.new
+  end
+  specify "Should return 0 when cur_pos is called" do
+    @tokenstream.cur_pos.should_equal 0
+  end
+  specify "Should return an empty Array when tokens is called" do
+    @tokenstream.tokens.should_be_a_kind_of Array
+    @tokenstream.tokens.should_be_empty
+  end
+  specify "Should not contain any tokens" do
+    @tokenstream.tokens.size.should_equal 0
+  end
+  specify "Should return an empty string went sent the message raw_text" do
+    @tokenstream.raw_text.should_equal ""
+  end
+  specify "Should return nil when asked to advance" do
+    @tokenstream.advance.should_be_nil
+  end
+  specify "cur_pos should increase to 1 when asked to advance and no further" do
+    @tokenstream.advance
+    @tokenstream.cur_pos.should_equal 1
+    @tokenstream.advance
+    @tokenstream.cur_pos.should_equal 1
+  end
+  specify "Should not be reversed" do
+    @tokenstream.should_not_be_reversed
+  end
+end

data/test/specs/wildcards_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'ariel'
+context "When querying the Wildcards class" do
+  specify "Should not be possible to create a Wildcards instance" do
+    lambda {Ariel::Wildcards.new}.should_raise
+  end
+  specify "Should return a hash of Symbol to Regexp pairs when sent the list message" do
+    wildcards=Ariel::Wildcards.list
+    wildcards.should_be_a_kind_of Hash
+    wildcards.keys.each {|key| key.should_be_a_kind_of Symbol}
+    wildcards.values.each {|value| value.should_be_a_kind_of Regexp}
+  end
+  specify "When Wildcards.matching is called with a String, should return an array of the symbols of all matching wildcards" do
+    Ariel::Wildcards.matching("Test").should_be_a_kind_of Array
+    Ariel::Wildcards.matching("<a>").should_include :html_tag
+  end
+  specify "Should yield a symbol for every wildcard the string matches when Wildcards.matching is called" do
+    list=[]
+    Ariel::Wildcards.matching("<a>") {|wildcard| list << wildcard}
+    list.should_not_be_empty
+  end
+end

data/test/test_candidate_selector.rb ADDED Viewed

@@ -0,0 +1,58 @@
+require 'ariel'
+require 'ariel_test_case'
+class TestCandidateSelector < Ariel::TestCase
+  include Fixtures
+  def setup
+    # Must get rid of this repetition, should be available to all tests
+    @e=@@labeled_addresses
+    @candidates=[]
+    @candidates << Ariel::Rule.new(:forward, [[:anything]])
+    @candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
+    @candidates << Ariel::Rule.new(:forward, [["("]])
+    @candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
+    @selector=Ariel::CandidateSelector.new(@candidates, @e)
+  end
+  def test_score_by
+    score_hash = @selector.score_by {|rule| rule.landmarks.size}
+    assert_equal @candidates.size, score_hash.size
+    assert_equal 1, score_hash.values.sort.first
+  end
+  def test_highest_scoring_by
+    t1 = @selector.highest_scoring_by {|rule| 1}
+    assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
+    assert_equal @candidates.size, t1.size
+    t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
+    assert_equal 1, t2.size
+  end
+  def test_select_best_by_match_type
+    @selector.select_best_by_match_type :fail, :early, :late, :perfect
+    assert_equal @candidates, @selector.candidates
+    @selector.select_best_by_match_type :late
+    assert_equal 1, @selector.candidates.size
+    assert_equal @candidates[1], @selector.candidates[0]
+  end
+  def test_select_with_fewer_wildcards
+    assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
+    assert_equal 1, @selector.candidates.size
+  end
+  def test_select_closest_to_label
+    assert_equal @candidates[2], @selector.select_closest_to_label[0]
+    assert_equal 1, @selector.candidates.size
+  end
+  def test_select_with_longer_landmarks
+    assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
+    assert_equal 1, @selector.candidates.size
+  end
+  def test_random_from_remaining
+    assert(@candidates.include?(@selector.random_from_remaining))
+  end
+end

data/test/test_example_document_loader.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'ariel'
+require 'ariel_test_case'
+class TestExampleDocumentLoader < Ariel::TestCase
+  include Fixtures
+end

data/test/test_label_utils.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'ariel'
+require 'ariel_test_case'
+class TestLabelUtils < Ariel::TestCase
+  include Fixtures
+  def test_label_regex
+    assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
+    assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
+  end
+  def test_clean_string
+    assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
+  end
+end

data/test/test_learner.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'ariel'
+require 'ariel_test_case'
+class TestLearner < Ariel::TestCase
+  include Fixtures
+  def setup
+    #Examples stolen from the STALKER paper. Target to extract is the area
+    #codes.
+    @e=@@labeled_addresses
+    @learner=Ariel::Learner.new(*@e)
+  end
+  def test_set_seed
+    assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
+  end
+  def test_generate_initial_candidates
+    @learner.direction=:forward
+    @learner.generate_initial_candidates
+    c=@learner.candidates
+    assert (c.include? Ariel::Rule.new(:forward, [["("]]))
+    assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
+    assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
+  end
+  def test_refine
+    @learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
+    assert @learner.refine
+    @learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
+    assert @learner.refine
+  end
+  def test_learn_rule
+    rule=@learner.learn_rule :forward
+    p rule
+  end
+end