RubyGems - ariel - Versions diffs - 0.0.1 → 0.1.0 - Mend

ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README +49 -83
data/bin/ariel +29 -20
data/examples/google_calculator/structure.rb +2 -2
data/examples/google_calculator/structure.yaml +13 -15
data/examples/raa/labeled/highline.html +5 -4
data/examples/raa/labeled/mongrel.html +9 -8
data/examples/raa/structure.rb +4 -2
data/examples/raa/structure.yaml +94 -78
data/lib/ariel.rb +71 -33
data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
data/lib/ariel/label_utils.rb +46 -18
data/lib/ariel/labeled_document_loader.rb +77 -0
data/lib/ariel/learner.rb +60 -38
data/lib/ariel/log.rb +67 -0
data/lib/ariel/node.rb +52 -0
data/lib/ariel/node/extracted.rb +90 -0
data/lib/ariel/node/structure.rb +91 -0
data/lib/ariel/rule.rb +114 -32
data/lib/ariel/rule_set.rb +34 -15
data/lib/ariel/token.rb +9 -3
data/lib/ariel/token_stream.rb +32 -17
data/lib/ariel/wildcards.rb +19 -15
data/test/fixtures.rb +45 -3
data/test/specs/candidate_refiner_spec.rb +48 -0
data/test/specs/label_utils_spec.rb +97 -0
data/test/specs/learner_spec.rb +39 -0
data/test/specs/node_extracted_spec.rb +90 -0
data/test/specs/node_spec.rb +76 -0
data/test/specs/node_structure_spec.rb +74 -0
data/test/specs/rule_set_spec.rb +85 -0
data/test/specs/rule_spec.rb +110 -0
data/test/specs/token_stream_spec.rb +100 -7
metadata +21 -28
data/lib/ariel/example_document_loader.rb +0 -59
data/lib/ariel/extracted_node.rb +0 -20
data/lib/ariel/node_like.rb +0 -26
data/lib/ariel/structure_node.rb +0 -75
data/test/ariel_test_case.rb +0 -15
data/test/test_candidate_selector.rb +0 -58
data/test/test_example_document_loader.rb +0 -7
data/test/test_label_utils.rb +0 -15
data/test/test_learner.rb +0 -38
data/test/test_rule.rb +0 -38
data/test/test_structure_node.rb +0 -81
data/test/test_token.rb +0 -16
data/test/test_token_stream.rb +0 -82
data/test/test_wildcards.rb +0 -18

data/lib/ariel/rule_set.rb CHANGED

@@ -1,34 +1,53 @@
 module Ariel
-  # A RuleSet acts as a container for a StructureNode's start and end rules.
+  # A RuleSet acts as a container for a Node::Structure's start and end rules.
   # These are stored as an ordered array and are applied in turn until there is
   # a successful match. A RuleSet takes responsibility for applying start and
-  # end rules to extract an ExtractedNode.
+  # end rules to extract an Node::Extracted.
   class RuleSet
     def initialize(start_rules, end_rules)
       @start_rules=start_rules
       @end_rules=end_rules
     end
+    # Returns an array of the extracted tokenstreams. An empty array is returned
+    # if the rules cannot be applied.
+    # TODO: Think more about the way list iteration rules are applied
     def apply_to(tokenstream)
-      start_idx=nil
-      end_idx=nil
+      start_idxs=nil
+      end_idxs=nil
       @start_rules.each do |rule|
-        start_idx=rule.apply_to tokenstream
-        break if start_idx
+      start_idxs=rule.apply_to tokenstream
+        break if !start_idxs.empty?
       end
       @end_rules.each do |rule|
-        end_idx=rule.apply_to tokenstream
-        break if end_idx
+        end_idxs=rule.apply_to tokenstream
+        end_idxs.reverse! #So the start_idxs and end_idxs match up
+        break if !end_idxs.empty?
       end
-      if start_idx && end_idx
-        debug "RuleSet matched with start_idx=#{start_idx} and end_idx=#{end_idx}"
-        return nil if end_idx < start_idx
-        return tokenstream.slice_by_token_index(start_idx, end_idx)
-      else
-        debug "No valid match was found"
-        return nil
+      result=[]
+      unless start_idxs.empty? && end_idxs.empty?
+        # Following expression deals with the case where the first start rule
+        # matches after the first end rule, indicating that all tokens up to the
+        # end rule match should be a list item
+        if start_idxs.first > end_idxs.first
+          start_idxs.insert(0, 0)
+        end
+        if end_idxs.last < start_idxs.last
+          end_idxs << (tokenstream.size - 1)
+        end
+        Log.debug "RuleSet matched with start_idxs=#{start_idxs.inspect} and end_idxs=#{end_idxs.inspect}"
+        start_idxs.zip(end_idxs) do |start_idx, end_idx|
+          if start_idx && end_idx
+            next if start_idx > end_idx
+            result << tokenstream.slice_by_token_index(start_idx, end_idx)
+            yield result.last if block_given?
+          else
+            break
+          end
+        end
       end
+      return result
     end
   end
 end

data/lib/ariel/token.rb CHANGED

@@ -36,9 +36,10 @@ module Ariel
       @start_loc <=> t.start_loc
     end
-    # Accepts either a string or symbol representing a wildcard in
-    # Wildcards#list. Returns true if the whole Token is consumed by the wildcard or the
-    # string is equal to Token#text, and false if the match fails. Raises an
+    # Accepts either a string a symbol representing a wildcard in
+    # Wildcards#list or an an arbitrary regex. Returns true if the
+    # whole Token is consumed by the wildcard or the string is equal
+    # to Token#text, and false if the match fails. Raises an
     # error if the passed symbol is not a member of Wildcards#list.
     def matches?(landmark)
       if landmark.kind_of? Symbol or landmark.kind_of? Regexp
@@ -64,5 +65,10 @@ module Ariel
     def matching_wildcards
       return Wildcards.matching(self.text)
     end
+    # Redefined for caching purposes. This proved to be too slow.
+#    def hash
+#      [@text, @start_loc, @end_loc, @label_tag].hash
+#    end
   end
 end

data/lib/ariel/token_stream.rb CHANGED

@@ -16,19 +16,21 @@ module Ariel
   class TokenStream
     include Enumerable
     attr_accessor :tokens, :cur_pos, :label_index, :original_text
-    def initialize()
-      @tokens=[]
-      @cur_pos=0
-      @original_text = ""
-      @token_regexen = [
+    TOKEN_REGEXEN = [
       Wildcards.list[:html_tag], # Match html tags that don't have attributes
       /\d+/, # Match any numbers, probably good to make a split
       /\b\w+\b/, # Pick up words, will split at punctuation
       /\S/ # Grab any characters left over that aren't whitespace
       ]
-      @label_tag_regexen = [LabelUtils.any_label_regex]
+    LABEL_TAG_REGEXEN = [LabelUtils.any_label_regex]
+    def initialize()
+      @tokens=[]
+      @cur_pos=0
+      @original_text = ""
       @reversed=false
+      @contains_label_tags=false
     end
     # The tokenizer operates on a string by splitting it at every point it
@@ -37,7 +39,7 @@ module Ariel
     # offsets. The same is then done with the next regular expression on each of
     # these split strings, and new tokens are created with the correct offset in
     # the original text. Any characters left unmatched by any of the regular
-    # expressions in @token_regexen are discarded. This approach allows a
+    # expressions in TokenStream::TOKEN_REGEXEN are discarded. This approach allows a
     # hierarchy of regular expressions to work simply and easily. A simple
     # regular expression to match html tags might operate first, and then later
     # expressions that pick up runs of word characters can operate on what's
@@ -45,16 +47,25 @@ module Ariel
     # tokenizer will first remove and discard any occurences of label_tags (as
     # defined by the Regex set in LabelUtils) before matching and adding tokens.
     # Any label_tag tokens will be marked as such upon creation.
-    def tokenize(input, contains_labels=false)
+    def tokenize(input, contains_label_tags=false)
       string_array=[[input, 0]]
       @original_text = input
-      @original_text_contains_labels=contains_labels
-      @label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
-      @token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
+      @contains_label_tags=contains_label_tags
+      LABEL_TAG_REGEXEN.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_label_tags
+      TOKEN_REGEXEN.each {|regex| split_string_array_by_regex(string_array, regex)}
       @tokens.sort!
       @tokens.size
     end
+    # Note, token.cache_hash!=token.reverse.reverse.cache_hash.
+    def cache_hash
+      [@tokens, @reversed].hash
+    end
+    def contains_label_tags?
+      @contains_label_tags
+    end
     # Goes through all stored Token instances, removing them if
     # Token#is_label_tag? Called after a labeled document has been extracted to
     # a tree ready for the rule learning process.
@@ -100,7 +111,7 @@ module Ariel
         raise ArgumentError, "Given string position does not match the start of any token"
       else
         @label_index = token_pos
-        debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
+        Log.debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
         return @label_index
       end
     end
@@ -111,14 +122,14 @@ module Ariel
     # examples). See also TokenStream#raw_text
     def text(l_index=0, r_index=-1)
       out=raw_text(l_index, r_index)
-      if @original_text_contains_labels
+      if contains_label_tags?
         LabelUtils.clean_string(out)
       else
         out
       end
     end
-    # Returns all text represented by the instance's stored tokens it will not
+    # Returns all text represented by the instance's stored tokens. It will not
     # strip label tags even if the stream is marked to contain them. However,
     # you should not expect to get the raw_text once any label_tags have been
     # filtered (TokenStream#remove_label_tags).
@@ -141,7 +152,7 @@ module Ariel
       end
     end
-    # Return to the beginning of the TokenStream.
+    # Return to the beginning of the TokenStream. Returns self.
     def rewind
       @cur_pos=0
       self
@@ -166,7 +177,6 @@ module Ariel
       if label_index
         @label_index = reverse_pos(@label_index)
       end
-      @cur_pos = reverse_pos(@cur_pos)
       @reversed=!@reversed
       return self
     end
@@ -176,6 +186,11 @@ module Ariel
     def reversed?
       @reversed
     end
+    # Returns the number of tokens in the TokenStream
+    def size
+      @tokens.size
+    end
     # Takes a list of Strings and Symbols as its arguments representing text to be matched in
     # individual tokens and Wildcards. For a match to be a

data/lib/ariel/wildcards.rb CHANGED

@@ -1,8 +1,7 @@
 module Ariel
   # Contains all wildcards to be used in rule generation.
   class Wildcards
-    private_class_method :new
-    @@list = {
+    @list = {
         :anything=>/.+/,
         :numeric=>/\d+/,
         :alpha_numeric=>/\w+/,
@@ -12,22 +11,27 @@ module Ariel
         :html_tag=>/<\/?\w+>|<\w+\s+\/>/,
         :punctuation=>/[[:punct:]]+/
       }
-    # Returns the hash of wildcard name (symbol) and regular expression pairs.
-    def self.list
-      @@list
-    end
-    # Given a string, will return an array of symbols from Wildcards::list that
-    # match it.
-    def self.matching(string)
-      matches=[]
-      @@list.each do |name, regex|
-        if string[regex]==string
-          yield name if block_given?
-          matches << name
+    class << self
+      private :new
+      # Returns the hash of wildcard name (symbol) and regular expression pairs.
+      def list
+        @list
+      end
+      # Given a string, will return an array of symbols from Wildcards::list that
+      # match it.
+      def matching(string)
+        matches=[]
+        @list.each do |name, regex|
+          if string[regex]==string
+            yield name if block_given?
+            matches << name
+          end
         end
+        matches
       end
-      matches
     end
   end
 end

data/test/fixtures.rb CHANGED

@@ -4,7 +4,7 @@ Title: <l:title>The test of the Century</l:title>
 <l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
 <l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
 EOS
-  @@labeled_document_structure = Ariel::StructureNode.new do |r|
+  @@labeled_document_structure = Ariel::Node::Structure.new do |r|
     r.item :title
     r.item :content do |c|
       c.item :excerpt
@@ -24,12 +24,39 @@ Title: <l:title>Another example</l:title>
 <l:body>I love to write examples, you love to read them, ruby loves to process them.
 In conclusion, we're has happy as can be.</l:body>
 <l:comment_list>Comments:
-<l:comment>Title:<l:title>Great example</l:title>
+<ol>
+<li><l:comment>Title:<l:title>Great example</l:title>
 <l:author>Adoring fan</l:author>
 <l:body>Always love reading your examples, keep up the great work.</l:body>
-</l:comment></l:comment_list>
+</l:comment></li>
+<li><l:comment>Title: <l:title>Some advice</l:title>
+<l:author>Wise old man</l:author>
+<l:body>Keep your friends close and your enemies closer.</l:body>
+</l:comment></li></l:comment_list>
 EOS
+  @@labeled_document_with_list_structure = Ariel::Node::Structure.new do |r|
+    r.item :title
+    r.item :body
+    r.item :comment_list do |c|
+      c.list_item :comment do |d|
+        d.item :author
+        d.item :body
+      end
+    end
+  end
+  title_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([[":"]], :forward)], [Ariel::Rule.new([["love", "I"]], :back)]
+  body_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["example"]], :forward)], [Ariel::Rule.new([["Comments"]], :back)]
+  c_list_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["be", "."]], :forward)], [Ariel::Rule.new([], :back)]
+  comment_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["<li>"]], :forward, true)], [Ariel::Rule.new([["</li>"]], :back, true)]
+  s=@@labeled_document_with_list_structure
+  s.title.ruleset=title_ruleset
+  s.body.ruleset=body_ruleset
+  s.comment_list.ruleset=c_list_ruleset
+  s.comment_list.comment.ruleset=comment_ruleset
   @@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
   @@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
   @@labeled_addresses[0].set_label_at 36
@@ -40,4 +67,19 @@ EOS
   @@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
   @@labeled_addresses[3].set_label_at 39
+  # This example is from the STALKER paper, it suggests that SkipTo('<p><i>')
+  # would extract the start of the list, and the rules SkipTo '<i>' and SkipTo
+  # '</i>' would locate the start and end of each list item. If the first found
+  # end_loc and before the first start_loc, it should be assumed all tokens from
+  # 0...end_loc are one item.
+  @@unlabeled_restaurant_example=<<EOS
+<p> Name: <b> Yala </b><p> Cuisine: Thai <p><i>
+4000 Colfax, Phoenix, AZ 85258 (602) 508-1570
+</i><br><i>
+523 Vernon, Las Vegas, NV 89104 (702) 578-2293
+</i><br><i>
+403 Pico, LA, CA 90007 (213) 798-0008
+</i>
+EOS
 end

data/test/specs/candidate_refiner_spec.rb ADDED

@@ -0,0 +1,48 @@
+require 'ariel'
+require 'fixtures'
+include Fixtures
+context "Refining non exhaustive rule candidates" do
+  setup do
+    @candidates=[]
+    @candidates << Ariel::Rule.new([[:anything]], :forward)
+    @candidates << Ariel::Rule.new([[:numeric], [:numeric], [:numeric]], :forward) #late
+    @candidates << Ariel::Rule.new([["("]], :forward)
+    @candidates << Ariel::Rule.new([[:numeric, :alpha_numeric]], :forward)
+    @refiner=Ariel::CandidateRefiner.new(@candidates, @@labeled_addresses)
+  end
+  specify "refine_by_match_type should not change the list of candidates if all rules match one of the given types" do
+    @refiner.refine_by_match_type :fail, :early, :late, :perfect
+    @refiner.candidates.should_equal @candidates
+  end
+  specify "refine_by_match_type should remove all candidates that don't match the given type from the candidates list" do
+    @refiner.refine_by_match_type :late
+    @refiner.candidates.size.should_equal 1
+    @candidates[1].should_equal @refiner.candidates[0]
+  end
+  specify "refine_by_fewer wildcards should leave only those rules with the lowest number of wildcards" do
+    @refiner.refine_by_fewer_wildcards
+    @refiner.candidates.size.should_equal 1
+    @refiner.candidates[0].should_equal @candidates[2]
+  end
+  specify "refine_by_label_proximity should leave only those candidates that match closest to the label" do
+    @refiner.refine_by_label_proximity
+    @refiner.candidates.size.should_equal 1
+    @refiner.candidates[0].should_equal @candidates[2]
+  end
+  specify "refine_by_longer_end_landmarks should leave only those candidates with the longest end landmark" do
+    @refiner.refine_by_longer_end_landmarks
+    @refiner.candidates.size.should_equal 1
+    @refiner.candidates[0].should_equal @candidates[3]
+  end
+  specify "random_from_remaining should return a random candidate from those remaining in the candidate list" do
+    @candidates.should_include(@refiner.random_from_remaining)
+  end
+end

data/test/specs/label_utils_spec.rb ADDED

@@ -0,0 +1,97 @@
+require 'ariel'
+require 'fixtures'
+include Fixtures
+context "Querying LabelUtils for label tag locating Regular Expressions" do
+  specify "label_regex should return an array of two Regexp to locate a start tag or an end tag with the given tag contents" do
+    s_regex, e_regex = Ariel::LabelUtils.label_regex('example')
+    s_tag="<l:example>"
+    e_tag="</l:example>"
+    s_tag.should_match s_regex
+    e_tag.should_not_match s_regex
+    s_tag.should_not_match e_regex
+    e_tag.should_match e_regex
+    "<l:fail>".should_not_match s_regex
+  end
+  specify "label_regex should by default return a pair of labels that will match any valid label tags" do
+    s_regex, e_regex = Ariel::LabelUtils.label_regex
+    "<l:randomexample>".should_match s_regex
+    "</l:unrandomexample>".should_match e_regex
+    "<l:foo>".should_not_match e_regex
+  end
+  specify "any_label_regex should return a regex that will match any valid open or closing label tags" do
+    regex=Ariel::LabelUtils.any_label_regex
+    regex.should_be_a_kind_of Regexp
+    %w[<l:foo> <l:bar> </l:foo> </l:bar>].each {|tag| tag.should_match regex}
+    %w[<l:foo <l/trunk> </l:** <a> </b>].each {|tag| tag.should_not_match regex}
+  end
+end
+context "Extracting a labeled region from a node" do
+  setup do
+    @tokenstream_with_label_tags = Ariel::TokenStream.new
+    @tokenstream_with_label_tags.tokenize @@labeled_document, true
+    @parent_extracted_node=Ariel::Node::Extracted.new(:root, @tokenstream_with_label_tags, @@labeled_document_structure)
+    @title_result=Ariel::LabelUtils.extract_labeled_region(@@labeled_document_structure.title, @parent_extracted_node)
+  end
+  specify "extract_labeled_region should return an array containing the region corresponding to the given structure node as a Node::Extracted" do
+    @title_result.should_be_a_kind_of Array
+    @title_result[0].should_be_an_instance_of Ariel::Node::Extracted
+    @title_result.size.should_equal 1
+    @title_result[0].tokenstream.tokens.should_equal @tokenstream_with_label_tags.tokens[3..7]
+  end
+  specify "Should return an empty array if the match fails" do
+    Ariel::LabelUtils.extract_labeled_region(Ariel::Node::Structure.new(:non_existent), @parent_extracted_node).should_equal []
+  end
+  specify "Extracted node should have the correct node_name" do
+    @title_result[0].node_name.should_equal :title
+  end
+  specify "Extracted node should be added as a child to the parent extracted node" do
+    @title_result.should_equal @parent_extracted_node.children.values
+  end
+end
+context "Extracting labeled list items from a node" do
+  setup do
+    @structure=@@labeled_document_with_list_structure
+    @tokenstream=Ariel::TokenStream.new
+    @tokenstream.tokenize @@labeled_document_with_list, true
+    @tokenstream = @tokenstream.slice_by_token_index 39, 95
+    @parent_extracted_node=Ariel::Node::Extracted.new(:comment_list, @tokenstream, @@labeled_document_with_list_structure.comment_list)
+    @result = Ariel::LabelUtils.extract_labeled_region(@structure.comment_list.comment, @parent_extracted_node)
+  end
+  specify "Should return an array containing each list_item" do
+    @result.size.should_equal 2
+    @result.each {|extracted_node| extracted_node.should_be_an_instance_of Ariel::Node::Extracted}
+    @tokenstream.tokens[5..28].should_equal @result[0].tokenstream.tokens
+    @tokenstream.tokens[33..54].should_equal @result[1].tokenstream.tokens
+  end
+  specify "Should name each list item itemname_num" do
+    @result[0].node_name.should_equal :comment_0
+    @result[1].node_name.should_equal :comment_1
+  end
+  specify "Should add each list_item as as a child of the parent extracted node" do
+    children=@parent_extracted_node.children.values
+    children.size.should_equal 2
+    children.each {|child| @result.should_include child}
+  end
+  specify "Should return an empty array if no list items are extracted" do
+    stream=Ariel::TokenStream.new
+    stream.tokenize "No labels here", true
+    @parent_extracted_node.tokenstream=stream
+    result = Ariel::LabelUtils.extract_labeled_region(@structure.comment_list.comment, @parent_extracted_node)
+    result.should_equal []
+  end
+end