RubyGems - ariel - Versions diffs - 0.0.1 → 0.1.0 - Mend

ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README +49 -83
data/bin/ariel +29 -20
data/examples/google_calculator/structure.rb +2 -2
data/examples/google_calculator/structure.yaml +13 -15
data/examples/raa/labeled/highline.html +5 -4
data/examples/raa/labeled/mongrel.html +9 -8
data/examples/raa/structure.rb +4 -2
data/examples/raa/structure.yaml +94 -78
data/lib/ariel.rb +71 -33
data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
data/lib/ariel/label_utils.rb +46 -18
data/lib/ariel/labeled_document_loader.rb +77 -0
data/lib/ariel/learner.rb +60 -38
data/lib/ariel/log.rb +67 -0
data/lib/ariel/node.rb +52 -0
data/lib/ariel/node/extracted.rb +90 -0
data/lib/ariel/node/structure.rb +91 -0
data/lib/ariel/rule.rb +114 -32
data/lib/ariel/rule_set.rb +34 -15
data/lib/ariel/token.rb +9 -3
data/lib/ariel/token_stream.rb +32 -17
data/lib/ariel/wildcards.rb +19 -15
data/test/fixtures.rb +45 -3
data/test/specs/candidate_refiner_spec.rb +48 -0
data/test/specs/label_utils_spec.rb +97 -0
data/test/specs/learner_spec.rb +39 -0
data/test/specs/node_extracted_spec.rb +90 -0
data/test/specs/node_spec.rb +76 -0
data/test/specs/node_structure_spec.rb +74 -0
data/test/specs/rule_set_spec.rb +85 -0
data/test/specs/rule_spec.rb +110 -0
data/test/specs/token_stream_spec.rb +100 -7
metadata +21 -28
data/lib/ariel/example_document_loader.rb +0 -59
data/lib/ariel/extracted_node.rb +0 -20
data/lib/ariel/node_like.rb +0 -26
data/lib/ariel/structure_node.rb +0 -75
data/test/ariel_test_case.rb +0 -15
data/test/test_candidate_selector.rb +0 -58
data/test/test_example_document_loader.rb +0 -7
data/test/test_label_utils.rb +0 -15
data/test/test_learner.rb +0 -38
data/test/test_rule.rb +0 -38
data/test/test_structure_node.rb +0 -81
data/test/test_token.rb +0 -16
data/test/test_token_stream.rb +0 -82
data/test/test_wildcards.rb +0 -18

data/lib/ariel/log.rb ADDED

@@ -0,0 +1,67 @@
+require 'singleton'
+module Ariel
+  # Very simple Log class. By default outputs to stdout and ignored messages
+  # below :info level. Should probably get rid of the usage of Singleton as it's
+  # used very little, with the classes eigenclass/singleton class used mostly
+  # for the same purpose. Use Log.set_level to lower/raise the logging level.
+  class Log
+    include Singleton
+    SEVERITY={:debug=>0, :info=>1, :warn=>2, :error=>3}
+    # Level defaults to :debug if $DEBUG is set and :info if not.
+    def initialize
+      self.class.output_to_stdout
+      if $DEBUG
+        self.class.set_level :debug
+      else
+        self.class.set_level :info
+      end
+    end
+    class << self
+      SEVERITY.keys.each do |level|
+        define_method(level) {|message| instance; log message, level}
+      end
+      # Set the log level to the given key from the SEVERITY constant.
+      def set_level(level)
+        if SEVERITY.has_key? level
+          @log_level=level
+        else
+          raise ArgumentError, "Invalid log level given"
+        end
+      end
+      def current_level
+        @log_level
+      end
+      def output_to_stdout
+        @output=:stdout
+      end
+      # Sends all output to a file called debug.log in the current directory.
+      def output_to_file
+        @output=:file
+      end
+      # Not intended to be used directly, preferred to use the methods
+      # corresponding to different serverity levels.
+      def log(message, level)
+        if SEVERITY[@log_level] <= SEVERITY[level]
+          message = "#{level}: #{message}"
+          if @output==:file
+            File.open('debug.log', 'ab') {|f| f.puts message }
+          elsif @output==:stdout
+            puts message
+          end
+          return message
+        end
+        return nil
+      end
+    end
+  end
+end

data/lib/ariel/node.rb ADDED

@@ -0,0 +1,52 @@
+module Ariel
+  # A generic Node object. As an end user, you have no need to use this. All
+  # children are stored in a hash. #id and #type are undefined so they can be
+  # used freely as part of a Node::Structure
+  class Node
+    removed_methods=[:id, :type]
+    removed_methods.each {|meth| undef_method meth}
+    attr_accessor :parent, :children, :node_name
+    # If the name is a string, it's converted to a symbol. If not it's just
+    # stored as is.
+    def initialize(name)
+      @children={}
+      if name.kind_of? String
+				@node_name=name.to_sym
+			else
+				@node_name=name
+			end
+    end
+    # Given a Node object and a name, adds a child to the array of children,
+    # setting its parent as the current node, as well as creating an accessor
+    # method matching that name.
+    def add_child(node)
+      @children[node.node_name]=node
+      node.parent = self
+      # Trick stolen from OpenStruct
+      meta = class << self; self; end
+      meta.send(:define_method, node.node_name.to_s.to_sym) {@children[node.node_name]}
+    end
+    # Yields each descendant node. If passed true will also yield itself.
+    def each_descendant(include_self=false)
+      if include_self
+        node_queue=[self]
+      else
+        node_queue=self.children.values
+      end
+      until node_queue.empty? do
+        node_queue.concat node_queue.first.children.values
+        yield node_queue.shift
+      end
+    end
+    def inspect
+      ["#{self.class.name} - node_name=#{self.node_name.inspect};",
+			 "parent=#{self.parent ? self.parent.node_name.inspect : nil.inspect };",
+			 "children=#{self.children.keys.inspect};"].join ' '
+    end
+  end
+end

data/lib/ariel/node/extracted.rb ADDED

@@ -0,0 +1,90 @@
+require 'ariel/node'
+module Ariel
+  # Each Node::Extracted has a name, a TokenStream and a structure which points to
+  # the relevant Node::Structure. Skip straight to #search, #/ and #at for the
+  # query interface. This is strongly recommended over using the built in method
+  # accessors (a method isn't defined if a given field isn't extracted, so
+  # you're going to have to catch a lot of potential errors).
+  class Node::Extracted < Node
+    attr_accessor :tokenstream, :structure_node
+    def initialize(name, tokenstream, structure)
+      super(name)
+      @structure_node=structure
+      @tokenstream=tokenstream
+    end
+    # Returns the text contained in the TokenStream.
+    def extracted_text
+      tokenstream.text
+    end
+    # Index based accessor for the Node::Extracted's children. Supports Range objects.
+		# Aims to provide behaviour that makes sense, especially when a Node has
+    # list children. Node::Extracted#[0..0] will return an array, while
+    # Node::Extracted[0] will not. This behaviour is the same as Ruby's standard
+    # Array class.
+    def [](*args)
+			dont_splat=false #determines whether to splat or not if there is only a single result
+			args.collect! do |arg|
+				if arg.kind_of? Range
+					arg=arg.to_a
+					dont_splat=true
+				end
+				arg
+			end
+			args.flatten!
+      dont_splat=true if args.size > 1
+      result=@children.values_at(*args).compact
+			if result.size==1 && dont_splat==true
+				return result
+			else
+				return *result
+			end
+    end
+    # The preferred way of querying extracted information. If nothing was
+    # extracted, an empty array is returned. This is much safer than using
+    # Node::Extracted accessors. Consider if your code is reading
+    # doc.address.phone_number.area_code - this will raise an error if any one of
+    # these were not extracted. (doc/'address/phone_number/area_code') is
+    # preferred. Numbered list_items can be queried e.g. (doc/'comment_list/2'),
+    # and basic globbing is supported: (doc/'*/*/title').
+    def search(search_string)
+      queue=search_string.split '/'
+      current_term=queue.shift
+      return [self] if current_term.nil? #If for some reason nothing is given in the search string
+      matches=[]
+      if current_term=='*'
+				new_matches=self.children.values
+				new_matches.sort! {|a, b| a.node_name <=> b.node_name} rescue nil #is this evil?
+        matches.concat new_matches
+      elsif current_term[/\d+/]==current_term
+        matches << @children[current_term.to_i]
+      else
+        matches << @children[current_term.to_sym]
+      end
+      if queue.empty?
+        return matches.flatten.compact
+      else
+        return matches.collect {|match| match.search(queue.join('/'))}.flatten.compact
+      end
+    end
+    alias :/ :search
+    # Acts exactly like #search, but returns only the first match or nil if
+    # there are no matches.
+    def at(search_string)
+      self.search(search_string).first
+    end
+		def inspect
+			[super,
+			"structure_node=#{self.structure_node.node_name.inspect};",
+			"extracted_text=\"#{text=self.extracted_text; text.size > 100 ? text[0..100]+'...' : text}\";"
+			].join ' '
+		end
+  end
+end

data/lib/ariel/node/structure.rb ADDED

@@ -0,0 +1,91 @@
+require 'ariel/node'
+module Ariel
+  # Implements a Node object used to represent the structure of the document
+  # tree. Each node stores start and end rules to extract the desired content
+  # from its parent node. Could be viewed as a rule-storing object.
+  class Node::Structure < Node
+    attr_accessor :ruleset, :node_type
+    def initialize(name=:root, type=:not_list, &block)
+      super(name)
+      @node_type=type
+      yield self if block_given?
+    end
+    # Used to extend an already created Node. e.g.
+    #  node.extend_structure do |r|
+    #    r.item :new_field1
+    #    r.item :new_field2
+    #  end
+    def extend_structure(&block)
+      yield self if block_given?
+    end
+    # Given a Node to apply it's rules to, this function will create a new node
+    # and add it as a child of the given node. It returns an array of the items
+    # extracted by the rule
+    def extract_from(node)
+      extractions=[]
+      i=0
+      return extractions if @ruleset.nil? #no extractions if no rule has been learnt
+      @ruleset.apply_to(node.tokenstream) do |newstream|
+        if self.node_type==:list_item
+          new_node_name=i
+          i+=1
+        else
+          new_node_name=@node_name
+        end
+        extracted_node = Node::Extracted.new(new_node_name, newstream, self)
+        node.add_child extracted_node
+        extractions << extracted_node
+      end
+      return extractions
+    end
+    # Applies the extraction rules stored in the current Node::Structure and all its
+    # descendant children.
+    def apply_extraction_tree_on(root_node, extract_labels=false)
+      extraction_queue = [root_node]
+      until extraction_queue.empty? do
+        new_parent = extraction_queue.shift
+        new_parent.structure_node.children.values.each do |child|
+          if extract_labels
+            extractions=LabelUtils.extract_labeled_region(child, new_parent)
+          else
+            extractions=child.extract_from(new_parent)
+          end
+          extractions.each {|extracted_node| extraction_queue.push extracted_node}
+        end
+      end
+      return root_node
+    end
+    # Use when defining any object that occurs once. #list is a synonym, but
+    # it's recommended you use it when defining a container for list_items. The
+    # children of a list_item are just items. e.g.
+    # <tt>structure = Ariel::Node::Structure.new do |r|
+    #   r.list :comments do |c|  # r.item :comments would be equivalent, but less readable
+    #     c.list_item :comment do |c|
+    #       c.item :author  # Now these are just normal items, as they are extracted once from their parent
+    #       c.item :date
+    #       c.item :body
+    #     end
+    #   end
+    # end
+    def item(name, &block)
+      self.add_child(Node::Structure.new(name, &block))
+    end
+    # Extracting a list is really the same as extracting a normal item, but
+    # people probably still prefer to call a list a list.
+    alias :list :item
+    # See the docs for #item for a discussion of when to use #item and when to
+    # use #list_item.
+    def list_item(name, &block)
+      self.add_child(Node::Structure.new(name, :list_item, &block))
+    end
+  end
+end

data/lib/ariel/rule.rb CHANGED

@@ -5,17 +5,23 @@ module Ariel
   # Rule#landmarks. A Rule also has a direction :forward or :back, which
   # determines whether it is applied from the end or beginning of a tokenstream.
   class Rule
-    attr_accessor :landmarks, :direction
+    attr_accessor :landmarks, :direction, :exhaustive
     @@RuleMatchData=Struct.new(:token_loc, :type)
+    @@cache={}
     # A rule's direction can be :back or :forward, which determines whether it
     # is applied from the start of end of the TokenStream. The landmark array
     # contains an array for each landmark, which consists of one or more
-    # features. e.g. Rule.new(:forward, [[:anything, "Example"], ["Test"]]).
-    def initialize(direction, landmarks=[])
+    # features. e.g. Rule.new([[:anything, "Example"], ["Test"]], :forward).
+    def initialize(landmarks, direction, exhaustive=false)
       @landmarks=landmarks
       raise(ArgumentError, "Not a valid direction") unless [:forward, :back].include?(direction)
       @direction=direction
+      @exhaustive=exhaustive
+    end
+    def exhaustive?
+      @exhaustive
     end
     # Two rules are equal if they have the same list of landmarks and the same
@@ -26,12 +32,12 @@ module Ariel
     alias :eql? :==
     def hash
-      [@landmarks, @direction].hash
+      [@landmarks, @direction, @exhaustive].hash
     end
     # Returns a rule that contains a given range of
     def partial(range)
-      return Rule.new(@direction, @landmarks[range])
+      return Rule.new(@landmarks[range], @direction)
     end
     def deep_clone
@@ -57,7 +63,7 @@ module Ariel
     end
     # Given a TokenStream and a rule, applies the rule on the stream and
-    # returns nil if the match fails and the token_loc if the match succeeds.
+    # returns an empty array if the match fails and an array of token_locs if the match succeeds.
     # Yields a RuleMatchData Struct with accessors token_loc (the position of the match in the stream)
     # and type if a block is given. type is nil if the TokenStream has no label,
     # :perfect if all tokens up to the labeled token are consumed, :early if the rule's final position
@@ -65,33 +71,23 @@ module Ariel
     # token_loc is the position in the stream as it was passed in. That is, the
     # token_loc is always from the left of the given stream whether it is in a
     # reversed state or not.
-    def apply_to(tokenstream)
-      if tokenstream.reversed?
-        target=tokenstream if @direction==:back
-        target=tokenstream.reverse if @direction==:forward
-      elsif not tokenstream.reversed?
-        target=tokenstream if @direction==:forward
-        target=tokenstream.reverse if @direction==:back
-      end
-      target.rewind #rules are applied from the beginning of the stream
-      @landmarks.each do |landmark|
-        unless target.skip_to(*landmark)
-          return nil
+    def apply_to(tokenstream)
+      target=self.class.prepare_tokenstream(tokenstream, @direction)
+      cache_check=@@cache[[tokenstream.cache_hash, self.hash]]
+      if cache_check
+        token_locs=cache_check
+      else
+        token_locs=[]
+        while result=seek_landmarks(target)
+          token_locs << correct_match_location(tokenstream, result)
+          break unless exhaustive?
         end
+        @@cache[[tokenstream.cache_hash, self.hash]]=token_locs
       end
-      token_loc=target.cur_pos
-      if @direction==:back && !tokenstream.reversed?
-        token_loc = tokenstream.reverse_pos(token_loc) #Return position from left of given stream
-      end
-      md = @@RuleMatchData.new(token_loc)
-      if target.label_index
-        idx = target.label_index
-        md.type = :perfect if token_loc == idx
-        md.type = :early if token_loc < idx
-        md.type = :late if token_loc > idx
+      if block_given?
+        generate_match_data(target, token_locs).each {|md| yield md}
       end
-      yield md if block_given?
-      return token_loc
+      return token_locs
     end
     # Returns true or false depending on if the match of this rule on the given
@@ -99,8 +95,9 @@ module Ariel
     # :perfect, :early, :fail and :late). Only valid on streams with labels
     def matches(tokenstream, *types)
       raise ArgumentError, "No match types given" if types.empty?
+      raise ArgumentError, "Only applicable to tokenstreams containing a label" if tokenstream.label_index.nil?
       match = nil
-      apply_to(tokenstream) {|md| match=md.type}
+      apply_to(tokenstream) {|md| match=md.type if md.type;}
       match = :fail if match.nil?
       if types.include? match
         return true
@@ -108,5 +105,90 @@ module Ariel
         return false
       end
     end
+    # Only used in rule learning on labeled tokenstreams. Needed to provide the
+    # match index most relevant to the currently labeled list item. A preference
+    # of :early or :late can be passed, which will only return a
+    # token_loc before the stream's label_index or after the label_index.
+    def closest_match(tokenstream, preference=:none)
+      token_locs=self.apply_to(tokenstream)
+      return find_closest_match(token_locs, tokenstream.label_index)
+    end
+    # Reverses the given tokenstream if necessary based on its current direction, and
+    # the direction given (corresponding to the sort of rule you hope to apply
+    # to it).
+    def self.prepare_tokenstream(tokenstream, direction)
+      if tokenstream.reversed?
+        target=tokenstream if direction==:back
+        target=tokenstream.reverse if direction==:forward
+      elsif not tokenstream.reversed?
+        target=tokenstream if direction==:forward
+        target=tokenstream.reverse if direction==:back
+      end
+      target.rewind #rules are applied from the beginning of the stream
+      return target
+    end
+    private
+    # Finds the sequence of landmarks contained in the Rule instance in the
+    # given tokenstream. The logic of reversing or rewinding the stream if necessary
+    # is left to the method that uses it. Returns the match location from the
+    # beginning of whatever tokenstream it was passed. This location should be
+    # corrected by correct_match_location
+    def seek_landmarks(tokenstream)
+      @landmarks.each do |landmark|
+        unless tokenstream.skip_to(*landmark)
+          return nil
+        end
+      end
+      return tokenstream.cur_pos
+    end
+    # Takes the original tokenstream passed to apply_to and reverses the match
+    # location is required, so the match location returned to the user will be
+    # the index from the left of the passed tokenstream.
+    def correct_match_location(tokenstream, match_loc)
+      if tokenstream.reversed?
+        result=match_loc if @direction==:back
+        result=tokenstream.reverse_pos(match_loc) if @direction==:forward
+      elsif not tokenstream.reversed?
+        result=match_loc if @direction==:forward
+        result=tokenstream.reverse_pos(match_loc) if @direction==:back
+      end
+      return result
+    end
+    def generate_match_data(tokenstream, token_locs)
+      result=[]
+      if tokenstream.label_index
+        closest_match=find_closest_match(token_locs, tokenstream.label_index)
+      end
+      token_locs.each do |token_loc|
+        md = @@RuleMatchData.new(token_loc)
+        if tokenstream.label_index && token_loc==closest_match
+          idx = tokenstream.label_index
+          md.type = :perfect if token_loc == idx
+          md.type = :early if token_loc < idx
+          md.type = :late if token_loc > idx
+        end
+        result << md
+      end
+      return result
+    end
+    def find_closest_match(token_locs, label_index, preference=:none)
+      if preference==:early
+        token_locs = token_locs.reject {|token_loc| token_loc > label_index}
+      elsif preference==:late
+        token_locs = token_locs.reject {|token_loc| token_loc | label_index}
+      end
+      token_locs.sort_by {|token_loc| (label_index-token_loc).abs}.first
+    end
+    def self.clear_cache
+      @@cache.clear
+    end
   end
 end