RubyGems - ariel - Versions diffs - 0.0.1 → 0.1.0 - Mend

ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README +49 -83
data/bin/ariel +29 -20
data/examples/google_calculator/structure.rb +2 -2
data/examples/google_calculator/structure.yaml +13 -15
data/examples/raa/labeled/highline.html +5 -4
data/examples/raa/labeled/mongrel.html +9 -8
data/examples/raa/structure.rb +4 -2
data/examples/raa/structure.yaml +94 -78
data/lib/ariel.rb +71 -33
data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
data/lib/ariel/label_utils.rb +46 -18
data/lib/ariel/labeled_document_loader.rb +77 -0
data/lib/ariel/learner.rb +60 -38
data/lib/ariel/log.rb +67 -0
data/lib/ariel/node.rb +52 -0
data/lib/ariel/node/extracted.rb +90 -0
data/lib/ariel/node/structure.rb +91 -0
data/lib/ariel/rule.rb +114 -32
data/lib/ariel/rule_set.rb +34 -15
data/lib/ariel/token.rb +9 -3
data/lib/ariel/token_stream.rb +32 -17
data/lib/ariel/wildcards.rb +19 -15
data/test/fixtures.rb +45 -3
data/test/specs/candidate_refiner_spec.rb +48 -0
data/test/specs/label_utils_spec.rb +97 -0
data/test/specs/learner_spec.rb +39 -0
data/test/specs/node_extracted_spec.rb +90 -0
data/test/specs/node_spec.rb +76 -0
data/test/specs/node_structure_spec.rb +74 -0
data/test/specs/rule_set_spec.rb +85 -0
data/test/specs/rule_spec.rb +110 -0
data/test/specs/token_stream_spec.rb +100 -7
metadata +21 -28
data/lib/ariel/example_document_loader.rb +0 -59
data/lib/ariel/extracted_node.rb +0 -20
data/lib/ariel/node_like.rb +0 -26
data/lib/ariel/structure_node.rb +0 -75
data/test/ariel_test_case.rb +0 -15
data/test/test_candidate_selector.rb +0 -58
data/test/test_example_document_loader.rb +0 -7
data/test/test_label_utils.rb +0 -15
data/test/test_learner.rb +0 -38
data/test/test_rule.rb +0 -38
data/test/test_structure_node.rb +0 -81
data/test/test_token.rb +0 -16
data/test/test_token_stream.rb +0 -82
data/test/test_wildcards.rb +0 -18

data/test/specs/token_stream_spec.rb CHANGED

@@ -2,7 +2,7 @@ require 'ariel'
 require 'fixtures'
 include Fixtures
-context "A new TokenStream" do
+context "A new, unlabeled TokenStream" do
   setup do
     @tokenstream = Ariel::TokenStream.new
   end
@@ -11,16 +11,11 @@ context "A new TokenStream" do
     @tokenstream.cur_pos.should_equal 0
   end
-  specify "Should return an empty Array when tokens is called" do
-    @tokenstream.tokens.should_be_a_kind_of Array
-    @tokenstream.tokens.should_be_empty
-  end
   specify "Should not contain any tokens" do
     @tokenstream.tokens.size.should_equal 0
   end
-  specify "Should return an empty string went sent the message raw_text" do
+  specify "Should have an empty string as raw_text" do
     @tokenstream.raw_text.should_equal ""
   end
@@ -38,6 +33,104 @@ context "A new TokenStream" do
   specify "Should not be reversed" do
     @tokenstream.should_not_be_reversed
   end
+  specify "Should have a label_index of nil" do
+    @tokenstream.label_index.should_be_nil
+  end
+  specify "Should accept a string to be tokenized" do
+    lambda {@tokenstream.tokenize "This is a test"}.should_not_raise
+  end
+  specify "Should provide a skip_to method" do
+    @tokenstream.should_respond_to :skip_to
+  end
+  specify "Should not contain label tags" do
+    @tokenstream.contains_label_tags?.should_equal false
+  end
 end
+context "A TokenStream instance which has tokenized unlabeled text" do
+  setup do
+    @tokenstream = Ariel::TokenStream.new
+    @tokenstream.tokenize "This is a test you know"
+  end
+  specify "Should not contain label tags" do
+    @tokenstream.contains_label_tags?.should_equal false
+  end
+  specify "Should return its original text" do
+    @tokenstream.raw_text.should_equal @tokenstream.original_text
+    @tokenstream.original_text.should_equal @tokenstream.text
+  end
+  specify "Should contain tokens that correctly identify their position in the original text" do
+    @tokenstream.each do |token|
+      token.text.should_equal @tokenstream.original_text[token.start_loc...token.end_loc]
+    end
+  end
+  specify "Should have its tokens in order" do
+    sorted=@tokenstream.tokens.sort_by {|token| token.start_loc}
+    sorted.should_equal @tokenstream.tokens
+  end
+  specify "Should advance its position when Enumerable methods are called" do
+    pos=0
+    @tokenstream.each do |token|
+      pos+=1
+      @tokenstream.cur_pos.should_equal pos
+    end
+  end
+  specify "Should make no changes when remove_label_tags is called" do
+    token_n = @tokenstream.tokens.size
+    @tokenstream.remove_label_tags
+    @tokenstream.tokens.size.should_equal token_n
+  end
+  specify "Should return its current_token" do
+    @tokenstream.cur_pos=2
+    @tokenstream.current_token.should_equal @tokenstream.tokens[2]
+  end
+  specify "Should return its current token and increment cur_pos by one when asked to advance" do
+    @tokenstream.cur_pos=3
+    @tokenstream.advance.should_equal @tokenstream.tokens[3]
+    @tokenstream.cur_pos.should_equal 4
+  end
+  specify "reverse should not modify the receiver" do
+    @tokenstream.reverse.should_not_equal @tokenstream
+    @tokenstream.reverse.tokens.should_not_equal @tokenstream.tokens
+  end
+  specify "reversed? should reflect whether the tokenstream is in a reversed state or not" do
+    @tokenstream.reverse.reversed?.should_equal true
+    @tokenstream.reverse!
+    @tokenstream.reversed?.should_equal true
+    @tokenstream.reverse!
+    @tokenstream.reversed?.should_equal false
+  end
+  specify "Should provide a method that will convert a given token index so it will refer to the same token if the stream were reversed" do
+    idx=@tokenstream.reverse_pos(2)
+    @tokenstream.reverse.tokens[idx].should_equal @tokenstream.tokens[2]
+  end
+end
+context "A TokenStream with multibyte characters" do
+  setup do
+    @e=0xc3.chr + 0xa9.chr
+    @tokenstream=Ariel::TokenStream.new
+    @tokenstream.tokenize "Would you like my r#{@e}sum#{@e}? Just wondering"
+  end
+  specify "Each token's start and end_loc should refer to the relevant slice of the original text" do
+    @tokenstream.tokens.each do |token|
+      @tokenstream.original_text[token.start_loc...token.end_loc].should_equal token.text
+    end
+  end
+end

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
 specification_version: 1
 name: ariel
 version: !ruby/object:Gem::Version
-  version: 0.0.1
-date: 2006-08-09 00:00:00 +01:00
+  version: 0.1.0
+date: 2006-08-22 00:00:00 +01:00
 summary: A Ruby Information Extraction Library
 require_paths:
 - lib
@@ -30,33 +30,33 @@ authors:
 files:
 - lib/ariel
 - lib/ariel.rb
-- lib/ariel/extracted_node.rb
+- lib/ariel/node
 - lib/ariel/learner.rb
-- lib/ariel/example_document_loader.rb
 - lib/ariel/rule_set.rb
+- lib/ariel/labeled_document_loader.rb
 - lib/ariel/rule.rb
 - lib/ariel/wildcards.rb
 - lib/ariel/token_stream.rb
+- lib/ariel/log.rb
+- lib/ariel/node.rb
 - lib/ariel/label_utils.rb
-- lib/ariel/structure_node.rb
 - lib/ariel/token.rb
-- lib/ariel/candidate_selector.rb
-- lib/ariel/node_like.rb
-- test/test_learner.rb
+- lib/ariel/candidate_refiner.rb
+- lib/ariel/node/structure.rb
+- lib/ariel/node/extracted.rb
 - test/specs
-- test/test_rule.rb
-- test/ariel_test_case.rb
 - test/fixtures.rb
-- test/test_token_stream.rb
-- test/test_example_document_loader.rb
-- test/test_token.rb
-- test/test_structure_node.rb
-- test/test_label_utils.rb
-- test/test_candidate_selector.rb
-- test/test_wildcards.rb
+- test/specs/learner_spec.rb
+- test/specs/node_spec.rb
+- test/specs/candidate_refiner_spec.rb
+- test/specs/rule_spec.rb
 - test/specs/token_stream_spec.rb
 - test/specs/wildcards_spec.rb
+- test/specs/node_structure_spec.rb
+- test/specs/label_utils_spec.rb
+- test/specs/rule_set_spec.rb
 - test/specs/token_spec.rb
+- test/specs/node_extracted_spec.rb
 - README
 - LICENSE
 - examples/raa
@@ -78,18 +78,11 @@ files:
 - examples/google_calculator/unlabeled/1
 - examples/google_calculator/unlabeled/2
 - bin/ariel
-test_files:
-- test/test_learner.rb
-- test/test_rule.rb
-- test/test_token_stream.rb
-- test/test_example_document_loader.rb
-- test/test_token.rb
-- test/test_structure_node.rb
-- test/test_label_utils.rb
-- test/test_candidate_selector.rb
-- test/test_wildcards.rb
-rdoc_options: []
+test_files: []
+rdoc_options:
+- --main
+- README
 extra_rdoc_files:
 - README
 - LICENSE

data/lib/ariel/example_document_loader.rb DELETED

@@ -1,59 +0,0 @@
-module Ariel
-  # Provides methods that read an example document, using a StructureNode tree
-  # to populate a tree of Nodes with each labeled example.
-  # TODO: Fix the UTF issues this implementation is bound to create.
-  class ExampleDocumentLoader
-    # Assumes it is passed a root parent
-    def self.load_labeled_example(file, structure, loaded_example_hash)
-      raise ArgumentError, "Passed structure is not root parent" if structure.parent
-      string = file.respond_to?(:read) ? file.read : file
-      tokenstream = TokenStream.new
-      tokenstream.tokenize(string, true)
-      root = ExtractedNode.new(:root, tokenstream, structure)
-      structure.apply_extraction_tree_on(root, true)
-      root.each_descendant(true) do |extracted_node|
-        if extracted_node.parent
-          loaded_example_hash[extracted_node.meta.structure] << extracted_node
-        end
-        extracted_node.tokenstream.remove_label_tags
-      end
-      return loaded_example_hash
-    end
-    def self.supervise_learning(structure, loaded_example_hash)
-      loaded_example_hash.each_pair do |structure_node, example_nodes|
-        start_examples=[]
-        end_examples=[]
-        example_nodes.each do |node|
-          start_tstream=node.parent.tokenstream #Rules are based on extracting from the parent
-          start_tstream.set_label_at(node.tokenstream.tokens.first.start_loc)
-          start_examples << start_tstream
-          end_tstream=node.parent.tokenstream.reverse
-          end_tstream.set_label_at(node.tokenstream.tokens.last.start_loc)
-          end_examples << end_tstream
-        end
-        learner = Learner.new(*start_examples)
-        start_rules = learner.learn_rule :forward
-        learner = Learner.new(*end_examples)
-        end_rules = learner.learn_rule :back
-        structure_node.ruleset=RuleSet.new(start_rules, end_rules)
-      end
-    end
-    def self.load_directory(dir, structure)
-      loaded_example_hash = Hash.new {|h, k| h[k]=[]}
-      Dir.glob("#{dir}/*") do |doc|
-        next if doc=~ /structure\.rb\z/
-        File.open(doc) do |file|
-          self.load_labeled_example(file, structure, loaded_example_hash)
-        end
-      end
-      self.supervise_learning structure, loaded_example_hash
-      return structure
-    end
-  end
-end

data/lib/ariel/extracted_node.rb DELETED

@@ -1,20 +0,0 @@
-module Ariel
-  require 'ostruct'
-  # Each ExtractedNode has a name, a tokenstream and a structure which points to
-  # the relevant StructureNode.
-  class ExtractedNode
-    include NodeLike
-    attr_accessor :tokenstream
-    def initialize(name, tokenstream, structure)
-      @children={}
-      @meta = OpenStruct.new({:name=>name, :structure=>structure})
-      @tokenstream=tokenstream
-    end
-    def extracted_text
-      tokenstream.text
-    end
-  end
-end

data/lib/ariel/node_like.rb DELETED

@@ -1,26 +0,0 @@
-module Ariel
-  module NodeLike
-    attr_accessor :parent, :children, :meta
-    # Given a Node object and a name, adds a child to the array of children,
-    # setting its parent as the current node, as well as creating an accessor
-    # method matching that name.
-    def add_child(node)
-      @children[node.meta.name]=node
-      node.parent = self
-    end
-    def each_descendant(include_self=false)
-      if include_self
-        node_queue=[self]
-      else
-        node_queue=self.children.values
-      end
-      until node_queue.empty? do
-        node_queue.concat node_queue.first.children.values
-        yield node_queue.shift
-      end
-    end
-  end
-end

data/lib/ariel/structure_node.rb DELETED

@@ -1,75 +0,0 @@
-module Ariel
-  require 'ostruct'
-  # Implements a Node object used to represent the structure of the document
-  # tree. Each node stores start and end rules to extract the desired content
-  # from its parent node. Could be viewed as a rule-storing object.
-  class StructureNode
-    include NodeLike
-    attr_accessor :ruleset
-    def initialize(name=:root, type=:not_list, &block)
-      @children={}
-      @meta = OpenStruct.new({:name=>name, :node_type=>type})
-      yield self if block_given?
-    end
-    # Used to extend an already created Node. e.g.
-    #  node.extend_structure do |r|
-    #    r.new_field1
-    #    r.new_field2
-    #  end
-    def extend_structure(&block)
-      yield self if block_given?
-    end
-    # Given a Node to apply it's rules to, this function will create a new node
-    # and add it as a child of the given node. For StructureNodes of :list type,
-    # the list is extracted and so are each of the list items. In this case,
-    # only the list items are yielded.
-    def extract_from(node)
-      # Will be reimplemented to return an array of extracted items
-      newstream = @ruleset.apply_to(node.tokenstream)
-      extracted_node = ExtractedNode.new(meta.name, newstream, self)
-      node.add_child extracted_node if newstream
-      if self.meta.node_type == :list
-        #Do stuff
-      end
-      return extracted_node
-    end
-    # Applies the extraction rules stored in the current StructureNode and all its
-    # descendant children.
-    def apply_extraction_tree_on(root_node, extract_labels=false)
-      extraction_queue = [root_node]
-      until extraction_queue.empty? do
-        new_parent = extraction_queue.shift
-        new_parent.meta.structure.children.values.each do |child|
-          if extract_labels
-            extracted_node=LabelUtils.extract_labeled_region(child, new_parent)
-          else
-            extracted_node=child.extract_from(new_parent)
-          end
-          extraction_queue.push(extracted_node) if extracted_node
-        end
-      end
-      return root_node
-    end
-    def item(name, &block)
-      self.add_child(StructureNode.new(name, &block))
-    end
-    def list_item(name, &block)
-      self.add_child(StructureNode.new(name, :list, &block))
-    end
-    def method_missing(method, *args, &block)
-      if @children.has_key? method
-        @children[method]
-      else
-        super
-      end
-    end
-  end
-end

data/test/ariel_test_case.rb DELETED

@@ -1,15 +0,0 @@
-require 'test/unit'
-require 'fixtures'
-module Ariel
-  include Fixtures
-  class TestCase < Test::Unit::TestCase
-    def run(result)
-      debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
-      super
-    end
-    def default_test
-    end
-  end
-end

data/test/test_candidate_selector.rb DELETED

@@ -1,58 +0,0 @@
-require 'ariel'
-require 'ariel_test_case'
-class TestCandidateSelector < Ariel::TestCase
-  include Fixtures
-  def setup
-    # Must get rid of this repetition, should be available to all tests
-    @e=@@labeled_addresses
-    @candidates=[]
-    @candidates << Ariel::Rule.new(:forward, [[:anything]])
-    @candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
-    @candidates << Ariel::Rule.new(:forward, [["("]])
-    @candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
-    @selector=Ariel::CandidateSelector.new(@candidates, @e)
-  end
-  def test_score_by
-    score_hash = @selector.score_by {|rule| rule.landmarks.size}
-    assert_equal @candidates.size, score_hash.size
-    assert_equal 1, score_hash.values.sort.first
-  end
-  def test_highest_scoring_by
-    t1 = @selector.highest_scoring_by {|rule| 1}
-    assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
-    assert_equal @candidates.size, t1.size
-    t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
-    assert_equal 1, t2.size
-  end
-  def test_select_best_by_match_type
-    @selector.select_best_by_match_type :fail, :early, :late, :perfect
-    assert_equal @candidates, @selector.candidates
-    @selector.select_best_by_match_type :late
-    assert_equal 1, @selector.candidates.size
-    assert_equal @candidates[1], @selector.candidates[0]
-  end
-  def test_select_with_fewer_wildcards
-    assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
-    assert_equal 1, @selector.candidates.size
-  end
-  def test_select_closest_to_label
-    assert_equal @candidates[2], @selector.select_closest_to_label[0]
-    assert_equal 1, @selector.candidates.size
-  end
-  def test_select_with_longer_landmarks
-    assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
-    assert_equal 1, @selector.candidates.size
-  end
-  def test_random_from_remaining
-    assert(@candidates.include?(@selector.random_from_remaining))
-  end
-end