ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -2,7 +2,7 @@ require 'ariel'
2
2
  require 'fixtures'
3
3
  include Fixtures
4
4
 
5
- context "A new TokenStream" do
5
+ context "A new, unlabeled TokenStream" do
6
6
  setup do
7
7
  @tokenstream = Ariel::TokenStream.new
8
8
  end
@@ -11,16 +11,11 @@ context "A new TokenStream" do
11
11
  @tokenstream.cur_pos.should_equal 0
12
12
  end
13
13
 
14
- specify "Should return an empty Array when tokens is called" do
15
- @tokenstream.tokens.should_be_a_kind_of Array
16
- @tokenstream.tokens.should_be_empty
17
- end
18
-
19
14
  specify "Should not contain any tokens" do
20
15
  @tokenstream.tokens.size.should_equal 0
21
16
  end
22
17
 
23
- specify "Should return an empty string went sent the message raw_text" do
18
+ specify "Should have an empty string as raw_text" do
24
19
  @tokenstream.raw_text.should_equal ""
25
20
  end
26
21
 
@@ -38,6 +33,104 @@ context "A new TokenStream" do
38
33
  specify "Should not be reversed" do
39
34
  @tokenstream.should_not_be_reversed
40
35
  end
36
+
37
+ specify "Should have a label_index of nil" do
38
+ @tokenstream.label_index.should_be_nil
39
+ end
40
+
41
+ specify "Should accept a string to be tokenized" do
42
+ lambda {@tokenstream.tokenize "This is a test"}.should_not_raise
43
+ end
44
+
45
+ specify "Should provide a skip_to method" do
46
+ @tokenstream.should_respond_to :skip_to
47
+ end
48
+
49
+ specify "Should not contain label tags" do
50
+ @tokenstream.contains_label_tags?.should_equal false
51
+ end
41
52
  end
42
53
 
54
+ context "A TokenStream instance which has tokenized unlabeled text" do
55
+ setup do
56
+ @tokenstream = Ariel::TokenStream.new
57
+ @tokenstream.tokenize "This is a test you know"
58
+ end
59
+
60
+ specify "Should not contain label tags" do
61
+ @tokenstream.contains_label_tags?.should_equal false
62
+ end
63
+
64
+ specify "Should return its original text" do
65
+ @tokenstream.raw_text.should_equal @tokenstream.original_text
66
+ @tokenstream.original_text.should_equal @tokenstream.text
67
+ end
68
+
69
+ specify "Should contain tokens that correctly identify their position in the original text" do
70
+ @tokenstream.each do |token|
71
+ token.text.should_equal @tokenstream.original_text[token.start_loc...token.end_loc]
72
+ end
73
+ end
74
+
75
+ specify "Should have its tokens in order" do
76
+ sorted=@tokenstream.tokens.sort_by {|token| token.start_loc}
77
+ sorted.should_equal @tokenstream.tokens
78
+ end
43
79
 
80
+ specify "Should advance its position when Enumerable methods are called" do
81
+ pos=0
82
+ @tokenstream.each do |token|
83
+ pos+=1
84
+ @tokenstream.cur_pos.should_equal pos
85
+ end
86
+ end
87
+
88
+ specify "Should make no changes when remove_label_tags is called" do
89
+ token_n = @tokenstream.tokens.size
90
+ @tokenstream.remove_label_tags
91
+ @tokenstream.tokens.size.should_equal token_n
92
+ end
93
+
94
+ specify "Should return its current_token" do
95
+ @tokenstream.cur_pos=2
96
+ @tokenstream.current_token.should_equal @tokenstream.tokens[2]
97
+ end
98
+
99
+ specify "Should return its current token and increment cur_pos by one when asked to advance" do
100
+ @tokenstream.cur_pos=3
101
+ @tokenstream.advance.should_equal @tokenstream.tokens[3]
102
+ @tokenstream.cur_pos.should_equal 4
103
+ end
104
+
105
+ specify "reverse should not modify the receiver" do
106
+ @tokenstream.reverse.should_not_equal @tokenstream
107
+ @tokenstream.reverse.tokens.should_not_equal @tokenstream.tokens
108
+ end
109
+
110
+ specify "reversed? should reflect whether the tokenstream is in a reversed state or not" do
111
+ @tokenstream.reverse.reversed?.should_equal true
112
+ @tokenstream.reverse!
113
+ @tokenstream.reversed?.should_equal true
114
+ @tokenstream.reverse!
115
+ @tokenstream.reversed?.should_equal false
116
+ end
117
+
118
+ specify "Should provide a method that will convert a given token index so it will refer to the same token if the stream were reversed" do
119
+ idx=@tokenstream.reverse_pos(2)
120
+ @tokenstream.reverse.tokens[idx].should_equal @tokenstream.tokens[2]
121
+ end
122
+ end
123
+
124
+ context "A TokenStream with multibyte characters" do
125
+ setup do
126
+ @e=0xc3.chr + 0xa9.chr
127
+ @tokenstream=Ariel::TokenStream.new
128
+ @tokenstream.tokenize "Would you like my r#{@e}sum#{@e}? Just wondering"
129
+ end
130
+
131
+ specify "Each token's start and end_loc should refer to the relevant slice of the original text" do
132
+ @tokenstream.tokens.each do |token|
133
+ @tokenstream.original_text[token.start_loc...token.end_loc].should_equal token.text
134
+ end
135
+ end
136
+ end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: ariel
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.1
7
- date: 2006-08-09 00:00:00 +01:00
6
+ version: 0.1.0
7
+ date: 2006-08-22 00:00:00 +01:00
8
8
  summary: A Ruby Information Extraction Library
9
9
  require_paths:
10
10
  - lib
@@ -30,33 +30,33 @@ authors:
30
30
  files:
31
31
  - lib/ariel
32
32
  - lib/ariel.rb
33
- - lib/ariel/extracted_node.rb
33
+ - lib/ariel/node
34
34
  - lib/ariel/learner.rb
35
- - lib/ariel/example_document_loader.rb
36
35
  - lib/ariel/rule_set.rb
36
+ - lib/ariel/labeled_document_loader.rb
37
37
  - lib/ariel/rule.rb
38
38
  - lib/ariel/wildcards.rb
39
39
  - lib/ariel/token_stream.rb
40
+ - lib/ariel/log.rb
41
+ - lib/ariel/node.rb
40
42
  - lib/ariel/label_utils.rb
41
- - lib/ariel/structure_node.rb
42
43
  - lib/ariel/token.rb
43
- - lib/ariel/candidate_selector.rb
44
- - lib/ariel/node_like.rb
45
- - test/test_learner.rb
44
+ - lib/ariel/candidate_refiner.rb
45
+ - lib/ariel/node/structure.rb
46
+ - lib/ariel/node/extracted.rb
46
47
  - test/specs
47
- - test/test_rule.rb
48
- - test/ariel_test_case.rb
49
48
  - test/fixtures.rb
50
- - test/test_token_stream.rb
51
- - test/test_example_document_loader.rb
52
- - test/test_token.rb
53
- - test/test_structure_node.rb
54
- - test/test_label_utils.rb
55
- - test/test_candidate_selector.rb
56
- - test/test_wildcards.rb
49
+ - test/specs/learner_spec.rb
50
+ - test/specs/node_spec.rb
51
+ - test/specs/candidate_refiner_spec.rb
52
+ - test/specs/rule_spec.rb
57
53
  - test/specs/token_stream_spec.rb
58
54
  - test/specs/wildcards_spec.rb
55
+ - test/specs/node_structure_spec.rb
56
+ - test/specs/label_utils_spec.rb
57
+ - test/specs/rule_set_spec.rb
59
58
  - test/specs/token_spec.rb
59
+ - test/specs/node_extracted_spec.rb
60
60
  - README
61
61
  - LICENSE
62
62
  - examples/raa
@@ -78,18 +78,11 @@ files:
78
78
  - examples/google_calculator/unlabeled/1
79
79
  - examples/google_calculator/unlabeled/2
80
80
  - bin/ariel
81
- test_files:
82
- - test/test_learner.rb
83
- - test/test_rule.rb
84
- - test/test_token_stream.rb
85
- - test/test_example_document_loader.rb
86
- - test/test_token.rb
87
- - test/test_structure_node.rb
88
- - test/test_label_utils.rb
89
- - test/test_candidate_selector.rb
90
- - test/test_wildcards.rb
91
- rdoc_options: []
81
+ test_files: []
92
82
 
83
+ rdoc_options:
84
+ - --main
85
+ - README
93
86
  extra_rdoc_files:
94
87
  - README
95
88
  - LICENSE
@@ -1,59 +0,0 @@
1
- module Ariel
2
-
3
- # Provides methods that read an example document, using a StructureNode tree
4
- # to populate a tree of Nodes with each labeled example.
5
- # TODO: Fix the UTF issues this implementation is bound to create.
6
- class ExampleDocumentLoader
7
-
8
- # Assumes it is passed a root parent
9
- def self.load_labeled_example(file, structure, loaded_example_hash)
10
- raise ArgumentError, "Passed structure is not root parent" if structure.parent
11
- string = file.respond_to?(:read) ? file.read : file
12
- tokenstream = TokenStream.new
13
- tokenstream.tokenize(string, true)
14
- root = ExtractedNode.new(:root, tokenstream, structure)
15
- structure.apply_extraction_tree_on(root, true)
16
- root.each_descendant(true) do |extracted_node|
17
- if extracted_node.parent
18
- loaded_example_hash[extracted_node.meta.structure] << extracted_node
19
- end
20
- extracted_node.tokenstream.remove_label_tags
21
- end
22
- return loaded_example_hash
23
- end
24
-
25
- def self.supervise_learning(structure, loaded_example_hash)
26
- loaded_example_hash.each_pair do |structure_node, example_nodes|
27
- start_examples=[]
28
- end_examples=[]
29
- example_nodes.each do |node|
30
- start_tstream=node.parent.tokenstream #Rules are based on extracting from the parent
31
- start_tstream.set_label_at(node.tokenstream.tokens.first.start_loc)
32
- start_examples << start_tstream
33
- end_tstream=node.parent.tokenstream.reverse
34
- end_tstream.set_label_at(node.tokenstream.tokens.last.start_loc)
35
- end_examples << end_tstream
36
- end
37
- learner = Learner.new(*start_examples)
38
- start_rules = learner.learn_rule :forward
39
- learner = Learner.new(*end_examples)
40
- end_rules = learner.learn_rule :back
41
- structure_node.ruleset=RuleSet.new(start_rules, end_rules)
42
- end
43
- end
44
-
45
- def self.load_directory(dir, structure)
46
- loaded_example_hash = Hash.new {|h, k| h[k]=[]}
47
- Dir.glob("#{dir}/*") do |doc|
48
- next if doc=~ /structure\.rb\z/
49
- File.open(doc) do |file|
50
- self.load_labeled_example(file, structure, loaded_example_hash)
51
- end
52
- end
53
- self.supervise_learning structure, loaded_example_hash
54
- return structure
55
- end
56
-
57
-
58
- end
59
- end
@@ -1,20 +0,0 @@
1
- module Ariel
2
- require 'ostruct'
3
-
4
- # Each ExtractedNode has a name, a tokenstream and a structure which points to
5
- # the relevant StructureNode.
6
- class ExtractedNode
7
- include NodeLike
8
- attr_accessor :tokenstream
9
-
10
- def initialize(name, tokenstream, structure)
11
- @children={}
12
- @meta = OpenStruct.new({:name=>name, :structure=>structure})
13
- @tokenstream=tokenstream
14
- end
15
-
16
- def extracted_text
17
- tokenstream.text
18
- end
19
- end
20
- end
@@ -1,26 +0,0 @@
1
- module Ariel
2
-
3
- module NodeLike
4
- attr_accessor :parent, :children, :meta
5
-
6
- # Given a Node object and a name, adds a child to the array of children,
7
- # setting its parent as the current node, as well as creating an accessor
8
- # method matching that name.
9
- def add_child(node)
10
- @children[node.meta.name]=node
11
- node.parent = self
12
- end
13
-
14
- def each_descendant(include_self=false)
15
- if include_self
16
- node_queue=[self]
17
- else
18
- node_queue=self.children.values
19
- end
20
- until node_queue.empty? do
21
- node_queue.concat node_queue.first.children.values
22
- yield node_queue.shift
23
- end
24
- end
25
- end
26
- end
@@ -1,75 +0,0 @@
1
- module Ariel
2
- require 'ostruct'
3
-
4
- # Implements a Node object used to represent the structure of the document
5
- # tree. Each node stores start and end rules to extract the desired content
6
- # from its parent node. Could be viewed as a rule-storing object.
7
- class StructureNode
8
- include NodeLike
9
- attr_accessor :ruleset
10
- def initialize(name=:root, type=:not_list, &block)
11
- @children={}
12
- @meta = OpenStruct.new({:name=>name, :node_type=>type})
13
- yield self if block_given?
14
- end
15
-
16
- # Used to extend an already created Node. e.g.
17
- # node.extend_structure do |r|
18
- # r.new_field1
19
- # r.new_field2
20
- # end
21
- def extend_structure(&block)
22
- yield self if block_given?
23
- end
24
-
25
- # Given a Node to apply it's rules to, this function will create a new node
26
- # and add it as a child of the given node. For StructureNodes of :list type,
27
- # the list is extracted and so are each of the list items. In this case,
28
- # only the list items are yielded.
29
- def extract_from(node)
30
- # Will be reimplemented to return an array of extracted items
31
- newstream = @ruleset.apply_to(node.tokenstream)
32
- extracted_node = ExtractedNode.new(meta.name, newstream, self)
33
- node.add_child extracted_node if newstream
34
- if self.meta.node_type == :list
35
- #Do stuff
36
- end
37
- return extracted_node
38
- end
39
-
40
- # Applies the extraction rules stored in the current StructureNode and all its
41
- # descendant children.
42
- def apply_extraction_tree_on(root_node, extract_labels=false)
43
- extraction_queue = [root_node]
44
- until extraction_queue.empty? do
45
- new_parent = extraction_queue.shift
46
- new_parent.meta.structure.children.values.each do |child|
47
- if extract_labels
48
- extracted_node=LabelUtils.extract_labeled_region(child, new_parent)
49
- else
50
- extracted_node=child.extract_from(new_parent)
51
- end
52
- extraction_queue.push(extracted_node) if extracted_node
53
- end
54
- end
55
- return root_node
56
- end
57
-
58
- def item(name, &block)
59
- self.add_child(StructureNode.new(name, &block))
60
- end
61
-
62
- def list_item(name, &block)
63
- self.add_child(StructureNode.new(name, :list, &block))
64
- end
65
-
66
- def method_missing(method, *args, &block)
67
- if @children.has_key? method
68
- @children[method]
69
- else
70
- super
71
- end
72
- end
73
- end
74
- end
75
-
@@ -1,15 +0,0 @@
1
- require 'test/unit'
2
- require 'fixtures'
3
-
4
- module Ariel
5
- include Fixtures
6
- class TestCase < Test::Unit::TestCase
7
- def run(result)
8
- debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
9
- super
10
- end
11
-
12
- def default_test
13
- end
14
- end
15
- end
@@ -1,58 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
-
5
- class TestCandidateSelector < Ariel::TestCase
6
- include Fixtures
7
- def setup
8
- # Must get rid of this repetition, should be available to all tests
9
- @e=@@labeled_addresses
10
- @candidates=[]
11
- @candidates << Ariel::Rule.new(:forward, [[:anything]])
12
- @candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
13
- @candidates << Ariel::Rule.new(:forward, [["("]])
14
- @candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
15
- @selector=Ariel::CandidateSelector.new(@candidates, @e)
16
- end
17
-
18
- def test_score_by
19
- score_hash = @selector.score_by {|rule| rule.landmarks.size}
20
- assert_equal @candidates.size, score_hash.size
21
- assert_equal 1, score_hash.values.sort.first
22
- end
23
-
24
- def test_highest_scoring_by
25
- t1 = @selector.highest_scoring_by {|rule| 1}
26
- assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
27
- assert_equal @candidates.size, t1.size
28
- t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
29
- assert_equal 1, t2.size
30
- end
31
-
32
- def test_select_best_by_match_type
33
- @selector.select_best_by_match_type :fail, :early, :late, :perfect
34
- assert_equal @candidates, @selector.candidates
35
- @selector.select_best_by_match_type :late
36
- assert_equal 1, @selector.candidates.size
37
- assert_equal @candidates[1], @selector.candidates[0]
38
- end
39
-
40
- def test_select_with_fewer_wildcards
41
- assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
42
- assert_equal 1, @selector.candidates.size
43
- end
44
-
45
- def test_select_closest_to_label
46
- assert_equal @candidates[2], @selector.select_closest_to_label[0]
47
- assert_equal 1, @selector.candidates.size
48
- end
49
-
50
- def test_select_with_longer_landmarks
51
- assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
52
- assert_equal 1, @selector.candidates.size
53
- end
54
-
55
- def test_random_from_remaining
56
- assert(@candidates.include?(@selector.random_from_remaining))
57
- end
58
- end