ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -0,0 +1,39 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+
4
+ include Fixtures
5
+
6
+ context "A non-exhaustive forward rule learner" do
7
+ setup do
8
+ @learner=Ariel::Learner.new(*@@labeled_addresses)
9
+ end
10
+
11
+ specify "set_seed should choose the example with the smallest number of tokens before the label" do
12
+ @learner.set_seed.should_equal @@labeled_addresses[1]
13
+ end
14
+
15
+ specify "A seed should be set when a Learner instance is initialized and made accessible through #current_seed" do
16
+ @learner.current_seed.should_equal @@labeled_addresses[1]
17
+ end
18
+
19
+ specify "generate_initial_candidates should generate rule candidates based on the token before the label in the current_seed" do
20
+ @learner.direction=:forward
21
+ @learner.generate_initial_candidates
22
+ c=@learner.candidates
23
+ c.should_include Ariel::Rule.new([["("]], :forward)
24
+ c.should_include Ariel::Rule.new([[:anything]], :forward)
25
+ c.should_include Ariel::Rule.new([[:punctuation]], :forward)
26
+ end
27
+
28
+ specify "learn_rule should return an array of the Rule's learnt" do
29
+ rules=@learner.learn_rule :forward
30
+ rules.should_be_a_kind_of Array
31
+ rules.should_not_be_empty
32
+ end
33
+
34
+ specify "generated rules should be of a :forward type and non-exhaustive" do
35
+ rule=@learner.learn_rule(:forward).first
36
+ rule.direction.should_equal :forward
37
+ rule.should_not_be_exhaustive
38
+ end
39
+ end
@@ -0,0 +1,90 @@
1
+ require 'ariel'
2
+
3
+ context "A new Node::Extracted" do
4
+ setup do
5
+ @tokenstream=Ariel::TokenStream.new
6
+ @tokenstream.tokenize "This is a test"
7
+ @structure=Ariel::Node::Structure.new :test
8
+ @node=Ariel::Node::Extracted.new :test, @tokenstream, @structure
9
+ end
10
+
11
+ specify "Should provide an accessor for its tokenstream" do
12
+ @node.tokenstream.should_equal @tokenstream
13
+ end
14
+
15
+ specify "Should provide an accessor, structure_node for its structure node" do
16
+ @node.structure_node.should_equal @structure
17
+ end
18
+
19
+ specify "Should provide a method extracted_text to show the content of the tokenstream" do
20
+ @node.extracted_text.should_equal @tokenstream.text
21
+ end
22
+ end
23
+
24
+ context "A Node::Extracted with children" do
25
+ setup do
26
+ @tokenstream=Ariel::TokenStream.new
27
+ @tokenstream.tokenize "This is a dummy"
28
+ @structure=@@labeled_document_with_list_structure
29
+ @root=Ariel::Node::Extracted.new :root, @tokenstream, @structure
30
+ [:title, :comment_list].each do |name|
31
+ @root.add_child(Ariel::Node::Extracted.new(name, @tokenstream, @structure.children[name]))
32
+ end
33
+ 0.upto 10 do |i|
34
+ @root.comment_list.add_child Ariel::Node::Extracted.new(i, @tokenstream, @structure.comment_list.comment)
35
+ end
36
+ end
37
+
38
+ specify "should provide access to the node's children through" do #[] can't be used in a spec name due to a bug
39
+ @root[:comment_list].node_name.should_equal :comment_list
40
+ @root.comment_list[2].node_name.should_equal 2
41
+ end
42
+
43
+ specify "square bracket method should return an array when a Range with one member is given" do
44
+ @root.comment_list[0..0].should_be_a_kind_of Array
45
+ end
46
+
47
+ specify "square bracket should return nil if no matches exist" do
48
+ @root[:monkey].should_be_nil
49
+ @root[:monkey, :heaven].should_be_nil
50
+ end
51
+
52
+ specify "square bracket should return those matches that do exist in an array even if some don't" do
53
+ result=@root[:monkey, :title]
54
+ result.size.should_equal 1
55
+ result[0].node_name.should_equal :title
56
+ end
57
+
58
+ specify "#search should return an array of any and all matches to the given query" do
59
+ result=(@root/'comment_list')
60
+ result.should_be_a_kind_of Array
61
+ result.size.should_equal 1
62
+ result[0].node_name.should_equal :comment_list
63
+ end
64
+
65
+ specify "#search should return an empty array on match failure" do
66
+ (@root/'monkey').should_equal []
67
+ end
68
+
69
+ specify "#search should accept * as a wildcard" do
70
+ @root.title.add_child Ariel::Node::Extracted.new(:test, @tokenstream, @structure)
71
+ result=(@root/'*/*').collect {|r| r.node_name}
72
+ (((0..10).to_a << :test) - result).should_equal []
73
+ end
74
+
75
+ specify "#search should return numbered children" do
76
+ result=(@root.comment_list/'0')
77
+ result.size.should_equal 1
78
+ result[0].node_name.should_equal 0
79
+ end
80
+
81
+ specify "#search should return sorted results when a wildcard is used" do
82
+ result=(@root/'comment_list/*').collect {|node| node.node_name}
83
+ result.should_equal ((0..10).to_a)
84
+ end
85
+
86
+ specify "#at should act like #search, but return only the first result" do
87
+ @root.at('comment_list/*').node_name.should_equal 0
88
+ end
89
+
90
+ end
@@ -0,0 +1,76 @@
1
+ require 'ariel'
2
+
3
+ context "A new Node" do
4
+ setup do
5
+ @node=Ariel::Node.new(:root)
6
+ end
7
+
8
+ specify "Should have an empty hash of children" do
9
+ @node.children.should_equal Hash.new
10
+ end
11
+
12
+ specify "Should give its node_name as a symbol" do
13
+ @node.node_name.should_equal :root
14
+ end
15
+
16
+ specify "Should not have a parent" do
17
+ @node.parent.should_be_nil
18
+ end
19
+
20
+ specify "Should yield nothing if each_descendant is called with no arguments" do
21
+ results=[]
22
+ @node.each_descendant {|child| results << child}
23
+ results.should_equal []
24
+ end
25
+
26
+ specify "Should yield only itself if each_descendant is called with true as its argument" do
27
+ results=[]
28
+ @node.each_descendant(true) {|child| results << child}
29
+ results.size.should_equal 1
30
+ results[0].should_equal @node
31
+ end
32
+
33
+ specify "Should not respond to :id or :type" do
34
+ @node.should_not_respond_to :id
35
+ @node.should_not_respond_to :type
36
+ end
37
+ end
38
+
39
+ context "Building a tree of Node objects" do
40
+ setup do
41
+ @root_parent=Ariel::Node.new :root
42
+ @child1=Ariel::Node.new :child1
43
+ @child2=Ariel::Node.new :child2
44
+ @child1_1=Ariel::Node.new :child1_1
45
+ end
46
+
47
+ specify "Adding a child should add an entry to the parent's children hash with the child's node_name as the key" do
48
+ @root_parent.add_child @child1
49
+ @root_parent.children[:child1].should_equal @child1
50
+ end
51
+
52
+ specify "When adding a node as a child, its parent should be set appropriately" do
53
+ @root_parent.add_child @child1
54
+ @child1.parent.should_equal @root_parent
55
+ @child1.add_child @child1_1
56
+ @child1_1.parent.should_equal @child1
57
+ end
58
+
59
+ specify "When adding a node as a child, an accessor method should be created in the parent with name corresponding to the child's node_name" do
60
+ @root_parent.add_child @child1
61
+ @root_parent.should_respond_to :child1
62
+ @root_parent.child1.should_equal @child1
63
+ end
64
+
65
+ specify "Should yield all children when iterating over each_descendant" do
66
+ @root_parent.add_child @child1
67
+ @root_parent.add_child @child2
68
+ @child1.add_child @child1_1
69
+ results=[]
70
+ @root_parent.each_descendant {|child| results << child}
71
+ results.size.should_equal 3
72
+ results.should_include @child1
73
+ results.should_include @child2
74
+ results.should_include @child1_1
75
+ end
76
+ end
@@ -0,0 +1,74 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+
4
+ include Fixtures
5
+
6
+ context "Creating a new Node::Structure tree" do
7
+ setup do
8
+ @node=Ariel::Node::Structure.new
9
+ end
10
+
11
+ specify "Should default to :root as the node_name" do
12
+ @node.node_name=:root
13
+ end
14
+
15
+ specify "Should be possible to create a node with node_type :list_item" do
16
+ list_node=Ariel::Node::Structure.new :comments, :list_item
17
+ list_node.node_type.should_equal :list_item
18
+ end
19
+
20
+ specify "Node::Structure#new should yield itself is a block is given" do
21
+ result=[]
22
+ new_node=Ariel::Node::Structure.new {|r| result << r}
23
+ result.should_equal [new_node]
24
+ end
25
+
26
+ specify "Node::Structure#item should create a new child with the given name and of a non-list node_type" do
27
+ @node.item :test
28
+ @node.children.keys.should_include :test
29
+ @node.test.should_be_an_instance_of Ariel::Node::Structure
30
+ @node.test.node_type.should_not_equal :list_item
31
+ end
32
+
33
+ specify "Node::Structure#list_item should create a new child with the given name and node_type :list_item" do
34
+ @node.list_item :list
35
+ @node.children.keys.should_include :list
36
+ @node.list.should_be_an_instance_of Ariel::Node::Structure
37
+ @node.list.node_type.should_equal :list_item
38
+ end
39
+
40
+ specify "Should be possible to define a tree by passing blocks to item and list_item" do
41
+ root=@@labeled_document_with_list_structure
42
+ root.title.parent.should_equal root
43
+ root.comment_list.comment.should_respond_to :author
44
+ end
45
+
46
+ specify "#extend_structure should allow new children to be added to an existing Node::Structure" do
47
+ @node.extend_structure {|r| r.item :test1}
48
+ @node.extend_structure {|r| r.list_item :test2}
49
+ @node.children.keys.should_include :test1
50
+ @node.children.keys.should_include :test2
51
+ end
52
+ end
53
+
54
+ context "Applying a tree of Node::Structure objects to extract a tree of Node::Extracted objects, some children don't have rulesets defined" do
55
+ setup do
56
+ @structure_root=@@labeled_document_with_list_structure
57
+ @tokenstream=Ariel::TokenStream.new
58
+ @tokenstream.tokenize Ariel::LabelUtils.clean_string(@@labeled_document_with_list)
59
+ @extracted_root=Ariel::Node::Extracted.new :root, @tokenstream, @structure_root
60
+ end
61
+
62
+ specify "#extract_from should apply the ruleset in the given Node::Structure to extract and return an array new Node::Extracted" do
63
+ extractions=@structure_root.title.extract_from @extracted_root
64
+ extractions.size.should_equal 1
65
+ extractions[0].extracted_text.should_equal "Another example"
66
+ end
67
+
68
+ specify "#apply_extraction_tree_on should apply the RuleSet in every Node::Structure child and add all extracted children to the given Node::Extracted" do
69
+ @structure_root.apply_extraction_tree_on @extracted_root
70
+ [:title, :body, :comment_list].each {|node| @extracted_root.children.keys.should_include node}
71
+ @extracted_root.comment_list.children.size.should_equal 2
72
+ @extracted_root.comment_list.children[0].children.should_equal({})
73
+ end
74
+ end
@@ -0,0 +1,85 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+
4
+ include Fixtures
5
+ context "A RuleSet of non-exhaustive rules" do
6
+ setup do
7
+ @frule1=Ariel::Rule.new [["1"]], :forward
8
+ @frule2=Ariel::Rule.new [["2"]], :forward
9
+ @brule1=Ariel::Rule.new [["a"]], :back
10
+ @brule2=Ariel::Rule.new [["b"]], :back
11
+ @ruleset=Ariel::RuleSet.new [@frule1, @frule2], [@brule1, @brule2]
12
+ @tokenstream=Ariel::TokenStream.new
13
+ @tokenstream.tokenize "This is a test. 1 Let's see 2. You know a? what about b"
14
+ end
15
+
16
+ specify "Should return a tokenstream in an array split at the position where the first forward and back rules match" do
17
+ result=@ruleset.apply_to @tokenstream
18
+ result.should_be_a_kind_of Array
19
+ result.size.should_equal 1
20
+ result[0].should_be_a_kind_of Ariel::TokenStream
21
+ result[0].tokens.first.text.should_equal "Let"
22
+ result[0].tokens.last.text.should_equal "know"
23
+ end
24
+
25
+ specify "Should use the next forward or back rule if the first doesn't match" do
26
+ @tokenstream2=Ariel::TokenStream.new
27
+ @tokenstream2.tokenize "Only 2 and b in here"
28
+ result=@ruleset.apply_to @tokenstream2
29
+ result.size.should_equal 1
30
+ result[0].tokens.first.text.should_equal "and"
31
+ result[0].tokens.last.text.should_equal "and"
32
+ end
33
+
34
+ specify "Should return an empty array if there are no matches" do
35
+ stream=Ariel::TokenStream.new
36
+ stream.tokenize "Will not match"
37
+ (@ruleset.apply_to stream).should_equal []
38
+ end
39
+ end
40
+
41
+ context "A RuleSet of exhaustive rules" do
42
+ setup do
43
+ @tokenstream=Ariel::TokenStream.new
44
+ @tokenstream.tokenize <<EOS
45
+ <li>Item number one</li>
46
+ <li>Item number two</li>
47
+ <li>Item number three</li>
48
+ EOS
49
+ @frule=Ariel::Rule.new [["<li>"]], :forward, true
50
+ @brule = Ariel::Rule.new [["</li>"]], :back, true
51
+ @ruleset=Ariel::RuleSet.new [@frule], [@brule]
52
+ end
53
+
54
+ specify "Should return an array of all matches found by applying the rules exhaustively" do
55
+ result=@ruleset.apply_to @tokenstream
56
+ result.size.should_equal 3
57
+ result.each {|tokenstream| tokenstream.tokens.first.text.should_equal "Item"}
58
+ end
59
+ end
60
+
61
+ context "A RuleSet of exhaustive rules" do
62
+ setup do
63
+ @tokenstream=Ariel::TokenStream.new
64
+ @tokenstream.tokenize @@unlabeled_restaurant_example
65
+ @tokenstream=@tokenstream.slice_by_token_index(12, (@tokenstream.tokens.size - 2))
66
+ @frule=Ariel::Rule.new [["<i>"]], :forward, true
67
+ @brule=Ariel::Rule.new [["</i>"]], :back, true
68
+ @ruleset=Ariel::RuleSet.new [@frule], [@brule]
69
+ end
70
+
71
+ specify "Should extract a tokenstream when the first end match is before the first start match" do
72
+ result=@ruleset.apply_to @tokenstream
73
+ result.size.should_equal 3
74
+ result[0].tokens.first.text.should_equal "4000"
75
+ result[1].tokens.first.text.should_equal "523"
76
+ result[2].tokens.first.text.should_equal "403"
77
+ end
78
+
79
+ specify "Should return an empty array if there are no matches" do
80
+ stream=Ariel::TokenStream.new
81
+ stream.tokenize "Will not match"
82
+ (@ruleset.apply_to stream).should_equal []
83
+ end
84
+ end
85
+
@@ -0,0 +1,110 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+ include Fixtures
4
+
5
+ tokenstream=Ariel::TokenStream.new
6
+ tokenstream.tokenize @@unlabeled_document
7
+
8
+ labeled_tokenstream=Ariel::TokenStream.new
9
+ labeled_tokenstream.tokenize @@unlabeled_document
10
+ labeled_tokenstream.label_index=4
11
+
12
+ context "A forward rule with no landmarks" do
13
+ setup do
14
+ @rule=Ariel::Rule.new([], :forward)
15
+ end
16
+
17
+ specify "Should return its direction correctly" do
18
+ @rule.direction.should_equal :forward
19
+ end
20
+
21
+ specify "Should contain no wildcards" do
22
+ @rule.wildcard_count.should_equal 0
23
+ end
24
+
25
+ specify "Should match any tokenstream at index 0" do
26
+ @rule.apply_to(tokenstream).should_equal [0]
27
+ end
28
+
29
+ specify "Should not be exhaustive" do
30
+ @rule.should_not_be_exhaustive
31
+ end
32
+ end
33
+
34
+ context "A back rule with no landmarks" do
35
+ setup do
36
+ @rule=Ariel::Rule.new([], :back)
37
+ end
38
+
39
+ specify "Should match any tokenstream at its last token" do
40
+ match_loc=@rule.apply_to(tokenstream)
41
+ p match_loc
42
+ tokenstream.tokens[*@rule.apply_to(tokenstream)].should_equal tokenstream.tokens.last
43
+ end
44
+ end
45
+
46
+ context "Creating a new rule" do
47
+ specify "Should not be possible to create a rule with an invalid direction" do
48
+ lambda {Ariel::Rule.new([[:anything]], :upward)}.should_raise
49
+ end
50
+ end
51
+
52
+ context "Applying a non-exhaustive forward rule" do
53
+ setup do
54
+ @rule=Ariel::Rule.new [[:anything]], :forward
55
+ end
56
+ specify "apply_to should return an array of match locations" do
57
+ locs=@rule.apply_to(tokenstream)
58
+ locs.should_equal [1]
59
+ end
60
+
61
+ specify "apply_to should yield matchdata with type nil for tokenstreams with no label_index" do
62
+ @rule.apply_to(tokenstream) do |md|
63
+ md.type.should_be_nil
64
+ md.token_loc.should_equal 1
65
+ end
66
+ end
67
+
68
+ specify "apply_to should yield match data with type :early, :late, or :perfect for a labeled tokenstream" do
69
+ @rule.apply_to(labeled_tokenstream) do |md|
70
+ md.type.should_equal :early
71
+ end
72
+ late_rule=Ariel::Rule.new [["assess"]], :forward
73
+ late_rule.apply_to(labeled_tokenstream) do |md|
74
+ md.type.should_equal :late
75
+ end
76
+ perfect_rule = Ariel::Rule.new [["test"]], :forward
77
+ perfect_rule.apply_to(labeled_tokenstream) do |md|
78
+ md.type.should_equal :perfect
79
+ md.token_loc.should_equal 4
80
+ end
81
+ end
82
+
83
+ specify "matches should return true or false if applying the rule to a labeled tokenstream results in a match of one of the given types" do
84
+ @rule.matches(labeled_tokenstream, :late, :perfect, :fail).should_equal false
85
+ @rule.matches(labeled_tokenstream, :early, :late, :perfect, :fail).should_equal true
86
+ failed_rule=Ariel::Rule.new([["bacon"]], :forward)
87
+ failed_rule.matches(labeled_tokenstream, :fail).should_equal true
88
+ end
89
+
90
+ specify "apply_to should apply the rule in the right direction whether the tokenstream is reversed or not. The returned match should be from the left" do
91
+ @rule.apply_to(labeled_tokenstream).should_equal [1]
92
+ @rule.apply_to(labeled_tokenstream.reverse).should_equal [labeled_tokenstream.tokens.size-2]
93
+
94
+ end
95
+
96
+ end
97
+
98
+ context "Applying a non-exhaustive back rule" do
99
+
100
+ end
101
+
102
+ context "Applying an exhaustive rule" do
103
+ setup do
104
+ @rule = Ariel::Rule.new [[:html_tag]], :forward, true
105
+ end
106
+
107
+ specify "apply_to should return an array of multiple matches" do
108
+ @rule.apply_to(tokenstream).size.should_equal 4
109
+ end
110
+ end