ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -1,7 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestExampleDocumentLoader < Ariel::TestCase
5
- include Fixtures
6
-
7
- end
@@ -1,15 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestLabelUtils < Ariel::TestCase
5
- include Fixtures
6
-
7
- def test_label_regex
8
- assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
9
- assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
10
- end
11
-
12
- def test_clean_string
13
- assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
14
- end
15
- end
@@ -1,38 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestLearner < Ariel::TestCase
5
- include Fixtures
6
-
7
- def setup
8
- #Examples stolen from the STALKER paper. Target to extract is the area
9
- #codes.
10
- @e=@@labeled_addresses
11
- @learner=Ariel::Learner.new(*@e)
12
- end
13
-
14
- def test_set_seed
15
- assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
16
- end
17
-
18
- def test_generate_initial_candidates
19
- @learner.direction=:forward
20
- @learner.generate_initial_candidates
21
- c=@learner.candidates
22
- assert (c.include? Ariel::Rule.new(:forward, [["("]]))
23
- assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
24
- assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
25
- end
26
-
27
- def test_refine
28
- @learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
29
- assert @learner.refine
30
- @learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
31
- assert @learner.refine
32
- end
33
-
34
- def test_learn_rule
35
- rule=@learner.learn_rule :forward
36
- p rule
37
- end
38
- end
@@ -1,38 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestRule < Ariel::TestCase
5
- def setup
6
- @labeled=Ariel::TokenStream.new
7
- @labeled.tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
8
- @labeled.set_label_at 35
9
- @perfect_rule=Ariel::Rule.new(:forward, [["Phone"], ["("]])
10
- @early_rule=Ariel::Rule.new(:forward, [[:anything]])
11
- @late_rule=Ariel::Rule.new(:forward, [["508"]])
12
- @unlabeled=Ariel::TokenStream.new
13
- @unlabeled.tokenize("Robot 9753 reporting for duty. BEEP BEEP")
14
- end
15
-
16
- def test_apply_to
17
- md=nil
18
- @perfect_rule.apply_to(@labeled) {|md|}
19
- assert_equal :perfect, md.type
20
- @early_rule.apply_to(@labeled) {|md|}
21
- assert_equal :early, md.type
22
- assert_equal 1, md.token_loc
23
- @late_rule.apply_to(@labeled) {|md|}
24
- assert_equal :late, md.type
25
- assert_nil (@perfect_rule.apply_to(@unlabeled))
26
- end
27
-
28
- def test_matches
29
- assert(@early_rule.matches(@labeled, :early))
30
- assert(@late_rule.matches(@labeled, :early, :late))
31
- assert(@perfect_rule.matches(@unlabeled, :fail))
32
- end
33
-
34
- def test_wildcard_count
35
- assert_equal 0, @perfect_rule.wildcard_count
36
- assert_equal 1, @early_rule.wildcard_count
37
- end
38
- end
@@ -1,81 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestStructureNode < Ariel::TestCase
5
- def setup
6
- @tree=Ariel::StructureNode.new do |r|
7
- r.item :item_info do |i|
8
- i.item :title
9
- i.item :price
10
- i.item :stock_level
11
- end
12
- end
13
- end
14
- def test_unnested
15
- t=Ariel::StructureNode.new {|r| r.item :picture; r.item :title; r.item :description; r.item :url}
16
- assert t
17
- assert_equal Ariel::StructureNode, t.picture.class
18
- assert_equal :root, t.meta.name
19
- end
20
-
21
- def test_nested
22
- assert @tree.item_info.children.has_key?(:title)
23
- end
24
-
25
- def test_nested_with_list
26
- doc_tree=Ariel::StructureNode.new do |r|
27
- r.item :restaurant_list do |r|
28
- r.list_item :restaurant do |r|
29
- r.item :name
30
- r.item :address
31
- r.item :phone
32
- r.item :review
33
- r.item :credit_card_list do |c|
34
- c.item :credit_card
35
- end
36
- end
37
- end
38
- end
39
- assert doc_tree
40
- assert_equal :list, doc_tree.restaurant_list.restaurant.meta.node_type
41
- end
42
-
43
- def test_extend_structure
44
- assert (@tree.extend_structure {|r| r.item :site_copyright; r.item :logo;})
45
- assert @tree.children.has_key?(:site_copyright)
46
- assert @tree.children.has_key?(:logo)
47
- assert (@tree.item_info.extend_structure {|i| i.item :picture})
48
- assert @tree.item_info.children.has_key?(:picture)
49
- end
50
-
51
- # def test_apply_extraction_tree_on
52
- # # t = Ariel::StructureNode.new do |r|
53
- # # # r.title
54
- # # # # r.content do |c|
55
- # # # # # c.excerpt
56
- # # # # # # c.body
57
- # # # # # # # end
58
- # # # # # # # # end
59
- # # # # # # # # # str = %q{Title: The test of the Century
60
- # # # # # # # # # # <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
61
- # # # # # # # # # # # There was once a test designed to assess whether apply_extraction_tree_on worked.}
62
- # # # # # # # # # # # # tokenstream = Ariel::TokenStream.new
63
- # # # # # # # # # # # # # tokenstream.tokenize(str)
64
- # # # # # # # # # # # # # # root = Ariel::ExtractedNode.new(tokenstream, :structure=>t, :name=>:root)
65
- # # # # # # # # # # # # # # # t.title.meta.start_rule = Ariel::Rule.new(["Title", ":"])
66
- # # # # # # # # # # # # # # # # t.title.meta.end_rule = Ariel::Rule.new(["<b>"])
67
- # # # # # # # # # # # # # # # # # t.title.meta.end_rule.direction = :back
68
- # # # # # # # # # # # # # # # # # # t.content.meta.start_rule = Ariel::Rule.new(["Century"]) #later implementation might use skip_until("<b>")
69
- # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule = Ariel::Rule.new()
70
- # # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule.direction = :back
71
- # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.start_rule = Ariel::Rule.new(["<i>"])
72
- # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule = Ariel::Rule.new([".</"])
73
- # # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule.direction = :back
74
- # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.start_rule = Ariel::Rule.new(["i", ">"])
75
- # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule = Ariel::Rule.new()
76
- # # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule.direction = :back
77
- # # # # # # # # # # # # # # # # # # # # # # # # # # # t.apply_extraction_tree_on root
78
- # end
79
-
80
-
81
- end
@@ -1,16 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestToken < Ariel::TestCase
5
- def setup
6
- @t=Ariel::Token.new('Test', 0, 4)
7
- end
8
-
9
- def test_matches?
10
- assert @t.matches?('Test')
11
- assert_equal false, @t.matches?('test')
12
- assert_equal false, @t.matches?('te')
13
- assert @t.matches?(:alpha)
14
- assert_equal false, @t.matches?(:html_tag)
15
- end
16
- end
@@ -1,82 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestTokenStream < Ariel::TestCase
5
- include Fixtures
6
-
7
- def setup
8
- @stream=Ariel::TokenStream.new
9
- @text = "This is test101. See below:"
10
- @stream.tokenize(@text)
11
-
12
- @labeled_stream = Ariel::TokenStream.new
13
- @labeled_stream.tokenize(@@labeled_document, true)
14
- end
15
-
16
- def test_advance
17
- assert_equal Ariel::Token.new("This", 0, 4), @stream.advance
18
- end
19
-
20
- def test_cur_pos
21
- assert_equal 0, @stream.cur_pos
22
- @stream.advance
23
- assert_equal 1, @stream.cur_pos
24
- end
25
-
26
- def test_each
27
- i=0
28
- @stream.each {i=i+1}
29
- assert_equal 8, i
30
- assert_equal 9, @stream.cur_pos
31
- end
32
-
33
- def test_rewind
34
- @stream.each {}
35
- @stream.rewind
36
- assert_equal 0, @stream.cur_pos
37
- end
38
-
39
- def test_skip_to
40
- assert @stream.skip_to("This")
41
- assert_equal 1, @stream.cur_pos #Test the matched token has been consumed
42
- assert_nil @stream.skip_to("Ruby")
43
- assert_equal 1, @stream.cur_pos #Stream's position remains unchanged by a failed match
44
- assert @stream.skip_to("See", "below")
45
- assert_equal 7, @stream.cur_pos
46
- @stream.rewind
47
- @stream.skip_to(:anything, "below")
48
- assert_equal 7, @stream.cur_pos
49
- end
50
-
51
- def test_tokenize
52
- assert_equal 8, @stream.tokens.length
53
- @stream.each do |token|
54
- assert_equal @text[token.start_loc...token.end_loc], token.text
55
- end
56
- @labeled_stream.each do |token|
57
- assert_equal @@labeled_document[token.start_loc...token.end_loc], token.text
58
- end
59
- end
60
-
61
- def test_set_label_at
62
- assert_raise(ArgumentError) {@stream.set_label_at 1}
63
- assert_nil @stream.label_index
64
- assert(@labeled_stream.set_label_at(16))
65
- assert_equal("The", @labeled_stream.tokens[@labeled_stream.label_index].text)
66
- end
67
-
68
- def test_raw_text
69
- assert_equal @text, @stream.raw_text
70
- assert_equal @@labeled_document.chomp, @labeled_stream.raw_text
71
- end
72
-
73
- def test_text
74
- assert_equal @text, @stream.text
75
- assert_equal @@unlabeled_document.chomp, @labeled_stream.text
76
- end
77
-
78
- def test_slice_by_token_index
79
- assert sliced=@stream.slice_by_token_index(1,3)
80
- assert_equal @text[sliced.tokens.first.start_loc...sliced.tokens.last.end_loc], sliced.text
81
- end
82
- end
@@ -1,18 +0,0 @@
1
- require 'ariel'
2
- require 'ariel_test_case'
3
-
4
- class TestWildcards < Ariel::TestCase
5
-
6
- def test_list
7
- assert (wildcards=Ariel::Wildcards.list)
8
- assert (wildcards.kind_of? Hash)
9
- end
10
-
11
- def test_matching
12
- assert matches=Ariel::Wildcards.matching("123")
13
- assert (matches.include? :alpha_numeric)
14
- assert (matches.include? :numeric)
15
- assert (matches.include? :anything)
16
- assert_equal 3, matches.size
17
- end
18
- end