ariel 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +49 -83
- data/bin/ariel +29 -20
- data/examples/google_calculator/structure.rb +2 -2
- data/examples/google_calculator/structure.yaml +13 -15
- data/examples/raa/labeled/highline.html +5 -4
- data/examples/raa/labeled/mongrel.html +9 -8
- data/examples/raa/structure.rb +4 -2
- data/examples/raa/structure.yaml +94 -78
- data/lib/ariel.rb +71 -33
- data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
- data/lib/ariel/label_utils.rb +46 -18
- data/lib/ariel/labeled_document_loader.rb +77 -0
- data/lib/ariel/learner.rb +60 -38
- data/lib/ariel/log.rb +67 -0
- data/lib/ariel/node.rb +52 -0
- data/lib/ariel/node/extracted.rb +90 -0
- data/lib/ariel/node/structure.rb +91 -0
- data/lib/ariel/rule.rb +114 -32
- data/lib/ariel/rule_set.rb +34 -15
- data/lib/ariel/token.rb +9 -3
- data/lib/ariel/token_stream.rb +32 -17
- data/lib/ariel/wildcards.rb +19 -15
- data/test/fixtures.rb +45 -3
- data/test/specs/candidate_refiner_spec.rb +48 -0
- data/test/specs/label_utils_spec.rb +97 -0
- data/test/specs/learner_spec.rb +39 -0
- data/test/specs/node_extracted_spec.rb +90 -0
- data/test/specs/node_spec.rb +76 -0
- data/test/specs/node_structure_spec.rb +74 -0
- data/test/specs/rule_set_spec.rb +85 -0
- data/test/specs/rule_spec.rb +110 -0
- data/test/specs/token_stream_spec.rb +100 -7
- metadata +21 -28
- data/lib/ariel/example_document_loader.rb +0 -59
- data/lib/ariel/extracted_node.rb +0 -20
- data/lib/ariel/node_like.rb +0 -26
- data/lib/ariel/structure_node.rb +0 -75
- data/test/ariel_test_case.rb +0 -15
- data/test/test_candidate_selector.rb +0 -58
- data/test/test_example_document_loader.rb +0 -7
- data/test/test_label_utils.rb +0 -15
- data/test/test_learner.rb +0 -38
- data/test/test_rule.rb +0 -38
- data/test/test_structure_node.rb +0 -81
- data/test/test_token.rb +0 -16
- data/test/test_token_stream.rb +0 -82
- data/test/test_wildcards.rb +0 -18
data/test/test_label_utils.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestLabelUtils < Ariel::TestCase
|
5
|
-
include Fixtures
|
6
|
-
|
7
|
-
def test_label_regex
|
8
|
-
assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
|
9
|
-
assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_clean_string
|
13
|
-
assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
|
14
|
-
end
|
15
|
-
end
|
data/test/test_learner.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestLearner < Ariel::TestCase
|
5
|
-
include Fixtures
|
6
|
-
|
7
|
-
def setup
|
8
|
-
#Examples stolen from the STALKER paper. Target to extract is the area
|
9
|
-
#codes.
|
10
|
-
@e=@@labeled_addresses
|
11
|
-
@learner=Ariel::Learner.new(*@e)
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_set_seed
|
15
|
-
assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
|
16
|
-
end
|
17
|
-
|
18
|
-
def test_generate_initial_candidates
|
19
|
-
@learner.direction=:forward
|
20
|
-
@learner.generate_initial_candidates
|
21
|
-
c=@learner.candidates
|
22
|
-
assert (c.include? Ariel::Rule.new(:forward, [["("]]))
|
23
|
-
assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
|
24
|
-
assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_refine
|
28
|
-
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
|
29
|
-
assert @learner.refine
|
30
|
-
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
|
31
|
-
assert @learner.refine
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_learn_rule
|
35
|
-
rule=@learner.learn_rule :forward
|
36
|
-
p rule
|
37
|
-
end
|
38
|
-
end
|
data/test/test_rule.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestRule < Ariel::TestCase
|
5
|
-
def setup
|
6
|
-
@labeled=Ariel::TokenStream.new
|
7
|
-
@labeled.tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
|
8
|
-
@labeled.set_label_at 35
|
9
|
-
@perfect_rule=Ariel::Rule.new(:forward, [["Phone"], ["("]])
|
10
|
-
@early_rule=Ariel::Rule.new(:forward, [[:anything]])
|
11
|
-
@late_rule=Ariel::Rule.new(:forward, [["508"]])
|
12
|
-
@unlabeled=Ariel::TokenStream.new
|
13
|
-
@unlabeled.tokenize("Robot 9753 reporting for duty. BEEP BEEP")
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_apply_to
|
17
|
-
md=nil
|
18
|
-
@perfect_rule.apply_to(@labeled) {|md|}
|
19
|
-
assert_equal :perfect, md.type
|
20
|
-
@early_rule.apply_to(@labeled) {|md|}
|
21
|
-
assert_equal :early, md.type
|
22
|
-
assert_equal 1, md.token_loc
|
23
|
-
@late_rule.apply_to(@labeled) {|md|}
|
24
|
-
assert_equal :late, md.type
|
25
|
-
assert_nil (@perfect_rule.apply_to(@unlabeled))
|
26
|
-
end
|
27
|
-
|
28
|
-
def test_matches
|
29
|
-
assert(@early_rule.matches(@labeled, :early))
|
30
|
-
assert(@late_rule.matches(@labeled, :early, :late))
|
31
|
-
assert(@perfect_rule.matches(@unlabeled, :fail))
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_wildcard_count
|
35
|
-
assert_equal 0, @perfect_rule.wildcard_count
|
36
|
-
assert_equal 1, @early_rule.wildcard_count
|
37
|
-
end
|
38
|
-
end
|
data/test/test_structure_node.rb
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestStructureNode < Ariel::TestCase
|
5
|
-
def setup
|
6
|
-
@tree=Ariel::StructureNode.new do |r|
|
7
|
-
r.item :item_info do |i|
|
8
|
-
i.item :title
|
9
|
-
i.item :price
|
10
|
-
i.item :stock_level
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
def test_unnested
|
15
|
-
t=Ariel::StructureNode.new {|r| r.item :picture; r.item :title; r.item :description; r.item :url}
|
16
|
-
assert t
|
17
|
-
assert_equal Ariel::StructureNode, t.picture.class
|
18
|
-
assert_equal :root, t.meta.name
|
19
|
-
end
|
20
|
-
|
21
|
-
def test_nested
|
22
|
-
assert @tree.item_info.children.has_key?(:title)
|
23
|
-
end
|
24
|
-
|
25
|
-
def test_nested_with_list
|
26
|
-
doc_tree=Ariel::StructureNode.new do |r|
|
27
|
-
r.item :restaurant_list do |r|
|
28
|
-
r.list_item :restaurant do |r|
|
29
|
-
r.item :name
|
30
|
-
r.item :address
|
31
|
-
r.item :phone
|
32
|
-
r.item :review
|
33
|
-
r.item :credit_card_list do |c|
|
34
|
-
c.item :credit_card
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
assert doc_tree
|
40
|
-
assert_equal :list, doc_tree.restaurant_list.restaurant.meta.node_type
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_extend_structure
|
44
|
-
assert (@tree.extend_structure {|r| r.item :site_copyright; r.item :logo;})
|
45
|
-
assert @tree.children.has_key?(:site_copyright)
|
46
|
-
assert @tree.children.has_key?(:logo)
|
47
|
-
assert (@tree.item_info.extend_structure {|i| i.item :picture})
|
48
|
-
assert @tree.item_info.children.has_key?(:picture)
|
49
|
-
end
|
50
|
-
|
51
|
-
# def test_apply_extraction_tree_on
|
52
|
-
# # t = Ariel::StructureNode.new do |r|
|
53
|
-
# # # r.title
|
54
|
-
# # # # r.content do |c|
|
55
|
-
# # # # # c.excerpt
|
56
|
-
# # # # # # c.body
|
57
|
-
# # # # # # # end
|
58
|
-
# # # # # # # # end
|
59
|
-
# # # # # # # # # str = %q{Title: The test of the Century
|
60
|
-
# # # # # # # # # # <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
|
61
|
-
# # # # # # # # # # # There was once a test designed to assess whether apply_extraction_tree_on worked.}
|
62
|
-
# # # # # # # # # # # # tokenstream = Ariel::TokenStream.new
|
63
|
-
# # # # # # # # # # # # # tokenstream.tokenize(str)
|
64
|
-
# # # # # # # # # # # # # # root = Ariel::ExtractedNode.new(tokenstream, :structure=>t, :name=>:root)
|
65
|
-
# # # # # # # # # # # # # # # t.title.meta.start_rule = Ariel::Rule.new(["Title", ":"])
|
66
|
-
# # # # # # # # # # # # # # # # t.title.meta.end_rule = Ariel::Rule.new(["<b>"])
|
67
|
-
# # # # # # # # # # # # # # # # # t.title.meta.end_rule.direction = :back
|
68
|
-
# # # # # # # # # # # # # # # # # # t.content.meta.start_rule = Ariel::Rule.new(["Century"]) #later implementation might use skip_until("<b>")
|
69
|
-
# # # # # # # # # # # # # # # # # # # t.content.meta.end_rule = Ariel::Rule.new()
|
70
|
-
# # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule.direction = :back
|
71
|
-
# # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.start_rule = Ariel::Rule.new(["<i>"])
|
72
|
-
# # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule = Ariel::Rule.new([".</"])
|
73
|
-
# # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule.direction = :back
|
74
|
-
# # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.start_rule = Ariel::Rule.new(["i", ">"])
|
75
|
-
# # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule = Ariel::Rule.new()
|
76
|
-
# # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule.direction = :back
|
77
|
-
# # # # # # # # # # # # # # # # # # # # # # # # # # # t.apply_extraction_tree_on root
|
78
|
-
# end
|
79
|
-
|
80
|
-
|
81
|
-
end
|
data/test/test_token.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestToken < Ariel::TestCase
|
5
|
-
def setup
|
6
|
-
@t=Ariel::Token.new('Test', 0, 4)
|
7
|
-
end
|
8
|
-
|
9
|
-
def test_matches?
|
10
|
-
assert @t.matches?('Test')
|
11
|
-
assert_equal false, @t.matches?('test')
|
12
|
-
assert_equal false, @t.matches?('te')
|
13
|
-
assert @t.matches?(:alpha)
|
14
|
-
assert_equal false, @t.matches?(:html_tag)
|
15
|
-
end
|
16
|
-
end
|
data/test/test_token_stream.rb
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestTokenStream < Ariel::TestCase
|
5
|
-
include Fixtures
|
6
|
-
|
7
|
-
def setup
|
8
|
-
@stream=Ariel::TokenStream.new
|
9
|
-
@text = "This is test101. See below:"
|
10
|
-
@stream.tokenize(@text)
|
11
|
-
|
12
|
-
@labeled_stream = Ariel::TokenStream.new
|
13
|
-
@labeled_stream.tokenize(@@labeled_document, true)
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_advance
|
17
|
-
assert_equal Ariel::Token.new("This", 0, 4), @stream.advance
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_cur_pos
|
21
|
-
assert_equal 0, @stream.cur_pos
|
22
|
-
@stream.advance
|
23
|
-
assert_equal 1, @stream.cur_pos
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_each
|
27
|
-
i=0
|
28
|
-
@stream.each {i=i+1}
|
29
|
-
assert_equal 8, i
|
30
|
-
assert_equal 9, @stream.cur_pos
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_rewind
|
34
|
-
@stream.each {}
|
35
|
-
@stream.rewind
|
36
|
-
assert_equal 0, @stream.cur_pos
|
37
|
-
end
|
38
|
-
|
39
|
-
def test_skip_to
|
40
|
-
assert @stream.skip_to("This")
|
41
|
-
assert_equal 1, @stream.cur_pos #Test the matched token has been consumed
|
42
|
-
assert_nil @stream.skip_to("Ruby")
|
43
|
-
assert_equal 1, @stream.cur_pos #Stream's position remains unchanged by a failed match
|
44
|
-
assert @stream.skip_to("See", "below")
|
45
|
-
assert_equal 7, @stream.cur_pos
|
46
|
-
@stream.rewind
|
47
|
-
@stream.skip_to(:anything, "below")
|
48
|
-
assert_equal 7, @stream.cur_pos
|
49
|
-
end
|
50
|
-
|
51
|
-
def test_tokenize
|
52
|
-
assert_equal 8, @stream.tokens.length
|
53
|
-
@stream.each do |token|
|
54
|
-
assert_equal @text[token.start_loc...token.end_loc], token.text
|
55
|
-
end
|
56
|
-
@labeled_stream.each do |token|
|
57
|
-
assert_equal @@labeled_document[token.start_loc...token.end_loc], token.text
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def test_set_label_at
|
62
|
-
assert_raise(ArgumentError) {@stream.set_label_at 1}
|
63
|
-
assert_nil @stream.label_index
|
64
|
-
assert(@labeled_stream.set_label_at(16))
|
65
|
-
assert_equal("The", @labeled_stream.tokens[@labeled_stream.label_index].text)
|
66
|
-
end
|
67
|
-
|
68
|
-
def test_raw_text
|
69
|
-
assert_equal @text, @stream.raw_text
|
70
|
-
assert_equal @@labeled_document.chomp, @labeled_stream.raw_text
|
71
|
-
end
|
72
|
-
|
73
|
-
def test_text
|
74
|
-
assert_equal @text, @stream.text
|
75
|
-
assert_equal @@unlabeled_document.chomp, @labeled_stream.text
|
76
|
-
end
|
77
|
-
|
78
|
-
def test_slice_by_token_index
|
79
|
-
assert sliced=@stream.slice_by_token_index(1,3)
|
80
|
-
assert_equal @text[sliced.tokens.first.start_loc...sliced.tokens.last.end_loc], sliced.text
|
81
|
-
end
|
82
|
-
end
|
data/test/test_wildcards.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'ariel'
|
2
|
-
require 'ariel_test_case'
|
3
|
-
|
4
|
-
class TestWildcards < Ariel::TestCase
|
5
|
-
|
6
|
-
def test_list
|
7
|
-
assert (wildcards=Ariel::Wildcards.list)
|
8
|
-
assert (wildcards.kind_of? Hash)
|
9
|
-
end
|
10
|
-
|
11
|
-
def test_matching
|
12
|
-
assert matches=Ariel::Wildcards.matching("123")
|
13
|
-
assert (matches.include? :alpha_numeric)
|
14
|
-
assert (matches.include? :numeric)
|
15
|
-
assert (matches.include? :anything)
|
16
|
-
assert_equal 3, matches.size
|
17
|
-
end
|
18
|
-
end
|