ariel 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/LICENSE +21 -0
  2. data/README +98 -0
  3. data/bin/ariel +56 -0
  4. data/examples/google_calculator/labeled/1 +43 -0
  5. data/examples/google_calculator/labeled/2 +41 -0
  6. data/examples/google_calculator/labeled/3 +41 -0
  7. data/examples/google_calculator/structure.rb +12 -0
  8. data/examples/google_calculator/structure.yaml +46 -0
  9. data/examples/google_calculator/unlabeled/1 +43 -0
  10. data/examples/google_calculator/unlabeled/2 +43 -0
  11. data/examples/raa/labeled/highline.html +135 -0
  12. data/examples/raa/labeled/mongrel.html +168 -0
  13. data/examples/raa/structure.rb +17 -0
  14. data/examples/raa/structure.yaml +183 -0
  15. data/examples/raa/unlabeled/pdf-writer.html +175 -0
  16. data/lib/ariel/candidate_selector.rb +94 -0
  17. data/lib/ariel/example_document_loader.rb +59 -0
  18. data/lib/ariel/extracted_node.rb +20 -0
  19. data/lib/ariel/label_utils.rb +71 -0
  20. data/lib/ariel/learner.rb +237 -0
  21. data/lib/ariel/node_like.rb +26 -0
  22. data/lib/ariel/rule.rb +112 -0
  23. data/lib/ariel/rule_set.rb +34 -0
  24. data/lib/ariel/structure_node.rb +75 -0
  25. data/lib/ariel/token.rb +68 -0
  26. data/lib/ariel/token_stream.rb +240 -0
  27. data/lib/ariel/wildcards.rb +33 -0
  28. data/lib/ariel.rb +69 -0
  29. data/test/ariel_test_case.rb +15 -0
  30. data/test/fixtures.rb +43 -0
  31. data/test/specs/token_spec.rb +65 -0
  32. data/test/specs/token_stream_spec.rb +43 -0
  33. data/test/specs/wildcards_spec.rb +26 -0
  34. data/test/test_candidate_selector.rb +58 -0
  35. data/test/test_example_document_loader.rb +7 -0
  36. data/test/test_label_utils.rb +15 -0
  37. data/test/test_learner.rb +38 -0
  38. data/test/test_rule.rb +38 -0
  39. data/test/test_structure_node.rb +81 -0
  40. data/test/test_token.rb +16 -0
  41. data/test/test_token_stream.rb +82 -0
  42. data/test/test_wildcards.rb +18 -0
  43. metadata +103 -0
data/test/test_rule.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestRule < Ariel::TestCase
5
+ def setup
6
+ @labeled=Ariel::TokenStream.new
7
+ @labeled.tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
8
+ @labeled.set_label_at 35
9
+ @perfect_rule=Ariel::Rule.new(:forward, [["Phone"], ["("]])
10
+ @early_rule=Ariel::Rule.new(:forward, [[:anything]])
11
+ @late_rule=Ariel::Rule.new(:forward, [["508"]])
12
+ @unlabeled=Ariel::TokenStream.new
13
+ @unlabeled.tokenize("Robot 9753 reporting for duty. BEEP BEEP")
14
+ end
15
+
16
+ def test_apply_to
17
+ md=nil
18
+ @perfect_rule.apply_to(@labeled) {|md|}
19
+ assert_equal :perfect, md.type
20
+ @early_rule.apply_to(@labeled) {|md|}
21
+ assert_equal :early, md.type
22
+ assert_equal 1, md.token_loc
23
+ @late_rule.apply_to(@labeled) {|md|}
24
+ assert_equal :late, md.type
25
+ assert_nil (@perfect_rule.apply_to(@unlabeled))
26
+ end
27
+
28
+ def test_matches
29
+ assert(@early_rule.matches(@labeled, :early))
30
+ assert(@late_rule.matches(@labeled, :early, :late))
31
+ assert(@perfect_rule.matches(@unlabeled, :fail))
32
+ end
33
+
34
+ def test_wildcard_count
35
+ assert_equal 0, @perfect_rule.wildcard_count
36
+ assert_equal 1, @early_rule.wildcard_count
37
+ end
38
+ end
@@ -0,0 +1,81 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestStructureNode < Ariel::TestCase
5
+ def setup
6
+ @tree=Ariel::StructureNode.new do |r|
7
+ r.item :item_info do |i|
8
+ i.item :title
9
+ i.item :price
10
+ i.item :stock_level
11
+ end
12
+ end
13
+ end
14
+ def test_unnested
15
+ t=Ariel::StructureNode.new {|r| r.item :picture; r.item :title; r.item :description; r.item :url}
16
+ assert t
17
+ assert_equal Ariel::StructureNode, t.picture.class
18
+ assert_equal :root, t.meta.name
19
+ end
20
+
21
+ def test_nested
22
+ assert @tree.item_info.children.has_key?(:title)
23
+ end
24
+
25
+ def test_nested_with_list
26
+ doc_tree=Ariel::StructureNode.new do |r|
27
+ r.item :restaurant_list do |r|
28
+ r.list_item :restaurant do |r|
29
+ r.item :name
30
+ r.item :address
31
+ r.item :phone
32
+ r.item :review
33
+ r.item :credit_card_list do |c|
34
+ c.item :credit_card
35
+ end
36
+ end
37
+ end
38
+ end
39
+ assert doc_tree
40
+ assert_equal :list, doc_tree.restaurant_list.restaurant.meta.node_type
41
+ end
42
+
43
+ def test_extend_structure
44
+ assert (@tree.extend_structure {|r| r.item :site_copyright; r.item :logo;})
45
+ assert @tree.children.has_key?(:site_copyright)
46
+ assert @tree.children.has_key?(:logo)
47
+ assert (@tree.item_info.extend_structure {|i| i.item :picture})
48
+ assert @tree.item_info.children.has_key?(:picture)
49
+ end
50
+
51
+ # def test_apply_extraction_tree_on
52
+ # # t = Ariel::StructureNode.new do |r|
53
+ # # # r.title
54
+ # # # # r.content do |c|
55
+ # # # # # c.excerpt
56
+ # # # # # # c.body
57
+ # # # # # # # end
58
+ # # # # # # # # end
59
+ # # # # # # # # # str = %q{Title: The test of the Century
60
+ # # # # # # # # # # <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
61
+ # # # # # # # # # # # There was once a test designed to assess whether apply_extraction_tree_on worked.}
62
+ # # # # # # # # # # # # tokenstream = Ariel::TokenStream.new
63
+ # # # # # # # # # # # # # tokenstream.tokenize(str)
64
+ # # # # # # # # # # # # # # root = Ariel::ExtractedNode.new(tokenstream, :structure=>t, :name=>:root)
65
+ # # # # # # # # # # # # # # # t.title.meta.start_rule = Ariel::Rule.new(["Title", ":"])
66
+ # # # # # # # # # # # # # # # # t.title.meta.end_rule = Ariel::Rule.new(["<b>"])
67
+ # # # # # # # # # # # # # # # # # t.title.meta.end_rule.direction = :back
68
+ # # # # # # # # # # # # # # # # # # t.content.meta.start_rule = Ariel::Rule.new(["Century"]) #later implementation might use skip_until("<b>")
69
+ # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule = Ariel::Rule.new()
70
+ # # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule.direction = :back
71
+ # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.start_rule = Ariel::Rule.new(["<i>"])
72
+ # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule = Ariel::Rule.new([".</"])
73
+ # # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule.direction = :back
74
+ # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.start_rule = Ariel::Rule.new(["i", ">"])
75
+ # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule = Ariel::Rule.new()
76
+ # # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule.direction = :back
77
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # t.apply_extraction_tree_on root
78
+ # end
79
+
80
+
81
+ end
@@ -0,0 +1,16 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestToken < Ariel::TestCase
5
+ def setup
6
+ @t=Ariel::Token.new('Test', 0, 4)
7
+ end
8
+
9
+ def test_matches?
10
+ assert @t.matches?('Test')
11
+ assert_equal false, @t.matches?('test')
12
+ assert_equal false, @t.matches?('te')
13
+ assert @t.matches?(:alpha)
14
+ assert_equal false, @t.matches?(:html_tag)
15
+ end
16
+ end
@@ -0,0 +1,82 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestTokenStream < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ def setup
8
+ @stream=Ariel::TokenStream.new
9
+ @text = "This is test101. See below:"
10
+ @stream.tokenize(@text)
11
+
12
+ @labeled_stream = Ariel::TokenStream.new
13
+ @labeled_stream.tokenize(@@labeled_document, true)
14
+ end
15
+
16
+ def test_advance
17
+ assert_equal Ariel::Token.new("This", 0, 4), @stream.advance
18
+ end
19
+
20
+ def test_cur_pos
21
+ assert_equal 0, @stream.cur_pos
22
+ @stream.advance
23
+ assert_equal 1, @stream.cur_pos
24
+ end
25
+
26
+ def test_each
27
+ i=0
28
+ @stream.each {i=i+1}
29
+ assert_equal 8, i
30
+ assert_equal 9, @stream.cur_pos
31
+ end
32
+
33
+ def test_rewind
34
+ @stream.each {}
35
+ @stream.rewind
36
+ assert_equal 0, @stream.cur_pos
37
+ end
38
+
39
+ def test_skip_to
40
+ assert @stream.skip_to("This")
41
+ assert_equal 1, @stream.cur_pos #Test the matched token has been consumed
42
+ assert_nil @stream.skip_to("Ruby")
43
+ assert_equal 1, @stream.cur_pos #Stream's position remains unchanged by a failed match
44
+ assert @stream.skip_to("See", "below")
45
+ assert_equal 7, @stream.cur_pos
46
+ @stream.rewind
47
+ @stream.skip_to(:anything, "below")
48
+ assert_equal 7, @stream.cur_pos
49
+ end
50
+
51
+ def test_tokenize
52
+ assert_equal 8, @stream.tokens.length
53
+ @stream.each do |token|
54
+ assert_equal @text[token.start_loc...token.end_loc], token.text
55
+ end
56
+ @labeled_stream.each do |token|
57
+ assert_equal @@labeled_document[token.start_loc...token.end_loc], token.text
58
+ end
59
+ end
60
+
61
+ def test_set_label_at
62
+ assert_raise(ArgumentError) {@stream.set_label_at 1}
63
+ assert_nil @stream.label_index
64
+ assert(@labeled_stream.set_label_at(16))
65
+ assert_equal("The", @labeled_stream.tokens[@labeled_stream.label_index].text)
66
+ end
67
+
68
+ def test_raw_text
69
+ assert_equal @text, @stream.raw_text
70
+ assert_equal @@labeled_document.chomp, @labeled_stream.raw_text
71
+ end
72
+
73
+ def test_text
74
+ assert_equal @text, @stream.text
75
+ assert_equal @@unlabeled_document.chomp, @labeled_stream.text
76
+ end
77
+
78
+ def test_slice_by_token_index
79
+ assert sliced=@stream.slice_by_token_index(1,3)
80
+ assert_equal @text[sliced.tokens.first.start_loc...sliced.tokens.last.end_loc], sliced.text
81
+ end
82
+ end
@@ -0,0 +1,18 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestWildcards < Ariel::TestCase
5
+
6
+ def test_list
7
+ assert (wildcards=Ariel::Wildcards.list)
8
+ assert (wildcards.kind_of? Hash)
9
+ end
10
+
11
+ def test_matching
12
+ assert matches=Ariel::Wildcards.matching("123")
13
+ assert (matches.include? :alpha_numeric)
14
+ assert (matches.include? :numeric)
15
+ assert (matches.include? :anything)
16
+ assert_equal 3, matches.size
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: ariel
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.1
7
+ date: 2006-08-09 00:00:00 +01:00
8
+ summary: A Ruby Information Extraction Library
9
+ require_paths:
10
+ - lib
11
+ email: asbradbury@gmail.com
12
+ homepage: http://ariel.rubyforge.org
13
+ rubyforge_project: ariel
14
+ description: Ariel uses machine learning to assist in extracting information from semi-structured documents including (but not in any way limited to) web pages
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - A. S. Bradbury
30
+ files:
31
+ - lib/ariel
32
+ - lib/ariel.rb
33
+ - lib/ariel/extracted_node.rb
34
+ - lib/ariel/learner.rb
35
+ - lib/ariel/example_document_loader.rb
36
+ - lib/ariel/rule_set.rb
37
+ - lib/ariel/rule.rb
38
+ - lib/ariel/wildcards.rb
39
+ - lib/ariel/token_stream.rb
40
+ - lib/ariel/label_utils.rb
41
+ - lib/ariel/structure_node.rb
42
+ - lib/ariel/token.rb
43
+ - lib/ariel/candidate_selector.rb
44
+ - lib/ariel/node_like.rb
45
+ - test/test_learner.rb
46
+ - test/specs
47
+ - test/test_rule.rb
48
+ - test/ariel_test_case.rb
49
+ - test/fixtures.rb
50
+ - test/test_token_stream.rb
51
+ - test/test_example_document_loader.rb
52
+ - test/test_token.rb
53
+ - test/test_structure_node.rb
54
+ - test/test_label_utils.rb
55
+ - test/test_candidate_selector.rb
56
+ - test/test_wildcards.rb
57
+ - test/specs/token_stream_spec.rb
58
+ - test/specs/wildcards_spec.rb
59
+ - test/specs/token_spec.rb
60
+ - README
61
+ - LICENSE
62
+ - examples/raa
63
+ - examples/google_calculator
64
+ - examples/raa/structure.rb
65
+ - examples/raa/labeled
66
+ - examples/raa/unlabeled
67
+ - examples/raa/structure.yaml
68
+ - examples/raa/labeled/mongrel.html
69
+ - examples/raa/labeled/highline.html
70
+ - examples/raa/unlabeled/pdf-writer.html
71
+ - examples/google_calculator/structure.rb
72
+ - examples/google_calculator/labeled
73
+ - examples/google_calculator/unlabeled
74
+ - examples/google_calculator/structure.yaml
75
+ - examples/google_calculator/labeled/1
76
+ - examples/google_calculator/labeled/2
77
+ - examples/google_calculator/labeled/3
78
+ - examples/google_calculator/unlabeled/1
79
+ - examples/google_calculator/unlabeled/2
80
+ - bin/ariel
81
+ test_files:
82
+ - test/test_learner.rb
83
+ - test/test_rule.rb
84
+ - test/test_token_stream.rb
85
+ - test/test_example_document_loader.rb
86
+ - test/test_token.rb
87
+ - test/test_structure_node.rb
88
+ - test/test_label_utils.rb
89
+ - test/test_candidate_selector.rb
90
+ - test/test_wildcards.rb
91
+ rdoc_options: []
92
+
93
+ extra_rdoc_files:
94
+ - README
95
+ - LICENSE
96
+ executables:
97
+ - ariel
98
+ extensions: []
99
+
100
+ requirements: []
101
+
102
+ dependencies: []
103
+