ariel 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README +98 -0
- data/bin/ariel +56 -0
- data/examples/google_calculator/labeled/1 +43 -0
- data/examples/google_calculator/labeled/2 +41 -0
- data/examples/google_calculator/labeled/3 +41 -0
- data/examples/google_calculator/structure.rb +12 -0
- data/examples/google_calculator/structure.yaml +46 -0
- data/examples/google_calculator/unlabeled/1 +43 -0
- data/examples/google_calculator/unlabeled/2 +43 -0
- data/examples/raa/labeled/highline.html +135 -0
- data/examples/raa/labeled/mongrel.html +168 -0
- data/examples/raa/structure.rb +17 -0
- data/examples/raa/structure.yaml +183 -0
- data/examples/raa/unlabeled/pdf-writer.html +175 -0
- data/lib/ariel/candidate_selector.rb +94 -0
- data/lib/ariel/example_document_loader.rb +59 -0
- data/lib/ariel/extracted_node.rb +20 -0
- data/lib/ariel/label_utils.rb +71 -0
- data/lib/ariel/learner.rb +237 -0
- data/lib/ariel/node_like.rb +26 -0
- data/lib/ariel/rule.rb +112 -0
- data/lib/ariel/rule_set.rb +34 -0
- data/lib/ariel/structure_node.rb +75 -0
- data/lib/ariel/token.rb +68 -0
- data/lib/ariel/token_stream.rb +240 -0
- data/lib/ariel/wildcards.rb +33 -0
- data/lib/ariel.rb +69 -0
- data/test/ariel_test_case.rb +15 -0
- data/test/fixtures.rb +43 -0
- data/test/specs/token_spec.rb +65 -0
- data/test/specs/token_stream_spec.rb +43 -0
- data/test/specs/wildcards_spec.rb +26 -0
- data/test/test_candidate_selector.rb +58 -0
- data/test/test_example_document_loader.rb +7 -0
- data/test/test_label_utils.rb +15 -0
- data/test/test_learner.rb +38 -0
- data/test/test_rule.rb +38 -0
- data/test/test_structure_node.rb +81 -0
- data/test/test_token.rb +16 -0
- data/test/test_token_stream.rb +82 -0
- data/test/test_wildcards.rb +18 -0
- metadata +103 -0
data/test/test_rule.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestRule < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@labeled=Ariel::TokenStream.new
|
7
|
+
@labeled.tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
|
8
|
+
@labeled.set_label_at 35
|
9
|
+
@perfect_rule=Ariel::Rule.new(:forward, [["Phone"], ["("]])
|
10
|
+
@early_rule=Ariel::Rule.new(:forward, [[:anything]])
|
11
|
+
@late_rule=Ariel::Rule.new(:forward, [["508"]])
|
12
|
+
@unlabeled=Ariel::TokenStream.new
|
13
|
+
@unlabeled.tokenize("Robot 9753 reporting for duty. BEEP BEEP")
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_apply_to
|
17
|
+
md=nil
|
18
|
+
@perfect_rule.apply_to(@labeled) {|md|}
|
19
|
+
assert_equal :perfect, md.type
|
20
|
+
@early_rule.apply_to(@labeled) {|md|}
|
21
|
+
assert_equal :early, md.type
|
22
|
+
assert_equal 1, md.token_loc
|
23
|
+
@late_rule.apply_to(@labeled) {|md|}
|
24
|
+
assert_equal :late, md.type
|
25
|
+
assert_nil (@perfect_rule.apply_to(@unlabeled))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_matches
|
29
|
+
assert(@early_rule.matches(@labeled, :early))
|
30
|
+
assert(@late_rule.matches(@labeled, :early, :late))
|
31
|
+
assert(@perfect_rule.matches(@unlabeled, :fail))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_wildcard_count
|
35
|
+
assert_equal 0, @perfect_rule.wildcard_count
|
36
|
+
assert_equal 1, @early_rule.wildcard_count
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestStructureNode < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@tree=Ariel::StructureNode.new do |r|
|
7
|
+
r.item :item_info do |i|
|
8
|
+
i.item :title
|
9
|
+
i.item :price
|
10
|
+
i.item :stock_level
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def test_unnested
|
15
|
+
t=Ariel::StructureNode.new {|r| r.item :picture; r.item :title; r.item :description; r.item :url}
|
16
|
+
assert t
|
17
|
+
assert_equal Ariel::StructureNode, t.picture.class
|
18
|
+
assert_equal :root, t.meta.name
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_nested
|
22
|
+
assert @tree.item_info.children.has_key?(:title)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_nested_with_list
|
26
|
+
doc_tree=Ariel::StructureNode.new do |r|
|
27
|
+
r.item :restaurant_list do |r|
|
28
|
+
r.list_item :restaurant do |r|
|
29
|
+
r.item :name
|
30
|
+
r.item :address
|
31
|
+
r.item :phone
|
32
|
+
r.item :review
|
33
|
+
r.item :credit_card_list do |c|
|
34
|
+
c.item :credit_card
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
assert doc_tree
|
40
|
+
assert_equal :list, doc_tree.restaurant_list.restaurant.meta.node_type
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_extend_structure
|
44
|
+
assert (@tree.extend_structure {|r| r.item :site_copyright; r.item :logo;})
|
45
|
+
assert @tree.children.has_key?(:site_copyright)
|
46
|
+
assert @tree.children.has_key?(:logo)
|
47
|
+
assert (@tree.item_info.extend_structure {|i| i.item :picture})
|
48
|
+
assert @tree.item_info.children.has_key?(:picture)
|
49
|
+
end
|
50
|
+
|
51
|
+
# def test_apply_extraction_tree_on
|
52
|
+
# # t = Ariel::StructureNode.new do |r|
|
53
|
+
# # # r.title
|
54
|
+
# # # # r.content do |c|
|
55
|
+
# # # # # c.excerpt
|
56
|
+
# # # # # # c.body
|
57
|
+
# # # # # # # end
|
58
|
+
# # # # # # # # end
|
59
|
+
# # # # # # # # # str = %q{Title: The test of the Century
|
60
|
+
# # # # # # # # # # <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
|
61
|
+
# # # # # # # # # # # There was once a test designed to assess whether apply_extraction_tree_on worked.}
|
62
|
+
# # # # # # # # # # # # tokenstream = Ariel::TokenStream.new
|
63
|
+
# # # # # # # # # # # # # tokenstream.tokenize(str)
|
64
|
+
# # # # # # # # # # # # # # root = Ariel::ExtractedNode.new(tokenstream, :structure=>t, :name=>:root)
|
65
|
+
# # # # # # # # # # # # # # # t.title.meta.start_rule = Ariel::Rule.new(["Title", ":"])
|
66
|
+
# # # # # # # # # # # # # # # # t.title.meta.end_rule = Ariel::Rule.new(["<b>"])
|
67
|
+
# # # # # # # # # # # # # # # # # t.title.meta.end_rule.direction = :back
|
68
|
+
# # # # # # # # # # # # # # # # # # t.content.meta.start_rule = Ariel::Rule.new(["Century"]) #later implementation might use skip_until("<b>")
|
69
|
+
# # # # # # # # # # # # # # # # # # # t.content.meta.end_rule = Ariel::Rule.new()
|
70
|
+
# # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule.direction = :back
|
71
|
+
# # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.start_rule = Ariel::Rule.new(["<i>"])
|
72
|
+
# # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule = Ariel::Rule.new([".</"])
|
73
|
+
# # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule.direction = :back
|
74
|
+
# # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.start_rule = Ariel::Rule.new(["i", ">"])
|
75
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule = Ariel::Rule.new()
|
76
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule.direction = :back
|
77
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # # # t.apply_extraction_tree_on root
|
78
|
+
# end
|
79
|
+
|
80
|
+
|
81
|
+
end
|
data/test/test_token.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestToken < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@t=Ariel::Token.new('Test', 0, 4)
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_matches?
|
10
|
+
assert @t.matches?('Test')
|
11
|
+
assert_equal false, @t.matches?('test')
|
12
|
+
assert_equal false, @t.matches?('te')
|
13
|
+
assert @t.matches?(:alpha)
|
14
|
+
assert_equal false, @t.matches?(:html_tag)
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestTokenStream < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@stream=Ariel::TokenStream.new
|
9
|
+
@text = "This is test101. See below:"
|
10
|
+
@stream.tokenize(@text)
|
11
|
+
|
12
|
+
@labeled_stream = Ariel::TokenStream.new
|
13
|
+
@labeled_stream.tokenize(@@labeled_document, true)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_advance
|
17
|
+
assert_equal Ariel::Token.new("This", 0, 4), @stream.advance
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_cur_pos
|
21
|
+
assert_equal 0, @stream.cur_pos
|
22
|
+
@stream.advance
|
23
|
+
assert_equal 1, @stream.cur_pos
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_each
|
27
|
+
i=0
|
28
|
+
@stream.each {i=i+1}
|
29
|
+
assert_equal 8, i
|
30
|
+
assert_equal 9, @stream.cur_pos
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_rewind
|
34
|
+
@stream.each {}
|
35
|
+
@stream.rewind
|
36
|
+
assert_equal 0, @stream.cur_pos
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_skip_to
|
40
|
+
assert @stream.skip_to("This")
|
41
|
+
assert_equal 1, @stream.cur_pos #Test the matched token has been consumed
|
42
|
+
assert_nil @stream.skip_to("Ruby")
|
43
|
+
assert_equal 1, @stream.cur_pos #Stream's position remains unchanged by a failed match
|
44
|
+
assert @stream.skip_to("See", "below")
|
45
|
+
assert_equal 7, @stream.cur_pos
|
46
|
+
@stream.rewind
|
47
|
+
@stream.skip_to(:anything, "below")
|
48
|
+
assert_equal 7, @stream.cur_pos
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_tokenize
|
52
|
+
assert_equal 8, @stream.tokens.length
|
53
|
+
@stream.each do |token|
|
54
|
+
assert_equal @text[token.start_loc...token.end_loc], token.text
|
55
|
+
end
|
56
|
+
@labeled_stream.each do |token|
|
57
|
+
assert_equal @@labeled_document[token.start_loc...token.end_loc], token.text
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_set_label_at
|
62
|
+
assert_raise(ArgumentError) {@stream.set_label_at 1}
|
63
|
+
assert_nil @stream.label_index
|
64
|
+
assert(@labeled_stream.set_label_at(16))
|
65
|
+
assert_equal("The", @labeled_stream.tokens[@labeled_stream.label_index].text)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_raw_text
|
69
|
+
assert_equal @text, @stream.raw_text
|
70
|
+
assert_equal @@labeled_document.chomp, @labeled_stream.raw_text
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_text
|
74
|
+
assert_equal @text, @stream.text
|
75
|
+
assert_equal @@unlabeled_document.chomp, @labeled_stream.text
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_slice_by_token_index
|
79
|
+
assert sliced=@stream.slice_by_token_index(1,3)
|
80
|
+
assert_equal @text[sliced.tokens.first.start_loc...sliced.tokens.last.end_loc], sliced.text
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestWildcards < Ariel::TestCase
|
5
|
+
|
6
|
+
def test_list
|
7
|
+
assert (wildcards=Ariel::Wildcards.list)
|
8
|
+
assert (wildcards.kind_of? Hash)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_matching
|
12
|
+
assert matches=Ariel::Wildcards.matching("123")
|
13
|
+
assert (matches.include? :alpha_numeric)
|
14
|
+
assert (matches.include? :numeric)
|
15
|
+
assert (matches.include? :anything)
|
16
|
+
assert_equal 3, matches.size
|
17
|
+
end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: ariel
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.0.1
|
7
|
+
date: 2006-08-09 00:00:00 +01:00
|
8
|
+
summary: A Ruby Information Extraction Library
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: asbradbury@gmail.com
|
12
|
+
homepage: http://ariel.rubyforge.org
|
13
|
+
rubyforge_project: ariel
|
14
|
+
description: Ariel uses machine learning to assist in extracting information from semi-structured documents including (but not in any way limited to) web pages
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- A. S. Bradbury
|
30
|
+
files:
|
31
|
+
- lib/ariel
|
32
|
+
- lib/ariel.rb
|
33
|
+
- lib/ariel/extracted_node.rb
|
34
|
+
- lib/ariel/learner.rb
|
35
|
+
- lib/ariel/example_document_loader.rb
|
36
|
+
- lib/ariel/rule_set.rb
|
37
|
+
- lib/ariel/rule.rb
|
38
|
+
- lib/ariel/wildcards.rb
|
39
|
+
- lib/ariel/token_stream.rb
|
40
|
+
- lib/ariel/label_utils.rb
|
41
|
+
- lib/ariel/structure_node.rb
|
42
|
+
- lib/ariel/token.rb
|
43
|
+
- lib/ariel/candidate_selector.rb
|
44
|
+
- lib/ariel/node_like.rb
|
45
|
+
- test/test_learner.rb
|
46
|
+
- test/specs
|
47
|
+
- test/test_rule.rb
|
48
|
+
- test/ariel_test_case.rb
|
49
|
+
- test/fixtures.rb
|
50
|
+
- test/test_token_stream.rb
|
51
|
+
- test/test_example_document_loader.rb
|
52
|
+
- test/test_token.rb
|
53
|
+
- test/test_structure_node.rb
|
54
|
+
- test/test_label_utils.rb
|
55
|
+
- test/test_candidate_selector.rb
|
56
|
+
- test/test_wildcards.rb
|
57
|
+
- test/specs/token_stream_spec.rb
|
58
|
+
- test/specs/wildcards_spec.rb
|
59
|
+
- test/specs/token_spec.rb
|
60
|
+
- README
|
61
|
+
- LICENSE
|
62
|
+
- examples/raa
|
63
|
+
- examples/google_calculator
|
64
|
+
- examples/raa/structure.rb
|
65
|
+
- examples/raa/labeled
|
66
|
+
- examples/raa/unlabeled
|
67
|
+
- examples/raa/structure.yaml
|
68
|
+
- examples/raa/labeled/mongrel.html
|
69
|
+
- examples/raa/labeled/highline.html
|
70
|
+
- examples/raa/unlabeled/pdf-writer.html
|
71
|
+
- examples/google_calculator/structure.rb
|
72
|
+
- examples/google_calculator/labeled
|
73
|
+
- examples/google_calculator/unlabeled
|
74
|
+
- examples/google_calculator/structure.yaml
|
75
|
+
- examples/google_calculator/labeled/1
|
76
|
+
- examples/google_calculator/labeled/2
|
77
|
+
- examples/google_calculator/labeled/3
|
78
|
+
- examples/google_calculator/unlabeled/1
|
79
|
+
- examples/google_calculator/unlabeled/2
|
80
|
+
- bin/ariel
|
81
|
+
test_files:
|
82
|
+
- test/test_learner.rb
|
83
|
+
- test/test_rule.rb
|
84
|
+
- test/test_token_stream.rb
|
85
|
+
- test/test_example_document_loader.rb
|
86
|
+
- test/test_token.rb
|
87
|
+
- test/test_structure_node.rb
|
88
|
+
- test/test_label_utils.rb
|
89
|
+
- test/test_candidate_selector.rb
|
90
|
+
- test/test_wildcards.rb
|
91
|
+
rdoc_options: []
|
92
|
+
|
93
|
+
extra_rdoc_files:
|
94
|
+
- README
|
95
|
+
- LICENSE
|
96
|
+
executables:
|
97
|
+
- ariel
|
98
|
+
extensions: []
|
99
|
+
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
dependencies: []
|
103
|
+
|