ariel 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README +98 -0
- data/bin/ariel +56 -0
- data/examples/google_calculator/labeled/1 +43 -0
- data/examples/google_calculator/labeled/2 +41 -0
- data/examples/google_calculator/labeled/3 +41 -0
- data/examples/google_calculator/structure.rb +12 -0
- data/examples/google_calculator/structure.yaml +46 -0
- data/examples/google_calculator/unlabeled/1 +43 -0
- data/examples/google_calculator/unlabeled/2 +43 -0
- data/examples/raa/labeled/highline.html +135 -0
- data/examples/raa/labeled/mongrel.html +168 -0
- data/examples/raa/structure.rb +17 -0
- data/examples/raa/structure.yaml +183 -0
- data/examples/raa/unlabeled/pdf-writer.html +175 -0
- data/lib/ariel/candidate_selector.rb +94 -0
- data/lib/ariel/example_document_loader.rb +59 -0
- data/lib/ariel/extracted_node.rb +20 -0
- data/lib/ariel/label_utils.rb +71 -0
- data/lib/ariel/learner.rb +237 -0
- data/lib/ariel/node_like.rb +26 -0
- data/lib/ariel/rule.rb +112 -0
- data/lib/ariel/rule_set.rb +34 -0
- data/lib/ariel/structure_node.rb +75 -0
- data/lib/ariel/token.rb +68 -0
- data/lib/ariel/token_stream.rb +240 -0
- data/lib/ariel/wildcards.rb +33 -0
- data/lib/ariel.rb +69 -0
- data/test/ariel_test_case.rb +15 -0
- data/test/fixtures.rb +43 -0
- data/test/specs/token_spec.rb +65 -0
- data/test/specs/token_stream_spec.rb +43 -0
- data/test/specs/wildcards_spec.rb +26 -0
- data/test/test_candidate_selector.rb +58 -0
- data/test/test_example_document_loader.rb +7 -0
- data/test/test_label_utils.rb +15 -0
- data/test/test_learner.rb +38 -0
- data/test/test_rule.rb +38 -0
- data/test/test_structure_node.rb +81 -0
- data/test/test_token.rb +16 -0
- data/test/test_token_stream.rb +82 -0
- data/test/test_wildcards.rb +18 -0
- metadata +103 -0
data/test/test_rule.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestRule < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@labeled=Ariel::TokenStream.new
|
7
|
+
@labeled.tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
|
8
|
+
@labeled.set_label_at 35
|
9
|
+
@perfect_rule=Ariel::Rule.new(:forward, [["Phone"], ["("]])
|
10
|
+
@early_rule=Ariel::Rule.new(:forward, [[:anything]])
|
11
|
+
@late_rule=Ariel::Rule.new(:forward, [["508"]])
|
12
|
+
@unlabeled=Ariel::TokenStream.new
|
13
|
+
@unlabeled.tokenize("Robot 9753 reporting for duty. BEEP BEEP")
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_apply_to
|
17
|
+
md=nil
|
18
|
+
@perfect_rule.apply_to(@labeled) {|md|}
|
19
|
+
assert_equal :perfect, md.type
|
20
|
+
@early_rule.apply_to(@labeled) {|md|}
|
21
|
+
assert_equal :early, md.type
|
22
|
+
assert_equal 1, md.token_loc
|
23
|
+
@late_rule.apply_to(@labeled) {|md|}
|
24
|
+
assert_equal :late, md.type
|
25
|
+
assert_nil (@perfect_rule.apply_to(@unlabeled))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_matches
|
29
|
+
assert(@early_rule.matches(@labeled, :early))
|
30
|
+
assert(@late_rule.matches(@labeled, :early, :late))
|
31
|
+
assert(@perfect_rule.matches(@unlabeled, :fail))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_wildcard_count
|
35
|
+
assert_equal 0, @perfect_rule.wildcard_count
|
36
|
+
assert_equal 1, @early_rule.wildcard_count
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestStructureNode < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@tree=Ariel::StructureNode.new do |r|
|
7
|
+
r.item :item_info do |i|
|
8
|
+
i.item :title
|
9
|
+
i.item :price
|
10
|
+
i.item :stock_level
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def test_unnested
|
15
|
+
t=Ariel::StructureNode.new {|r| r.item :picture; r.item :title; r.item :description; r.item :url}
|
16
|
+
assert t
|
17
|
+
assert_equal Ariel::StructureNode, t.picture.class
|
18
|
+
assert_equal :root, t.meta.name
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_nested
|
22
|
+
assert @tree.item_info.children.has_key?(:title)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_nested_with_list
|
26
|
+
doc_tree=Ariel::StructureNode.new do |r|
|
27
|
+
r.item :restaurant_list do |r|
|
28
|
+
r.list_item :restaurant do |r|
|
29
|
+
r.item :name
|
30
|
+
r.item :address
|
31
|
+
r.item :phone
|
32
|
+
r.item :review
|
33
|
+
r.item :credit_card_list do |c|
|
34
|
+
c.item :credit_card
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
assert doc_tree
|
40
|
+
assert_equal :list, doc_tree.restaurant_list.restaurant.meta.node_type
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_extend_structure
|
44
|
+
assert (@tree.extend_structure {|r| r.item :site_copyright; r.item :logo;})
|
45
|
+
assert @tree.children.has_key?(:site_copyright)
|
46
|
+
assert @tree.children.has_key?(:logo)
|
47
|
+
assert (@tree.item_info.extend_structure {|i| i.item :picture})
|
48
|
+
assert @tree.item_info.children.has_key?(:picture)
|
49
|
+
end
|
50
|
+
|
51
|
+
# def test_apply_extraction_tree_on
|
52
|
+
# # t = Ariel::StructureNode.new do |r|
|
53
|
+
# # # r.title
|
54
|
+
# # # # r.content do |c|
|
55
|
+
# # # # # c.excerpt
|
56
|
+
# # # # # # c.body
|
57
|
+
# # # # # # # end
|
58
|
+
# # # # # # # # end
|
59
|
+
# # # # # # # # # str = %q{Title: The test of the Century
|
60
|
+
# # # # # # # # # # <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
|
61
|
+
# # # # # # # # # # # There was once a test designed to assess whether apply_extraction_tree_on worked.}
|
62
|
+
# # # # # # # # # # # # tokenstream = Ariel::TokenStream.new
|
63
|
+
# # # # # # # # # # # # # tokenstream.tokenize(str)
|
64
|
+
# # # # # # # # # # # # # # root = Ariel::ExtractedNode.new(tokenstream, :structure=>t, :name=>:root)
|
65
|
+
# # # # # # # # # # # # # # # t.title.meta.start_rule = Ariel::Rule.new(["Title", ":"])
|
66
|
+
# # # # # # # # # # # # # # # # t.title.meta.end_rule = Ariel::Rule.new(["<b>"])
|
67
|
+
# # # # # # # # # # # # # # # # # t.title.meta.end_rule.direction = :back
|
68
|
+
# # # # # # # # # # # # # # # # # # t.content.meta.start_rule = Ariel::Rule.new(["Century"]) #later implementation might use skip_until("<b>")
|
69
|
+
# # # # # # # # # # # # # # # # # # # t.content.meta.end_rule = Ariel::Rule.new()
|
70
|
+
# # # # # # # # # # # # # # # # # # # # t.content.meta.end_rule.direction = :back
|
71
|
+
# # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.start_rule = Ariel::Rule.new(["<i>"])
|
72
|
+
# # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule = Ariel::Rule.new([".</"])
|
73
|
+
# # # # # # # # # # # # # # # # # # # # # # # t.content.excerpt.meta.end_rule.direction = :back
|
74
|
+
# # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.start_rule = Ariel::Rule.new(["i", ">"])
|
75
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule = Ariel::Rule.new()
|
76
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # # t.content.body.meta.end_rule.direction = :back
|
77
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # # # t.apply_extraction_tree_on root
|
78
|
+
# end
|
79
|
+
|
80
|
+
|
81
|
+
end
|
data/test/test_token.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestToken < Ariel::TestCase
|
5
|
+
def setup
|
6
|
+
@t=Ariel::Token.new('Test', 0, 4)
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_matches?
|
10
|
+
assert @t.matches?('Test')
|
11
|
+
assert_equal false, @t.matches?('test')
|
12
|
+
assert_equal false, @t.matches?('te')
|
13
|
+
assert @t.matches?(:alpha)
|
14
|
+
assert_equal false, @t.matches?(:html_tag)
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestTokenStream < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@stream=Ariel::TokenStream.new
|
9
|
+
@text = "This is test101. See below:"
|
10
|
+
@stream.tokenize(@text)
|
11
|
+
|
12
|
+
@labeled_stream = Ariel::TokenStream.new
|
13
|
+
@labeled_stream.tokenize(@@labeled_document, true)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_advance
|
17
|
+
assert_equal Ariel::Token.new("This", 0, 4), @stream.advance
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_cur_pos
|
21
|
+
assert_equal 0, @stream.cur_pos
|
22
|
+
@stream.advance
|
23
|
+
assert_equal 1, @stream.cur_pos
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_each
|
27
|
+
i=0
|
28
|
+
@stream.each {i=i+1}
|
29
|
+
assert_equal 8, i
|
30
|
+
assert_equal 9, @stream.cur_pos
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_rewind
|
34
|
+
@stream.each {}
|
35
|
+
@stream.rewind
|
36
|
+
assert_equal 0, @stream.cur_pos
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_skip_to
|
40
|
+
assert @stream.skip_to("This")
|
41
|
+
assert_equal 1, @stream.cur_pos #Test the matched token has been consumed
|
42
|
+
assert_nil @stream.skip_to("Ruby")
|
43
|
+
assert_equal 1, @stream.cur_pos #Stream's position remains unchanged by a failed match
|
44
|
+
assert @stream.skip_to("See", "below")
|
45
|
+
assert_equal 7, @stream.cur_pos
|
46
|
+
@stream.rewind
|
47
|
+
@stream.skip_to(:anything, "below")
|
48
|
+
assert_equal 7, @stream.cur_pos
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_tokenize
|
52
|
+
assert_equal 8, @stream.tokens.length
|
53
|
+
@stream.each do |token|
|
54
|
+
assert_equal @text[token.start_loc...token.end_loc], token.text
|
55
|
+
end
|
56
|
+
@labeled_stream.each do |token|
|
57
|
+
assert_equal @@labeled_document[token.start_loc...token.end_loc], token.text
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_set_label_at
|
62
|
+
assert_raise(ArgumentError) {@stream.set_label_at 1}
|
63
|
+
assert_nil @stream.label_index
|
64
|
+
assert(@labeled_stream.set_label_at(16))
|
65
|
+
assert_equal("The", @labeled_stream.tokens[@labeled_stream.label_index].text)
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_raw_text
|
69
|
+
assert_equal @text, @stream.raw_text
|
70
|
+
assert_equal @@labeled_document.chomp, @labeled_stream.raw_text
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_text
|
74
|
+
assert_equal @text, @stream.text
|
75
|
+
assert_equal @@unlabeled_document.chomp, @labeled_stream.text
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_slice_by_token_index
|
79
|
+
assert sliced=@stream.slice_by_token_index(1,3)
|
80
|
+
assert_equal @text[sliced.tokens.first.start_loc...sliced.tokens.last.end_loc], sliced.text
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestWildcards < Ariel::TestCase
|
5
|
+
|
6
|
+
def test_list
|
7
|
+
assert (wildcards=Ariel::Wildcards.list)
|
8
|
+
assert (wildcards.kind_of? Hash)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_matching
|
12
|
+
assert matches=Ariel::Wildcards.matching("123")
|
13
|
+
assert (matches.include? :alpha_numeric)
|
14
|
+
assert (matches.include? :numeric)
|
15
|
+
assert (matches.include? :anything)
|
16
|
+
assert_equal 3, matches.size
|
17
|
+
end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: ariel
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.0.1
|
7
|
+
date: 2006-08-09 00:00:00 +01:00
|
8
|
+
summary: A Ruby Information Extraction Library
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: asbradbury@gmail.com
|
12
|
+
homepage: http://ariel.rubyforge.org
|
13
|
+
rubyforge_project: ariel
|
14
|
+
description: Ariel uses machine learning to assist in extracting information from semi-structured documents including (but not in any way limited to) web pages
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- A. S. Bradbury
|
30
|
+
files:
|
31
|
+
- lib/ariel
|
32
|
+
- lib/ariel.rb
|
33
|
+
- lib/ariel/extracted_node.rb
|
34
|
+
- lib/ariel/learner.rb
|
35
|
+
- lib/ariel/example_document_loader.rb
|
36
|
+
- lib/ariel/rule_set.rb
|
37
|
+
- lib/ariel/rule.rb
|
38
|
+
- lib/ariel/wildcards.rb
|
39
|
+
- lib/ariel/token_stream.rb
|
40
|
+
- lib/ariel/label_utils.rb
|
41
|
+
- lib/ariel/structure_node.rb
|
42
|
+
- lib/ariel/token.rb
|
43
|
+
- lib/ariel/candidate_selector.rb
|
44
|
+
- lib/ariel/node_like.rb
|
45
|
+
- test/test_learner.rb
|
46
|
+
- test/specs
|
47
|
+
- test/test_rule.rb
|
48
|
+
- test/ariel_test_case.rb
|
49
|
+
- test/fixtures.rb
|
50
|
+
- test/test_token_stream.rb
|
51
|
+
- test/test_example_document_loader.rb
|
52
|
+
- test/test_token.rb
|
53
|
+
- test/test_structure_node.rb
|
54
|
+
- test/test_label_utils.rb
|
55
|
+
- test/test_candidate_selector.rb
|
56
|
+
- test/test_wildcards.rb
|
57
|
+
- test/specs/token_stream_spec.rb
|
58
|
+
- test/specs/wildcards_spec.rb
|
59
|
+
- test/specs/token_spec.rb
|
60
|
+
- README
|
61
|
+
- LICENSE
|
62
|
+
- examples/raa
|
63
|
+
- examples/google_calculator
|
64
|
+
- examples/raa/structure.rb
|
65
|
+
- examples/raa/labeled
|
66
|
+
- examples/raa/unlabeled
|
67
|
+
- examples/raa/structure.yaml
|
68
|
+
- examples/raa/labeled/mongrel.html
|
69
|
+
- examples/raa/labeled/highline.html
|
70
|
+
- examples/raa/unlabeled/pdf-writer.html
|
71
|
+
- examples/google_calculator/structure.rb
|
72
|
+
- examples/google_calculator/labeled
|
73
|
+
- examples/google_calculator/unlabeled
|
74
|
+
- examples/google_calculator/structure.yaml
|
75
|
+
- examples/google_calculator/labeled/1
|
76
|
+
- examples/google_calculator/labeled/2
|
77
|
+
- examples/google_calculator/labeled/3
|
78
|
+
- examples/google_calculator/unlabeled/1
|
79
|
+
- examples/google_calculator/unlabeled/2
|
80
|
+
- bin/ariel
|
81
|
+
test_files:
|
82
|
+
- test/test_learner.rb
|
83
|
+
- test/test_rule.rb
|
84
|
+
- test/test_token_stream.rb
|
85
|
+
- test/test_example_document_loader.rb
|
86
|
+
- test/test_token.rb
|
87
|
+
- test/test_structure_node.rb
|
88
|
+
- test/test_label_utils.rb
|
89
|
+
- test/test_candidate_selector.rb
|
90
|
+
- test/test_wildcards.rb
|
91
|
+
rdoc_options: []
|
92
|
+
|
93
|
+
extra_rdoc_files:
|
94
|
+
- README
|
95
|
+
- LICENSE
|
96
|
+
executables:
|
97
|
+
- ariel
|
98
|
+
extensions: []
|
99
|
+
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
dependencies: []
|
103
|
+
|