ariel 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/LICENSE +21 -0
  2. data/README +98 -0
  3. data/bin/ariel +56 -0
  4. data/examples/google_calculator/labeled/1 +43 -0
  5. data/examples/google_calculator/labeled/2 +41 -0
  6. data/examples/google_calculator/labeled/3 +41 -0
  7. data/examples/google_calculator/structure.rb +12 -0
  8. data/examples/google_calculator/structure.yaml +46 -0
  9. data/examples/google_calculator/unlabeled/1 +43 -0
  10. data/examples/google_calculator/unlabeled/2 +43 -0
  11. data/examples/raa/labeled/highline.html +135 -0
  12. data/examples/raa/labeled/mongrel.html +168 -0
  13. data/examples/raa/structure.rb +17 -0
  14. data/examples/raa/structure.yaml +183 -0
  15. data/examples/raa/unlabeled/pdf-writer.html +175 -0
  16. data/lib/ariel/candidate_selector.rb +94 -0
  17. data/lib/ariel/example_document_loader.rb +59 -0
  18. data/lib/ariel/extracted_node.rb +20 -0
  19. data/lib/ariel/label_utils.rb +71 -0
  20. data/lib/ariel/learner.rb +237 -0
  21. data/lib/ariel/node_like.rb +26 -0
  22. data/lib/ariel/rule.rb +112 -0
  23. data/lib/ariel/rule_set.rb +34 -0
  24. data/lib/ariel/structure_node.rb +75 -0
  25. data/lib/ariel/token.rb +68 -0
  26. data/lib/ariel/token_stream.rb +240 -0
  27. data/lib/ariel/wildcards.rb +33 -0
  28. data/lib/ariel.rb +69 -0
  29. data/test/ariel_test_case.rb +15 -0
  30. data/test/fixtures.rb +43 -0
  31. data/test/specs/token_spec.rb +65 -0
  32. data/test/specs/token_stream_spec.rb +43 -0
  33. data/test/specs/wildcards_spec.rb +26 -0
  34. data/test/test_candidate_selector.rb +58 -0
  35. data/test/test_example_document_loader.rb +7 -0
  36. data/test/test_label_utils.rb +15 -0
  37. data/test/test_learner.rb +38 -0
  38. data/test/test_rule.rb +38 -0
  39. data/test/test_structure_node.rb +81 -0
  40. data/test/test_token.rb +16 -0
  41. data/test/test_token_stream.rb +82 -0
  42. data/test/test_wildcards.rb +18 -0
  43. metadata +103 -0
@@ -0,0 +1,240 @@
1
+ module Ariel
2
+
3
+ require 'enumerator'
4
+
5
+ # A TokenStream instance stores a stream of Tokens once it has used its tokenization
6
+ # rules to extract them from a string. A TokenStream knows its current
7
+ # position (TokenStream#cur_pos), which is incremented when any of the
8
+ # Enumerable methods are used (due to the redefinition of TokenStream#each).
9
+ # As you advance through the stream, the current token is always returned and
10
+ # then consumed. A TokenStream also provides methods for finding patterns in a
11
+ # given stream much like StringScanner but for an array of tokens. For rule
12
+ # generation, a certain token can be marked as being the start point of a label.
13
+ # Finally, a TokenStream will record whether it is in a reversed or unreversed
14
+ # state so that when rules are applied, they are always applied from the front
15
+ # or end of the stream as required, whether it is reversed or not.
16
+ class TokenStream
17
+ include Enumerable
18
+ attr_accessor :tokens, :cur_pos, :label_index, :original_text
19
+
20
+ def initialize()
21
+ @tokens=[]
22
+ @cur_pos=0
23
+ @original_text = ""
24
+ @token_regexen = [
25
+ Wildcards.list[:html_tag], # Match html tags that don't have attributes
26
+ /\d+/, # Match any numbers, probably good to make a split
27
+ /\b\w+\b/, # Pick up words, will split at punctuation
28
+ /\S/ # Grab any characters left over that aren't whitespace
29
+ ]
30
+ @label_tag_regexen = [LabelUtils.any_label_regex]
31
+ @reversed=false
32
+ end
33
+
34
+ # The tokenizer operates on a string by splitting it at every point it
35
+ # finds a match to a regular expression. Each match is added as a token, and
36
+ # the strings between each match are stored along with their original
37
+ # offsets. The same is then done with the next regular expression on each of
38
+ # these split strings, and new tokens are created with the correct offset in
39
+ # the original text. Any characters left unmatched by any of the regular
40
+ # expressions in @token_regexen are discarded. This approach allows a
41
+ # hierarchy of regular expressions to work simply and easily. A simple
42
+ # regular expression to match html tags might operate first, and then later
43
+ # expressions that pick up runs of word characters can operate on what's
44
+ # left. If contains_labels is set to true when calling tokenize, the
45
+ # tokenizer will first remove and discard any occurences of label_tags (as
46
+ # defined by the Regex set in LabelUtils) before matching and adding tokens.
47
+ # Any label_tag tokens will be marked as such upon creation.
48
+ def tokenize(input, contains_labels=false)
49
+ string_array=[[input, 0]]
50
+ @original_text = input
51
+ @original_text_contains_labels=contains_labels
52
+ @label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
53
+ @token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
54
+ @tokens.sort!
55
+ @tokens.size
56
+ end
57
+
58
+ # Goes through all stored Token instances, removing them if
59
+ # Token#is_label_tag? Called after a labeled document has been extracted to
60
+ # a tree ready for the rule learning process.
61
+ def remove_label_tags
62
+ @tokens.delete_if {|token| token.is_label_tag?}
63
+ end
64
+
65
+ # Returns the slice of the current instance containing all the tokens
66
+ # between the token where the start_loc == the left parameter and the token
67
+ # where the end_loc == the right parameter.
68
+ def slice_by_string_pos(left, right)
69
+ l_index=nil
70
+ r_index=nil
71
+ @tokens.each_index {|i| l_index = i if @tokens[i].start_loc == left}
72
+ @tokens.each_index {|i| r_index = i if @tokens[i].end_loc == right}
73
+ if l_index.nil? or r_index.nil?
74
+ raise ArgumentError, "Cannot slice between those locations"
75
+ else
76
+ return slice_by_token_index(l_index, r_index)
77
+ end
78
+ end
79
+
80
+ # Slices tokens between the l_index and the r_index inclusive.
81
+ def slice_by_token_index(l_index, r_index)
82
+ sliced = self.dup
83
+ sliced.tokens=@tokens[l_index..r_index]
84
+ return sliced
85
+ end
86
+
87
+ # Used to ensure operations such as @tokens.reverse! in one instance won't
88
+ # inadvertently effect another.
89
+ def deep_clone
90
+ Marshal::load(Marshal.dump(self))
91
+ end
92
+
93
+ # Set a label at a given offset in the original text. Searches for a token
94
+ # with a start_loc equal to the position passed as an argument, and raises
95
+ # an error if one is not found.
96
+ def set_label_at(pos)
97
+ token_pos=nil
98
+ @tokens.each_index {|i| token_pos = i if @tokens[i].start_loc == pos}
99
+ if token_pos.nil?
100
+ raise ArgumentError, "Given string position does not match the start of any token"
101
+ else
102
+ @label_index = token_pos
103
+ debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
104
+ return @label_index
105
+ end
106
+ end
107
+
108
+ # Returns all text represented by the instance's stored tokens, stripping any
109
+ # label tags if the stream was declared to be containing them when it was
110
+ # initialized (this would only happen during the process of loading labeled
111
+ # examples). See also TokenStream#raw_text
112
+ def text(l_index=0, r_index=-1)
113
+ out=raw_text(l_index, r_index)
114
+ if @original_text_contains_labels
115
+ LabelUtils.clean_string(out)
116
+ else
117
+ out
118
+ end
119
+ end
120
+
121
+ # Returns all text represented by the instance's stored tokens it will not
122
+ # strip label tags even if the stream is marked to contain them. However,
123
+ # you should not expect to get the raw_text once any label_tags have been
124
+ # filtered (TokenStream#remove_label_tags).
125
+ def raw_text(l_index=0, r_index=-1)
126
+ return "" if @tokens.size==0
127
+ if reversed?
128
+ l_index, r_index = r_index, l_index
129
+ end
130
+ @original_text[@tokens[l_index].start_loc...@tokens[r_index].end_loc]
131
+ end
132
+
133
+ # Returns the current Token and consumes it.
134
+ def advance
135
+ return nil if @cur_pos > @tokens.size
136
+ while true
137
+ @cur_pos+=1
138
+ current_token = @tokens[@cur_pos-1]
139
+ return nil if current_token.nil?
140
+ return current_token
141
+ end
142
+ end
143
+
144
+ # Return to the beginning of the TokenStream.
145
+ def rewind
146
+ @cur_pos=0
147
+ self
148
+ end
149
+
150
+ # Returns a copy of the current instance with a reversed set of tokens. If
151
+ # it is set, the label_index is adjusted accordingly to point to the correct
152
+ # token.
153
+ def reverse
154
+ self.deep_clone.reverse!
155
+ end
156
+
157
+ # Converts the given position so it points to the same token once the stream
158
+ # is reversed. Result invalid for when @tokens.size==0
159
+ def reverse_pos(pos)
160
+ @tokens.size-(pos + 1)
161
+ end
162
+
163
+ # Same as LabeledStream#reverse, but changes are made in place.
164
+ def reverse!
165
+ @tokens.reverse!
166
+ if label_index
167
+ @label_index = reverse_pos(@label_index)
168
+ end
169
+ @cur_pos = reverse_pos(@cur_pos)
170
+ @reversed=!@reversed
171
+ return self
172
+ end
173
+
174
+ # Returns true or false depending on whether the given tokenstream is in a
175
+ # reversed state
176
+ def reversed?
177
+ @reversed
178
+ end
179
+
180
+ # Takes a list of Strings and Symbols as its arguments representing text to be matched in
181
+ # individual tokens and Wildcards. For a match to be a
182
+ # success, all wildcards and strings must match a consecutive sequence
183
+ # of Tokens in the TokenStream. All matched Tokens are consumed, and the
184
+ # TokenStream's current position is returned on success. On failure, the
185
+ # TokenStream is returned to its original state and returns nil.
186
+ def skip_to(*features)
187
+ original_pos=@cur_pos
188
+ self.each_cons(features.size) do |tokens|
189
+ i=0
190
+ return @cur_pos if tokens.all? {|token| i+=1; token.matches?(features[i-1])}
191
+ end
192
+ @cur_pos=original_pos #No match, return TokenStream to original state
193
+ return nil
194
+ end
195
+
196
+ # Iterates over and consumes every Token from the cur_pos.
197
+ def each
198
+ while (token = self.advance)
199
+ yield token
200
+ end
201
+ end
202
+
203
+ # Returns the current Token.
204
+ def current_token
205
+ @tokens[@cur_pos]
206
+ end
207
+
208
+ private
209
+
210
+ # Uses split_by_regex to split each member of a given array of string and
211
+ # offset pairs in to new arrays of string and offset pairs.
212
+ def split_string_array_by_regex(string_array, regex, add_matches=true)
213
+ new_string_array = []
214
+ string_array.each do |arr|
215
+ result = split_by_regex(arr[0], arr[1], regex, add_matches)
216
+ new_string_array.concat result
217
+ end
218
+ string_array.replace new_string_array
219
+ end
220
+
221
+ # For tokenization, removes regex matches and creates new strings to
222
+ # represent the gaps between each match.
223
+ def split_by_regex(string, offset, regex, add_matches=true)
224
+ split_points=[0]
225
+ string_holder = []
226
+ string.scan(regex) do |s|
227
+ match = Regexp.last_match
228
+ split_points << match.begin(0)
229
+ split_points << match.end(0)
230
+ @tokens << Token.new(match[0], match.begin(0)+offset, match.end(0)+offset, !add_matches)
231
+ end
232
+ split_points << string.size
233
+ split_points.each_slice(2) do |s_pos, e_pos|
234
+ split_string = string[s_pos...e_pos]
235
+ string_holder << [split_string, s_pos+offset] unless split_string.empty?
236
+ end
237
+ return string_holder
238
+ end
239
+ end
240
+ end
@@ -0,0 +1,33 @@
1
+ module Ariel
2
+ # Contains all wildcards to be used in rule generation.
3
+ class Wildcards
4
+ private_class_method :new
5
+ @@list = {
6
+ :anything=>/.+/,
7
+ :numeric=>/\d+/,
8
+ :alpha_numeric=>/\w+/,
9
+ :alpha=>/[[:alpha:]]+/,
10
+ :capitalized=>/[[:upper:]]+\w+/,
11
+ :all_caps=>/[[:upper:]]+/,
12
+ :html_tag=>/<\/?\w+>|<\w+\s+\/>/,
13
+ :punctuation=>/[[:punct:]]+/
14
+ }
15
+ # Returns the hash of wildcard name (symbol) and regular expression pairs.
16
+ def self.list
17
+ @@list
18
+ end
19
+
20
+ # Given a string, will return an array of symbols from Wildcards::list that
21
+ # match it.
22
+ def self.matching(string)
23
+ matches=[]
24
+ @@list.each do |name, regex|
25
+ if string[regex]==string
26
+ yield name if block_given?
27
+ matches << name
28
+ end
29
+ end
30
+ matches
31
+ end
32
+ end
33
+ end
data/lib/ariel.rb ADDED
@@ -0,0 +1,69 @@
1
+ require 'ariel/token'
2
+ require 'ariel/token_stream'
3
+ require 'ariel/learner'
4
+ require 'ariel/node_like'
5
+ require 'ariel/extracted_node'
6
+ require 'ariel/structure_node'
7
+ require 'ariel/rule'
8
+ require 'ariel/wildcards'
9
+ require 'ariel/candidate_selector'
10
+ require 'ariel/label_utils'
11
+ require 'ariel/example_document_loader'
12
+ require 'ariel/rule_set'
13
+
14
+ if $DEBUG
15
+ # require 'logger'
16
+
17
+ # DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
18
+ # DEBUGLOG.datetime_format = " \010"
19
+ # DEBUGLOG.progname = "\010\010\010"
20
+
21
+ def debug(message)
22
+ p message
23
+ #DEBUGLOG.debug message
24
+ end
25
+ else
26
+ def debug(message)
27
+ end
28
+ end
29
+
30
+ # = Ariel - A Ruby Information Extraction Library
31
+ # Ariel intends to assist in extracting information from semi-structured
32
+ # documents including (but not in any way limited to) web pages. Although you
33
+ # may use libraries such as Hpricot or Rubyful Soup, or even plain Regular
34
+ # Expressions to achieve the same goal, Ariel approaches the problem very
35
+ # differently. Ariel relies on the user labeling examples of the data they
36
+ # want to extract, and then finds patterns across several such labeled
37
+ # examples in order to produce a set of general rules for extracting this
38
+ # information from any similar document.
39
+ #
40
+ # When working with Ariel, your workflow might look something like this:
41
+ # 1. Define a structure for the data you wish to extract. For example:
42
+ #
43
+ # @structure = Ariel::StructureNode.new do |r|
44
+ # r.article do |a|
45
+ # a.title
46
+ # a.author
47
+ # a.date
48
+ # a.body
49
+ # end
50
+ # r.comment_list do |c|
51
+ # c.author
52
+ # c.date
53
+ # c.body
54
+ # end
55
+ # end
56
+ # 2. Label these fields in a few example documents (normally at least 3).
57
+ # Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
58
+ # 3. Ariel will read these examples, and try to generate suitable rules that can
59
+ # be used to extract this data from other similarly structured documents.
60
+ # 4. A wrapper has been generated - we can now happily load documents with the
61
+ # same structure (normally documents generated by the same rules, so
62
+ # different pages from a single site perhaps) and query the extracted data.
63
+ module Ariel
64
+
65
+
66
+
67
+ end
68
+
69
+
@@ -0,0 +1,15 @@
1
+ require 'test/unit'
2
+ require 'fixtures'
3
+
4
+ module Ariel
5
+ include Fixtures
6
+ class TestCase < Test::Unit::TestCase
7
+ def run(result)
8
+ debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
9
+ super
10
+ end
11
+
12
+ def default_test
13
+ end
14
+ end
15
+ end
data/test/fixtures.rb ADDED
@@ -0,0 +1,43 @@
1
+ module Fixtures
2
+ @@labeled_document = <<EOS
3
+ Title: <l:title>The test of the Century</l:title>
4
+ <l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
5
+ <l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
6
+ EOS
7
+ @@labeled_document_structure = Ariel::StructureNode.new do |r|
8
+ r.item :title
9
+ r.item :content do |c|
10
+ c.item :excerpt
11
+ c.item :body
12
+ end
13
+ end
14
+ @@unlabeled_document=<<EOS
15
+ Title: The test of the Century
16
+ <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
17
+ There was once a test designed to assess whether apply_extraction_tree_on worked.
18
+ EOS
19
+ # Document with nested labels with clashing names. i.e. a label at the top
20
+ # level as well as a label lower down in the tree that has the same label
21
+ # name.
22
+ @@labeled_document_with_list=<<EOS
23
+ Title: <l:title>Another example</l:title>
24
+ <l:body>I love to write examples, you love to read them, ruby loves to process them.
25
+ In conclusion, we're has happy as can be.</l:body>
26
+ <l:comment_list>Comments:
27
+ <l:comment>Title:<l:title>Great example</l:title>
28
+ <l:author>Adoring fan</l:author>
29
+ <l:body>Always love reading your examples, keep up the great work.</l:body>
30
+ </l:comment></l:comment_list>
31
+ EOS
32
+
33
+ @@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
34
+ @@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
35
+ @@labeled_addresses[0].set_label_at 36
36
+ @@labeled_addresses[1].tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
37
+ @@labeled_addresses[1].set_label_at 35
38
+ @@labeled_addresses[2].tokenize("523 1st St., <b> LA </b>, Phone: 1-<b>888</b>-578-2293")
39
+ @@labeled_addresses[2].set_label_at 38
40
+ @@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
41
+ @@labeled_addresses[3].set_label_at 39
42
+
43
+ end
@@ -0,0 +1,65 @@
1
+ require 'ariel'
2
+
3
+ context "An average token" do
4
+ setup do
5
+ @token = Ariel::Token.new("Test", 0, 4)
6
+ end
7
+ specify "Should return the string it holds when text is called" do
8
+ @token.text.should_equal "Test"
9
+ end
10
+
11
+ specify "Should not be a label tag" do
12
+ @token.is_label_tag?.should_be false
13
+ end
14
+
15
+ specify "Should return true if if the token string matches a given wildcard or equals a given string" do
16
+ @token.matches?("Test").should_be true
17
+ @token.matches?(:alpha_numeric).should_be true
18
+ end
19
+
20
+ specify "Should return false if the token string doesn't match the given wildcard or string" do
21
+ @token.matches?("Tes").should_be false
22
+ @token.matches?(:html_tag).should_be false
23
+ end
24
+
25
+ specify "Should raise an error if an invalid wildcard is given" do
26
+ lambda {@token.matches? :not_a_wildcard}.should_raise ArgumentError
27
+ end
28
+
29
+ specify "Should be able to list all wildcard symbols that match its text" do
30
+ @token.matching_wildcards.should_be_an_instance_of Array
31
+ @token.matching_wildcards.each {|wildcard| wildcard.should_be_an_instance_of Symbol}
32
+ end
33
+ end
34
+
35
+ context "Comparing two Tokens" do
36
+ setup do
37
+ @token1 = Ariel::Token.new("Alice", 0, 5)
38
+ @token2 = Ariel::Token.new("Bob", 5, 8)
39
+ @token1_clone = Ariel::Token.new("Alice", 0, 5)
40
+ @token1_almost_clone = Ariel::Token.new("Alice", 0, 4)
41
+ end
42
+
43
+ specify "Should be equal if and only if text, start location and end location are equal" do
44
+ @token1.should_equal @token1_clone
45
+ @token1.should_not_equal @token2
46
+ @token1.should_not_equal @token1_almost_clone
47
+ end
48
+
49
+ specify "Should define a way of comparing itself to other tokens" do
50
+ @token1.should_respond_to :<=>
51
+ end
52
+
53
+ specify "Should make comparisons based on the start location of the token" do
54
+ (@token1<=>@token1_almost_clone).should_equal 0
55
+ (@token1<=>@token2).should_equal -1
56
+ end
57
+ end
58
+
59
+ context "Initializing a label tag token" do
60
+ specify "Should be ignored if passed true as the final argument to Token#new" do
61
+ Ariel::Token.new("Test", 0, 4, true).is_label_tag?.should_be true
62
+ end
63
+ end
64
+
65
+
@@ -0,0 +1,43 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+ include Fixtures
4
+
5
+ context "A new TokenStream" do
6
+ setup do
7
+ @tokenstream = Ariel::TokenStream.new
8
+ end
9
+
10
+ specify "Should return 0 when cur_pos is called" do
11
+ @tokenstream.cur_pos.should_equal 0
12
+ end
13
+
14
+ specify "Should return an empty Array when tokens is called" do
15
+ @tokenstream.tokens.should_be_a_kind_of Array
16
+ @tokenstream.tokens.should_be_empty
17
+ end
18
+
19
+ specify "Should not contain any tokens" do
20
+ @tokenstream.tokens.size.should_equal 0
21
+ end
22
+
23
+ specify "Should return an empty string went sent the message raw_text" do
24
+ @tokenstream.raw_text.should_equal ""
25
+ end
26
+
27
+ specify "Should return nil when asked to advance" do
28
+ @tokenstream.advance.should_be_nil
29
+ end
30
+
31
+ specify "cur_pos should increase to 1 when asked to advance and no further" do
32
+ @tokenstream.advance
33
+ @tokenstream.cur_pos.should_equal 1
34
+ @tokenstream.advance
35
+ @tokenstream.cur_pos.should_equal 1
36
+ end
37
+
38
+ specify "Should not be reversed" do
39
+ @tokenstream.should_not_be_reversed
40
+ end
41
+ end
42
+
43
+
@@ -0,0 +1,26 @@
1
+ require 'ariel'
2
+
3
+ context "When querying the Wildcards class" do
4
+
5
+ specify "Should not be possible to create a Wildcards instance" do
6
+ lambda {Ariel::Wildcards.new}.should_raise
7
+ end
8
+
9
+ specify "Should return a hash of Symbol to Regexp pairs when sent the list message" do
10
+ wildcards=Ariel::Wildcards.list
11
+ wildcards.should_be_a_kind_of Hash
12
+ wildcards.keys.each {|key| key.should_be_a_kind_of Symbol}
13
+ wildcards.values.each {|value| value.should_be_a_kind_of Regexp}
14
+ end
15
+
16
+ specify "When Wildcards.matching is called with a String, should return an array of the symbols of all matching wildcards" do
17
+ Ariel::Wildcards.matching("Test").should_be_a_kind_of Array
18
+ Ariel::Wildcards.matching("<a>").should_include :html_tag
19
+ end
20
+
21
+ specify "Should yield a symbol for every wildcard the string matches when Wildcards.matching is called" do
22
+ list=[]
23
+ Ariel::Wildcards.matching("<a>") {|wildcard| list << wildcard}
24
+ list.should_not_be_empty
25
+ end
26
+ end
@@ -0,0 +1,58 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+
5
+ class TestCandidateSelector < Ariel::TestCase
6
+ include Fixtures
7
+ def setup
8
+ # Must get rid of this repetition, should be available to all tests
9
+ @e=@@labeled_addresses
10
+ @candidates=[]
11
+ @candidates << Ariel::Rule.new(:forward, [[:anything]])
12
+ @candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
13
+ @candidates << Ariel::Rule.new(:forward, [["("]])
14
+ @candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
15
+ @selector=Ariel::CandidateSelector.new(@candidates, @e)
16
+ end
17
+
18
+ def test_score_by
19
+ score_hash = @selector.score_by {|rule| rule.landmarks.size}
20
+ assert_equal @candidates.size, score_hash.size
21
+ assert_equal 1, score_hash.values.sort.first
22
+ end
23
+
24
+ def test_highest_scoring_by
25
+ t1 = @selector.highest_scoring_by {|rule| 1}
26
+ assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
27
+ assert_equal @candidates.size, t1.size
28
+ t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
29
+ assert_equal 1, t2.size
30
+ end
31
+
32
+ def test_select_best_by_match_type
33
+ @selector.select_best_by_match_type :fail, :early, :late, :perfect
34
+ assert_equal @candidates, @selector.candidates
35
+ @selector.select_best_by_match_type :late
36
+ assert_equal 1, @selector.candidates.size
37
+ assert_equal @candidates[1], @selector.candidates[0]
38
+ end
39
+
40
+ def test_select_with_fewer_wildcards
41
+ assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
42
+ assert_equal 1, @selector.candidates.size
43
+ end
44
+
45
+ def test_select_closest_to_label
46
+ assert_equal @candidates[2], @selector.select_closest_to_label[0]
47
+ assert_equal 1, @selector.candidates.size
48
+ end
49
+
50
+ def test_select_with_longer_landmarks
51
+ assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
52
+ assert_equal 1, @selector.candidates.size
53
+ end
54
+
55
+ def test_random_from_remaining
56
+ assert(@candidates.include?(@selector.random_from_remaining))
57
+ end
58
+ end
@@ -0,0 +1,7 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestExampleDocumentLoader < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ end
@@ -0,0 +1,15 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestLabelUtils < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ def test_label_regex
8
+ assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
9
+ assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
10
+ end
11
+
12
+ def test_clean_string
13
+ assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
14
+ end
15
+ end
@@ -0,0 +1,38 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestLearner < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ def setup
8
+ #Examples stolen from the STALKER paper. Target to extract is the area
9
+ #codes.
10
+ @e=@@labeled_addresses
11
+ @learner=Ariel::Learner.new(*@e)
12
+ end
13
+
14
+ def test_set_seed
15
+ assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
16
+ end
17
+
18
+ def test_generate_initial_candidates
19
+ @learner.direction=:forward
20
+ @learner.generate_initial_candidates
21
+ c=@learner.candidates
22
+ assert (c.include? Ariel::Rule.new(:forward, [["("]]))
23
+ assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
24
+ assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
25
+ end
26
+
27
+ def test_refine
28
+ @learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
29
+ assert @learner.refine
30
+ @learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
31
+ assert @learner.refine
32
+ end
33
+
34
+ def test_learn_rule
35
+ rule=@learner.learn_rule :forward
36
+ p rule
37
+ end
38
+ end