ariel 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/LICENSE +21 -0
  2. data/README +98 -0
  3. data/bin/ariel +56 -0
  4. data/examples/google_calculator/labeled/1 +43 -0
  5. data/examples/google_calculator/labeled/2 +41 -0
  6. data/examples/google_calculator/labeled/3 +41 -0
  7. data/examples/google_calculator/structure.rb +12 -0
  8. data/examples/google_calculator/structure.yaml +46 -0
  9. data/examples/google_calculator/unlabeled/1 +43 -0
  10. data/examples/google_calculator/unlabeled/2 +43 -0
  11. data/examples/raa/labeled/highline.html +135 -0
  12. data/examples/raa/labeled/mongrel.html +168 -0
  13. data/examples/raa/structure.rb +17 -0
  14. data/examples/raa/structure.yaml +183 -0
  15. data/examples/raa/unlabeled/pdf-writer.html +175 -0
  16. data/lib/ariel/candidate_selector.rb +94 -0
  17. data/lib/ariel/example_document_loader.rb +59 -0
  18. data/lib/ariel/extracted_node.rb +20 -0
  19. data/lib/ariel/label_utils.rb +71 -0
  20. data/lib/ariel/learner.rb +237 -0
  21. data/lib/ariel/node_like.rb +26 -0
  22. data/lib/ariel/rule.rb +112 -0
  23. data/lib/ariel/rule_set.rb +34 -0
  24. data/lib/ariel/structure_node.rb +75 -0
  25. data/lib/ariel/token.rb +68 -0
  26. data/lib/ariel/token_stream.rb +240 -0
  27. data/lib/ariel/wildcards.rb +33 -0
  28. data/lib/ariel.rb +69 -0
  29. data/test/ariel_test_case.rb +15 -0
  30. data/test/fixtures.rb +43 -0
  31. data/test/specs/token_spec.rb +65 -0
  32. data/test/specs/token_stream_spec.rb +43 -0
  33. data/test/specs/wildcards_spec.rb +26 -0
  34. data/test/test_candidate_selector.rb +58 -0
  35. data/test/test_example_document_loader.rb +7 -0
  36. data/test/test_label_utils.rb +15 -0
  37. data/test/test_learner.rb +38 -0
  38. data/test/test_rule.rb +38 -0
  39. data/test/test_structure_node.rb +81 -0
  40. data/test/test_token.rb +16 -0
  41. data/test/test_token_stream.rb +82 -0
  42. data/test/test_wildcards.rb +18 -0
  43. metadata +103 -0
@@ -0,0 +1,240 @@
1
+ module Ariel
2
+
3
+ require 'enumerator'
4
+
5
+ # A TokenStream instance stores a stream of Tokens once it has used its tokenization
6
+ # rules to extract them from a string. A TokenStream knows its current
7
+ # position (TokenStream#cur_pos), which is incremented when any of the
8
+ # Enumerable methods are used (due to the redefinition of TokenStream#each).
9
+ # As you advance through the stream, the current token is always returned and
10
+ # then consumed. A TokenStream also provides methods for finding patterns in a
11
+ # given stream much like StringScanner but for an array of tokens. For rule
12
+ # generation, a certain token can be marked as being the start point of a label.
13
+ # Finally, a TokenStream will record whether it is in a reversed or unreversed
14
+ # state so that when rules are applied, they are always applied from the front
15
+ # or end of the stream as required, whether it is reversed or not.
16
+ class TokenStream
17
+ include Enumerable
18
+ attr_accessor :tokens, :cur_pos, :label_index, :original_text
19
+
20
+ def initialize()
21
+ @tokens=[]
22
+ @cur_pos=0
23
+ @original_text = ""
24
+ @token_regexen = [
25
+ Wildcards.list[:html_tag], # Match html tags that don't have attributes
26
+ /\d+/, # Match any numbers, probably good to make a split
27
+ /\b\w+\b/, # Pick up words, will split at punctuation
28
+ /\S/ # Grab any characters left over that aren't whitespace
29
+ ]
30
+ @label_tag_regexen = [LabelUtils.any_label_regex]
31
+ @reversed=false
32
+ end
33
+
34
+ # The tokenizer operates on a string by splitting it at every point it
35
+ # finds a match to a regular expression. Each match is added as a token, and
36
+ # the strings between each match are stored along with their original
37
+ # offsets. The same is then done with the next regular expression on each of
38
+ # these split strings, and new tokens are created with the correct offset in
39
+ # the original text. Any characters left unmatched by any of the regular
40
+ # expressions in @token_regexen are discarded. This approach allows a
41
+ # hierarchy of regular expressions to work simply and easily. A simple
42
+ # regular expression to match html tags might operate first, and then later
43
+ # expressions that pick up runs of word characters can operate on what's
44
+ # left. If contains_labels is set to true when calling tokenize, the
45
+ # tokenizer will first remove and discard any occurences of label_tags (as
46
+ # defined by the Regex set in LabelUtils) before matching and adding tokens.
47
+ # Any label_tag tokens will be marked as such upon creation.
48
+ def tokenize(input, contains_labels=false)
49
+ string_array=[[input, 0]]
50
+ @original_text = input
51
+ @original_text_contains_labels=contains_labels
52
+ @label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
53
+ @token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
54
+ @tokens.sort!
55
+ @tokens.size
56
+ end
57
+
58
+ # Goes through all stored Token instances, removing them if
59
+ # Token#is_label_tag? Called after a labeled document has been extracted to
60
+ # a tree ready for the rule learning process.
61
+ def remove_label_tags
62
+ @tokens.delete_if {|token| token.is_label_tag?}
63
+ end
64
+
65
+ # Returns the slice of the current instance containing all the tokens
66
+ # between the token where the start_loc == the left parameter and the token
67
+ # where the end_loc == the right parameter.
68
+ def slice_by_string_pos(left, right)
69
+ l_index=nil
70
+ r_index=nil
71
+ @tokens.each_index {|i| l_index = i if @tokens[i].start_loc == left}
72
+ @tokens.each_index {|i| r_index = i if @tokens[i].end_loc == right}
73
+ if l_index.nil? or r_index.nil?
74
+ raise ArgumentError, "Cannot slice between those locations"
75
+ else
76
+ return slice_by_token_index(l_index, r_index)
77
+ end
78
+ end
79
+
80
+ # Slices tokens between the l_index and the r_index inclusive.
81
+ def slice_by_token_index(l_index, r_index)
82
+ sliced = self.dup
83
+ sliced.tokens=@tokens[l_index..r_index]
84
+ return sliced
85
+ end
86
+
87
+ # Used to ensure operations such as @tokens.reverse! in one instance won't
88
+ # inadvertently effect another.
89
+ def deep_clone
90
+ Marshal::load(Marshal.dump(self))
91
+ end
92
+
93
+ # Set a label at a given offset in the original text. Searches for a token
94
+ # with a start_loc equal to the position passed as an argument, and raises
95
+ # an error if one is not found.
96
+ def set_label_at(pos)
97
+ token_pos=nil
98
+ @tokens.each_index {|i| token_pos = i if @tokens[i].start_loc == pos}
99
+ if token_pos.nil?
100
+ raise ArgumentError, "Given string position does not match the start of any token"
101
+ else
102
+ @label_index = token_pos
103
+ debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
104
+ return @label_index
105
+ end
106
+ end
107
+
108
+ # Returns all text represented by the instance's stored tokens, stripping any
109
+ # label tags if the stream was declared to be containing them when it was
110
+ # initialized (this would only happen during the process of loading labeled
111
+ # examples). See also TokenStream#raw_text
112
+ def text(l_index=0, r_index=-1)
113
+ out=raw_text(l_index, r_index)
114
+ if @original_text_contains_labels
115
+ LabelUtils.clean_string(out)
116
+ else
117
+ out
118
+ end
119
+ end
120
+
121
+ # Returns all text represented by the instance's stored tokens it will not
122
+ # strip label tags even if the stream is marked to contain them. However,
123
+ # you should not expect to get the raw_text once any label_tags have been
124
+ # filtered (TokenStream#remove_label_tags).
125
+ def raw_text(l_index=0, r_index=-1)
126
+ return "" if @tokens.size==0
127
+ if reversed?
128
+ l_index, r_index = r_index, l_index
129
+ end
130
+ @original_text[@tokens[l_index].start_loc...@tokens[r_index].end_loc]
131
+ end
132
+
133
+ # Returns the current Token and consumes it.
134
+ def advance
135
+ return nil if @cur_pos > @tokens.size
136
+ while true
137
+ @cur_pos+=1
138
+ current_token = @tokens[@cur_pos-1]
139
+ return nil if current_token.nil?
140
+ return current_token
141
+ end
142
+ end
143
+
144
+ # Return to the beginning of the TokenStream.
145
+ def rewind
146
+ @cur_pos=0
147
+ self
148
+ end
149
+
150
+ # Returns a copy of the current instance with a reversed set of tokens. If
151
+ # it is set, the label_index is adjusted accordingly to point to the correct
152
+ # token.
153
+ def reverse
154
+ self.deep_clone.reverse!
155
+ end
156
+
157
+ # Converts the given position so it points to the same token once the stream
158
+ # is reversed. Result invalid for when @tokens.size==0
159
+ def reverse_pos(pos)
160
+ @tokens.size-(pos + 1)
161
+ end
162
+
163
+ # Same as LabeledStream#reverse, but changes are made in place.
164
+ def reverse!
165
+ @tokens.reverse!
166
+ if label_index
167
+ @label_index = reverse_pos(@label_index)
168
+ end
169
+ @cur_pos = reverse_pos(@cur_pos)
170
+ @reversed=!@reversed
171
+ return self
172
+ end
173
+
174
+ # Returns true or false depending on whether the given tokenstream is in a
175
+ # reversed state
176
+ def reversed?
177
+ @reversed
178
+ end
179
+
180
+ # Takes a list of Strings and Symbols as its arguments representing text to be matched in
181
+ # individual tokens and Wildcards. For a match to be a
182
+ # success, all wildcards and strings must match a consecutive sequence
183
+ # of Tokens in the TokenStream. All matched Tokens are consumed, and the
184
+ # TokenStream's current position is returned on success. On failure, the
185
+ # TokenStream is returned to its original state and returns nil.
186
+ def skip_to(*features)
187
+ original_pos=@cur_pos
188
+ self.each_cons(features.size) do |tokens|
189
+ i=0
190
+ return @cur_pos if tokens.all? {|token| i+=1; token.matches?(features[i-1])}
191
+ end
192
+ @cur_pos=original_pos #No match, return TokenStream to original state
193
+ return nil
194
+ end
195
+
196
+ # Iterates over and consumes every Token from the cur_pos.
197
+ def each
198
+ while (token = self.advance)
199
+ yield token
200
+ end
201
+ end
202
+
203
+ # Returns the current Token.
204
+ def current_token
205
+ @tokens[@cur_pos]
206
+ end
207
+
208
+ private
209
+
210
+ # Uses split_by_regex to split each member of a given array of string and
211
+ # offset pairs in to new arrays of string and offset pairs.
212
+ def split_string_array_by_regex(string_array, regex, add_matches=true)
213
+ new_string_array = []
214
+ string_array.each do |arr|
215
+ result = split_by_regex(arr[0], arr[1], regex, add_matches)
216
+ new_string_array.concat result
217
+ end
218
+ string_array.replace new_string_array
219
+ end
220
+
221
+ # For tokenization, removes regex matches and creates new strings to
222
+ # represent the gaps between each match.
223
+ def split_by_regex(string, offset, regex, add_matches=true)
224
+ split_points=[0]
225
+ string_holder = []
226
+ string.scan(regex) do |s|
227
+ match = Regexp.last_match
228
+ split_points << match.begin(0)
229
+ split_points << match.end(0)
230
+ @tokens << Token.new(match[0], match.begin(0)+offset, match.end(0)+offset, !add_matches)
231
+ end
232
+ split_points << string.size
233
+ split_points.each_slice(2) do |s_pos, e_pos|
234
+ split_string = string[s_pos...e_pos]
235
+ string_holder << [split_string, s_pos+offset] unless split_string.empty?
236
+ end
237
+ return string_holder
238
+ end
239
+ end
240
+ end
@@ -0,0 +1,33 @@
1
+ module Ariel
2
+ # Contains all wildcards to be used in rule generation.
3
+ class Wildcards
4
+ private_class_method :new
5
+ @@list = {
6
+ :anything=>/.+/,
7
+ :numeric=>/\d+/,
8
+ :alpha_numeric=>/\w+/,
9
+ :alpha=>/[[:alpha:]]+/,
10
+ :capitalized=>/[[:upper:]]+\w+/,
11
+ :all_caps=>/[[:upper:]]+/,
12
+ :html_tag=>/<\/?\w+>|<\w+\s+\/>/,
13
+ :punctuation=>/[[:punct:]]+/
14
+ }
15
+ # Returns the hash of wildcard name (symbol) and regular expression pairs.
16
+ def self.list
17
+ @@list
18
+ end
19
+
20
+ # Given a string, will return an array of symbols from Wildcards::list that
21
+ # match it.
22
+ def self.matching(string)
23
+ matches=[]
24
+ @@list.each do |name, regex|
25
+ if string[regex]==string
26
+ yield name if block_given?
27
+ matches << name
28
+ end
29
+ end
30
+ matches
31
+ end
32
+ end
33
+ end
data/lib/ariel.rb ADDED
@@ -0,0 +1,69 @@
1
+ require 'ariel/token'
2
+ require 'ariel/token_stream'
3
+ require 'ariel/learner'
4
+ require 'ariel/node_like'
5
+ require 'ariel/extracted_node'
6
+ require 'ariel/structure_node'
7
+ require 'ariel/rule'
8
+ require 'ariel/wildcards'
9
+ require 'ariel/candidate_selector'
10
+ require 'ariel/label_utils'
11
+ require 'ariel/example_document_loader'
12
+ require 'ariel/rule_set'
13
+
14
+ if $DEBUG
15
+ # require 'logger'
16
+
17
+ # DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
18
+ # DEBUGLOG.datetime_format = " \010"
19
+ # DEBUGLOG.progname = "\010\010\010"
20
+
21
+ def debug(message)
22
+ p message
23
+ #DEBUGLOG.debug message
24
+ end
25
+ else
26
+ def debug(message)
27
+ end
28
+ end
29
+
30
+ # = Ariel - A Ruby Information Extraction Library
31
+ # Ariel intends to assist in extracting information from semi-structured
32
+ # documents including (but not in any way limited to) web pages. Although you
33
+ # may use libraries such as Hpricot or Rubyful Soup, or even plain Regular
34
+ # Expressions to achieve the same goal, Ariel approaches the problem very
35
+ # differently. Ariel relies on the user labeling examples of the data they
36
+ # want to extract, and then finds patterns across several such labeled
37
+ # examples in order to produce a set of general rules for extracting this
38
+ # information from any similar document.
39
+ #
40
+ # When working with Ariel, your workflow might look something like this:
41
+ # 1. Define a structure for the data you wish to extract. For example:
42
+ #
43
+ # @structure = Ariel::StructureNode.new do |r|
44
+ # r.article do |a|
45
+ # a.title
46
+ # a.author
47
+ # a.date
48
+ # a.body
49
+ # end
50
+ # r.comment_list do |c|
51
+ # c.author
52
+ # c.date
53
+ # c.body
54
+ # end
55
+ # end
56
+ # 2. Label these fields in a few example documents (normally at least 3).
57
+ # Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
58
+ # 3. Ariel will read these examples, and try to generate suitable rules that can
59
+ # be used to extract this data from other similarly structured documents.
60
+ # 4. A wrapper has been generated - we can now happily load documents with the
61
+ # same structure (normally documents generated by the same rules, so
62
+ # different pages from a single site perhaps) and query the extracted data.
63
+ module Ariel
64
+
65
+
66
+
67
+ end
68
+
69
+
@@ -0,0 +1,15 @@
1
+ require 'test/unit'
2
+ require 'fixtures'
3
+
4
+ module Ariel
5
+ include Fixtures
6
+ class TestCase < Test::Unit::TestCase
7
+ def run(result)
8
+ debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
9
+ super
10
+ end
11
+
12
+ def default_test
13
+ end
14
+ end
15
+ end
data/test/fixtures.rb ADDED
@@ -0,0 +1,43 @@
1
+ module Fixtures
2
+ @@labeled_document = <<EOS
3
+ Title: <l:title>The test of the Century</l:title>
4
+ <l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
5
+ <l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
6
+ EOS
7
+ @@labeled_document_structure = Ariel::StructureNode.new do |r|
8
+ r.item :title
9
+ r.item :content do |c|
10
+ c.item :excerpt
11
+ c.item :body
12
+ end
13
+ end
14
+ @@unlabeled_document=<<EOS
15
+ Title: The test of the Century
16
+ <b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
17
+ There was once a test designed to assess whether apply_extraction_tree_on worked.
18
+ EOS
19
+ # Document with nested labels with clashing names. i.e. a label at the top
20
+ # level as well as a label lower down in the tree that has the same label
21
+ # name.
22
+ @@labeled_document_with_list=<<EOS
23
+ Title: <l:title>Another example</l:title>
24
+ <l:body>I love to write examples, you love to read them, ruby loves to process them.
25
+ In conclusion, we're has happy as can be.</l:body>
26
+ <l:comment_list>Comments:
27
+ <l:comment>Title:<l:title>Great example</l:title>
28
+ <l:author>Adoring fan</l:author>
29
+ <l:body>Always love reading your examples, keep up the great work.</l:body>
30
+ </l:comment></l:comment_list>
31
+ EOS
32
+
33
+ @@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
34
+ @@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
35
+ @@labeled_addresses[0].set_label_at 36
36
+ @@labeled_addresses[1].tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
37
+ @@labeled_addresses[1].set_label_at 35
38
+ @@labeled_addresses[2].tokenize("523 1st St., <b> LA </b>, Phone: 1-<b>888</b>-578-2293")
39
+ @@labeled_addresses[2].set_label_at 38
40
+ @@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
41
+ @@labeled_addresses[3].set_label_at 39
42
+
43
+ end
@@ -0,0 +1,65 @@
1
+ require 'ariel'
2
+
3
+ context "An average token" do
4
+ setup do
5
+ @token = Ariel::Token.new("Test", 0, 4)
6
+ end
7
+ specify "Should return the string it holds when text is called" do
8
+ @token.text.should_equal "Test"
9
+ end
10
+
11
+ specify "Should not be a label tag" do
12
+ @token.is_label_tag?.should_be false
13
+ end
14
+
15
+ specify "Should return true if if the token string matches a given wildcard or equals a given string" do
16
+ @token.matches?("Test").should_be true
17
+ @token.matches?(:alpha_numeric).should_be true
18
+ end
19
+
20
+ specify "Should return false if the token string doesn't match the given wildcard or string" do
21
+ @token.matches?("Tes").should_be false
22
+ @token.matches?(:html_tag).should_be false
23
+ end
24
+
25
+ specify "Should raise an error if an invalid wildcard is given" do
26
+ lambda {@token.matches? :not_a_wildcard}.should_raise ArgumentError
27
+ end
28
+
29
+ specify "Should be able to list all wildcard symbols that match its text" do
30
+ @token.matching_wildcards.should_be_an_instance_of Array
31
+ @token.matching_wildcards.each {|wildcard| wildcard.should_be_an_instance_of Symbol}
32
+ end
33
+ end
34
+
35
+ context "Comparing two Tokens" do
36
+ setup do
37
+ @token1 = Ariel::Token.new("Alice", 0, 5)
38
+ @token2 = Ariel::Token.new("Bob", 5, 8)
39
+ @token1_clone = Ariel::Token.new("Alice", 0, 5)
40
+ @token1_almost_clone = Ariel::Token.new("Alice", 0, 4)
41
+ end
42
+
43
+ specify "Should be equal if and only if text, start location and end location are equal" do
44
+ @token1.should_equal @token1_clone
45
+ @token1.should_not_equal @token2
46
+ @token1.should_not_equal @token1_almost_clone
47
+ end
48
+
49
+ specify "Should define a way of comparing itself to other tokens" do
50
+ @token1.should_respond_to :<=>
51
+ end
52
+
53
+ specify "Should make comparisons based on the start location of the token" do
54
+ (@token1<=>@token1_almost_clone).should_equal 0
55
+ (@token1<=>@token2).should_equal -1
56
+ end
57
+ end
58
+
59
+ context "Initializing a label tag token" do
60
+ specify "Should be ignored if passed true as the final argument to Token#new" do
61
+ Ariel::Token.new("Test", 0, 4, true).is_label_tag?.should_be true
62
+ end
63
+ end
64
+
65
+
@@ -0,0 +1,43 @@
1
+ require 'ariel'
2
+ require 'fixtures'
3
+ include Fixtures
4
+
5
+ context "A new TokenStream" do
6
+ setup do
7
+ @tokenstream = Ariel::TokenStream.new
8
+ end
9
+
10
+ specify "Should return 0 when cur_pos is called" do
11
+ @tokenstream.cur_pos.should_equal 0
12
+ end
13
+
14
+ specify "Should return an empty Array when tokens is called" do
15
+ @tokenstream.tokens.should_be_a_kind_of Array
16
+ @tokenstream.tokens.should_be_empty
17
+ end
18
+
19
+ specify "Should not contain any tokens" do
20
+ @tokenstream.tokens.size.should_equal 0
21
+ end
22
+
23
+ specify "Should return an empty string went sent the message raw_text" do
24
+ @tokenstream.raw_text.should_equal ""
25
+ end
26
+
27
+ specify "Should return nil when asked to advance" do
28
+ @tokenstream.advance.should_be_nil
29
+ end
30
+
31
+ specify "cur_pos should increase to 1 when asked to advance and no further" do
32
+ @tokenstream.advance
33
+ @tokenstream.cur_pos.should_equal 1
34
+ @tokenstream.advance
35
+ @tokenstream.cur_pos.should_equal 1
36
+ end
37
+
38
+ specify "Should not be reversed" do
39
+ @tokenstream.should_not_be_reversed
40
+ end
41
+ end
42
+
43
+
@@ -0,0 +1,26 @@
1
+ require 'ariel'
2
+
3
+ context "When querying the Wildcards class" do
4
+
5
+ specify "Should not be possible to create a Wildcards instance" do
6
+ lambda {Ariel::Wildcards.new}.should_raise
7
+ end
8
+
9
+ specify "Should return a hash of Symbol to Regexp pairs when sent the list message" do
10
+ wildcards=Ariel::Wildcards.list
11
+ wildcards.should_be_a_kind_of Hash
12
+ wildcards.keys.each {|key| key.should_be_a_kind_of Symbol}
13
+ wildcards.values.each {|value| value.should_be_a_kind_of Regexp}
14
+ end
15
+
16
+ specify "When Wildcards.matching is called with a String, should return an array of the symbols of all matching wildcards" do
17
+ Ariel::Wildcards.matching("Test").should_be_a_kind_of Array
18
+ Ariel::Wildcards.matching("<a>").should_include :html_tag
19
+ end
20
+
21
+ specify "Should yield a symbol for every wildcard the string matches when Wildcards.matching is called" do
22
+ list=[]
23
+ Ariel::Wildcards.matching("<a>") {|wildcard| list << wildcard}
24
+ list.should_not_be_empty
25
+ end
26
+ end
@@ -0,0 +1,58 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+
5
+ class TestCandidateSelector < Ariel::TestCase
6
+ include Fixtures
7
+ def setup
8
+ # Must get rid of this repetition, should be available to all tests
9
+ @e=@@labeled_addresses
10
+ @candidates=[]
11
+ @candidates << Ariel::Rule.new(:forward, [[:anything]])
12
+ @candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
13
+ @candidates << Ariel::Rule.new(:forward, [["("]])
14
+ @candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
15
+ @selector=Ariel::CandidateSelector.new(@candidates, @e)
16
+ end
17
+
18
+ def test_score_by
19
+ score_hash = @selector.score_by {|rule| rule.landmarks.size}
20
+ assert_equal @candidates.size, score_hash.size
21
+ assert_equal 1, score_hash.values.sort.first
22
+ end
23
+
24
+ def test_highest_scoring_by
25
+ t1 = @selector.highest_scoring_by {|rule| 1}
26
+ assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
27
+ assert_equal @candidates.size, t1.size
28
+ t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
29
+ assert_equal 1, t2.size
30
+ end
31
+
32
+ def test_select_best_by_match_type
33
+ @selector.select_best_by_match_type :fail, :early, :late, :perfect
34
+ assert_equal @candidates, @selector.candidates
35
+ @selector.select_best_by_match_type :late
36
+ assert_equal 1, @selector.candidates.size
37
+ assert_equal @candidates[1], @selector.candidates[0]
38
+ end
39
+
40
+ def test_select_with_fewer_wildcards
41
+ assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
42
+ assert_equal 1, @selector.candidates.size
43
+ end
44
+
45
+ def test_select_closest_to_label
46
+ assert_equal @candidates[2], @selector.select_closest_to_label[0]
47
+ assert_equal 1, @selector.candidates.size
48
+ end
49
+
50
+ def test_select_with_longer_landmarks
51
+ assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
52
+ assert_equal 1, @selector.candidates.size
53
+ end
54
+
55
+ def test_random_from_remaining
56
+ assert(@candidates.include?(@selector.random_from_remaining))
57
+ end
58
+ end
@@ -0,0 +1,7 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestExampleDocumentLoader < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ end
@@ -0,0 +1,15 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestLabelUtils < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ def test_label_regex
8
+ assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
9
+ assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
10
+ end
11
+
12
+ def test_clean_string
13
+ assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
14
+ end
15
+ end
@@ -0,0 +1,38 @@
1
+ require 'ariel'
2
+ require 'ariel_test_case'
3
+
4
+ class TestLearner < Ariel::TestCase
5
+ include Fixtures
6
+
7
+ def setup
8
+ #Examples stolen from the STALKER paper. Target to extract is the area
9
+ #codes.
10
+ @e=@@labeled_addresses
11
+ @learner=Ariel::Learner.new(*@e)
12
+ end
13
+
14
+ def test_set_seed
15
+ assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
16
+ end
17
+
18
+ def test_generate_initial_candidates
19
+ @learner.direction=:forward
20
+ @learner.generate_initial_candidates
21
+ c=@learner.candidates
22
+ assert (c.include? Ariel::Rule.new(:forward, [["("]]))
23
+ assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
24
+ assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
25
+ end
26
+
27
+ def test_refine
28
+ @learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
29
+ assert @learner.refine
30
+ @learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
31
+ assert @learner.refine
32
+ end
33
+
34
+ def test_learn_rule
35
+ rule=@learner.learn_rule :forward
36
+ p rule
37
+ end
38
+ end