ariel 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README +98 -0
- data/bin/ariel +56 -0
- data/examples/google_calculator/labeled/1 +43 -0
- data/examples/google_calculator/labeled/2 +41 -0
- data/examples/google_calculator/labeled/3 +41 -0
- data/examples/google_calculator/structure.rb +12 -0
- data/examples/google_calculator/structure.yaml +46 -0
- data/examples/google_calculator/unlabeled/1 +43 -0
- data/examples/google_calculator/unlabeled/2 +43 -0
- data/examples/raa/labeled/highline.html +135 -0
- data/examples/raa/labeled/mongrel.html +168 -0
- data/examples/raa/structure.rb +17 -0
- data/examples/raa/structure.yaml +183 -0
- data/examples/raa/unlabeled/pdf-writer.html +175 -0
- data/lib/ariel/candidate_selector.rb +94 -0
- data/lib/ariel/example_document_loader.rb +59 -0
- data/lib/ariel/extracted_node.rb +20 -0
- data/lib/ariel/label_utils.rb +71 -0
- data/lib/ariel/learner.rb +237 -0
- data/lib/ariel/node_like.rb +26 -0
- data/lib/ariel/rule.rb +112 -0
- data/lib/ariel/rule_set.rb +34 -0
- data/lib/ariel/structure_node.rb +75 -0
- data/lib/ariel/token.rb +68 -0
- data/lib/ariel/token_stream.rb +240 -0
- data/lib/ariel/wildcards.rb +33 -0
- data/lib/ariel.rb +69 -0
- data/test/ariel_test_case.rb +15 -0
- data/test/fixtures.rb +43 -0
- data/test/specs/token_spec.rb +65 -0
- data/test/specs/token_stream_spec.rb +43 -0
- data/test/specs/wildcards_spec.rb +26 -0
- data/test/test_candidate_selector.rb +58 -0
- data/test/test_example_document_loader.rb +7 -0
- data/test/test_label_utils.rb +15 -0
- data/test/test_learner.rb +38 -0
- data/test/test_rule.rb +38 -0
- data/test/test_structure_node.rb +81 -0
- data/test/test_token.rb +16 -0
- data/test/test_token_stream.rb +82 -0
- data/test/test_wildcards.rb +18 -0
- metadata +103 -0
@@ -0,0 +1,240 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
require 'enumerator'
|
4
|
+
|
5
|
+
# A TokenStream instance stores a stream of Tokens once it has used its tokenization
|
6
|
+
# rules to extract them from a string. A TokenStream knows its current
|
7
|
+
# position (TokenStream#cur_pos), which is incremented when any of the
|
8
|
+
# Enumerable methods are used (due to the redefinition of TokenStream#each).
|
9
|
+
# As you advance through the stream, the current token is always returned and
|
10
|
+
# then consumed. A TokenStream also provides methods for finding patterns in a
|
11
|
+
# given stream much like StringScanner but for an array of tokens. For rule
|
12
|
+
# generation, a certain token can be marked as being the start point of a label.
|
13
|
+
# Finally, a TokenStream will record whether it is in a reversed or unreversed
|
14
|
+
# state so that when rules are applied, they are always applied from the front
|
15
|
+
# or end of the stream as required, whether it is reversed or not.
|
16
|
+
class TokenStream
|
17
|
+
include Enumerable
|
18
|
+
attr_accessor :tokens, :cur_pos, :label_index, :original_text
|
19
|
+
|
20
|
+
def initialize()
|
21
|
+
@tokens=[]
|
22
|
+
@cur_pos=0
|
23
|
+
@original_text = ""
|
24
|
+
@token_regexen = [
|
25
|
+
Wildcards.list[:html_tag], # Match html tags that don't have attributes
|
26
|
+
/\d+/, # Match any numbers, probably good to make a split
|
27
|
+
/\b\w+\b/, # Pick up words, will split at punctuation
|
28
|
+
/\S/ # Grab any characters left over that aren't whitespace
|
29
|
+
]
|
30
|
+
@label_tag_regexen = [LabelUtils.any_label_regex]
|
31
|
+
@reversed=false
|
32
|
+
end
|
33
|
+
|
34
|
+
# The tokenizer operates on a string by splitting it at every point it
|
35
|
+
# finds a match to a regular expression. Each match is added as a token, and
|
36
|
+
# the strings between each match are stored along with their original
|
37
|
+
# offsets. The same is then done with the next regular expression on each of
|
38
|
+
# these split strings, and new tokens are created with the correct offset in
|
39
|
+
# the original text. Any characters left unmatched by any of the regular
|
40
|
+
# expressions in @token_regexen are discarded. This approach allows a
|
41
|
+
# hierarchy of regular expressions to work simply and easily. A simple
|
42
|
+
# regular expression to match html tags might operate first, and then later
|
43
|
+
# expressions that pick up runs of word characters can operate on what's
|
44
|
+
# left. If contains_labels is set to true when calling tokenize, the
|
45
|
+
# tokenizer will first remove and discard any occurences of label_tags (as
|
46
|
+
# defined by the Regex set in LabelUtils) before matching and adding tokens.
|
47
|
+
# Any label_tag tokens will be marked as such upon creation.
|
48
|
+
def tokenize(input, contains_labels=false)
|
49
|
+
string_array=[[input, 0]]
|
50
|
+
@original_text = input
|
51
|
+
@original_text_contains_labels=contains_labels
|
52
|
+
@label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
|
53
|
+
@token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
|
54
|
+
@tokens.sort!
|
55
|
+
@tokens.size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Goes through all stored Token instances, removing them if
|
59
|
+
# Token#is_label_tag? Called after a labeled document has been extracted to
|
60
|
+
# a tree ready for the rule learning process.
|
61
|
+
def remove_label_tags
|
62
|
+
@tokens.delete_if {|token| token.is_label_tag?}
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the slice of the current instance containing all the tokens
|
66
|
+
# between the token where the start_loc == the left parameter and the token
|
67
|
+
# where the end_loc == the right parameter.
|
68
|
+
def slice_by_string_pos(left, right)
|
69
|
+
l_index=nil
|
70
|
+
r_index=nil
|
71
|
+
@tokens.each_index {|i| l_index = i if @tokens[i].start_loc == left}
|
72
|
+
@tokens.each_index {|i| r_index = i if @tokens[i].end_loc == right}
|
73
|
+
if l_index.nil? or r_index.nil?
|
74
|
+
raise ArgumentError, "Cannot slice between those locations"
|
75
|
+
else
|
76
|
+
return slice_by_token_index(l_index, r_index)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Slices tokens between the l_index and the r_index inclusive.
|
81
|
+
def slice_by_token_index(l_index, r_index)
|
82
|
+
sliced = self.dup
|
83
|
+
sliced.tokens=@tokens[l_index..r_index]
|
84
|
+
return sliced
|
85
|
+
end
|
86
|
+
|
87
|
+
# Used to ensure operations such as @tokens.reverse! in one instance won't
|
88
|
+
# inadvertently effect another.
|
89
|
+
def deep_clone
|
90
|
+
Marshal::load(Marshal.dump(self))
|
91
|
+
end
|
92
|
+
|
93
|
+
# Set a label at a given offset in the original text. Searches for a token
|
94
|
+
# with a start_loc equal to the position passed as an argument, and raises
|
95
|
+
# an error if one is not found.
|
96
|
+
def set_label_at(pos)
|
97
|
+
token_pos=nil
|
98
|
+
@tokens.each_index {|i| token_pos = i if @tokens[i].start_loc == pos}
|
99
|
+
if token_pos.nil?
|
100
|
+
raise ArgumentError, "Given string position does not match the start of any token"
|
101
|
+
else
|
102
|
+
@label_index = token_pos
|
103
|
+
debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
|
104
|
+
return @label_index
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns all text represented by the instance's stored tokens, stripping any
|
109
|
+
# label tags if the stream was declared to be containing them when it was
|
110
|
+
# initialized (this would only happen during the process of loading labeled
|
111
|
+
# examples). See also TokenStream#raw_text
|
112
|
+
def text(l_index=0, r_index=-1)
|
113
|
+
out=raw_text(l_index, r_index)
|
114
|
+
if @original_text_contains_labels
|
115
|
+
LabelUtils.clean_string(out)
|
116
|
+
else
|
117
|
+
out
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns all text represented by the instance's stored tokens it will not
|
122
|
+
# strip label tags even if the stream is marked to contain them. However,
|
123
|
+
# you should not expect to get the raw_text once any label_tags have been
|
124
|
+
# filtered (TokenStream#remove_label_tags).
|
125
|
+
def raw_text(l_index=0, r_index=-1)
|
126
|
+
return "" if @tokens.size==0
|
127
|
+
if reversed?
|
128
|
+
l_index, r_index = r_index, l_index
|
129
|
+
end
|
130
|
+
@original_text[@tokens[l_index].start_loc...@tokens[r_index].end_loc]
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns the current Token and consumes it.
|
134
|
+
def advance
|
135
|
+
return nil if @cur_pos > @tokens.size
|
136
|
+
while true
|
137
|
+
@cur_pos+=1
|
138
|
+
current_token = @tokens[@cur_pos-1]
|
139
|
+
return nil if current_token.nil?
|
140
|
+
return current_token
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Return to the beginning of the TokenStream.
|
145
|
+
def rewind
|
146
|
+
@cur_pos=0
|
147
|
+
self
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns a copy of the current instance with a reversed set of tokens. If
|
151
|
+
# it is set, the label_index is adjusted accordingly to point to the correct
|
152
|
+
# token.
|
153
|
+
def reverse
|
154
|
+
self.deep_clone.reverse!
|
155
|
+
end
|
156
|
+
|
157
|
+
# Converts the given position so it points to the same token once the stream
|
158
|
+
# is reversed. Result invalid for when @tokens.size==0
|
159
|
+
def reverse_pos(pos)
|
160
|
+
@tokens.size-(pos + 1)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Same as LabeledStream#reverse, but changes are made in place.
|
164
|
+
def reverse!
|
165
|
+
@tokens.reverse!
|
166
|
+
if label_index
|
167
|
+
@label_index = reverse_pos(@label_index)
|
168
|
+
end
|
169
|
+
@cur_pos = reverse_pos(@cur_pos)
|
170
|
+
@reversed=!@reversed
|
171
|
+
return self
|
172
|
+
end
|
173
|
+
|
174
|
+
# Returns true or false depending on whether the given tokenstream is in a
|
175
|
+
# reversed state
|
176
|
+
def reversed?
|
177
|
+
@reversed
|
178
|
+
end
|
179
|
+
|
180
|
+
# Takes a list of Strings and Symbols as its arguments representing text to be matched in
|
181
|
+
# individual tokens and Wildcards. For a match to be a
|
182
|
+
# success, all wildcards and strings must match a consecutive sequence
|
183
|
+
# of Tokens in the TokenStream. All matched Tokens are consumed, and the
|
184
|
+
# TokenStream's current position is returned on success. On failure, the
|
185
|
+
# TokenStream is returned to its original state and returns nil.
|
186
|
+
def skip_to(*features)
|
187
|
+
original_pos=@cur_pos
|
188
|
+
self.each_cons(features.size) do |tokens|
|
189
|
+
i=0
|
190
|
+
return @cur_pos if tokens.all? {|token| i+=1; token.matches?(features[i-1])}
|
191
|
+
end
|
192
|
+
@cur_pos=original_pos #No match, return TokenStream to original state
|
193
|
+
return nil
|
194
|
+
end
|
195
|
+
|
196
|
+
# Iterates over and consumes every Token from the cur_pos.
|
197
|
+
def each
|
198
|
+
while (token = self.advance)
|
199
|
+
yield token
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the current Token.
|
204
|
+
def current_token
|
205
|
+
@tokens[@cur_pos]
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
# Uses split_by_regex to split each member of a given array of string and
|
211
|
+
# offset pairs in to new arrays of string and offset pairs.
|
212
|
+
def split_string_array_by_regex(string_array, regex, add_matches=true)
|
213
|
+
new_string_array = []
|
214
|
+
string_array.each do |arr|
|
215
|
+
result = split_by_regex(arr[0], arr[1], regex, add_matches)
|
216
|
+
new_string_array.concat result
|
217
|
+
end
|
218
|
+
string_array.replace new_string_array
|
219
|
+
end
|
220
|
+
|
221
|
+
# For tokenization, removes regex matches and creates new strings to
|
222
|
+
# represent the gaps between each match.
|
223
|
+
def split_by_regex(string, offset, regex, add_matches=true)
|
224
|
+
split_points=[0]
|
225
|
+
string_holder = []
|
226
|
+
string.scan(regex) do |s|
|
227
|
+
match = Regexp.last_match
|
228
|
+
split_points << match.begin(0)
|
229
|
+
split_points << match.end(0)
|
230
|
+
@tokens << Token.new(match[0], match.begin(0)+offset, match.end(0)+offset, !add_matches)
|
231
|
+
end
|
232
|
+
split_points << string.size
|
233
|
+
split_points.each_slice(2) do |s_pos, e_pos|
|
234
|
+
split_string = string[s_pos...e_pos]
|
235
|
+
string_holder << [split_string, s_pos+offset] unless split_string.empty?
|
236
|
+
end
|
237
|
+
return string_holder
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Ariel
|
2
|
+
# Contains all wildcards to be used in rule generation.
|
3
|
+
class Wildcards
|
4
|
+
private_class_method :new
|
5
|
+
@@list = {
|
6
|
+
:anything=>/.+/,
|
7
|
+
:numeric=>/\d+/,
|
8
|
+
:alpha_numeric=>/\w+/,
|
9
|
+
:alpha=>/[[:alpha:]]+/,
|
10
|
+
:capitalized=>/[[:upper:]]+\w+/,
|
11
|
+
:all_caps=>/[[:upper:]]+/,
|
12
|
+
:html_tag=>/<\/?\w+>|<\w+\s+\/>/,
|
13
|
+
:punctuation=>/[[:punct:]]+/
|
14
|
+
}
|
15
|
+
# Returns the hash of wildcard name (symbol) and regular expression pairs.
|
16
|
+
def self.list
|
17
|
+
@@list
|
18
|
+
end
|
19
|
+
|
20
|
+
# Given a string, will return an array of symbols from Wildcards::list that
|
21
|
+
# match it.
|
22
|
+
def self.matching(string)
|
23
|
+
matches=[]
|
24
|
+
@@list.each do |name, regex|
|
25
|
+
if string[regex]==string
|
26
|
+
yield name if block_given?
|
27
|
+
matches << name
|
28
|
+
end
|
29
|
+
end
|
30
|
+
matches
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/ariel.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'ariel/token'
|
2
|
+
require 'ariel/token_stream'
|
3
|
+
require 'ariel/learner'
|
4
|
+
require 'ariel/node_like'
|
5
|
+
require 'ariel/extracted_node'
|
6
|
+
require 'ariel/structure_node'
|
7
|
+
require 'ariel/rule'
|
8
|
+
require 'ariel/wildcards'
|
9
|
+
require 'ariel/candidate_selector'
|
10
|
+
require 'ariel/label_utils'
|
11
|
+
require 'ariel/example_document_loader'
|
12
|
+
require 'ariel/rule_set'
|
13
|
+
|
14
|
+
if $DEBUG
|
15
|
+
# require 'logger'
|
16
|
+
|
17
|
+
# DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
|
18
|
+
# DEBUGLOG.datetime_format = " \010"
|
19
|
+
# DEBUGLOG.progname = "\010\010\010"
|
20
|
+
|
21
|
+
def debug(message)
|
22
|
+
p message
|
23
|
+
#DEBUGLOG.debug message
|
24
|
+
end
|
25
|
+
else
|
26
|
+
def debug(message)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# = Ariel - A Ruby Information Extraction Library
|
31
|
+
# Ariel intends to assist in extracting information from semi-structured
|
32
|
+
# documents including (but not in any way limited to) web pages. Although you
|
33
|
+
# may use libraries such as Hpricot or Rubyful Soup, or even plain Regular
|
34
|
+
# Expressions to achieve the same goal, Ariel approaches the problem very
|
35
|
+
# differently. Ariel relies on the user labeling examples of the data they
|
36
|
+
# want to extract, and then finds patterns across several such labeled
|
37
|
+
# examples in order to produce a set of general rules for extracting this
|
38
|
+
# information from any similar document.
|
39
|
+
#
|
40
|
+
# When working with Ariel, your workflow might look something like this:
|
41
|
+
# 1. Define a structure for the data you wish to extract. For example:
|
42
|
+
#
|
43
|
+
# @structure = Ariel::StructureNode.new do |r|
|
44
|
+
# r.article do |a|
|
45
|
+
# a.title
|
46
|
+
# a.author
|
47
|
+
# a.date
|
48
|
+
# a.body
|
49
|
+
# end
|
50
|
+
# r.comment_list do |c|
|
51
|
+
# c.author
|
52
|
+
# c.date
|
53
|
+
# c.body
|
54
|
+
# end
|
55
|
+
# end
|
56
|
+
# 2. Label these fields in a few example documents (normally at least 3).
|
57
|
+
# Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
|
58
|
+
# 3. Ariel will read these examples, and try to generate suitable rules that can
|
59
|
+
# be used to extract this data from other similarly structured documents.
|
60
|
+
# 4. A wrapper has been generated - we can now happily load documents with the
|
61
|
+
# same structure (normally documents generated by the same rules, so
|
62
|
+
# different pages from a single site perhaps) and query the extracted data.
|
63
|
+
module Ariel
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'fixtures'
|
3
|
+
|
4
|
+
module Ariel
|
5
|
+
include Fixtures
|
6
|
+
class TestCase < Test::Unit::TestCase
|
7
|
+
def run(result)
|
8
|
+
debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def default_test
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/test/fixtures.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Fixtures
|
2
|
+
@@labeled_document = <<EOS
|
3
|
+
Title: <l:title>The test of the Century</l:title>
|
4
|
+
<l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
|
5
|
+
<l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
|
6
|
+
EOS
|
7
|
+
@@labeled_document_structure = Ariel::StructureNode.new do |r|
|
8
|
+
r.item :title
|
9
|
+
r.item :content do |c|
|
10
|
+
c.item :excerpt
|
11
|
+
c.item :body
|
12
|
+
end
|
13
|
+
end
|
14
|
+
@@unlabeled_document=<<EOS
|
15
|
+
Title: The test of the Century
|
16
|
+
<b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
|
17
|
+
There was once a test designed to assess whether apply_extraction_tree_on worked.
|
18
|
+
EOS
|
19
|
+
# Document with nested labels with clashing names. i.e. a label at the top
|
20
|
+
# level as well as a label lower down in the tree that has the same label
|
21
|
+
# name.
|
22
|
+
@@labeled_document_with_list=<<EOS
|
23
|
+
Title: <l:title>Another example</l:title>
|
24
|
+
<l:body>I love to write examples, you love to read them, ruby loves to process them.
|
25
|
+
In conclusion, we're has happy as can be.</l:body>
|
26
|
+
<l:comment_list>Comments:
|
27
|
+
<l:comment>Title:<l:title>Great example</l:title>
|
28
|
+
<l:author>Adoring fan</l:author>
|
29
|
+
<l:body>Always love reading your examples, keep up the great work.</l:body>
|
30
|
+
</l:comment></l:comment_list>
|
31
|
+
EOS
|
32
|
+
|
33
|
+
@@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
|
34
|
+
@@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
|
35
|
+
@@labeled_addresses[0].set_label_at 36
|
36
|
+
@@labeled_addresses[1].tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
|
37
|
+
@@labeled_addresses[1].set_label_at 35
|
38
|
+
@@labeled_addresses[2].tokenize("523 1st St., <b> LA </b>, Phone: 1-<b>888</b>-578-2293")
|
39
|
+
@@labeled_addresses[2].set_label_at 38
|
40
|
+
@@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
|
41
|
+
@@labeled_addresses[3].set_label_at 39
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
|
3
|
+
context "An average token" do
|
4
|
+
setup do
|
5
|
+
@token = Ariel::Token.new("Test", 0, 4)
|
6
|
+
end
|
7
|
+
specify "Should return the string it holds when text is called" do
|
8
|
+
@token.text.should_equal "Test"
|
9
|
+
end
|
10
|
+
|
11
|
+
specify "Should not be a label tag" do
|
12
|
+
@token.is_label_tag?.should_be false
|
13
|
+
end
|
14
|
+
|
15
|
+
specify "Should return true if if the token string matches a given wildcard or equals a given string" do
|
16
|
+
@token.matches?("Test").should_be true
|
17
|
+
@token.matches?(:alpha_numeric).should_be true
|
18
|
+
end
|
19
|
+
|
20
|
+
specify "Should return false if the token string doesn't match the given wildcard or string" do
|
21
|
+
@token.matches?("Tes").should_be false
|
22
|
+
@token.matches?(:html_tag).should_be false
|
23
|
+
end
|
24
|
+
|
25
|
+
specify "Should raise an error if an invalid wildcard is given" do
|
26
|
+
lambda {@token.matches? :not_a_wildcard}.should_raise ArgumentError
|
27
|
+
end
|
28
|
+
|
29
|
+
specify "Should be able to list all wildcard symbols that match its text" do
|
30
|
+
@token.matching_wildcards.should_be_an_instance_of Array
|
31
|
+
@token.matching_wildcards.each {|wildcard| wildcard.should_be_an_instance_of Symbol}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "Comparing two Tokens" do
|
36
|
+
setup do
|
37
|
+
@token1 = Ariel::Token.new("Alice", 0, 5)
|
38
|
+
@token2 = Ariel::Token.new("Bob", 5, 8)
|
39
|
+
@token1_clone = Ariel::Token.new("Alice", 0, 5)
|
40
|
+
@token1_almost_clone = Ariel::Token.new("Alice", 0, 4)
|
41
|
+
end
|
42
|
+
|
43
|
+
specify "Should be equal if and only if text, start location and end location are equal" do
|
44
|
+
@token1.should_equal @token1_clone
|
45
|
+
@token1.should_not_equal @token2
|
46
|
+
@token1.should_not_equal @token1_almost_clone
|
47
|
+
end
|
48
|
+
|
49
|
+
specify "Should define a way of comparing itself to other tokens" do
|
50
|
+
@token1.should_respond_to :<=>
|
51
|
+
end
|
52
|
+
|
53
|
+
specify "Should make comparisons based on the start location of the token" do
|
54
|
+
(@token1<=>@token1_almost_clone).should_equal 0
|
55
|
+
(@token1<=>@token2).should_equal -1
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "Initializing a label tag token" do
|
60
|
+
specify "Should be ignored if passed true as the final argument to Token#new" do
|
61
|
+
Ariel::Token.new("Test", 0, 4, true).is_label_tag?.should_be true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'fixtures'
|
3
|
+
include Fixtures
|
4
|
+
|
5
|
+
context "A new TokenStream" do
|
6
|
+
setup do
|
7
|
+
@tokenstream = Ariel::TokenStream.new
|
8
|
+
end
|
9
|
+
|
10
|
+
specify "Should return 0 when cur_pos is called" do
|
11
|
+
@tokenstream.cur_pos.should_equal 0
|
12
|
+
end
|
13
|
+
|
14
|
+
specify "Should return an empty Array when tokens is called" do
|
15
|
+
@tokenstream.tokens.should_be_a_kind_of Array
|
16
|
+
@tokenstream.tokens.should_be_empty
|
17
|
+
end
|
18
|
+
|
19
|
+
specify "Should not contain any tokens" do
|
20
|
+
@tokenstream.tokens.size.should_equal 0
|
21
|
+
end
|
22
|
+
|
23
|
+
specify "Should return an empty string went sent the message raw_text" do
|
24
|
+
@tokenstream.raw_text.should_equal ""
|
25
|
+
end
|
26
|
+
|
27
|
+
specify "Should return nil when asked to advance" do
|
28
|
+
@tokenstream.advance.should_be_nil
|
29
|
+
end
|
30
|
+
|
31
|
+
specify "cur_pos should increase to 1 when asked to advance and no further" do
|
32
|
+
@tokenstream.advance
|
33
|
+
@tokenstream.cur_pos.should_equal 1
|
34
|
+
@tokenstream.advance
|
35
|
+
@tokenstream.cur_pos.should_equal 1
|
36
|
+
end
|
37
|
+
|
38
|
+
specify "Should not be reversed" do
|
39
|
+
@tokenstream.should_not_be_reversed
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
|
3
|
+
context "When querying the Wildcards class" do
|
4
|
+
|
5
|
+
specify "Should not be possible to create a Wildcards instance" do
|
6
|
+
lambda {Ariel::Wildcards.new}.should_raise
|
7
|
+
end
|
8
|
+
|
9
|
+
specify "Should return a hash of Symbol to Regexp pairs when sent the list message" do
|
10
|
+
wildcards=Ariel::Wildcards.list
|
11
|
+
wildcards.should_be_a_kind_of Hash
|
12
|
+
wildcards.keys.each {|key| key.should_be_a_kind_of Symbol}
|
13
|
+
wildcards.values.each {|value| value.should_be_a_kind_of Regexp}
|
14
|
+
end
|
15
|
+
|
16
|
+
specify "When Wildcards.matching is called with a String, should return an array of the symbols of all matching wildcards" do
|
17
|
+
Ariel::Wildcards.matching("Test").should_be_a_kind_of Array
|
18
|
+
Ariel::Wildcards.matching("<a>").should_include :html_tag
|
19
|
+
end
|
20
|
+
|
21
|
+
specify "Should yield a symbol for every wildcard the string matches when Wildcards.matching is called" do
|
22
|
+
list=[]
|
23
|
+
Ariel::Wildcards.matching("<a>") {|wildcard| list << wildcard}
|
24
|
+
list.should_not_be_empty
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
|
5
|
+
class TestCandidateSelector < Ariel::TestCase
|
6
|
+
include Fixtures
|
7
|
+
def setup
|
8
|
+
# Must get rid of this repetition, should be available to all tests
|
9
|
+
@e=@@labeled_addresses
|
10
|
+
@candidates=[]
|
11
|
+
@candidates << Ariel::Rule.new(:forward, [[:anything]])
|
12
|
+
@candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
|
13
|
+
@candidates << Ariel::Rule.new(:forward, [["("]])
|
14
|
+
@candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
|
15
|
+
@selector=Ariel::CandidateSelector.new(@candidates, @e)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_score_by
|
19
|
+
score_hash = @selector.score_by {|rule| rule.landmarks.size}
|
20
|
+
assert_equal @candidates.size, score_hash.size
|
21
|
+
assert_equal 1, score_hash.values.sort.first
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_highest_scoring_by
|
25
|
+
t1 = @selector.highest_scoring_by {|rule| 1}
|
26
|
+
assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
|
27
|
+
assert_equal @candidates.size, t1.size
|
28
|
+
t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
|
29
|
+
assert_equal 1, t2.size
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_select_best_by_match_type
|
33
|
+
@selector.select_best_by_match_type :fail, :early, :late, :perfect
|
34
|
+
assert_equal @candidates, @selector.candidates
|
35
|
+
@selector.select_best_by_match_type :late
|
36
|
+
assert_equal 1, @selector.candidates.size
|
37
|
+
assert_equal @candidates[1], @selector.candidates[0]
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_select_with_fewer_wildcards
|
41
|
+
assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
|
42
|
+
assert_equal 1, @selector.candidates.size
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_select_closest_to_label
|
46
|
+
assert_equal @candidates[2], @selector.select_closest_to_label[0]
|
47
|
+
assert_equal 1, @selector.candidates.size
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_select_with_longer_landmarks
|
51
|
+
assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
|
52
|
+
assert_equal 1, @selector.candidates.size
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_random_from_remaining
|
56
|
+
assert(@candidates.include?(@selector.random_from_remaining))
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestLabelUtils < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def test_label_regex
|
8
|
+
assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
|
9
|
+
assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_clean_string
|
13
|
+
assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestLearner < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def setup
|
8
|
+
#Examples stolen from the STALKER paper. Target to extract is the area
|
9
|
+
#codes.
|
10
|
+
@e=@@labeled_addresses
|
11
|
+
@learner=Ariel::Learner.new(*@e)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_set_seed
|
15
|
+
assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_generate_initial_candidates
|
19
|
+
@learner.direction=:forward
|
20
|
+
@learner.generate_initial_candidates
|
21
|
+
c=@learner.candidates
|
22
|
+
assert (c.include? Ariel::Rule.new(:forward, [["("]]))
|
23
|
+
assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
|
24
|
+
assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_refine
|
28
|
+
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
|
29
|
+
assert @learner.refine
|
30
|
+
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
|
31
|
+
assert @learner.refine
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_learn_rule
|
35
|
+
rule=@learner.learn_rule :forward
|
36
|
+
p rule
|
37
|
+
end
|
38
|
+
end
|