ariel 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README +98 -0
- data/bin/ariel +56 -0
- data/examples/google_calculator/labeled/1 +43 -0
- data/examples/google_calculator/labeled/2 +41 -0
- data/examples/google_calculator/labeled/3 +41 -0
- data/examples/google_calculator/structure.rb +12 -0
- data/examples/google_calculator/structure.yaml +46 -0
- data/examples/google_calculator/unlabeled/1 +43 -0
- data/examples/google_calculator/unlabeled/2 +43 -0
- data/examples/raa/labeled/highline.html +135 -0
- data/examples/raa/labeled/mongrel.html +168 -0
- data/examples/raa/structure.rb +17 -0
- data/examples/raa/structure.yaml +183 -0
- data/examples/raa/unlabeled/pdf-writer.html +175 -0
- data/lib/ariel/candidate_selector.rb +94 -0
- data/lib/ariel/example_document_loader.rb +59 -0
- data/lib/ariel/extracted_node.rb +20 -0
- data/lib/ariel/label_utils.rb +71 -0
- data/lib/ariel/learner.rb +237 -0
- data/lib/ariel/node_like.rb +26 -0
- data/lib/ariel/rule.rb +112 -0
- data/lib/ariel/rule_set.rb +34 -0
- data/lib/ariel/structure_node.rb +75 -0
- data/lib/ariel/token.rb +68 -0
- data/lib/ariel/token_stream.rb +240 -0
- data/lib/ariel/wildcards.rb +33 -0
- data/lib/ariel.rb +69 -0
- data/test/ariel_test_case.rb +15 -0
- data/test/fixtures.rb +43 -0
- data/test/specs/token_spec.rb +65 -0
- data/test/specs/token_stream_spec.rb +43 -0
- data/test/specs/wildcards_spec.rb +26 -0
- data/test/test_candidate_selector.rb +58 -0
- data/test/test_example_document_loader.rb +7 -0
- data/test/test_label_utils.rb +15 -0
- data/test/test_learner.rb +38 -0
- data/test/test_rule.rb +38 -0
- data/test/test_structure_node.rb +81 -0
- data/test/test_token.rb +16 -0
- data/test/test_token_stream.rb +82 -0
- data/test/test_wildcards.rb +18 -0
- metadata +103 -0
@@ -0,0 +1,240 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
require 'enumerator'
|
4
|
+
|
5
|
+
# A TokenStream instance stores a stream of Tokens once it has used its tokenization
|
6
|
+
# rules to extract them from a string. A TokenStream knows its current
|
7
|
+
# position (TokenStream#cur_pos), which is incremented when any of the
|
8
|
+
# Enumerable methods are used (due to the redefinition of TokenStream#each).
|
9
|
+
# As you advance through the stream, the current token is always returned and
|
10
|
+
# then consumed. A TokenStream also provides methods for finding patterns in a
|
11
|
+
# given stream much like StringScanner but for an array of tokens. For rule
|
12
|
+
# generation, a certain token can be marked as being the start point of a label.
|
13
|
+
# Finally, a TokenStream will record whether it is in a reversed or unreversed
|
14
|
+
# state so that when rules are applied, they are always applied from the front
|
15
|
+
# or end of the stream as required, whether it is reversed or not.
|
16
|
+
class TokenStream
|
17
|
+
include Enumerable
|
18
|
+
attr_accessor :tokens, :cur_pos, :label_index, :original_text
|
19
|
+
|
20
|
+
def initialize()
|
21
|
+
@tokens=[]
|
22
|
+
@cur_pos=0
|
23
|
+
@original_text = ""
|
24
|
+
@token_regexen = [
|
25
|
+
Wildcards.list[:html_tag], # Match html tags that don't have attributes
|
26
|
+
/\d+/, # Match any numbers, probably good to make a split
|
27
|
+
/\b\w+\b/, # Pick up words, will split at punctuation
|
28
|
+
/\S/ # Grab any characters left over that aren't whitespace
|
29
|
+
]
|
30
|
+
@label_tag_regexen = [LabelUtils.any_label_regex]
|
31
|
+
@reversed=false
|
32
|
+
end
|
33
|
+
|
34
|
+
# The tokenizer operates on a string by splitting it at every point it
|
35
|
+
# finds a match to a regular expression. Each match is added as a token, and
|
36
|
+
# the strings between each match are stored along with their original
|
37
|
+
# offsets. The same is then done with the next regular expression on each of
|
38
|
+
# these split strings, and new tokens are created with the correct offset in
|
39
|
+
# the original text. Any characters left unmatched by any of the regular
|
40
|
+
# expressions in @token_regexen are discarded. This approach allows a
|
41
|
+
# hierarchy of regular expressions to work simply and easily. A simple
|
42
|
+
# regular expression to match html tags might operate first, and then later
|
43
|
+
# expressions that pick up runs of word characters can operate on what's
|
44
|
+
# left. If contains_labels is set to true when calling tokenize, the
|
45
|
+
# tokenizer will first remove and discard any occurences of label_tags (as
|
46
|
+
# defined by the Regex set in LabelUtils) before matching and adding tokens.
|
47
|
+
# Any label_tag tokens will be marked as such upon creation.
|
48
|
+
def tokenize(input, contains_labels=false)
|
49
|
+
string_array=[[input, 0]]
|
50
|
+
@original_text = input
|
51
|
+
@original_text_contains_labels=contains_labels
|
52
|
+
@label_tag_regexen.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_labels
|
53
|
+
@token_regexen.each {|regex| split_string_array_by_regex(string_array, regex)}
|
54
|
+
@tokens.sort!
|
55
|
+
@tokens.size
|
56
|
+
end
|
57
|
+
|
58
|
+
# Goes through all stored Token instances, removing them if
|
59
|
+
# Token#is_label_tag? Called after a labeled document has been extracted to
|
60
|
+
# a tree ready for the rule learning process.
|
61
|
+
def remove_label_tags
|
62
|
+
@tokens.delete_if {|token| token.is_label_tag?}
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the slice of the current instance containing all the tokens
|
66
|
+
# between the token where the start_loc == the left parameter and the token
|
67
|
+
# where the end_loc == the right parameter.
|
68
|
+
def slice_by_string_pos(left, right)
|
69
|
+
l_index=nil
|
70
|
+
r_index=nil
|
71
|
+
@tokens.each_index {|i| l_index = i if @tokens[i].start_loc == left}
|
72
|
+
@tokens.each_index {|i| r_index = i if @tokens[i].end_loc == right}
|
73
|
+
if l_index.nil? or r_index.nil?
|
74
|
+
raise ArgumentError, "Cannot slice between those locations"
|
75
|
+
else
|
76
|
+
return slice_by_token_index(l_index, r_index)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Slices tokens between the l_index and the r_index inclusive.
|
81
|
+
def slice_by_token_index(l_index, r_index)
|
82
|
+
sliced = self.dup
|
83
|
+
sliced.tokens=@tokens[l_index..r_index]
|
84
|
+
return sliced
|
85
|
+
end
|
86
|
+
|
87
|
+
# Used to ensure operations such as @tokens.reverse! in one instance won't
|
88
|
+
# inadvertently effect another.
|
89
|
+
def deep_clone
|
90
|
+
Marshal::load(Marshal.dump(self))
|
91
|
+
end
|
92
|
+
|
93
|
+
# Set a label at a given offset in the original text. Searches for a token
|
94
|
+
# with a start_loc equal to the position passed as an argument, and raises
|
95
|
+
# an error if one is not found.
|
96
|
+
def set_label_at(pos)
|
97
|
+
token_pos=nil
|
98
|
+
@tokens.each_index {|i| token_pos = i if @tokens[i].start_loc == pos}
|
99
|
+
if token_pos.nil?
|
100
|
+
raise ArgumentError, "Given string position does not match the start of any token"
|
101
|
+
else
|
102
|
+
@label_index = token_pos
|
103
|
+
debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
|
104
|
+
return @label_index
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns all text represented by the instance's stored tokens, stripping any
|
109
|
+
# label tags if the stream was declared to be containing them when it was
|
110
|
+
# initialized (this would only happen during the process of loading labeled
|
111
|
+
# examples). See also TokenStream#raw_text
|
112
|
+
def text(l_index=0, r_index=-1)
|
113
|
+
out=raw_text(l_index, r_index)
|
114
|
+
if @original_text_contains_labels
|
115
|
+
LabelUtils.clean_string(out)
|
116
|
+
else
|
117
|
+
out
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns all text represented by the instance's stored tokens it will not
|
122
|
+
# strip label tags even if the stream is marked to contain them. However,
|
123
|
+
# you should not expect to get the raw_text once any label_tags have been
|
124
|
+
# filtered (TokenStream#remove_label_tags).
|
125
|
+
def raw_text(l_index=0, r_index=-1)
|
126
|
+
return "" if @tokens.size==0
|
127
|
+
if reversed?
|
128
|
+
l_index, r_index = r_index, l_index
|
129
|
+
end
|
130
|
+
@original_text[@tokens[l_index].start_loc...@tokens[r_index].end_loc]
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns the current Token and consumes it.
|
134
|
+
def advance
|
135
|
+
return nil if @cur_pos > @tokens.size
|
136
|
+
while true
|
137
|
+
@cur_pos+=1
|
138
|
+
current_token = @tokens[@cur_pos-1]
|
139
|
+
return nil if current_token.nil?
|
140
|
+
return current_token
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Return to the beginning of the TokenStream.
|
145
|
+
def rewind
|
146
|
+
@cur_pos=0
|
147
|
+
self
|
148
|
+
end
|
149
|
+
|
150
|
+
# Returns a copy of the current instance with a reversed set of tokens. If
|
151
|
+
# it is set, the label_index is adjusted accordingly to point to the correct
|
152
|
+
# token.
|
153
|
+
def reverse
|
154
|
+
self.deep_clone.reverse!
|
155
|
+
end
|
156
|
+
|
157
|
+
# Converts the given position so it points to the same token once the stream
|
158
|
+
# is reversed. Result invalid for when @tokens.size==0
|
159
|
+
def reverse_pos(pos)
|
160
|
+
@tokens.size-(pos + 1)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Same as LabeledStream#reverse, but changes are made in place.
|
164
|
+
def reverse!
|
165
|
+
@tokens.reverse!
|
166
|
+
if label_index
|
167
|
+
@label_index = reverse_pos(@label_index)
|
168
|
+
end
|
169
|
+
@cur_pos = reverse_pos(@cur_pos)
|
170
|
+
@reversed=!@reversed
|
171
|
+
return self
|
172
|
+
end
|
173
|
+
|
174
|
+
# Returns true or false depending on whether the given tokenstream is in a
|
175
|
+
# reversed state
|
176
|
+
def reversed?
|
177
|
+
@reversed
|
178
|
+
end
|
179
|
+
|
180
|
+
# Takes a list of Strings and Symbols as its arguments representing text to be matched in
|
181
|
+
# individual tokens and Wildcards. For a match to be a
|
182
|
+
# success, all wildcards and strings must match a consecutive sequence
|
183
|
+
# of Tokens in the TokenStream. All matched Tokens are consumed, and the
|
184
|
+
# TokenStream's current position is returned on success. On failure, the
|
185
|
+
# TokenStream is returned to its original state and returns nil.
|
186
|
+
def skip_to(*features)
|
187
|
+
original_pos=@cur_pos
|
188
|
+
self.each_cons(features.size) do |tokens|
|
189
|
+
i=0
|
190
|
+
return @cur_pos if tokens.all? {|token| i+=1; token.matches?(features[i-1])}
|
191
|
+
end
|
192
|
+
@cur_pos=original_pos #No match, return TokenStream to original state
|
193
|
+
return nil
|
194
|
+
end
|
195
|
+
|
196
|
+
# Iterates over and consumes every Token from the cur_pos.
|
197
|
+
def each
|
198
|
+
while (token = self.advance)
|
199
|
+
yield token
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the current Token.
|
204
|
+
def current_token
|
205
|
+
@tokens[@cur_pos]
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
# Uses split_by_regex to split each member of a given array of string and
|
211
|
+
# offset pairs in to new arrays of string and offset pairs.
|
212
|
+
def split_string_array_by_regex(string_array, regex, add_matches=true)
|
213
|
+
new_string_array = []
|
214
|
+
string_array.each do |arr|
|
215
|
+
result = split_by_regex(arr[0], arr[1], regex, add_matches)
|
216
|
+
new_string_array.concat result
|
217
|
+
end
|
218
|
+
string_array.replace new_string_array
|
219
|
+
end
|
220
|
+
|
221
|
+
# For tokenization, removes regex matches and creates new strings to
|
222
|
+
# represent the gaps between each match.
|
223
|
+
def split_by_regex(string, offset, regex, add_matches=true)
|
224
|
+
split_points=[0]
|
225
|
+
string_holder = []
|
226
|
+
string.scan(regex) do |s|
|
227
|
+
match = Regexp.last_match
|
228
|
+
split_points << match.begin(0)
|
229
|
+
split_points << match.end(0)
|
230
|
+
@tokens << Token.new(match[0], match.begin(0)+offset, match.end(0)+offset, !add_matches)
|
231
|
+
end
|
232
|
+
split_points << string.size
|
233
|
+
split_points.each_slice(2) do |s_pos, e_pos|
|
234
|
+
split_string = string[s_pos...e_pos]
|
235
|
+
string_holder << [split_string, s_pos+offset] unless split_string.empty?
|
236
|
+
end
|
237
|
+
return string_holder
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Ariel
|
2
|
+
# Contains all wildcards to be used in rule generation.
|
3
|
+
class Wildcards
|
4
|
+
private_class_method :new
|
5
|
+
@@list = {
|
6
|
+
:anything=>/.+/,
|
7
|
+
:numeric=>/\d+/,
|
8
|
+
:alpha_numeric=>/\w+/,
|
9
|
+
:alpha=>/[[:alpha:]]+/,
|
10
|
+
:capitalized=>/[[:upper:]]+\w+/,
|
11
|
+
:all_caps=>/[[:upper:]]+/,
|
12
|
+
:html_tag=>/<\/?\w+>|<\w+\s+\/>/,
|
13
|
+
:punctuation=>/[[:punct:]]+/
|
14
|
+
}
|
15
|
+
# Returns the hash of wildcard name (symbol) and regular expression pairs.
|
16
|
+
def self.list
|
17
|
+
@@list
|
18
|
+
end
|
19
|
+
|
20
|
+
# Given a string, will return an array of symbols from Wildcards::list that
|
21
|
+
# match it.
|
22
|
+
def self.matching(string)
|
23
|
+
matches=[]
|
24
|
+
@@list.each do |name, regex|
|
25
|
+
if string[regex]==string
|
26
|
+
yield name if block_given?
|
27
|
+
matches << name
|
28
|
+
end
|
29
|
+
end
|
30
|
+
matches
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/ariel.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'ariel/token'
|
2
|
+
require 'ariel/token_stream'
|
3
|
+
require 'ariel/learner'
|
4
|
+
require 'ariel/node_like'
|
5
|
+
require 'ariel/extracted_node'
|
6
|
+
require 'ariel/structure_node'
|
7
|
+
require 'ariel/rule'
|
8
|
+
require 'ariel/wildcards'
|
9
|
+
require 'ariel/candidate_selector'
|
10
|
+
require 'ariel/label_utils'
|
11
|
+
require 'ariel/example_document_loader'
|
12
|
+
require 'ariel/rule_set'
|
13
|
+
|
14
|
+
if $DEBUG
|
15
|
+
# require 'logger'
|
16
|
+
|
17
|
+
# DEBUGLOG = Logger.new(File.open('debug.log', 'wb'))
|
18
|
+
# DEBUGLOG.datetime_format = " \010"
|
19
|
+
# DEBUGLOG.progname = "\010\010\010"
|
20
|
+
|
21
|
+
def debug(message)
|
22
|
+
p message
|
23
|
+
#DEBUGLOG.debug message
|
24
|
+
end
|
25
|
+
else
|
26
|
+
def debug(message)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# = Ariel - A Ruby Information Extraction Library
|
31
|
+
# Ariel intends to assist in extracting information from semi-structured
|
32
|
+
# documents including (but not in any way limited to) web pages. Although you
|
33
|
+
# may use libraries such as Hpricot or Rubyful Soup, or even plain Regular
|
34
|
+
# Expressions to achieve the same goal, Ariel approaches the problem very
|
35
|
+
# differently. Ariel relies on the user labeling examples of the data they
|
36
|
+
# want to extract, and then finds patterns across several such labeled
|
37
|
+
# examples in order to produce a set of general rules for extracting this
|
38
|
+
# information from any similar document.
|
39
|
+
#
|
40
|
+
# When working with Ariel, your workflow might look something like this:
|
41
|
+
# 1. Define a structure for the data you wish to extract. For example:
|
42
|
+
#
|
43
|
+
# @structure = Ariel::StructureNode.new do |r|
|
44
|
+
# r.article do |a|
|
45
|
+
# a.title
|
46
|
+
# a.author
|
47
|
+
# a.date
|
48
|
+
# a.body
|
49
|
+
# end
|
50
|
+
# r.comment_list do |c|
|
51
|
+
# c.author
|
52
|
+
# c.date
|
53
|
+
# c.body
|
54
|
+
# end
|
55
|
+
# end
|
56
|
+
# 2. Label these fields in a few example documents (normally at least 3).
|
57
|
+
# Labels are in the form of <tt><l:label_name>...</l:label_name></tt>
|
58
|
+
# 3. Ariel will read these examples, and try to generate suitable rules that can
|
59
|
+
# be used to extract this data from other similarly structured documents.
|
60
|
+
# 4. A wrapper has been generated - we can now happily load documents with the
|
61
|
+
# same structure (normally documents generated by the same rules, so
|
62
|
+
# different pages from a single site perhaps) and query the extracted data.
|
63
|
+
module Ariel
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'fixtures'
|
3
|
+
|
4
|
+
module Ariel
|
5
|
+
include Fixtures
|
6
|
+
class TestCase < Test::Unit::TestCase
|
7
|
+
def run(result)
|
8
|
+
debug "Running #{self.class.name}##{method_name}" unless method_name.to_s=="default_test"
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def default_test
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/test/fixtures.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Fixtures
|
2
|
+
@@labeled_document = <<EOS
|
3
|
+
Title: <l:title>The test of the Century</l:title>
|
4
|
+
<l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
|
5
|
+
<l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
|
6
|
+
EOS
|
7
|
+
@@labeled_document_structure = Ariel::StructureNode.new do |r|
|
8
|
+
r.item :title
|
9
|
+
r.item :content do |c|
|
10
|
+
c.item :excerpt
|
11
|
+
c.item :body
|
12
|
+
end
|
13
|
+
end
|
14
|
+
@@unlabeled_document=<<EOS
|
15
|
+
Title: The test of the Century
|
16
|
+
<b>Excerpt</b>: <i>A look back at what could be considered the greatest ever test.</i>
|
17
|
+
There was once a test designed to assess whether apply_extraction_tree_on worked.
|
18
|
+
EOS
|
19
|
+
# Document with nested labels with clashing names. i.e. a label at the top
|
20
|
+
# level as well as a label lower down in the tree that has the same label
|
21
|
+
# name.
|
22
|
+
@@labeled_document_with_list=<<EOS
|
23
|
+
Title: <l:title>Another example</l:title>
|
24
|
+
<l:body>I love to write examples, you love to read them, ruby loves to process them.
|
25
|
+
In conclusion, we're has happy as can be.</l:body>
|
26
|
+
<l:comment_list>Comments:
|
27
|
+
<l:comment>Title:<l:title>Great example</l:title>
|
28
|
+
<l:author>Adoring fan</l:author>
|
29
|
+
<l:body>Always love reading your examples, keep up the great work.</l:body>
|
30
|
+
</l:comment></l:comment_list>
|
31
|
+
EOS
|
32
|
+
|
33
|
+
@@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
|
34
|
+
@@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
|
35
|
+
@@labeled_addresses[0].set_label_at 36
|
36
|
+
@@labeled_addresses[1].tokenize("90 Colfax, <b> Palms </b>, Phone: (818) 508-1570")
|
37
|
+
@@labeled_addresses[1].set_label_at 35
|
38
|
+
@@labeled_addresses[2].tokenize("523 1st St., <b> LA </b>, Phone: 1-<b>888</b>-578-2293")
|
39
|
+
@@labeled_addresses[2].set_label_at 38
|
40
|
+
@@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
|
41
|
+
@@labeled_addresses[3].set_label_at 39
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
|
3
|
+
context "An average token" do
|
4
|
+
setup do
|
5
|
+
@token = Ariel::Token.new("Test", 0, 4)
|
6
|
+
end
|
7
|
+
specify "Should return the string it holds when text is called" do
|
8
|
+
@token.text.should_equal "Test"
|
9
|
+
end
|
10
|
+
|
11
|
+
specify "Should not be a label tag" do
|
12
|
+
@token.is_label_tag?.should_be false
|
13
|
+
end
|
14
|
+
|
15
|
+
specify "Should return true if if the token string matches a given wildcard or equals a given string" do
|
16
|
+
@token.matches?("Test").should_be true
|
17
|
+
@token.matches?(:alpha_numeric).should_be true
|
18
|
+
end
|
19
|
+
|
20
|
+
specify "Should return false if the token string doesn't match the given wildcard or string" do
|
21
|
+
@token.matches?("Tes").should_be false
|
22
|
+
@token.matches?(:html_tag).should_be false
|
23
|
+
end
|
24
|
+
|
25
|
+
specify "Should raise an error if an invalid wildcard is given" do
|
26
|
+
lambda {@token.matches? :not_a_wildcard}.should_raise ArgumentError
|
27
|
+
end
|
28
|
+
|
29
|
+
specify "Should be able to list all wildcard symbols that match its text" do
|
30
|
+
@token.matching_wildcards.should_be_an_instance_of Array
|
31
|
+
@token.matching_wildcards.each {|wildcard| wildcard.should_be_an_instance_of Symbol}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "Comparing two Tokens" do
|
36
|
+
setup do
|
37
|
+
@token1 = Ariel::Token.new("Alice", 0, 5)
|
38
|
+
@token2 = Ariel::Token.new("Bob", 5, 8)
|
39
|
+
@token1_clone = Ariel::Token.new("Alice", 0, 5)
|
40
|
+
@token1_almost_clone = Ariel::Token.new("Alice", 0, 4)
|
41
|
+
end
|
42
|
+
|
43
|
+
specify "Should be equal if and only if text, start location and end location are equal" do
|
44
|
+
@token1.should_equal @token1_clone
|
45
|
+
@token1.should_not_equal @token2
|
46
|
+
@token1.should_not_equal @token1_almost_clone
|
47
|
+
end
|
48
|
+
|
49
|
+
specify "Should define a way of comparing itself to other tokens" do
|
50
|
+
@token1.should_respond_to :<=>
|
51
|
+
end
|
52
|
+
|
53
|
+
specify "Should make comparisons based on the start location of the token" do
|
54
|
+
(@token1<=>@token1_almost_clone).should_equal 0
|
55
|
+
(@token1<=>@token2).should_equal -1
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "Initializing a label tag token" do
|
60
|
+
specify "Should be ignored if passed true as the final argument to Token#new" do
|
61
|
+
Ariel::Token.new("Test", 0, 4, true).is_label_tag?.should_be true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'fixtures'
|
3
|
+
include Fixtures
|
4
|
+
|
5
|
+
context "A new TokenStream" do
|
6
|
+
setup do
|
7
|
+
@tokenstream = Ariel::TokenStream.new
|
8
|
+
end
|
9
|
+
|
10
|
+
specify "Should return 0 when cur_pos is called" do
|
11
|
+
@tokenstream.cur_pos.should_equal 0
|
12
|
+
end
|
13
|
+
|
14
|
+
specify "Should return an empty Array when tokens is called" do
|
15
|
+
@tokenstream.tokens.should_be_a_kind_of Array
|
16
|
+
@tokenstream.tokens.should_be_empty
|
17
|
+
end
|
18
|
+
|
19
|
+
specify "Should not contain any tokens" do
|
20
|
+
@tokenstream.tokens.size.should_equal 0
|
21
|
+
end
|
22
|
+
|
23
|
+
specify "Should return an empty string went sent the message raw_text" do
|
24
|
+
@tokenstream.raw_text.should_equal ""
|
25
|
+
end
|
26
|
+
|
27
|
+
specify "Should return nil when asked to advance" do
|
28
|
+
@tokenstream.advance.should_be_nil
|
29
|
+
end
|
30
|
+
|
31
|
+
specify "cur_pos should increase to 1 when asked to advance and no further" do
|
32
|
+
@tokenstream.advance
|
33
|
+
@tokenstream.cur_pos.should_equal 1
|
34
|
+
@tokenstream.advance
|
35
|
+
@tokenstream.cur_pos.should_equal 1
|
36
|
+
end
|
37
|
+
|
38
|
+
specify "Should not be reversed" do
|
39
|
+
@tokenstream.should_not_be_reversed
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
|
3
|
+
context "When querying the Wildcards class" do
|
4
|
+
|
5
|
+
specify "Should not be possible to create a Wildcards instance" do
|
6
|
+
lambda {Ariel::Wildcards.new}.should_raise
|
7
|
+
end
|
8
|
+
|
9
|
+
specify "Should return a hash of Symbol to Regexp pairs when sent the list message" do
|
10
|
+
wildcards=Ariel::Wildcards.list
|
11
|
+
wildcards.should_be_a_kind_of Hash
|
12
|
+
wildcards.keys.each {|key| key.should_be_a_kind_of Symbol}
|
13
|
+
wildcards.values.each {|value| value.should_be_a_kind_of Regexp}
|
14
|
+
end
|
15
|
+
|
16
|
+
specify "When Wildcards.matching is called with a String, should return an array of the symbols of all matching wildcards" do
|
17
|
+
Ariel::Wildcards.matching("Test").should_be_a_kind_of Array
|
18
|
+
Ariel::Wildcards.matching("<a>").should_include :html_tag
|
19
|
+
end
|
20
|
+
|
21
|
+
specify "Should yield a symbol for every wildcard the string matches when Wildcards.matching is called" do
|
22
|
+
list=[]
|
23
|
+
Ariel::Wildcards.matching("<a>") {|wildcard| list << wildcard}
|
24
|
+
list.should_not_be_empty
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
|
5
|
+
class TestCandidateSelector < Ariel::TestCase
|
6
|
+
include Fixtures
|
7
|
+
def setup
|
8
|
+
# Must get rid of this repetition, should be available to all tests
|
9
|
+
@e=@@labeled_addresses
|
10
|
+
@candidates=[]
|
11
|
+
@candidates << Ariel::Rule.new(:forward, [[:anything]])
|
12
|
+
@candidates << Ariel::Rule.new(:forward, [[:numeric], [:numeric], [:numeric]])
|
13
|
+
@candidates << Ariel::Rule.new(:forward, [["("]])
|
14
|
+
@candidates << Ariel::Rule.new(:forward, [[:numeric, :alpha_numeric]])
|
15
|
+
@selector=Ariel::CandidateSelector.new(@candidates, @e)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_score_by
|
19
|
+
score_hash = @selector.score_by {|rule| rule.landmarks.size}
|
20
|
+
assert_equal @candidates.size, score_hash.size
|
21
|
+
assert_equal 1, score_hash.values.sort.first
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_highest_scoring_by
|
25
|
+
t1 = @selector.highest_scoring_by {|rule| 1}
|
26
|
+
assert (t1.all? {|rule| rule.kind_of? Ariel::Rule})
|
27
|
+
assert_equal @candidates.size, t1.size
|
28
|
+
t2 = @selector.highest_scoring_by {|rule| rule.landmarks.size}
|
29
|
+
assert_equal 1, t2.size
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_select_best_by_match_type
|
33
|
+
@selector.select_best_by_match_type :fail, :early, :late, :perfect
|
34
|
+
assert_equal @candidates, @selector.candidates
|
35
|
+
@selector.select_best_by_match_type :late
|
36
|
+
assert_equal 1, @selector.candidates.size
|
37
|
+
assert_equal @candidates[1], @selector.candidates[0]
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_select_with_fewer_wildcards
|
41
|
+
assert_equal @selector.select_with_fewer_wildcards[0], @candidates[2]
|
42
|
+
assert_equal 1, @selector.candidates.size
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_select_closest_to_label
|
46
|
+
assert_equal @candidates[2], @selector.select_closest_to_label[0]
|
47
|
+
assert_equal 1, @selector.candidates.size
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_select_with_longer_landmarks
|
51
|
+
assert_equal @candidates[3], @selector.select_with_longer_end_landmarks[0]
|
52
|
+
assert_equal 1, @selector.candidates.size
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_random_from_remaining
|
56
|
+
assert(@candidates.include?(@selector.random_from_remaining))
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestLabelUtils < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def test_label_regex
|
8
|
+
assert_equal 2, Ariel::LabelUtils.label_regex.uniq.size
|
9
|
+
assert_kind_of Regexp, Ariel::LabelUtils.label_regex[0]
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_clean_string
|
13
|
+
assert_equal @@unlabeled_document, Ariel::LabelUtils.clean_string(@@labeled_document)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'ariel_test_case'
|
3
|
+
|
4
|
+
class TestLearner < Ariel::TestCase
|
5
|
+
include Fixtures
|
6
|
+
|
7
|
+
def setup
|
8
|
+
#Examples stolen from the STALKER paper. Target to extract is the area
|
9
|
+
#codes.
|
10
|
+
@e=@@labeled_addresses
|
11
|
+
@learner=Ariel::Learner.new(*@e)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_set_seed
|
15
|
+
assert_equal @e[1], @learner.current_seed # LabeledStream with smallest label_index
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_generate_initial_candidates
|
19
|
+
@learner.direction=:forward
|
20
|
+
@learner.generate_initial_candidates
|
21
|
+
c=@learner.candidates
|
22
|
+
assert (c.include? Ariel::Rule.new(:forward, [["("]]))
|
23
|
+
assert (c.include? Ariel::Rule.new(:forward, [[:anything]]))
|
24
|
+
assert (c.include? Ariel::Rule.new(:forward, [[:punctuation]]))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_refine
|
28
|
+
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>"]])
|
29
|
+
assert @learner.refine
|
30
|
+
@learner.current_rule=Ariel::Rule.new(:forward, [["<b>", "Palms"], ["Phone"]])
|
31
|
+
assert @learner.refine
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_learn_rule
|
35
|
+
rule=@learner.learn_rule :forward
|
36
|
+
p rule
|
37
|
+
end
|
38
|
+
end
|