ariel 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +49 -83
- data/bin/ariel +29 -20
- data/examples/google_calculator/structure.rb +2 -2
- data/examples/google_calculator/structure.yaml +13 -15
- data/examples/raa/labeled/highline.html +5 -4
- data/examples/raa/labeled/mongrel.html +9 -8
- data/examples/raa/structure.rb +4 -2
- data/examples/raa/structure.yaml +94 -78
- data/lib/ariel.rb +71 -33
- data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
- data/lib/ariel/label_utils.rb +46 -18
- data/lib/ariel/labeled_document_loader.rb +77 -0
- data/lib/ariel/learner.rb +60 -38
- data/lib/ariel/log.rb +67 -0
- data/lib/ariel/node.rb +52 -0
- data/lib/ariel/node/extracted.rb +90 -0
- data/lib/ariel/node/structure.rb +91 -0
- data/lib/ariel/rule.rb +114 -32
- data/lib/ariel/rule_set.rb +34 -15
- data/lib/ariel/token.rb +9 -3
- data/lib/ariel/token_stream.rb +32 -17
- data/lib/ariel/wildcards.rb +19 -15
- data/test/fixtures.rb +45 -3
- data/test/specs/candidate_refiner_spec.rb +48 -0
- data/test/specs/label_utils_spec.rb +97 -0
- data/test/specs/learner_spec.rb +39 -0
- data/test/specs/node_extracted_spec.rb +90 -0
- data/test/specs/node_spec.rb +76 -0
- data/test/specs/node_structure_spec.rb +74 -0
- data/test/specs/rule_set_spec.rb +85 -0
- data/test/specs/rule_spec.rb +110 -0
- data/test/specs/token_stream_spec.rb +100 -7
- metadata +21 -28
- data/lib/ariel/example_document_loader.rb +0 -59
- data/lib/ariel/extracted_node.rb +0 -20
- data/lib/ariel/node_like.rb +0 -26
- data/lib/ariel/structure_node.rb +0 -75
- data/test/ariel_test_case.rb +0 -15
- data/test/test_candidate_selector.rb +0 -58
- data/test/test_example_document_loader.rb +0 -7
- data/test/test_label_utils.rb +0 -15
- data/test/test_learner.rb +0 -38
- data/test/test_rule.rb +0 -38
- data/test/test_structure_node.rb +0 -81
- data/test/test_token.rb +0 -16
- data/test/test_token_stream.rb +0 -82
- data/test/test_wildcards.rb +0 -18
data/lib/ariel/rule_set.rb
CHANGED
@@ -1,34 +1,53 @@
|
|
1
1
|
module Ariel
|
2
2
|
|
3
|
-
# A RuleSet acts as a container for a
|
3
|
+
# A RuleSet acts as a container for a Node::Structure's start and end rules.
|
4
4
|
# These are stored as an ordered array and are applied in turn until there is
|
5
5
|
# a successful match. A RuleSet takes responsibility for applying start and
|
6
|
-
# end rules to extract an
|
6
|
+
# end rules to extract an Node::Extracted.
|
7
7
|
class RuleSet
|
8
8
|
def initialize(start_rules, end_rules)
|
9
9
|
@start_rules=start_rules
|
10
10
|
@end_rules=end_rules
|
11
11
|
end
|
12
12
|
|
13
|
+
# Returns an array of the extracted tokenstreams. An empty array is returned
|
14
|
+
# if the rules cannot be applied.
|
15
|
+
# TODO: Think more about the way list iteration rules are applied
|
13
16
|
def apply_to(tokenstream)
|
14
|
-
|
15
|
-
|
17
|
+
start_idxs=nil
|
18
|
+
end_idxs=nil
|
16
19
|
@start_rules.each do |rule|
|
17
|
-
|
18
|
-
break if
|
20
|
+
start_idxs=rule.apply_to tokenstream
|
21
|
+
break if !start_idxs.empty?
|
19
22
|
end
|
20
23
|
@end_rules.each do |rule|
|
21
|
-
|
22
|
-
|
24
|
+
end_idxs=rule.apply_to tokenstream
|
25
|
+
end_idxs.reverse! #So the start_idxs and end_idxs match up
|
26
|
+
break if !end_idxs.empty?
|
23
27
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
result=[]
|
29
|
+
unless start_idxs.empty? && end_idxs.empty?
|
30
|
+
# Following expression deals with the case where the first start rule
|
31
|
+
# matches after the first end rule, indicating that all tokens up to the
|
32
|
+
# end rule match should be a list item
|
33
|
+
if start_idxs.first > end_idxs.first
|
34
|
+
start_idxs.insert(0, 0)
|
35
|
+
end
|
36
|
+
if end_idxs.last < start_idxs.last
|
37
|
+
end_idxs << (tokenstream.size - 1)
|
38
|
+
end
|
39
|
+
Log.debug "RuleSet matched with start_idxs=#{start_idxs.inspect} and end_idxs=#{end_idxs.inspect}"
|
40
|
+
start_idxs.zip(end_idxs) do |start_idx, end_idx|
|
41
|
+
if start_idx && end_idx
|
42
|
+
next if start_idx > end_idx
|
43
|
+
result << tokenstream.slice_by_token_index(start_idx, end_idx)
|
44
|
+
yield result.last if block_given?
|
45
|
+
else
|
46
|
+
break
|
47
|
+
end
|
48
|
+
end
|
31
49
|
end
|
50
|
+
return result
|
32
51
|
end
|
33
52
|
end
|
34
53
|
end
|
data/lib/ariel/token.rb
CHANGED
@@ -36,9 +36,10 @@ module Ariel
|
|
36
36
|
@start_loc <=> t.start_loc
|
37
37
|
end
|
38
38
|
|
39
|
-
# Accepts either a string
|
40
|
-
# Wildcards#list
|
41
|
-
#
|
39
|
+
# Accepts either a string a symbol representing a wildcard in
|
40
|
+
# Wildcards#list or an an arbitrary regex. Returns true if the
|
41
|
+
# whole Token is consumed by the wildcard or the string is equal
|
42
|
+
# to Token#text, and false if the match fails. Raises an
|
42
43
|
# error if the passed symbol is not a member of Wildcards#list.
|
43
44
|
def matches?(landmark)
|
44
45
|
if landmark.kind_of? Symbol or landmark.kind_of? Regexp
|
@@ -64,5 +65,10 @@ module Ariel
|
|
64
65
|
def matching_wildcards
|
65
66
|
return Wildcards.matching(self.text)
|
66
67
|
end
|
68
|
+
|
69
|
+
# Redefined for caching purposes. This proved to be too slow.
|
70
|
+
# def hash
|
71
|
+
# [@text, @start_loc, @end_loc, @label_tag].hash
|
72
|
+
# end
|
67
73
|
end
|
68
74
|
end
|
data/lib/ariel/token_stream.rb
CHANGED
@@ -16,19 +16,21 @@ module Ariel
|
|
16
16
|
class TokenStream
|
17
17
|
include Enumerable
|
18
18
|
attr_accessor :tokens, :cur_pos, :label_index, :original_text
|
19
|
-
|
20
|
-
|
21
|
-
@tokens=[]
|
22
|
-
@cur_pos=0
|
23
|
-
@original_text = ""
|
24
|
-
@token_regexen = [
|
19
|
+
|
20
|
+
TOKEN_REGEXEN = [
|
25
21
|
Wildcards.list[:html_tag], # Match html tags that don't have attributes
|
26
22
|
/\d+/, # Match any numbers, probably good to make a split
|
27
23
|
/\b\w+\b/, # Pick up words, will split at punctuation
|
28
24
|
/\S/ # Grab any characters left over that aren't whitespace
|
29
25
|
]
|
30
|
-
|
26
|
+
LABEL_TAG_REGEXEN = [LabelUtils.any_label_regex]
|
27
|
+
|
28
|
+
def initialize()
|
29
|
+
@tokens=[]
|
30
|
+
@cur_pos=0
|
31
|
+
@original_text = ""
|
31
32
|
@reversed=false
|
33
|
+
@contains_label_tags=false
|
32
34
|
end
|
33
35
|
|
34
36
|
# The tokenizer operates on a string by splitting it at every point it
|
@@ -37,7 +39,7 @@ module Ariel
|
|
37
39
|
# offsets. The same is then done with the next regular expression on each of
|
38
40
|
# these split strings, and new tokens are created with the correct offset in
|
39
41
|
# the original text. Any characters left unmatched by any of the regular
|
40
|
-
# expressions in
|
42
|
+
# expressions in TokenStream::TOKEN_REGEXEN are discarded. This approach allows a
|
41
43
|
# hierarchy of regular expressions to work simply and easily. A simple
|
42
44
|
# regular expression to match html tags might operate first, and then later
|
43
45
|
# expressions that pick up runs of word characters can operate on what's
|
@@ -45,16 +47,25 @@ module Ariel
|
|
45
47
|
# tokenizer will first remove and discard any occurences of label_tags (as
|
46
48
|
# defined by the Regex set in LabelUtils) before matching and adding tokens.
|
47
49
|
# Any label_tag tokens will be marked as such upon creation.
|
48
|
-
def tokenize(input,
|
50
|
+
def tokenize(input, contains_label_tags=false)
|
49
51
|
string_array=[[input, 0]]
|
50
52
|
@original_text = input
|
51
|
-
@
|
52
|
-
|
53
|
-
|
53
|
+
@contains_label_tags=contains_label_tags
|
54
|
+
LABEL_TAG_REGEXEN.each {|regex| split_string_array_by_regex(string_array, regex, false)} if contains_label_tags
|
55
|
+
TOKEN_REGEXEN.each {|regex| split_string_array_by_regex(string_array, regex)}
|
54
56
|
@tokens.sort!
|
55
57
|
@tokens.size
|
56
58
|
end
|
57
59
|
|
60
|
+
# Note, token.cache_hash!=token.reverse.reverse.cache_hash.
|
61
|
+
def cache_hash
|
62
|
+
[@tokens, @reversed].hash
|
63
|
+
end
|
64
|
+
|
65
|
+
def contains_label_tags?
|
66
|
+
@contains_label_tags
|
67
|
+
end
|
68
|
+
|
58
69
|
# Goes through all stored Token instances, removing them if
|
59
70
|
# Token#is_label_tag? Called after a labeled document has been extracted to
|
60
71
|
# a tree ready for the rule learning process.
|
@@ -100,7 +111,7 @@ module Ariel
|
|
100
111
|
raise ArgumentError, "Given string position does not match the start of any token"
|
101
112
|
else
|
102
113
|
@label_index = token_pos
|
103
|
-
debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
|
114
|
+
Log.debug "Token ##{label_index} - \"#{@tokens[label_index].text}\" labeled."
|
104
115
|
return @label_index
|
105
116
|
end
|
106
117
|
end
|
@@ -111,14 +122,14 @@ module Ariel
|
|
111
122
|
# examples). See also TokenStream#raw_text
|
112
123
|
def text(l_index=0, r_index=-1)
|
113
124
|
out=raw_text(l_index, r_index)
|
114
|
-
if
|
125
|
+
if contains_label_tags?
|
115
126
|
LabelUtils.clean_string(out)
|
116
127
|
else
|
117
128
|
out
|
118
129
|
end
|
119
130
|
end
|
120
131
|
|
121
|
-
# Returns all text represented by the instance's stored tokens
|
132
|
+
# Returns all text represented by the instance's stored tokens. It will not
|
122
133
|
# strip label tags even if the stream is marked to contain them. However,
|
123
134
|
# you should not expect to get the raw_text once any label_tags have been
|
124
135
|
# filtered (TokenStream#remove_label_tags).
|
@@ -141,7 +152,7 @@ module Ariel
|
|
141
152
|
end
|
142
153
|
end
|
143
154
|
|
144
|
-
# Return to the beginning of the TokenStream.
|
155
|
+
# Return to the beginning of the TokenStream. Returns self.
|
145
156
|
def rewind
|
146
157
|
@cur_pos=0
|
147
158
|
self
|
@@ -166,7 +177,6 @@ module Ariel
|
|
166
177
|
if label_index
|
167
178
|
@label_index = reverse_pos(@label_index)
|
168
179
|
end
|
169
|
-
@cur_pos = reverse_pos(@cur_pos)
|
170
180
|
@reversed=!@reversed
|
171
181
|
return self
|
172
182
|
end
|
@@ -176,6 +186,11 @@ module Ariel
|
|
176
186
|
def reversed?
|
177
187
|
@reversed
|
178
188
|
end
|
189
|
+
|
190
|
+
# Returns the number of tokens in the TokenStream
|
191
|
+
def size
|
192
|
+
@tokens.size
|
193
|
+
end
|
179
194
|
|
180
195
|
# Takes a list of Strings and Symbols as its arguments representing text to be matched in
|
181
196
|
# individual tokens and Wildcards. For a match to be a
|
data/lib/ariel/wildcards.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
module Ariel
|
2
2
|
# Contains all wildcards to be used in rule generation.
|
3
3
|
class Wildcards
|
4
|
-
|
5
|
-
@@list = {
|
4
|
+
@list = {
|
6
5
|
:anything=>/.+/,
|
7
6
|
:numeric=>/\d+/,
|
8
7
|
:alpha_numeric=>/\w+/,
|
@@ -12,22 +11,27 @@ module Ariel
|
|
12
11
|
:html_tag=>/<\/?\w+>|<\w+\s+\/>/,
|
13
12
|
:punctuation=>/[[:punct:]]+/
|
14
13
|
}
|
15
|
-
# Returns the hash of wildcard name (symbol) and regular expression pairs.
|
16
|
-
def self.list
|
17
|
-
@@list
|
18
|
-
end
|
19
14
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
15
|
+
class << self
|
16
|
+
private :new
|
17
|
+
# Returns the hash of wildcard name (symbol) and regular expression pairs.
|
18
|
+
def list
|
19
|
+
@list
|
20
|
+
end
|
21
|
+
|
22
|
+
# Given a string, will return an array of symbols from Wildcards::list that
|
23
|
+
# match it.
|
24
|
+
def matching(string)
|
25
|
+
matches=[]
|
26
|
+
@list.each do |name, regex|
|
27
|
+
if string[regex]==string
|
28
|
+
yield name if block_given?
|
29
|
+
matches << name
|
30
|
+
end
|
28
31
|
end
|
32
|
+
matches
|
29
33
|
end
|
30
|
-
|
34
|
+
|
31
35
|
end
|
32
36
|
end
|
33
37
|
end
|
data/test/fixtures.rb
CHANGED
@@ -4,7 +4,7 @@ Title: <l:title>The test of the Century</l:title>
|
|
4
4
|
<l:content><b>Excerpt</b>: <i><l:excerpt>A look back at what could be considered the greatest ever test.</l:excerpt></i>
|
5
5
|
<l:body>There was once a test designed to assess whether apply_extraction_tree_on worked.</l:body></l:content>
|
6
6
|
EOS
|
7
|
-
@@labeled_document_structure = Ariel::
|
7
|
+
@@labeled_document_structure = Ariel::Node::Structure.new do |r|
|
8
8
|
r.item :title
|
9
9
|
r.item :content do |c|
|
10
10
|
c.item :excerpt
|
@@ -24,12 +24,39 @@ Title: <l:title>Another example</l:title>
|
|
24
24
|
<l:body>I love to write examples, you love to read them, ruby loves to process them.
|
25
25
|
In conclusion, we're has happy as can be.</l:body>
|
26
26
|
<l:comment_list>Comments:
|
27
|
-
<
|
27
|
+
<ol>
|
28
|
+
<li><l:comment>Title:<l:title>Great example</l:title>
|
28
29
|
<l:author>Adoring fan</l:author>
|
29
30
|
<l:body>Always love reading your examples, keep up the great work.</l:body>
|
30
|
-
</l:comment></
|
31
|
+
</l:comment></li>
|
32
|
+
<li><l:comment>Title: <l:title>Some advice</l:title>
|
33
|
+
<l:author>Wise old man</l:author>
|
34
|
+
<l:body>Keep your friends close and your enemies closer.</l:body>
|
35
|
+
</l:comment></li></l:comment_list>
|
31
36
|
EOS
|
32
37
|
|
38
|
+
@@labeled_document_with_list_structure = Ariel::Node::Structure.new do |r|
|
39
|
+
r.item :title
|
40
|
+
r.item :body
|
41
|
+
r.item :comment_list do |c|
|
42
|
+
c.list_item :comment do |d|
|
43
|
+
d.item :author
|
44
|
+
d.item :body
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
title_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([[":"]], :forward)], [Ariel::Rule.new([["love", "I"]], :back)]
|
50
|
+
body_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["example"]], :forward)], [Ariel::Rule.new([["Comments"]], :back)]
|
51
|
+
c_list_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["be", "."]], :forward)], [Ariel::Rule.new([], :back)]
|
52
|
+
comment_ruleset=Ariel::RuleSet.new [Ariel::Rule.new([["<li>"]], :forward, true)], [Ariel::Rule.new([["</li>"]], :back, true)]
|
53
|
+
|
54
|
+
s=@@labeled_document_with_list_structure
|
55
|
+
s.title.ruleset=title_ruleset
|
56
|
+
s.body.ruleset=body_ruleset
|
57
|
+
s.comment_list.ruleset=c_list_ruleset
|
58
|
+
s.comment_list.comment.ruleset=comment_ruleset
|
59
|
+
|
33
60
|
@@labeled_addresses=Array.new(4) {Ariel::TokenStream.new}
|
34
61
|
@@labeled_addresses[0].tokenize("513 Pico <b>Venice</b>, Phone: 1-<b>800</b>-555-1515")
|
35
62
|
@@labeled_addresses[0].set_label_at 36
|
@@ -40,4 +67,19 @@ EOS
|
|
40
67
|
@@labeled_addresses[3].tokenize("403 La Tijera, <b> Watts </b>, Phone: (310) 798-0008")
|
41
68
|
@@labeled_addresses[3].set_label_at 39
|
42
69
|
|
70
|
+
# This example is from the STALKER paper, it suggests that SkipTo('<p><i>')
|
71
|
+
# would extract the start of the list, and the rules SkipTo '<i>' and SkipTo
|
72
|
+
# '</i>' would locate the start and end of each list item. If the first found
|
73
|
+
# end_loc and before the first start_loc, it should be assumed all tokens from
|
74
|
+
# 0...end_loc are one item.
|
75
|
+
@@unlabeled_restaurant_example=<<EOS
|
76
|
+
<p> Name: <b> Yala </b><p> Cuisine: Thai <p><i>
|
77
|
+
4000 Colfax, Phoenix, AZ 85258 (602) 508-1570
|
78
|
+
</i><br><i>
|
79
|
+
523 Vernon, Las Vegas, NV 89104 (702) 578-2293
|
80
|
+
</i><br><i>
|
81
|
+
403 Pico, LA, CA 90007 (213) 798-0008
|
82
|
+
</i>
|
83
|
+
EOS
|
84
|
+
|
43
85
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'fixtures'
|
3
|
+
|
4
|
+
include Fixtures
|
5
|
+
|
6
|
+
context "Refining non exhaustive rule candidates" do
|
7
|
+
setup do
|
8
|
+
@candidates=[]
|
9
|
+
@candidates << Ariel::Rule.new([[:anything]], :forward)
|
10
|
+
@candidates << Ariel::Rule.new([[:numeric], [:numeric], [:numeric]], :forward) #late
|
11
|
+
@candidates << Ariel::Rule.new([["("]], :forward)
|
12
|
+
@candidates << Ariel::Rule.new([[:numeric, :alpha_numeric]], :forward)
|
13
|
+
@refiner=Ariel::CandidateRefiner.new(@candidates, @@labeled_addresses)
|
14
|
+
end
|
15
|
+
|
16
|
+
specify "refine_by_match_type should not change the list of candidates if all rules match one of the given types" do
|
17
|
+
@refiner.refine_by_match_type :fail, :early, :late, :perfect
|
18
|
+
@refiner.candidates.should_equal @candidates
|
19
|
+
end
|
20
|
+
|
21
|
+
specify "refine_by_match_type should remove all candidates that don't match the given type from the candidates list" do
|
22
|
+
@refiner.refine_by_match_type :late
|
23
|
+
@refiner.candidates.size.should_equal 1
|
24
|
+
@candidates[1].should_equal @refiner.candidates[0]
|
25
|
+
end
|
26
|
+
|
27
|
+
specify "refine_by_fewer wildcards should leave only those rules with the lowest number of wildcards" do
|
28
|
+
@refiner.refine_by_fewer_wildcards
|
29
|
+
@refiner.candidates.size.should_equal 1
|
30
|
+
@refiner.candidates[0].should_equal @candidates[2]
|
31
|
+
end
|
32
|
+
|
33
|
+
specify "refine_by_label_proximity should leave only those candidates that match closest to the label" do
|
34
|
+
@refiner.refine_by_label_proximity
|
35
|
+
@refiner.candidates.size.should_equal 1
|
36
|
+
@refiner.candidates[0].should_equal @candidates[2]
|
37
|
+
end
|
38
|
+
|
39
|
+
specify "refine_by_longer_end_landmarks should leave only those candidates with the longest end landmark" do
|
40
|
+
@refiner.refine_by_longer_end_landmarks
|
41
|
+
@refiner.candidates.size.should_equal 1
|
42
|
+
@refiner.candidates[0].should_equal @candidates[3]
|
43
|
+
end
|
44
|
+
|
45
|
+
specify "random_from_remaining should return a random candidate from those remaining in the candidate list" do
|
46
|
+
@candidates.should_include(@refiner.random_from_remaining)
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'ariel'
|
2
|
+
require 'fixtures'
|
3
|
+
include Fixtures
|
4
|
+
|
5
|
+
context "Querying LabelUtils for label tag locating Regular Expressions" do
|
6
|
+
specify "label_regex should return an array of two Regexp to locate a start tag or an end tag with the given tag contents" do
|
7
|
+
s_regex, e_regex = Ariel::LabelUtils.label_regex('example')
|
8
|
+
s_tag="<l:example>"
|
9
|
+
e_tag="</l:example>"
|
10
|
+
s_tag.should_match s_regex
|
11
|
+
e_tag.should_not_match s_regex
|
12
|
+
s_tag.should_not_match e_regex
|
13
|
+
e_tag.should_match e_regex
|
14
|
+
"<l:fail>".should_not_match s_regex
|
15
|
+
end
|
16
|
+
|
17
|
+
specify "label_regex should by default return a pair of labels that will match any valid label tags" do
|
18
|
+
s_regex, e_regex = Ariel::LabelUtils.label_regex
|
19
|
+
"<l:randomexample>".should_match s_regex
|
20
|
+
"</l:unrandomexample>".should_match e_regex
|
21
|
+
"<l:foo>".should_not_match e_regex
|
22
|
+
end
|
23
|
+
|
24
|
+
specify "any_label_regex should return a regex that will match any valid open or closing label tags" do
|
25
|
+
regex=Ariel::LabelUtils.any_label_regex
|
26
|
+
regex.should_be_a_kind_of Regexp
|
27
|
+
%w[<l:foo> <l:bar> </l:foo> </l:bar>].each {|tag| tag.should_match regex}
|
28
|
+
%w[<l:foo <l/trunk> </l:** <a> </b>].each {|tag| tag.should_not_match regex}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "Extracting a labeled region from a node" do
|
33
|
+
setup do
|
34
|
+
@tokenstream_with_label_tags = Ariel::TokenStream.new
|
35
|
+
@tokenstream_with_label_tags.tokenize @@labeled_document, true
|
36
|
+
@parent_extracted_node=Ariel::Node::Extracted.new(:root, @tokenstream_with_label_tags, @@labeled_document_structure)
|
37
|
+
@title_result=Ariel::LabelUtils.extract_labeled_region(@@labeled_document_structure.title, @parent_extracted_node)
|
38
|
+
end
|
39
|
+
|
40
|
+
specify "extract_labeled_region should return an array containing the region corresponding to the given structure node as a Node::Extracted" do
|
41
|
+
@title_result.should_be_a_kind_of Array
|
42
|
+
@title_result[0].should_be_an_instance_of Ariel::Node::Extracted
|
43
|
+
@title_result.size.should_equal 1
|
44
|
+
@title_result[0].tokenstream.tokens.should_equal @tokenstream_with_label_tags.tokens[3..7]
|
45
|
+
end
|
46
|
+
|
47
|
+
specify "Should return an empty array if the match fails" do
|
48
|
+
Ariel::LabelUtils.extract_labeled_region(Ariel::Node::Structure.new(:non_existent), @parent_extracted_node).should_equal []
|
49
|
+
end
|
50
|
+
|
51
|
+
specify "Extracted node should have the correct node_name" do
|
52
|
+
@title_result[0].node_name.should_equal :title
|
53
|
+
end
|
54
|
+
|
55
|
+
specify "Extracted node should be added as a child to the parent extracted node" do
|
56
|
+
@title_result.should_equal @parent_extracted_node.children.values
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "Extracting labeled list items from a node" do
|
61
|
+
setup do
|
62
|
+
@structure=@@labeled_document_with_list_structure
|
63
|
+
@tokenstream=Ariel::TokenStream.new
|
64
|
+
@tokenstream.tokenize @@labeled_document_with_list, true
|
65
|
+
@tokenstream = @tokenstream.slice_by_token_index 39, 95
|
66
|
+
@parent_extracted_node=Ariel::Node::Extracted.new(:comment_list, @tokenstream, @@labeled_document_with_list_structure.comment_list)
|
67
|
+
@result = Ariel::LabelUtils.extract_labeled_region(@structure.comment_list.comment, @parent_extracted_node)
|
68
|
+
end
|
69
|
+
|
70
|
+
specify "Should return an array containing each list_item" do
|
71
|
+
@result.size.should_equal 2
|
72
|
+
@result.each {|extracted_node| extracted_node.should_be_an_instance_of Ariel::Node::Extracted}
|
73
|
+
@tokenstream.tokens[5..28].should_equal @result[0].tokenstream.tokens
|
74
|
+
@tokenstream.tokens[33..54].should_equal @result[1].tokenstream.tokens
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
specify "Should name each list item itemname_num" do
|
79
|
+
@result[0].node_name.should_equal :comment_0
|
80
|
+
@result[1].node_name.should_equal :comment_1
|
81
|
+
end
|
82
|
+
|
83
|
+
specify "Should add each list_item as as a child of the parent extracted node" do
|
84
|
+
children=@parent_extracted_node.children.values
|
85
|
+
children.size.should_equal 2
|
86
|
+
children.each {|child| @result.should_include child}
|
87
|
+
end
|
88
|
+
|
89
|
+
specify "Should return an empty array if no list items are extracted" do
|
90
|
+
stream=Ariel::TokenStream.new
|
91
|
+
stream.tokenize "No labels here", true
|
92
|
+
@parent_extracted_node.tokenstream=stream
|
93
|
+
result = Ariel::LabelUtils.extract_labeled_region(@structure.comment_list.comment, @parent_extracted_node)
|
94
|
+
result.should_equal []
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|