ariel 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README +98 -0
- data/bin/ariel +56 -0
- data/examples/google_calculator/labeled/1 +43 -0
- data/examples/google_calculator/labeled/2 +41 -0
- data/examples/google_calculator/labeled/3 +41 -0
- data/examples/google_calculator/structure.rb +12 -0
- data/examples/google_calculator/structure.yaml +46 -0
- data/examples/google_calculator/unlabeled/1 +43 -0
- data/examples/google_calculator/unlabeled/2 +43 -0
- data/examples/raa/labeled/highline.html +135 -0
- data/examples/raa/labeled/mongrel.html +168 -0
- data/examples/raa/structure.rb +17 -0
- data/examples/raa/structure.yaml +183 -0
- data/examples/raa/unlabeled/pdf-writer.html +175 -0
- data/lib/ariel/candidate_selector.rb +94 -0
- data/lib/ariel/example_document_loader.rb +59 -0
- data/lib/ariel/extracted_node.rb +20 -0
- data/lib/ariel/label_utils.rb +71 -0
- data/lib/ariel/learner.rb +237 -0
- data/lib/ariel/node_like.rb +26 -0
- data/lib/ariel/rule.rb +112 -0
- data/lib/ariel/rule_set.rb +34 -0
- data/lib/ariel/structure_node.rb +75 -0
- data/lib/ariel/token.rb +68 -0
- data/lib/ariel/token_stream.rb +240 -0
- data/lib/ariel/wildcards.rb +33 -0
- data/lib/ariel.rb +69 -0
- data/test/ariel_test_case.rb +15 -0
- data/test/fixtures.rb +43 -0
- data/test/specs/token_spec.rb +65 -0
- data/test/specs/token_stream_spec.rb +43 -0
- data/test/specs/wildcards_spec.rb +26 -0
- data/test/test_candidate_selector.rb +58 -0
- data/test/test_example_document_loader.rb +7 -0
- data/test/test_label_utils.rb +15 -0
- data/test/test_learner.rb +38 -0
- data/test/test_rule.rb +38 -0
- data/test/test_structure_node.rb +81 -0
- data/test/test_token.rb +16 -0
- data/test/test_token_stream.rb +82 -0
- data/test/test_wildcards.rb +18 -0
- metadata +103 -0
@@ -0,0 +1,20 @@
|
|
1
|
+
module Ariel
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
# Each ExtractedNode has a name, a tokenstream and a structure which points to
|
5
|
+
# the relevant StructureNode.
|
6
|
+
class ExtractedNode
|
7
|
+
include NodeLike
|
8
|
+
attr_accessor :tokenstream
|
9
|
+
|
10
|
+
def initialize(name, tokenstream, structure)
|
11
|
+
@children={}
|
12
|
+
@meta = OpenStruct.new({:name=>name, :structure=>structure})
|
13
|
+
@tokenstream=tokenstream
|
14
|
+
end
|
15
|
+
|
16
|
+
def extracted_text
|
17
|
+
tokenstream.text
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# A set of methods for use when dealing with strings from labeled documents
|
4
|
+
module LabelUtils
|
5
|
+
S_LABEL="<"
|
6
|
+
E_LABEL=">"
|
7
|
+
|
8
|
+
# Returns an array containing a pair of regular expressions to match a start
|
9
|
+
# label tag and an end label tag. If the tag_contents is not modified the
|
10
|
+
# regular expressions will return any properly formatted label tag. The
|
11
|
+
# namespace to search for can also be modified. The returned regular
|
12
|
+
# expressions are case insensitive.
|
13
|
+
def self.label_regex(tag_contents='\w+', namespace='l')
|
14
|
+
[/#{S_LABEL}#{namespace}:#{tag_contents}#{E_LABEL}/i,
|
15
|
+
/#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
|
16
|
+
end
|
17
|
+
|
18
|
+
# Helper function that returns a regex that will return any open or closing
|
19
|
+
# label tags.
|
20
|
+
def self.any_label_regex()
|
21
|
+
Regexp.union(*self.label_regex)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Removes all labels such as <l:title> from the given string and returns the
|
25
|
+
# result.
|
26
|
+
def self.clean_string(string)
|
27
|
+
string.gsub self.any_label_regex, ''
|
28
|
+
end
|
29
|
+
|
30
|
+
# Extracts the labeled region representing the given structure node from the
|
31
|
+
# parent_extracted_node. A new ExtractedNode is returned to be added as a
|
32
|
+
# child to the parent_extracted_node. Used when loading labeled documents.
|
33
|
+
def self.extract_labeled_region(structure, parent_extracted_node)
|
34
|
+
tokenstream=parent_extracted_node.tokenstream
|
35
|
+
start_idx=self.skip_to_label_tag(tokenstream, structure.meta.name, :open)
|
36
|
+
end_idx=self.skip_to_label_tag(tokenstream.reverse, structure.meta.name, :closed)
|
37
|
+
end_idx=tokenstream.reverse_pos end_idx
|
38
|
+
newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
|
39
|
+
child_node=ExtractedNode.new(structure.meta.name, newstream, structure)
|
40
|
+
parent_extracted_node.add_child child_node
|
41
|
+
return child_node
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
# Locates a given label tag in a tokenstream
|
46
|
+
def self.skip_to_label_tag(tokenstream, name, type)
|
47
|
+
case type
|
48
|
+
when :open
|
49
|
+
re_index=0
|
50
|
+
when :closed
|
51
|
+
re_index=1
|
52
|
+
end
|
53
|
+
tokenstream.rewind
|
54
|
+
regex = self.label_regex(name.to_s)[re_index]
|
55
|
+
debug "Seeking #{name.to_s} of type #{type}"
|
56
|
+
nesting_level=0
|
57
|
+
tokenstream.each do |token|
|
58
|
+
if token.matches?(regex)
|
59
|
+
return tokenstream.cur_pos if nesting_level==0
|
60
|
+
end
|
61
|
+
if token.matches?(self.label_regex[0])
|
62
|
+
nesting_level+=1
|
63
|
+
debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
|
64
|
+
elsif token.matches?(self.label_regex[1])
|
65
|
+
nesting_level-=1
|
66
|
+
debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,237 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# Implements a fairly standard separate and conquer rule learning system.
|
4
|
+
# Using a list of labeled examples, candidate rules are generated. A rule is
|
5
|
+
# refined until it covers as many as possible of the labeled examples. This
|
6
|
+
# rule is recorded, the covered examples are removed and the process repeats
|
7
|
+
# on the remaining examples. Once all examples are covered, the disjunct of
|
8
|
+
# all generated rules is returned.
|
9
|
+
|
10
|
+
class Learner
|
11
|
+
attr_accessor :current_rule, :current_seed, :candidates, :direction
|
12
|
+
|
13
|
+
# Takes a list of TokenStreams containing labels.
|
14
|
+
def initialize(*examples)
|
15
|
+
if examples.any? {|example| example.label_index.nil?}
|
16
|
+
raise ArgumentError, "Passed a TokenStream with no label"
|
17
|
+
end
|
18
|
+
debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
|
19
|
+
@examples=examples
|
20
|
+
@candidates=[]
|
21
|
+
set_seed
|
22
|
+
end
|
23
|
+
|
24
|
+
# Initiates and operates the whole rule induction process. Finds an example
|
25
|
+
# to use as its seed example, then finds a rule that matches the maximum
|
26
|
+
# number of examples correctly and fails on all overs. All matched examples
|
27
|
+
# are then removed and the process is repeated considering all examples that
|
28
|
+
# remain. Returns an array of the rules found (in order).
|
29
|
+
def learn_rule(direction)
|
30
|
+
debug "Searching for a #{direction} rule"
|
31
|
+
@direction=direction
|
32
|
+
@current_rule=Rule.new(direction)
|
33
|
+
combined_rules=[]
|
34
|
+
while not @examples.empty?
|
35
|
+
set_seed unless @examples.include? @current_seed
|
36
|
+
rule = find_best_rule() # Find the rule that matches the most examples and fails on the others
|
37
|
+
prev_size = @examples.size
|
38
|
+
@examples.delete_if {|example| rule.apply_to(example)} #separate and conquer!
|
39
|
+
debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
|
40
|
+
combined_rules << rule
|
41
|
+
end
|
42
|
+
# rule = order_rule(rule) #STALKER paper suggests that the generated rules should be ordered. This doesn't make sense, seeing as they are all generated based only on examples not matched by previous rules
|
43
|
+
debug "Generated rules: #{combined_rules.inspect}"
|
44
|
+
return combined_rules
|
45
|
+
end
|
46
|
+
|
47
|
+
# The seed example is chosen from the array of remaining examples. The
|
48
|
+
# LabeledStream with the fewest tokens before the labeled token is chosen.
|
49
|
+
def set_seed
|
50
|
+
sorted = @examples.sort_by {|example| example.label_index}
|
51
|
+
self.current_seed=sorted.first
|
52
|
+
debug "current_seed=#{current_seed.text}"
|
53
|
+
return current_seed
|
54
|
+
end
|
55
|
+
|
56
|
+
# Using the seed example passed to it, generates a list of initial rule
|
57
|
+
# candidates for further refinement and evaluation. The Token prior to the
|
58
|
+
# labeled token is considered, and separate rules are generated that skip_to that
|
59
|
+
# token's text or any of it's matching wildcards.
|
60
|
+
def generate_initial_candidates
|
61
|
+
if current_seed.label_index==0
|
62
|
+
@candidates << Rule.new(@direction)
|
63
|
+
else
|
64
|
+
end_token=current_seed.tokens[current_seed.label_index-1]
|
65
|
+
debug "Creating initial candidates based on #{end_token.text}"
|
66
|
+
@candidates<< Rule.new(@direction, [[end_token.text]])
|
67
|
+
@candidates.concat(@candidates[0].generalise_feature(0))
|
68
|
+
debug "Initial candidates: #{@candidates.inspect} created"
|
69
|
+
end
|
70
|
+
return @candidates.size
|
71
|
+
end
|
72
|
+
|
73
|
+
# Equivalent of LearnDisjunct in STALKER algorithm. Generates initial
|
74
|
+
# candidate rules, refines, and then returns a single rule.
|
75
|
+
def find_best_rule
|
76
|
+
@candidates=[]
|
77
|
+
generate_initial_candidates
|
78
|
+
while true
|
79
|
+
best_refiner = get_best_refiner
|
80
|
+
best_solution = get_best_solution
|
81
|
+
@current_rule = best_refiner
|
82
|
+
break if perfect?(best_solution)
|
83
|
+
refine
|
84
|
+
end
|
85
|
+
# return post_process(best_solution)
|
86
|
+
debug "Rule found: #{best_solution.inspect}"
|
87
|
+
return best_solution
|
88
|
+
end
|
89
|
+
|
90
|
+
# A given rule is perfect if it successfully matches the label on at least
|
91
|
+
# one example and fails all others.
|
92
|
+
def perfect?(rule)
|
93
|
+
perfect_count=0
|
94
|
+
fail_count=0
|
95
|
+
@examples.each do |example|
|
96
|
+
if rule.matches(example, :perfect)
|
97
|
+
perfect_count+=1
|
98
|
+
debug "#{rule.inspect} matches #{example.text} perfectly"
|
99
|
+
elsif rule.matches(example, :fail)
|
100
|
+
fail_count+=1
|
101
|
+
debug "#{rule.inspect} fails to match #{example.text}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
if (perfect_count >= 1) && (fail_count == (@examples.size - perfect_count))
|
105
|
+
return true
|
106
|
+
else
|
107
|
+
debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
|
108
|
+
return false
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Given a list of candidate rules, uses heuristics to determine a rule
|
113
|
+
# considered to be the best refiner. Prefers candidate rules that have:
|
114
|
+
# * Larger coverage = early + correct matches.
|
115
|
+
# * If equal, prefer more early matches - can be made in to fails or perfect matches.
|
116
|
+
# Intuitively, if there are more equal matches the rule is finding features common to all documents.
|
117
|
+
# * If there is a tie, more failed matches wins - we want matches to fail rather than match incorrectly
|
118
|
+
# * Fewer wildcards - more specific, less likely to match by chance.
|
119
|
+
# * Shorter unconsumed prefixes - closer to matching correctly
|
120
|
+
# * fewer tokens in SkipUntil() - huh? Perhaps because skip_until relies on slot content rather than
|
121
|
+
# document structure.
|
122
|
+
# * longer end landmarks - prefer "local context" landmarks.
|
123
|
+
def get_best_refiner
|
124
|
+
selector = CandidateSelector.new(@candidates, @examples)
|
125
|
+
selector.select_best_by_match_type :early, :perfect #Discriminate on coverage
|
126
|
+
selector.select_best_by_match_type :early
|
127
|
+
selector.select_best_by_match_type :fail
|
128
|
+
selector.select_with_fewer_wildcards
|
129
|
+
selector.select_closest_to_label
|
130
|
+
selector.select_with_longer_end_landmarks
|
131
|
+
best_refiner = selector.random_from_remaining #just pick a random one for now if still multiple
|
132
|
+
debug "best_refiner found => #{best_refiner.inspect}"
|
133
|
+
return best_refiner
|
134
|
+
end
|
135
|
+
|
136
|
+
# Given a list of candidate rules, use heuristics to determine the best solution. Prefers:
|
137
|
+
# * More correct matches
|
138
|
+
# * More failed matches if a tie - failed matches preferable to incorrect matchees.
|
139
|
+
# * Fewer tokens in SkipUntil()
|
140
|
+
# * fewer wildcards
|
141
|
+
# * longer end landmarks
|
142
|
+
# * shorter unconsumed prefixes
|
143
|
+
def get_best_solution
|
144
|
+
selector = CandidateSelector.new(@candidates, @examples)
|
145
|
+
selector.select_best_by_match_type :perfect
|
146
|
+
selector.select_best_by_match_type :fail
|
147
|
+
selector.select_with_fewer_wildcards
|
148
|
+
selector.select_closest_to_label
|
149
|
+
selector.select_with_longer_end_landmarks
|
150
|
+
best_solution = selector.random_from_remaining
|
151
|
+
debug "best_solution found => #{best_solution.inspect}"
|
152
|
+
return best_solution
|
153
|
+
end
|
154
|
+
|
155
|
+
# Oversees both landmark (e.g. changing skip_to("<b>") in to
|
156
|
+
# skip_to("Price","<b>") and topology (skip_to(:html_tag) to a chain of
|
157
|
+
# skip_to() commands). Takes the current rule being generated and the
|
158
|
+
# example against which it is being created (the current seed_rule) as
|
159
|
+
# arguments.
|
160
|
+
def refine
|
161
|
+
@candidates=[]
|
162
|
+
current_rule.landmarks.each_with_index do |landmark, index|
|
163
|
+
add_new_landmarks(landmark, index) #Topology refinements
|
164
|
+
lengthen_landmark(landmark, index) #Landmark refinements
|
165
|
+
end
|
166
|
+
return @candidates.size
|
167
|
+
end
|
168
|
+
|
169
|
+
# Implements landmark refinements. Landmarks are lengthened to make them
|
170
|
+
# more specific.
|
171
|
+
# * Takes a landmark and its index in the current rule.
|
172
|
+
# * Applies the rule consisting of all previous landmarks in the current
|
173
|
+
# rule, so the landmark can be considered in the context of the point from
|
174
|
+
# which it shall be applied.
|
175
|
+
# * Every point at which the landmark matches after the cur_loc is considered.
|
176
|
+
# * Two extended landmarks are generated - a landmark that includes the
|
177
|
+
# token before the match, and a landmark that includes that token after the
|
178
|
+
# match.
|
179
|
+
# * Rules are generated incorporating these extended landmarks, including
|
180
|
+
# alternative landmark extensions that use relevant wildcards.
|
181
|
+
def lengthen_landmark(landmark, index)
|
182
|
+
current_seed.rewind #In case apply_rule isn't called as index=0
|
183
|
+
result = @current_rule.partial(0..(index-1)).apply_to current_seed if index > 0 #Don't care about already matched tokens
|
184
|
+
return 0 unless result # Rule doesn't match, no point refining
|
185
|
+
refined_rules=[]
|
186
|
+
width = landmark.size
|
187
|
+
while current_seed.skip_to(*landmark) #Probably should stop when cur_pos > label_index
|
188
|
+
break if current_seed.cur_pos > current_seed.label_index
|
189
|
+
match_start = (current_seed.cur_pos - 1) - width #pos of first matched token
|
190
|
+
match_end = current_seed.cur_pos - 1 #pos of last matched token
|
191
|
+
preceding_token = current_seed.tokens[match_start-1]
|
192
|
+
trailing_token = current_seed.tokens[match_end+1]
|
193
|
+
front_extended_landmark = landmark.clone.insert(0, preceding_token.text) if preceding_token
|
194
|
+
back_extended_landmark = landmark.clone.insert(-1, trailing_token.text) if trailing_token
|
195
|
+
f = current_rule.deep_clone
|
196
|
+
b = current_rule.deep_clone
|
197
|
+
f.landmarks[index] = front_extended_landmark if front_extended_landmark
|
198
|
+
b.landmarks[index] = back_extended_landmark if back_extended_landmark
|
199
|
+
refined_rules << f
|
200
|
+
refined_rules.concat f.generalise_feature(index, 0)
|
201
|
+
refined_rules << b
|
202
|
+
refined_rules.concat b.generalise_feature(index, -1)
|
203
|
+
end
|
204
|
+
@candidates.concat refined_rules
|
205
|
+
debug "#{refined_rules.size} landmark refinements generated"
|
206
|
+
return refined_rules.size
|
207
|
+
end
|
208
|
+
|
209
|
+
# Implements topology refinements - new landmarks are added to the current rule.
|
210
|
+
# * Takes a landmark and its index in the current rule.
|
211
|
+
# * Applies the rule consisting of all landmarks up to and including the
|
212
|
+
# current landmark to find where it matches.
|
213
|
+
# * Only tokens between the label_index and the position at which the partial rule matches are considered.
|
214
|
+
# * Tokens before the rule match location will have no effect, as adding new
|
215
|
+
# landmarks before or after the current landmark will not make the rule
|
216
|
+
# match any earlier.
|
217
|
+
# * For every token in this slice of the TokenStream, a new potential rule
|
218
|
+
# is created by adding a new landmark consisting of that token. This
|
219
|
+
# is also done for each of that token's matching wildcards.
|
220
|
+
def add_new_landmarks(landmark, index)
|
221
|
+
topology_refs=[]
|
222
|
+
start_pos = current_rule.partial(0..index).apply_to(current_seed)
|
223
|
+
end_pos = current_seed.label_index #No point adding tokens that occur after the label_index
|
224
|
+
current_seed.tokens[start_pos...end_pos].each do |token|
|
225
|
+
r=current_rule.deep_clone
|
226
|
+
r.landmarks.insert(index+1, [token.text])
|
227
|
+
topology_refs << r
|
228
|
+
topology_refs.concat r.generalise_feature(index+1)
|
229
|
+
end
|
230
|
+
debug "Topology refinements before uniq! #{topology_refs.size}"
|
231
|
+
topology_refs.uniq!
|
232
|
+
@candidates.concat topology_refs
|
233
|
+
debug "#{topology_refs.size} topology refinements generated"
|
234
|
+
return topology_refs.size
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
module NodeLike
|
4
|
+
attr_accessor :parent, :children, :meta
|
5
|
+
|
6
|
+
# Given a Node object and a name, adds a child to the array of children,
|
7
|
+
# setting its parent as the current node, as well as creating an accessor
|
8
|
+
# method matching that name.
|
9
|
+
def add_child(node)
|
10
|
+
@children[node.meta.name]=node
|
11
|
+
node.parent = self
|
12
|
+
end
|
13
|
+
|
14
|
+
def each_descendant(include_self=false)
|
15
|
+
if include_self
|
16
|
+
node_queue=[self]
|
17
|
+
else
|
18
|
+
node_queue=self.children.values
|
19
|
+
end
|
20
|
+
until node_queue.empty? do
|
21
|
+
node_queue.concat node_queue.first.children.values
|
22
|
+
yield node_queue.shift
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/ariel/rule.rb
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# A rule contains an array of landmarks (each of which is an array of
|
4
|
+
# individual landmark features. This landmark array is accessible through
|
5
|
+
# Rule#landmarks. A Rule also has a direction :forward or :back, which
|
6
|
+
# determines whether it is applied from the end or beginning of a tokenstream.
|
7
|
+
class Rule
|
8
|
+
attr_accessor :landmarks, :direction
|
9
|
+
@@RuleMatchData=Struct.new(:token_loc, :type)
|
10
|
+
|
11
|
+
# A rule's direction can be :back or :forward, which determines whether it
|
12
|
+
# is applied from the start of end of the TokenStream. The landmark array
|
13
|
+
# contains an array for each landmark, which consists of one or more
|
14
|
+
# features. e.g. Rule.new(:forward, [[:anything, "Example"], ["Test"]]).
|
15
|
+
def initialize(direction, landmarks=[])
|
16
|
+
@landmarks=landmarks
|
17
|
+
raise(ArgumentError, "Not a valid direction") unless [:forward, :back].include?(direction)
|
18
|
+
@direction=direction
|
19
|
+
end
|
20
|
+
|
21
|
+
# Two rules are equal if they have the same list of landmarks and the same
|
22
|
+
# direction
|
23
|
+
def ==(rule)
|
24
|
+
return ((self.landmarks == rule.landmarks) && self.direction==rule.direction)
|
25
|
+
end
|
26
|
+
alias :eql? :==
|
27
|
+
|
28
|
+
def hash
|
29
|
+
[@landmarks, @direction].hash
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns a rule that contains a given range of
|
33
|
+
def partial(range)
|
34
|
+
return Rule.new(@direction, @landmarks[range])
|
35
|
+
end
|
36
|
+
|
37
|
+
def deep_clone
|
38
|
+
Marshal::load(Marshal.dump(self))
|
39
|
+
end
|
40
|
+
|
41
|
+
def generalise_feature(landmark_index, feature_index=0)
|
42
|
+
feature=self.landmarks[landmark_index][feature_index]
|
43
|
+
alternates=[]
|
44
|
+
Wildcards.matching(feature) do |wildcard|
|
45
|
+
r=self.deep_clone
|
46
|
+
r.landmarks[landmark_index][feature_index]=wildcard
|
47
|
+
alternates << r
|
48
|
+
yield r if block_given?
|
49
|
+
end
|
50
|
+
return alternates
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the number of wildcards included as features in the list of rule
|
54
|
+
# landmarks.
|
55
|
+
def wildcard_count
|
56
|
+
@landmarks.flatten.select {|feature| feature.kind_of? Symbol}.size
|
57
|
+
end
|
58
|
+
|
59
|
+
# Given a TokenStream and a rule, applies the rule on the stream and
|
60
|
+
# returns nil if the match fails and the token_loc if the match succeeds.
|
61
|
+
# Yields a RuleMatchData Struct with accessors token_loc (the position of the match in the stream)
|
62
|
+
# and type if a block is given. type is nil if the TokenStream has no label,
|
63
|
+
# :perfect if all tokens up to the labeled token are consumed, :early if the rule's final position
|
64
|
+
# is before the labeled token, and :late if it is after. The returned
|
65
|
+
# token_loc is the position in the stream as it was passed in. That is, the
|
66
|
+
# token_loc is always from the left of the given stream whether it is in a
|
67
|
+
# reversed state or not.
|
68
|
+
def apply_to(tokenstream)
|
69
|
+
if tokenstream.reversed?
|
70
|
+
target=tokenstream if @direction==:back
|
71
|
+
target=tokenstream.reverse if @direction==:forward
|
72
|
+
elsif not tokenstream.reversed?
|
73
|
+
target=tokenstream if @direction==:forward
|
74
|
+
target=tokenstream.reverse if @direction==:back
|
75
|
+
end
|
76
|
+
target.rewind #rules are applied from the beginning of the stream
|
77
|
+
@landmarks.each do |landmark|
|
78
|
+
unless target.skip_to(*landmark)
|
79
|
+
return nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
token_loc=target.cur_pos
|
83
|
+
if @direction==:back && !tokenstream.reversed?
|
84
|
+
token_loc = tokenstream.reverse_pos(token_loc) #Return position from left of given stream
|
85
|
+
end
|
86
|
+
md = @@RuleMatchData.new(token_loc)
|
87
|
+
if target.label_index
|
88
|
+
idx = target.label_index
|
89
|
+
md.type = :perfect if token_loc == idx
|
90
|
+
md.type = :early if token_loc < idx
|
91
|
+
md.type = :late if token_loc > idx
|
92
|
+
end
|
93
|
+
yield md if block_given?
|
94
|
+
return token_loc
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns true or false depending on if the match of this rule on the given
|
98
|
+
# tokenstream is of any of the given types (could be a combination of
|
99
|
+
# :perfect, :early, :fail and :late). Only valid on streams with labels
|
100
|
+
def matches(tokenstream, *types)
|
101
|
+
raise ArgumentError, "No match types given" if types.empty?
|
102
|
+
match = nil
|
103
|
+
apply_to(tokenstream) {|md| match=md.type}
|
104
|
+
match = :fail if match.nil?
|
105
|
+
if types.include? match
|
106
|
+
return true
|
107
|
+
else
|
108
|
+
return false
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# A RuleSet acts as a container for a StructureNode's start and end rules.
|
4
|
+
# These are stored as an ordered array and are applied in turn until there is
|
5
|
+
# a successful match. A RuleSet takes responsibility for applying start and
|
6
|
+
# end rules to extract an ExtractedNode.
|
7
|
+
class RuleSet
|
8
|
+
def initialize(start_rules, end_rules)
|
9
|
+
@start_rules=start_rules
|
10
|
+
@end_rules=end_rules
|
11
|
+
end
|
12
|
+
|
13
|
+
def apply_to(tokenstream)
|
14
|
+
start_idx=nil
|
15
|
+
end_idx=nil
|
16
|
+
@start_rules.each do |rule|
|
17
|
+
start_idx=rule.apply_to tokenstream
|
18
|
+
break if start_idx
|
19
|
+
end
|
20
|
+
@end_rules.each do |rule|
|
21
|
+
end_idx=rule.apply_to tokenstream
|
22
|
+
break if end_idx
|
23
|
+
end
|
24
|
+
if start_idx && end_idx
|
25
|
+
debug "RuleSet matched with start_idx=#{start_idx} and end_idx=#{end_idx}"
|
26
|
+
return nil if end_idx < start_idx
|
27
|
+
return tokenstream.slice_by_token_index(start_idx, end_idx)
|
28
|
+
else
|
29
|
+
debug "No valid match was found"
|
30
|
+
return nil
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Ariel
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
# Implements a Node object used to represent the structure of the document
|
5
|
+
# tree. Each node stores start and end rules to extract the desired content
|
6
|
+
# from its parent node. Could be viewed as a rule-storing object.
|
7
|
+
class StructureNode
|
8
|
+
include NodeLike
|
9
|
+
attr_accessor :ruleset
|
10
|
+
def initialize(name=:root, type=:not_list, &block)
|
11
|
+
@children={}
|
12
|
+
@meta = OpenStruct.new({:name=>name, :node_type=>type})
|
13
|
+
yield self if block_given?
|
14
|
+
end
|
15
|
+
|
16
|
+
# Used to extend an already created Node. e.g.
|
17
|
+
# node.extend_structure do |r|
|
18
|
+
# r.new_field1
|
19
|
+
# r.new_field2
|
20
|
+
# end
|
21
|
+
def extend_structure(&block)
|
22
|
+
yield self if block_given?
|
23
|
+
end
|
24
|
+
|
25
|
+
# Given a Node to apply it's rules to, this function will create a new node
|
26
|
+
# and add it as a child of the given node. For StructureNodes of :list type,
|
27
|
+
# the list is extracted and so are each of the list items. In this case,
|
28
|
+
# only the list items are yielded.
|
29
|
+
def extract_from(node)
|
30
|
+
# Will be reimplemented to return an array of extracted items
|
31
|
+
newstream = @ruleset.apply_to(node.tokenstream)
|
32
|
+
extracted_node = ExtractedNode.new(meta.name, newstream, self)
|
33
|
+
node.add_child extracted_node if newstream
|
34
|
+
if self.meta.node_type == :list
|
35
|
+
#Do stuff
|
36
|
+
end
|
37
|
+
return extracted_node
|
38
|
+
end
|
39
|
+
|
40
|
+
# Applies the extraction rules stored in the current StructureNode and all its
|
41
|
+
# descendant children.
|
42
|
+
def apply_extraction_tree_on(root_node, extract_labels=false)
|
43
|
+
extraction_queue = [root_node]
|
44
|
+
until extraction_queue.empty? do
|
45
|
+
new_parent = extraction_queue.shift
|
46
|
+
new_parent.meta.structure.children.values.each do |child|
|
47
|
+
if extract_labels
|
48
|
+
extracted_node=LabelUtils.extract_labeled_region(child, new_parent)
|
49
|
+
else
|
50
|
+
extracted_node=child.extract_from(new_parent)
|
51
|
+
end
|
52
|
+
extraction_queue.push(extracted_node) if extracted_node
|
53
|
+
end
|
54
|
+
end
|
55
|
+
return root_node
|
56
|
+
end
|
57
|
+
|
58
|
+
def item(name, &block)
|
59
|
+
self.add_child(StructureNode.new(name, &block))
|
60
|
+
end
|
61
|
+
|
62
|
+
def list_item(name, &block)
|
63
|
+
self.add_child(StructureNode.new(name, :list, &block))
|
64
|
+
end
|
65
|
+
|
66
|
+
def method_missing(method, *args, &block)
|
67
|
+
if @children.has_key? method
|
68
|
+
@children[method]
|
69
|
+
else
|
70
|
+
super
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
data/lib/ariel/token.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
module Ariel
|
2
|
+
|
3
|
+
# Tokens populate a TokenStream. They know their position in the original
|
4
|
+
# document, can list the wildcards that match them and determine whether a
|
5
|
+
# given string or wildcard is a valid match. During the process of parsing a
|
6
|
+
# labeled document, some tokens may be marked as being a label_tag. These are
|
7
|
+
# filtered from the TokenStream before the rule learning phase.
|
8
|
+
class Token
|
9
|
+
attr_reader :text, :start_loc, :end_loc
|
10
|
+
|
11
|
+
# Each new Token must have a string representing its content, its start position in the
|
12
|
+
# original document (start_loc) and the point at which it ends (end_loc).
|
13
|
+
# For instance, in str="This is an example", if "is" were to be made a
|
14
|
+
# Token it would be given a start_loc of 5 and and end_loc of 7, which is
|
15
|
+
# str[5...7]
|
16
|
+
def initialize(text, start_loc, end_loc, label_tag=false)
|
17
|
+
@text=text.to_s
|
18
|
+
@start_loc=start_loc
|
19
|
+
@end_loc=end_loc
|
20
|
+
@label_tag=label_tag
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns true or false depending on whether the token was marked as a label
|
24
|
+
# tag when it was initialized.
|
25
|
+
def is_label_tag?
|
26
|
+
@label_tag
|
27
|
+
end
|
28
|
+
|
29
|
+
# Tokens are only equal if they have an equal start_loc, end_loc and text.
|
30
|
+
def ==(t)
|
31
|
+
return (@start_loc==t.start_loc && @end_loc==t.end_loc && @text==t.text)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Tokens are sorted based on their start_loc
|
35
|
+
def <=>(t)
|
36
|
+
@start_loc <=> t.start_loc
|
37
|
+
end
|
38
|
+
|
39
|
+
# Accepts either a string or symbol representing a wildcard in
|
40
|
+
# Wildcards#list. Returns true if the whole Token is consumed by the wildcard or the
|
41
|
+
# string is equal to Token#text, and false if the match fails. Raises an
|
42
|
+
# error if the passed symbol is not a member of Wildcards#list.
|
43
|
+
def matches?(landmark)
|
44
|
+
if landmark.kind_of? Symbol or landmark.kind_of? Regexp
|
45
|
+
if landmark.kind_of? Symbol
|
46
|
+
raise ArgumentError, "#{landmark} is not a valid wildcard." unless Wildcards.list.has_key? landmark
|
47
|
+
regex = Wildcards.list[landmark]
|
48
|
+
else
|
49
|
+
regex = landmark
|
50
|
+
end
|
51
|
+
if self.text[regex] == self.text
|
52
|
+
return true
|
53
|
+
else
|
54
|
+
return false
|
55
|
+
end
|
56
|
+
else
|
57
|
+
return true if landmark==self.text
|
58
|
+
end
|
59
|
+
return false
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns an array of symbols corresponding to the Wildcards that match the
|
63
|
+
# Token.
|
64
|
+
def matching_wildcards
|
65
|
+
return Wildcards.matching(self.text)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|