ariel 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/LICENSE +21 -0
  2. data/README +98 -0
  3. data/bin/ariel +56 -0
  4. data/examples/google_calculator/labeled/1 +43 -0
  5. data/examples/google_calculator/labeled/2 +41 -0
  6. data/examples/google_calculator/labeled/3 +41 -0
  7. data/examples/google_calculator/structure.rb +12 -0
  8. data/examples/google_calculator/structure.yaml +46 -0
  9. data/examples/google_calculator/unlabeled/1 +43 -0
  10. data/examples/google_calculator/unlabeled/2 +43 -0
  11. data/examples/raa/labeled/highline.html +135 -0
  12. data/examples/raa/labeled/mongrel.html +168 -0
  13. data/examples/raa/structure.rb +17 -0
  14. data/examples/raa/structure.yaml +183 -0
  15. data/examples/raa/unlabeled/pdf-writer.html +175 -0
  16. data/lib/ariel/candidate_selector.rb +94 -0
  17. data/lib/ariel/example_document_loader.rb +59 -0
  18. data/lib/ariel/extracted_node.rb +20 -0
  19. data/lib/ariel/label_utils.rb +71 -0
  20. data/lib/ariel/learner.rb +237 -0
  21. data/lib/ariel/node_like.rb +26 -0
  22. data/lib/ariel/rule.rb +112 -0
  23. data/lib/ariel/rule_set.rb +34 -0
  24. data/lib/ariel/structure_node.rb +75 -0
  25. data/lib/ariel/token.rb +68 -0
  26. data/lib/ariel/token_stream.rb +240 -0
  27. data/lib/ariel/wildcards.rb +33 -0
  28. data/lib/ariel.rb +69 -0
  29. data/test/ariel_test_case.rb +15 -0
  30. data/test/fixtures.rb +43 -0
  31. data/test/specs/token_spec.rb +65 -0
  32. data/test/specs/token_stream_spec.rb +43 -0
  33. data/test/specs/wildcards_spec.rb +26 -0
  34. data/test/test_candidate_selector.rb +58 -0
  35. data/test/test_example_document_loader.rb +7 -0
  36. data/test/test_label_utils.rb +15 -0
  37. data/test/test_learner.rb +38 -0
  38. data/test/test_rule.rb +38 -0
  39. data/test/test_structure_node.rb +81 -0
  40. data/test/test_token.rb +16 -0
  41. data/test/test_token_stream.rb +82 -0
  42. data/test/test_wildcards.rb +18 -0
  43. metadata +103 -0
@@ -0,0 +1,20 @@
1
+ module Ariel
2
+ require 'ostruct'
3
+
4
+ # Each ExtractedNode has a name, a tokenstream and a structure which points to
5
+ # the relevant StructureNode.
6
+ class ExtractedNode
7
+ include NodeLike
8
+ attr_accessor :tokenstream
9
+
10
+ def initialize(name, tokenstream, structure)
11
+ @children={}
12
+ @meta = OpenStruct.new({:name=>name, :structure=>structure})
13
+ @tokenstream=tokenstream
14
+ end
15
+
16
+ def extracted_text
17
+ tokenstream.text
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,71 @@
1
+ module Ariel
2
+
3
+ # A set of methods for use when dealing with strings from labeled documents
4
+ module LabelUtils
5
+ S_LABEL="<"
6
+ E_LABEL=">"
7
+
8
+ # Returns an array containing a pair of regular expressions to match a start
9
+ # label tag and an end label tag. If the tag_contents is not modified the
10
+ # regular expressions will return any properly formatted label tag. The
11
+ # namespace to search for can also be modified. The returned regular
12
+ # expressions are case insensitive.
13
+ def self.label_regex(tag_contents='\w+', namespace='l')
14
+ [/#{S_LABEL}#{namespace}:#{tag_contents}#{E_LABEL}/i,
15
+ /#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
16
+ end
17
+
18
+ # Helper function that returns a regex that will return any open or closing
19
+ # label tags.
20
+ def self.any_label_regex()
21
+ Regexp.union(*self.label_regex)
22
+ end
23
+
24
+ # Removes all labels such as <l:title> from the given string and returns the
25
+ # result.
26
+ def self.clean_string(string)
27
+ string.gsub self.any_label_regex, ''
28
+ end
29
+
30
+ # Extracts the labeled region representing the given structure node from the
31
+ # parent_extracted_node. A new ExtractedNode is returned to be added as a
32
+ # child to the parent_extracted_node. Used when loading labeled documents.
33
+ def self.extract_labeled_region(structure, parent_extracted_node)
34
+ tokenstream=parent_extracted_node.tokenstream
35
+ start_idx=self.skip_to_label_tag(tokenstream, structure.meta.name, :open)
36
+ end_idx=self.skip_to_label_tag(tokenstream.reverse, structure.meta.name, :closed)
37
+ end_idx=tokenstream.reverse_pos end_idx
38
+ newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
39
+ child_node=ExtractedNode.new(structure.meta.name, newstream, structure)
40
+ parent_extracted_node.add_child child_node
41
+ return child_node
42
+ end
43
+
44
+ private
45
+ # Locates a given label tag in a tokenstream
46
+ def self.skip_to_label_tag(tokenstream, name, type)
47
+ case type
48
+ when :open
49
+ re_index=0
50
+ when :closed
51
+ re_index=1
52
+ end
53
+ tokenstream.rewind
54
+ regex = self.label_regex(name.to_s)[re_index]
55
+ debug "Seeking #{name.to_s} of type #{type}"
56
+ nesting_level=0
57
+ tokenstream.each do |token|
58
+ if token.matches?(regex)
59
+ return tokenstream.cur_pos if nesting_level==0
60
+ end
61
+ if token.matches?(self.label_regex[0])
62
+ nesting_level+=1
63
+ debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
64
+ elsif token.matches?(self.label_regex[1])
65
+ nesting_level-=1
66
+ debug "Encountered token \"#{token.text}\", nesting level=#{nesting_level}"
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,237 @@
1
+ module Ariel
2
+
3
+ # Implements a fairly standard separate and conquer rule learning system.
4
+ # Using a list of labeled examples, candidate rules are generated. A rule is
5
+ # refined until it covers as many as possible of the labeled examples. This
6
+ # rule is recorded, the covered examples are removed and the process repeats
7
+ # on the remaining examples. Once all examples are covered, the disjunct of
8
+ # all generated rules is returned.
9
+
10
+ class Learner
11
+ attr_accessor :current_rule, :current_seed, :candidates, :direction
12
+
13
+ # Takes a list of TokenStreams containing labels.
14
+ def initialize(*examples)
15
+ if examples.any? {|example| example.label_index.nil?}
16
+ raise ArgumentError, "Passed a TokenStream with no label"
17
+ end
18
+ debug "ATTENTION: New Learner instantiated with #{examples.size} labeled examples"
19
+ @examples=examples
20
+ @candidates=[]
21
+ set_seed
22
+ end
23
+
24
+ # Initiates and operates the whole rule induction process. Finds an example
25
+ # to use as its seed example, then finds a rule that matches the maximum
26
+ # number of examples correctly and fails on all overs. All matched examples
27
+ # are then removed and the process is repeated considering all examples that
28
+ # remain. Returns an array of the rules found (in order).
29
+ def learn_rule(direction)
30
+ debug "Searching for a #{direction} rule"
31
+ @direction=direction
32
+ @current_rule=Rule.new(direction)
33
+ combined_rules=[]
34
+ while not @examples.empty?
35
+ set_seed unless @examples.include? @current_seed
36
+ rule = find_best_rule() # Find the rule that matches the most examples and fails on the others
37
+ prev_size = @examples.size
38
+ @examples.delete_if {|example| rule.apply_to(example)} #separate and conquer!
39
+ debug "Removing #{prev_size - @examples.size} examples matched by the generated rule, #{@examples.size} remain"
40
+ combined_rules << rule
41
+ end
42
+ # rule = order_rule(rule) #STALKER paper suggests that the generated rules should be ordered. This doesn't make sense, seeing as they are all generated based only on examples not matched by previous rules
43
+ debug "Generated rules: #{combined_rules.inspect}"
44
+ return combined_rules
45
+ end
46
+
47
+ # The seed example is chosen from the array of remaining examples. The
48
+ # LabeledStream with the fewest tokens before the labeled token is chosen.
49
+ def set_seed
50
+ sorted = @examples.sort_by {|example| example.label_index}
51
+ self.current_seed=sorted.first
52
+ debug "current_seed=#{current_seed.text}"
53
+ return current_seed
54
+ end
55
+
56
+ # Using the seed example passed to it, generates a list of initial rule
57
+ # candidates for further refinement and evaluation. The Token prior to the
58
+ # labeled token is considered, and separate rules are generated that skip_to that
59
+ # token's text or any of it's matching wildcards.
60
+ def generate_initial_candidates
61
+ if current_seed.label_index==0
62
+ @candidates << Rule.new(@direction)
63
+ else
64
+ end_token=current_seed.tokens[current_seed.label_index-1]
65
+ debug "Creating initial candidates based on #{end_token.text}"
66
+ @candidates<< Rule.new(@direction, [[end_token.text]])
67
+ @candidates.concat(@candidates[0].generalise_feature(0))
68
+ debug "Initial candidates: #{@candidates.inspect} created"
69
+ end
70
+ return @candidates.size
71
+ end
72
+
73
+ # Equivalent of LearnDisjunct in STALKER algorithm. Generates initial
74
+ # candidate rules, refines, and then returns a single rule.
75
+ def find_best_rule
76
+ @candidates=[]
77
+ generate_initial_candidates
78
+ while true
79
+ best_refiner = get_best_refiner
80
+ best_solution = get_best_solution
81
+ @current_rule = best_refiner
82
+ break if perfect?(best_solution)
83
+ refine
84
+ end
85
+ # return post_process(best_solution)
86
+ debug "Rule found: #{best_solution.inspect}"
87
+ return best_solution
88
+ end
89
+
90
+ # A given rule is perfect if it successfully matches the label on at least
91
+ # one example and fails all others.
92
+ def perfect?(rule)
93
+ perfect_count=0
94
+ fail_count=0
95
+ @examples.each do |example|
96
+ if rule.matches(example, :perfect)
97
+ perfect_count+=1
98
+ debug "#{rule.inspect} matches #{example.text} perfectly"
99
+ elsif rule.matches(example, :fail)
100
+ fail_count+=1
101
+ debug "#{rule.inspect} fails to match #{example.text}"
102
+ end
103
+ end
104
+ if (perfect_count >= 1) && (fail_count == (@examples.size - perfect_count))
105
+ return true
106
+ else
107
+ debug "Rule was not perfect, perfect_count=#{perfect_count}, fail_count=#{fail_count}"
108
+ return false
109
+ end
110
+ end
111
+
112
+ # Given a list of candidate rules, uses heuristics to determine a rule
113
+ # considered to be the best refiner. Prefers candidate rules that have:
114
+ # * Larger coverage = early + correct matches.
115
+ # * If equal, prefer more early matches - can be made in to fails or perfect matches.
116
+ # Intuitively, if there are more equal matches the rule is finding features common to all documents.
117
+ # * If there is a tie, more failed matches wins - we want matches to fail rather than match incorrectly
118
+ # * Fewer wildcards - more specific, less likely to match by chance.
119
+ # * Shorter unconsumed prefixes - closer to matching correctly
120
+ # * fewer tokens in SkipUntil() - huh? Perhaps because skip_until relies on slot content rather than
121
+ # document structure.
122
+ # * longer end landmarks - prefer "local context" landmarks.
123
+ def get_best_refiner
124
+ selector = CandidateSelector.new(@candidates, @examples)
125
+ selector.select_best_by_match_type :early, :perfect #Discriminate on coverage
126
+ selector.select_best_by_match_type :early
127
+ selector.select_best_by_match_type :fail
128
+ selector.select_with_fewer_wildcards
129
+ selector.select_closest_to_label
130
+ selector.select_with_longer_end_landmarks
131
+ best_refiner = selector.random_from_remaining #just pick a random one for now if still multiple
132
+ debug "best_refiner found => #{best_refiner.inspect}"
133
+ return best_refiner
134
+ end
135
+
136
+ # Given a list of candidate rules, use heuristics to determine the best solution. Prefers:
137
+ # * More correct matches
138
+ # * More failed matches if a tie - failed matches preferable to incorrect matchees.
139
+ # * Fewer tokens in SkipUntil()
140
+ # * fewer wildcards
141
+ # * longer end landmarks
142
+ # * shorter unconsumed prefixes
143
+ def get_best_solution
144
+ selector = CandidateSelector.new(@candidates, @examples)
145
+ selector.select_best_by_match_type :perfect
146
+ selector.select_best_by_match_type :fail
147
+ selector.select_with_fewer_wildcards
148
+ selector.select_closest_to_label
149
+ selector.select_with_longer_end_landmarks
150
+ best_solution = selector.random_from_remaining
151
+ debug "best_solution found => #{best_solution.inspect}"
152
+ return best_solution
153
+ end
154
+
155
+ # Oversees both landmark (e.g. changing skip_to("<b>") in to
156
+ # skip_to("Price","<b>") and topology (skip_to(:html_tag) to a chain of
157
+ # skip_to() commands). Takes the current rule being generated and the
158
+ # example against which it is being created (the current seed_rule) as
159
+ # arguments.
160
+ def refine
161
+ @candidates=[]
162
+ current_rule.landmarks.each_with_index do |landmark, index|
163
+ add_new_landmarks(landmark, index) #Topology refinements
164
+ lengthen_landmark(landmark, index) #Landmark refinements
165
+ end
166
+ return @candidates.size
167
+ end
168
+
169
+ # Implements landmark refinements. Landmarks are lengthened to make them
170
+ # more specific.
171
+ # * Takes a landmark and its index in the current rule.
172
+ # * Applies the rule consisting of all previous landmarks in the current
173
+ # rule, so the landmark can be considered in the context of the point from
174
+ # which it shall be applied.
175
+ # * Every point at which the landmark matches after the cur_loc is considered.
176
+ # * Two extended landmarks are generated - a landmark that includes the
177
+ # token before the match, and a landmark that includes that token after the
178
+ # match.
179
+ # * Rules are generated incorporating these extended landmarks, including
180
+ # alternative landmark extensions that use relevant wildcards.
181
+ def lengthen_landmark(landmark, index)
182
+ current_seed.rewind #In case apply_rule isn't called as index=0
183
+ result = @current_rule.partial(0..(index-1)).apply_to current_seed if index > 0 #Don't care about already matched tokens
184
+ return 0 unless result # Rule doesn't match, no point refining
185
+ refined_rules=[]
186
+ width = landmark.size
187
+ while current_seed.skip_to(*landmark) #Probably should stop when cur_pos > label_index
188
+ break if current_seed.cur_pos > current_seed.label_index
189
+ match_start = (current_seed.cur_pos - 1) - width #pos of first matched token
190
+ match_end = current_seed.cur_pos - 1 #pos of last matched token
191
+ preceding_token = current_seed.tokens[match_start-1]
192
+ trailing_token = current_seed.tokens[match_end+1]
193
+ front_extended_landmark = landmark.clone.insert(0, preceding_token.text) if preceding_token
194
+ back_extended_landmark = landmark.clone.insert(-1, trailing_token.text) if trailing_token
195
+ f = current_rule.deep_clone
196
+ b = current_rule.deep_clone
197
+ f.landmarks[index] = front_extended_landmark if front_extended_landmark
198
+ b.landmarks[index] = back_extended_landmark if back_extended_landmark
199
+ refined_rules << f
200
+ refined_rules.concat f.generalise_feature(index, 0)
201
+ refined_rules << b
202
+ refined_rules.concat b.generalise_feature(index, -1)
203
+ end
204
+ @candidates.concat refined_rules
205
+ debug "#{refined_rules.size} landmark refinements generated"
206
+ return refined_rules.size
207
+ end
208
+
209
+ # Implements topology refinements - new landmarks are added to the current rule.
210
+ # * Takes a landmark and its index in the current rule.
211
+ # * Applies the rule consisting of all landmarks up to and including the
212
+ # current landmark to find where it matches.
213
+ # * Only tokens between the label_index and the position at which the partial rule matches are considered.
214
+ # * Tokens before the rule match location will have no effect, as adding new
215
+ # landmarks before or after the current landmark will not make the rule
216
+ # match any earlier.
217
+ # * For every token in this slice of the TokenStream, a new potential rule
218
+ # is created by adding a new landmark consisting of that token. This
219
+ # is also done for each of that token's matching wildcards.
220
+ def add_new_landmarks(landmark, index)
221
+ topology_refs=[]
222
+ start_pos = current_rule.partial(0..index).apply_to(current_seed)
223
+ end_pos = current_seed.label_index #No point adding tokens that occur after the label_index
224
+ current_seed.tokens[start_pos...end_pos].each do |token|
225
+ r=current_rule.deep_clone
226
+ r.landmarks.insert(index+1, [token.text])
227
+ topology_refs << r
228
+ topology_refs.concat r.generalise_feature(index+1)
229
+ end
230
+ debug "Topology refinements before uniq! #{topology_refs.size}"
231
+ topology_refs.uniq!
232
+ @candidates.concat topology_refs
233
+ debug "#{topology_refs.size} topology refinements generated"
234
+ return topology_refs.size
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,26 @@
1
+ module Ariel
2
+
3
+ module NodeLike
4
+ attr_accessor :parent, :children, :meta
5
+
6
+ # Given a Node object and a name, adds a child to the array of children,
7
+ # setting its parent as the current node, as well as creating an accessor
8
+ # method matching that name.
9
+ def add_child(node)
10
+ @children[node.meta.name]=node
11
+ node.parent = self
12
+ end
13
+
14
+ def each_descendant(include_self=false)
15
+ if include_self
16
+ node_queue=[self]
17
+ else
18
+ node_queue=self.children.values
19
+ end
20
+ until node_queue.empty? do
21
+ node_queue.concat node_queue.first.children.values
22
+ yield node_queue.shift
23
+ end
24
+ end
25
+ end
26
+ end
data/lib/ariel/rule.rb ADDED
@@ -0,0 +1,112 @@
1
+ module Ariel
2
+
3
+ # A rule contains an array of landmarks (each of which is an array of
4
+ # individual landmark features. This landmark array is accessible through
5
+ # Rule#landmarks. A Rule also has a direction :forward or :back, which
6
+ # determines whether it is applied from the end or beginning of a tokenstream.
7
+ class Rule
8
+ attr_accessor :landmarks, :direction
9
+ @@RuleMatchData=Struct.new(:token_loc, :type)
10
+
11
+ # A rule's direction can be :back or :forward, which determines whether it
12
+ # is applied from the start of end of the TokenStream. The landmark array
13
+ # contains an array for each landmark, which consists of one or more
14
+ # features. e.g. Rule.new(:forward, [[:anything, "Example"], ["Test"]]).
15
+ def initialize(direction, landmarks=[])
16
+ @landmarks=landmarks
17
+ raise(ArgumentError, "Not a valid direction") unless [:forward, :back].include?(direction)
18
+ @direction=direction
19
+ end
20
+
21
+ # Two rules are equal if they have the same list of landmarks and the same
22
+ # direction
23
+ def ==(rule)
24
+ return ((self.landmarks == rule.landmarks) && self.direction==rule.direction)
25
+ end
26
+ alias :eql? :==
27
+
28
+ def hash
29
+ [@landmarks, @direction].hash
30
+ end
31
+
32
+ # Returns a rule that contains a given range of
33
+ def partial(range)
34
+ return Rule.new(@direction, @landmarks[range])
35
+ end
36
+
37
+ def deep_clone
38
+ Marshal::load(Marshal.dump(self))
39
+ end
40
+
41
+ def generalise_feature(landmark_index, feature_index=0)
42
+ feature=self.landmarks[landmark_index][feature_index]
43
+ alternates=[]
44
+ Wildcards.matching(feature) do |wildcard|
45
+ r=self.deep_clone
46
+ r.landmarks[landmark_index][feature_index]=wildcard
47
+ alternates << r
48
+ yield r if block_given?
49
+ end
50
+ return alternates
51
+ end
52
+
53
+ # Returns the number of wildcards included as features in the list of rule
54
+ # landmarks.
55
+ def wildcard_count
56
+ @landmarks.flatten.select {|feature| feature.kind_of? Symbol}.size
57
+ end
58
+
59
+ # Given a TokenStream and a rule, applies the rule on the stream and
60
+ # returns nil if the match fails and the token_loc if the match succeeds.
61
+ # Yields a RuleMatchData Struct with accessors token_loc (the position of the match in the stream)
62
+ # and type if a block is given. type is nil if the TokenStream has no label,
63
+ # :perfect if all tokens up to the labeled token are consumed, :early if the rule's final position
64
+ # is before the labeled token, and :late if it is after. The returned
65
+ # token_loc is the position in the stream as it was passed in. That is, the
66
+ # token_loc is always from the left of the given stream whether it is in a
67
+ # reversed state or not.
68
+ def apply_to(tokenstream)
69
+ if tokenstream.reversed?
70
+ target=tokenstream if @direction==:back
71
+ target=tokenstream.reverse if @direction==:forward
72
+ elsif not tokenstream.reversed?
73
+ target=tokenstream if @direction==:forward
74
+ target=tokenstream.reverse if @direction==:back
75
+ end
76
+ target.rewind #rules are applied from the beginning of the stream
77
+ @landmarks.each do |landmark|
78
+ unless target.skip_to(*landmark)
79
+ return nil
80
+ end
81
+ end
82
+ token_loc=target.cur_pos
83
+ if @direction==:back && !tokenstream.reversed?
84
+ token_loc = tokenstream.reverse_pos(token_loc) #Return position from left of given stream
85
+ end
86
+ md = @@RuleMatchData.new(token_loc)
87
+ if target.label_index
88
+ idx = target.label_index
89
+ md.type = :perfect if token_loc == idx
90
+ md.type = :early if token_loc < idx
91
+ md.type = :late if token_loc > idx
92
+ end
93
+ yield md if block_given?
94
+ return token_loc
95
+ end
96
+
97
+ # Returns true or false depending on if the match of this rule on the given
98
+ # tokenstream is of any of the given types (could be a combination of
99
+ # :perfect, :early, :fail and :late). Only valid on streams with labels
100
+ def matches(tokenstream, *types)
101
+ raise ArgumentError, "No match types given" if types.empty?
102
+ match = nil
103
+ apply_to(tokenstream) {|md| match=md.type}
104
+ match = :fail if match.nil?
105
+ if types.include? match
106
+ return true
107
+ else
108
+ return false
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,34 @@
1
+ module Ariel
2
+
3
+ # A RuleSet acts as a container for a StructureNode's start and end rules.
4
+ # These are stored as an ordered array and are applied in turn until there is
5
+ # a successful match. A RuleSet takes responsibility for applying start and
6
+ # end rules to extract an ExtractedNode.
7
+ class RuleSet
8
+ def initialize(start_rules, end_rules)
9
+ @start_rules=start_rules
10
+ @end_rules=end_rules
11
+ end
12
+
13
+ def apply_to(tokenstream)
14
+ start_idx=nil
15
+ end_idx=nil
16
+ @start_rules.each do |rule|
17
+ start_idx=rule.apply_to tokenstream
18
+ break if start_idx
19
+ end
20
+ @end_rules.each do |rule|
21
+ end_idx=rule.apply_to tokenstream
22
+ break if end_idx
23
+ end
24
+ if start_idx && end_idx
25
+ debug "RuleSet matched with start_idx=#{start_idx} and end_idx=#{end_idx}"
26
+ return nil if end_idx < start_idx
27
+ return tokenstream.slice_by_token_index(start_idx, end_idx)
28
+ else
29
+ debug "No valid match was found"
30
+ return nil
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,75 @@
1
+ module Ariel
2
+ require 'ostruct'
3
+
4
+ # Implements a Node object used to represent the structure of the document
5
+ # tree. Each node stores start and end rules to extract the desired content
6
+ # from its parent node. Could be viewed as a rule-storing object.
7
+ class StructureNode
8
+ include NodeLike
9
+ attr_accessor :ruleset
10
+ def initialize(name=:root, type=:not_list, &block)
11
+ @children={}
12
+ @meta = OpenStruct.new({:name=>name, :node_type=>type})
13
+ yield self if block_given?
14
+ end
15
+
16
+ # Used to extend an already created Node. e.g.
17
+ # node.extend_structure do |r|
18
+ # r.new_field1
19
+ # r.new_field2
20
+ # end
21
+ def extend_structure(&block)
22
+ yield self if block_given?
23
+ end
24
+
25
+ # Given a Node to apply it's rules to, this function will create a new node
26
+ # and add it as a child of the given node. For StructureNodes of :list type,
27
+ # the list is extracted and so are each of the list items. In this case,
28
+ # only the list items are yielded.
29
+ def extract_from(node)
30
+ # Will be reimplemented to return an array of extracted items
31
+ newstream = @ruleset.apply_to(node.tokenstream)
32
+ extracted_node = ExtractedNode.new(meta.name, newstream, self)
33
+ node.add_child extracted_node if newstream
34
+ if self.meta.node_type == :list
35
+ #Do stuff
36
+ end
37
+ return extracted_node
38
+ end
39
+
40
+ # Applies the extraction rules stored in the current StructureNode and all its
41
+ # descendant children.
42
+ def apply_extraction_tree_on(root_node, extract_labels=false)
43
+ extraction_queue = [root_node]
44
+ until extraction_queue.empty? do
45
+ new_parent = extraction_queue.shift
46
+ new_parent.meta.structure.children.values.each do |child|
47
+ if extract_labels
48
+ extracted_node=LabelUtils.extract_labeled_region(child, new_parent)
49
+ else
50
+ extracted_node=child.extract_from(new_parent)
51
+ end
52
+ extraction_queue.push(extracted_node) if extracted_node
53
+ end
54
+ end
55
+ return root_node
56
+ end
57
+
58
+ def item(name, &block)
59
+ self.add_child(StructureNode.new(name, &block))
60
+ end
61
+
62
+ def list_item(name, &block)
63
+ self.add_child(StructureNode.new(name, :list, &block))
64
+ end
65
+
66
+ def method_missing(method, *args, &block)
67
+ if @children.has_key? method
68
+ @children[method]
69
+ else
70
+ super
71
+ end
72
+ end
73
+ end
74
+ end
75
+
@@ -0,0 +1,68 @@
1
+ module Ariel
2
+
3
+ # Tokens populate a TokenStream. They know their position in the original
4
+ # document, can list the wildcards that match them and determine whether a
5
+ # given string or wildcard is a valid match. During the process of parsing a
6
+ # labeled document, some tokens may be marked as being a label_tag. These are
7
+ # filtered from the TokenStream before the rule learning phase.
8
+ class Token
9
+ attr_reader :text, :start_loc, :end_loc
10
+
11
+ # Each new Token must have a string representing its content, its start position in the
12
+ # original document (start_loc) and the point at which it ends (end_loc).
13
+ # For instance, in str="This is an example", if "is" were to be made a
14
+ # Token it would be given a start_loc of 5 and and end_loc of 7, which is
15
+ # str[5...7]
16
+ def initialize(text, start_loc, end_loc, label_tag=false)
17
+ @text=text.to_s
18
+ @start_loc=start_loc
19
+ @end_loc=end_loc
20
+ @label_tag=label_tag
21
+ end
22
+
23
+ # Returns true or false depending on whether the token was marked as a label
24
+ # tag when it was initialized.
25
+ def is_label_tag?
26
+ @label_tag
27
+ end
28
+
29
+ # Tokens are only equal if they have an equal start_loc, end_loc and text.
30
+ def ==(t)
31
+ return (@start_loc==t.start_loc && @end_loc==t.end_loc && @text==t.text)
32
+ end
33
+
34
+ # Tokens are sorted based on their start_loc
35
+ def <=>(t)
36
+ @start_loc <=> t.start_loc
37
+ end
38
+
39
+ # Accepts either a string or symbol representing a wildcard in
40
+ # Wildcards#list. Returns true if the whole Token is consumed by the wildcard or the
41
+ # string is equal to Token#text, and false if the match fails. Raises an
42
+ # error if the passed symbol is not a member of Wildcards#list.
43
+ def matches?(landmark)
44
+ if landmark.kind_of? Symbol or landmark.kind_of? Regexp
45
+ if landmark.kind_of? Symbol
46
+ raise ArgumentError, "#{landmark} is not a valid wildcard." unless Wildcards.list.has_key? landmark
47
+ regex = Wildcards.list[landmark]
48
+ else
49
+ regex = landmark
50
+ end
51
+ if self.text[regex] == self.text
52
+ return true
53
+ else
54
+ return false
55
+ end
56
+ else
57
+ return true if landmark==self.text
58
+ end
59
+ return false
60
+ end
61
+
62
+ # Returns an array of symbols corresponding to the Wildcards that match the
63
+ # Token.
64
+ def matching_wildcards
65
+ return Wildcards.matching(self.text)
66
+ end
67
+ end
68
+ end