ariel 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -0,0 +1,67 @@
1
+ require 'singleton'
2
+
3
+ module Ariel
4
+
5
+ # Very simple Log class. By default outputs to stdout and ignored messages
6
+ # below :info level. Should probably get rid of the usage of Singleton as it's
7
+ # used very little, with the classes eigenclass/singleton class used mostly
8
+ # for the same purpose. Use Log.set_level to lower/raise the logging level.
9
+ class Log
10
+ include Singleton
11
+
12
+ SEVERITY={:debug=>0, :info=>1, :warn=>2, :error=>3}
13
+
14
+ # Level defaults to :debug if $DEBUG is set and :info if not.
15
+ def initialize
16
+ self.class.output_to_stdout
17
+ if $DEBUG
18
+ self.class.set_level :debug
19
+ else
20
+ self.class.set_level :info
21
+ end
22
+ end
23
+
24
+ class << self
25
+ SEVERITY.keys.each do |level|
26
+ define_method(level) {|message| instance; log message, level}
27
+ end
28
+
29
+ # Set the log level to the given key from the SEVERITY constant.
30
+ def set_level(level)
31
+ if SEVERITY.has_key? level
32
+ @log_level=level
33
+ else
34
+ raise ArgumentError, "Invalid log level given"
35
+ end
36
+ end
37
+
38
+ def current_level
39
+ @log_level
40
+ end
41
+
42
+ def output_to_stdout
43
+ @output=:stdout
44
+ end
45
+
46
+ # Sends all output to a file called debug.log in the current directory.
47
+ def output_to_file
48
+ @output=:file
49
+ end
50
+
51
+ # Not intended to be used directly, preferred to use the methods
52
+ # corresponding to different serverity levels.
53
+ def log(message, level)
54
+ if SEVERITY[@log_level] <= SEVERITY[level]
55
+ message = "#{level}: #{message}"
56
+ if @output==:file
57
+ File.open('debug.log', 'ab') {|f| f.puts message }
58
+ elsif @output==:stdout
59
+ puts message
60
+ end
61
+ return message
62
+ end
63
+ return nil
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,52 @@
1
+ module Ariel
2
+
3
+ # A generic Node object. As an end user, you have no need to use this. All
4
+ # children are stored in a hash. #id and #type are undefined so they can be
5
+ # used freely as part of a Node::Structure
6
+ class Node
7
+ removed_methods=[:id, :type]
8
+ removed_methods.each {|meth| undef_method meth}
9
+ attr_accessor :parent, :children, :node_name
10
+
11
+ # If the name is a string, it's converted to a symbol. If not it's just
12
+ # stored as is.
13
+ def initialize(name)
14
+ @children={}
15
+ if name.kind_of? String
16
+ @node_name=name.to_sym
17
+ else
18
+ @node_name=name
19
+ end
20
+ end
21
+
22
+ # Given a Node object and a name, adds a child to the array of children,
23
+ # setting its parent as the current node, as well as creating an accessor
24
+ # method matching that name.
25
+ def add_child(node)
26
+ @children[node.node_name]=node
27
+ node.parent = self
28
+ # Trick stolen from OpenStruct
29
+ meta = class << self; self; end
30
+ meta.send(:define_method, node.node_name.to_s.to_sym) {@children[node.node_name]}
31
+ end
32
+
33
+ # Yields each descendant node. If passed true will also yield itself.
34
+ def each_descendant(include_self=false)
35
+ if include_self
36
+ node_queue=[self]
37
+ else
38
+ node_queue=self.children.values
39
+ end
40
+ until node_queue.empty? do
41
+ node_queue.concat node_queue.first.children.values
42
+ yield node_queue.shift
43
+ end
44
+ end
45
+
46
+ def inspect
47
+ ["#{self.class.name} - node_name=#{self.node_name.inspect};",
48
+ "parent=#{self.parent ? self.parent.node_name.inspect : nil.inspect };",
49
+ "children=#{self.children.keys.inspect};"].join ' '
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,90 @@
1
+ require 'ariel/node'
2
+
3
+ module Ariel
4
+
5
+ # Each Node::Extracted has a name, a TokenStream and a structure which points to
6
+ # the relevant Node::Structure. Skip straight to #search, #/ and #at for the
7
+ # query interface. This is strongly recommended over using the built in method
8
+ # accessors (a method isn't defined if a given field isn't extracted, so
9
+ # you're going to have to catch a lot of potential errors).
10
+ class Node::Extracted < Node
11
+ attr_accessor :tokenstream, :structure_node
12
+
13
+ def initialize(name, tokenstream, structure)
14
+ super(name)
15
+ @structure_node=structure
16
+ @tokenstream=tokenstream
17
+ end
18
+
19
+ # Returns the text contained in the TokenStream.
20
+ def extracted_text
21
+ tokenstream.text
22
+ end
23
+
24
+ # Index based accessor for the Node::Extracted's children. Supports Range objects.
25
+ # Aims to provide behaviour that makes sense, especially when a Node has
26
+ # list children. Node::Extracted#[0..0] will return an array, while
27
+ # Node::Extracted[0] will not. This behaviour is the same as Ruby's standard
28
+ # Array class.
29
+ def [](*args)
30
+ dont_splat=false #determines whether to splat or not if there is only a single result
31
+ args.collect! do |arg|
32
+ if arg.kind_of? Range
33
+ arg=arg.to_a
34
+ dont_splat=true
35
+ end
36
+ arg
37
+ end
38
+ args.flatten!
39
+ dont_splat=true if args.size > 1
40
+ result=@children.values_at(*args).compact
41
+ if result.size==1 && dont_splat==true
42
+ return result
43
+ else
44
+ return *result
45
+ end
46
+ end
47
+
48
+ # The preferred way of querying extracted information. If nothing was
49
+ # extracted, an empty array is returned. This is much safer than using
50
+ # Node::Extracted accessors. Consider if your code is reading
51
+ # doc.address.phone_number.area_code - this will raise an error if any one of
52
+ # these were not extracted. (doc/'address/phone_number/area_code') is
53
+ # preferred. Numbered list_items can be queried e.g. (doc/'comment_list/2'),
54
+ # and basic globbing is supported: (doc/'*/*/title').
55
+ def search(search_string)
56
+ queue=search_string.split '/'
57
+ current_term=queue.shift
58
+ return [self] if current_term.nil? #If for some reason nothing is given in the search string
59
+ matches=[]
60
+ if current_term=='*'
61
+ new_matches=self.children.values
62
+ new_matches.sort! {|a, b| a.node_name <=> b.node_name} rescue nil #is this evil?
63
+ matches.concat new_matches
64
+ elsif current_term[/\d+/]==current_term
65
+ matches << @children[current_term.to_i]
66
+ else
67
+ matches << @children[current_term.to_sym]
68
+ end
69
+ if queue.empty?
70
+ return matches.flatten.compact
71
+ else
72
+ return matches.collect {|match| match.search(queue.join('/'))}.flatten.compact
73
+ end
74
+ end
75
+ alias :/ :search
76
+
77
+ # Acts exactly like #search, but returns only the first match or nil if
78
+ # there are no matches.
79
+ def at(search_string)
80
+ self.search(search_string).first
81
+ end
82
+
83
+ def inspect
84
+ [super,
85
+ "structure_node=#{self.structure_node.node_name.inspect};",
86
+ "extracted_text=\"#{text=self.extracted_text; text.size > 100 ? text[0..100]+'...' : text}\";"
87
+ ].join ' '
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,91 @@
1
+ require 'ariel/node'
2
+
3
+ module Ariel
4
+
5
+ # Implements a Node object used to represent the structure of the document
6
+ # tree. Each node stores start and end rules to extract the desired content
7
+ # from its parent node. Could be viewed as a rule-storing object.
8
+ class Node::Structure < Node
9
+ attr_accessor :ruleset, :node_type
10
+
11
+ def initialize(name=:root, type=:not_list, &block)
12
+ super(name)
13
+ @node_type=type
14
+ yield self if block_given?
15
+ end
16
+
17
+ # Used to extend an already created Node. e.g.
18
+ # node.extend_structure do |r|
19
+ # r.item :new_field1
20
+ # r.item :new_field2
21
+ # end
22
+ def extend_structure(&block)
23
+ yield self if block_given?
24
+ end
25
+
26
+ # Given a Node to apply it's rules to, this function will create a new node
27
+ # and add it as a child of the given node. It returns an array of the items
28
+ # extracted by the rule
29
+ def extract_from(node)
30
+ extractions=[]
31
+ i=0
32
+ return extractions if @ruleset.nil? #no extractions if no rule has been learnt
33
+ @ruleset.apply_to(node.tokenstream) do |newstream|
34
+ if self.node_type==:list_item
35
+ new_node_name=i
36
+ i+=1
37
+ else
38
+ new_node_name=@node_name
39
+ end
40
+ extracted_node = Node::Extracted.new(new_node_name, newstream, self)
41
+ node.add_child extracted_node
42
+ extractions << extracted_node
43
+ end
44
+ return extractions
45
+ end
46
+
47
+ # Applies the extraction rules stored in the current Node::Structure and all its
48
+ # descendant children.
49
+ def apply_extraction_tree_on(root_node, extract_labels=false)
50
+ extraction_queue = [root_node]
51
+ until extraction_queue.empty? do
52
+ new_parent = extraction_queue.shift
53
+ new_parent.structure_node.children.values.each do |child|
54
+ if extract_labels
55
+ extractions=LabelUtils.extract_labeled_region(child, new_parent)
56
+ else
57
+ extractions=child.extract_from(new_parent)
58
+ end
59
+ extractions.each {|extracted_node| extraction_queue.push extracted_node}
60
+ end
61
+ end
62
+ return root_node
63
+ end
64
+
65
+ # Use when defining any object that occurs once. #list is a synonym, but
66
+ # it's recommended you use it when defining a container for list_items. The
67
+ # children of a list_item are just items. e.g.
68
+ # <tt>structure = Ariel::Node::Structure.new do |r|
69
+ # r.list :comments do |c| # r.item :comments would be equivalent, but less readable
70
+ # c.list_item :comment do |c|
71
+ # c.item :author # Now these are just normal items, as they are extracted once from their parent
72
+ # c.item :date
73
+ # c.item :body
74
+ # end
75
+ # end
76
+ # end
77
+ def item(name, &block)
78
+ self.add_child(Node::Structure.new(name, &block))
79
+ end
80
+ # Extracting a list is really the same as extracting a normal item, but
81
+ # people probably still prefer to call a list a list.
82
+ alias :list :item
83
+
84
+ # See the docs for #item for a discussion of when to use #item and when to
85
+ # use #list_item.
86
+ def list_item(name, &block)
87
+ self.add_child(Node::Structure.new(name, :list_item, &block))
88
+ end
89
+ end
90
+ end
91
+
@@ -5,17 +5,23 @@ module Ariel
5
5
  # Rule#landmarks. A Rule also has a direction :forward or :back, which
6
6
  # determines whether it is applied from the end or beginning of a tokenstream.
7
7
  class Rule
8
- attr_accessor :landmarks, :direction
8
+ attr_accessor :landmarks, :direction, :exhaustive
9
9
  @@RuleMatchData=Struct.new(:token_loc, :type)
10
-
10
+ @@cache={}
11
+
11
12
  # A rule's direction can be :back or :forward, which determines whether it
12
13
  # is applied from the start of end of the TokenStream. The landmark array
13
14
  # contains an array for each landmark, which consists of one or more
14
- # features. e.g. Rule.new(:forward, [[:anything, "Example"], ["Test"]]).
15
- def initialize(direction, landmarks=[])
15
+ # features. e.g. Rule.new([[:anything, "Example"], ["Test"]], :forward).
16
+ def initialize(landmarks, direction, exhaustive=false)
16
17
  @landmarks=landmarks
17
18
  raise(ArgumentError, "Not a valid direction") unless [:forward, :back].include?(direction)
18
19
  @direction=direction
20
+ @exhaustive=exhaustive
21
+ end
22
+
23
+ def exhaustive?
24
+ @exhaustive
19
25
  end
20
26
 
21
27
  # Two rules are equal if they have the same list of landmarks and the same
@@ -26,12 +32,12 @@ module Ariel
26
32
  alias :eql? :==
27
33
 
28
34
  def hash
29
- [@landmarks, @direction].hash
35
+ [@landmarks, @direction, @exhaustive].hash
30
36
  end
31
37
 
32
38
  # Returns a rule that contains a given range of
33
39
  def partial(range)
34
- return Rule.new(@direction, @landmarks[range])
40
+ return Rule.new(@landmarks[range], @direction)
35
41
  end
36
42
 
37
43
  def deep_clone
@@ -57,7 +63,7 @@ module Ariel
57
63
  end
58
64
 
59
65
  # Given a TokenStream and a rule, applies the rule on the stream and
60
- # returns nil if the match fails and the token_loc if the match succeeds.
66
+ # returns an empty array if the match fails and an array of token_locs if the match succeeds.
61
67
  # Yields a RuleMatchData Struct with accessors token_loc (the position of the match in the stream)
62
68
  # and type if a block is given. type is nil if the TokenStream has no label,
63
69
  # :perfect if all tokens up to the labeled token are consumed, :early if the rule's final position
@@ -65,33 +71,23 @@ module Ariel
65
71
  # token_loc is the position in the stream as it was passed in. That is, the
66
72
  # token_loc is always from the left of the given stream whether it is in a
67
73
  # reversed state or not.
68
- def apply_to(tokenstream)
69
- if tokenstream.reversed?
70
- target=tokenstream if @direction==:back
71
- target=tokenstream.reverse if @direction==:forward
72
- elsif not tokenstream.reversed?
73
- target=tokenstream if @direction==:forward
74
- target=tokenstream.reverse if @direction==:back
75
- end
76
- target.rewind #rules are applied from the beginning of the stream
77
- @landmarks.each do |landmark|
78
- unless target.skip_to(*landmark)
79
- return nil
74
+ def apply_to(tokenstream)
75
+ target=self.class.prepare_tokenstream(tokenstream, @direction)
76
+ cache_check=@@cache[[tokenstream.cache_hash, self.hash]]
77
+ if cache_check
78
+ token_locs=cache_check
79
+ else
80
+ token_locs=[]
81
+ while result=seek_landmarks(target)
82
+ token_locs << correct_match_location(tokenstream, result)
83
+ break unless exhaustive?
80
84
  end
85
+ @@cache[[tokenstream.cache_hash, self.hash]]=token_locs
81
86
  end
82
- token_loc=target.cur_pos
83
- if @direction==:back && !tokenstream.reversed?
84
- token_loc = tokenstream.reverse_pos(token_loc) #Return position from left of given stream
85
- end
86
- md = @@RuleMatchData.new(token_loc)
87
- if target.label_index
88
- idx = target.label_index
89
- md.type = :perfect if token_loc == idx
90
- md.type = :early if token_loc < idx
91
- md.type = :late if token_loc > idx
87
+ if block_given?
88
+ generate_match_data(target, token_locs).each {|md| yield md}
92
89
  end
93
- yield md if block_given?
94
- return token_loc
90
+ return token_locs
95
91
  end
96
92
 
97
93
  # Returns true or false depending on if the match of this rule on the given
@@ -99,8 +95,9 @@ module Ariel
99
95
  # :perfect, :early, :fail and :late). Only valid on streams with labels
100
96
  def matches(tokenstream, *types)
101
97
  raise ArgumentError, "No match types given" if types.empty?
98
+ raise ArgumentError, "Only applicable to tokenstreams containing a label" if tokenstream.label_index.nil?
102
99
  match = nil
103
- apply_to(tokenstream) {|md| match=md.type}
100
+ apply_to(tokenstream) {|md| match=md.type if md.type;}
104
101
  match = :fail if match.nil?
105
102
  if types.include? match
106
103
  return true
@@ -108,5 +105,90 @@ module Ariel
108
105
  return false
109
106
  end
110
107
  end
108
+
109
+ # Only used in rule learning on labeled tokenstreams. Needed to provide the
110
+ # match index most relevant to the currently labeled list item. A preference
111
+ # of :early or :late can be passed, which will only return a
112
+ # token_loc before the stream's label_index or after the label_index.
113
+ def closest_match(tokenstream, preference=:none)
114
+ token_locs=self.apply_to(tokenstream)
115
+ return find_closest_match(token_locs, tokenstream.label_index)
116
+ end
117
+
118
+ # Reverses the given tokenstream if necessary based on its current direction, and
119
+ # the direction given (corresponding to the sort of rule you hope to apply
120
+ # to it).
121
+ def self.prepare_tokenstream(tokenstream, direction)
122
+ if tokenstream.reversed?
123
+ target=tokenstream if direction==:back
124
+ target=tokenstream.reverse if direction==:forward
125
+ elsif not tokenstream.reversed?
126
+ target=tokenstream if direction==:forward
127
+ target=tokenstream.reverse if direction==:back
128
+ end
129
+ target.rewind #rules are applied from the beginning of the stream
130
+ return target
131
+ end
132
+
133
+ private
134
+
135
+ # Finds the sequence of landmarks contained in the Rule instance in the
136
+ # given tokenstream. The logic of reversing or rewinding the stream if necessary
137
+ # is left to the method that uses it. Returns the match location from the
138
+ # beginning of whatever tokenstream it was passed. This location should be
139
+ # corrected by correct_match_location
140
+ def seek_landmarks(tokenstream)
141
+ @landmarks.each do |landmark|
142
+ unless tokenstream.skip_to(*landmark)
143
+ return nil
144
+ end
145
+ end
146
+ return tokenstream.cur_pos
147
+ end
148
+
149
+ # Takes the original tokenstream passed to apply_to and reverses the match
150
+ # location is required, so the match location returned to the user will be
151
+ # the index from the left of the passed tokenstream.
152
+ def correct_match_location(tokenstream, match_loc)
153
+ if tokenstream.reversed?
154
+ result=match_loc if @direction==:back
155
+ result=tokenstream.reverse_pos(match_loc) if @direction==:forward
156
+ elsif not tokenstream.reversed?
157
+ result=match_loc if @direction==:forward
158
+ result=tokenstream.reverse_pos(match_loc) if @direction==:back
159
+ end
160
+ return result
161
+ end
162
+
163
+ def generate_match_data(tokenstream, token_locs)
164
+ result=[]
165
+ if tokenstream.label_index
166
+ closest_match=find_closest_match(token_locs, tokenstream.label_index)
167
+ end
168
+ token_locs.each do |token_loc|
169
+ md = @@RuleMatchData.new(token_loc)
170
+ if tokenstream.label_index && token_loc==closest_match
171
+ idx = tokenstream.label_index
172
+ md.type = :perfect if token_loc == idx
173
+ md.type = :early if token_loc < idx
174
+ md.type = :late if token_loc > idx
175
+ end
176
+ result << md
177
+ end
178
+ return result
179
+ end
180
+
181
+ def find_closest_match(token_locs, label_index, preference=:none)
182
+ if preference==:early
183
+ token_locs = token_locs.reject {|token_loc| token_loc > label_index}
184
+ elsif preference==:late
185
+ token_locs = token_locs.reject {|token_loc| token_loc | label_index}
186
+ end
187
+ token_locs.sort_by {|token_loc| (label_index-token_loc).abs}.first
188
+ end
189
+
190
+ def self.clear_cache
191
+ @@cache.clear
192
+ end
111
193
  end
112
194
  end