ariel 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README +49 -83
  2. data/bin/ariel +29 -20
  3. data/examples/google_calculator/structure.rb +2 -2
  4. data/examples/google_calculator/structure.yaml +13 -15
  5. data/examples/raa/labeled/highline.html +5 -4
  6. data/examples/raa/labeled/mongrel.html +9 -8
  7. data/examples/raa/structure.rb +4 -2
  8. data/examples/raa/structure.yaml +94 -78
  9. data/lib/ariel.rb +71 -33
  10. data/lib/ariel/{candidate_selector.rb → candidate_refiner.rb} +39 -38
  11. data/lib/ariel/label_utils.rb +46 -18
  12. data/lib/ariel/labeled_document_loader.rb +77 -0
  13. data/lib/ariel/learner.rb +60 -38
  14. data/lib/ariel/log.rb +67 -0
  15. data/lib/ariel/node.rb +52 -0
  16. data/lib/ariel/node/extracted.rb +90 -0
  17. data/lib/ariel/node/structure.rb +91 -0
  18. data/lib/ariel/rule.rb +114 -32
  19. data/lib/ariel/rule_set.rb +34 -15
  20. data/lib/ariel/token.rb +9 -3
  21. data/lib/ariel/token_stream.rb +32 -17
  22. data/lib/ariel/wildcards.rb +19 -15
  23. data/test/fixtures.rb +45 -3
  24. data/test/specs/candidate_refiner_spec.rb +48 -0
  25. data/test/specs/label_utils_spec.rb +97 -0
  26. data/test/specs/learner_spec.rb +39 -0
  27. data/test/specs/node_extracted_spec.rb +90 -0
  28. data/test/specs/node_spec.rb +76 -0
  29. data/test/specs/node_structure_spec.rb +74 -0
  30. data/test/specs/rule_set_spec.rb +85 -0
  31. data/test/specs/rule_spec.rb +110 -0
  32. data/test/specs/token_stream_spec.rb +100 -7
  33. metadata +21 -28
  34. data/lib/ariel/example_document_loader.rb +0 -59
  35. data/lib/ariel/extracted_node.rb +0 -20
  36. data/lib/ariel/node_like.rb +0 -26
  37. data/lib/ariel/structure_node.rb +0 -75
  38. data/test/ariel_test_case.rb +0 -15
  39. data/test/test_candidate_selector.rb +0 -58
  40. data/test/test_example_document_loader.rb +0 -7
  41. data/test/test_label_utils.rb +0 -15
  42. data/test/test_learner.rb +0 -38
  43. data/test/test_rule.rb +0 -38
  44. data/test/test_structure_node.rb +0 -81
  45. data/test/test_token.rb +0 -16
  46. data/test/test_token_stream.rb +0 -82
  47. data/test/test_wildcards.rb +0 -18
@@ -0,0 +1,67 @@
1
+ require 'singleton'
2
+
3
+ module Ariel
4
+
5
+ # Very simple Log class. By default outputs to stdout and ignored messages
6
+ # below :info level. Should probably get rid of the usage of Singleton as it's
7
+ # used very little, with the classes eigenclass/singleton class used mostly
8
+ # for the same purpose. Use Log.set_level to lower/raise the logging level.
9
+ class Log
10
+ include Singleton
11
+
12
+ SEVERITY={:debug=>0, :info=>1, :warn=>2, :error=>3}
13
+
14
+ # Level defaults to :debug if $DEBUG is set and :info if not.
15
+ def initialize
16
+ self.class.output_to_stdout
17
+ if $DEBUG
18
+ self.class.set_level :debug
19
+ else
20
+ self.class.set_level :info
21
+ end
22
+ end
23
+
24
+ class << self
25
+ SEVERITY.keys.each do |level|
26
+ define_method(level) {|message| instance; log message, level}
27
+ end
28
+
29
+ # Set the log level to the given key from the SEVERITY constant.
30
+ def set_level(level)
31
+ if SEVERITY.has_key? level
32
+ @log_level=level
33
+ else
34
+ raise ArgumentError, "Invalid log level given"
35
+ end
36
+ end
37
+
38
+ def current_level
39
+ @log_level
40
+ end
41
+
42
+ def output_to_stdout
43
+ @output=:stdout
44
+ end
45
+
46
+ # Sends all output to a file called debug.log in the current directory.
47
+ def output_to_file
48
+ @output=:file
49
+ end
50
+
51
+ # Not intended to be used directly, preferred to use the methods
52
+ # corresponding to different serverity levels.
53
+ def log(message, level)
54
+ if SEVERITY[@log_level] <= SEVERITY[level]
55
+ message = "#{level}: #{message}"
56
+ if @output==:file
57
+ File.open('debug.log', 'ab') {|f| f.puts message }
58
+ elsif @output==:stdout
59
+ puts message
60
+ end
61
+ return message
62
+ end
63
+ return nil
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,52 @@
1
+ module Ariel
2
+
3
+ # A generic Node object. As an end user, you have no need to use this. All
4
+ # children are stored in a hash. #id and #type are undefined so they can be
5
+ # used freely as part of a Node::Structure
6
+ class Node
7
+ removed_methods=[:id, :type]
8
+ removed_methods.each {|meth| undef_method meth}
9
+ attr_accessor :parent, :children, :node_name
10
+
11
+ # If the name is a string, it's converted to a symbol. If not it's just
12
+ # stored as is.
13
+ def initialize(name)
14
+ @children={}
15
+ if name.kind_of? String
16
+ @node_name=name.to_sym
17
+ else
18
+ @node_name=name
19
+ end
20
+ end
21
+
22
+ # Given a Node object and a name, adds a child to the array of children,
23
+ # setting its parent as the current node, as well as creating an accessor
24
+ # method matching that name.
25
+ def add_child(node)
26
+ @children[node.node_name]=node
27
+ node.parent = self
28
+ # Trick stolen from OpenStruct
29
+ meta = class << self; self; end
30
+ meta.send(:define_method, node.node_name.to_s.to_sym) {@children[node.node_name]}
31
+ end
32
+
33
+ # Yields each descendant node. If passed true will also yield itself.
34
+ def each_descendant(include_self=false)
35
+ if include_self
36
+ node_queue=[self]
37
+ else
38
+ node_queue=self.children.values
39
+ end
40
+ until node_queue.empty? do
41
+ node_queue.concat node_queue.first.children.values
42
+ yield node_queue.shift
43
+ end
44
+ end
45
+
46
+ def inspect
47
+ ["#{self.class.name} - node_name=#{self.node_name.inspect};",
48
+ "parent=#{self.parent ? self.parent.node_name.inspect : nil.inspect };",
49
+ "children=#{self.children.keys.inspect};"].join ' '
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,90 @@
1
+ require 'ariel/node'
2
+
3
+ module Ariel
4
+
5
+ # Each Node::Extracted has a name, a TokenStream and a structure which points to
6
+ # the relevant Node::Structure. Skip straight to #search, #/ and #at for the
7
+ # query interface. This is strongly recommended over using the built in method
8
+ # accessors (a method isn't defined if a given field isn't extracted, so
9
+ # you're going to have to catch a lot of potential errors).
10
+ class Node::Extracted < Node
11
+ attr_accessor :tokenstream, :structure_node
12
+
13
+ def initialize(name, tokenstream, structure)
14
+ super(name)
15
+ @structure_node=structure
16
+ @tokenstream=tokenstream
17
+ end
18
+
19
+ # Returns the text contained in the TokenStream.
20
+ def extracted_text
21
+ tokenstream.text
22
+ end
23
+
24
+ # Index based accessor for the Node::Extracted's children. Supports Range objects.
25
+ # Aims to provide behaviour that makes sense, especially when a Node has
26
+ # list children. Node::Extracted#[0..0] will return an array, while
27
+ # Node::Extracted[0] will not. This behaviour is the same as Ruby's standard
28
+ # Array class.
29
+ def [](*args)
30
+ dont_splat=false #determines whether to splat or not if there is only a single result
31
+ args.collect! do |arg|
32
+ if arg.kind_of? Range
33
+ arg=arg.to_a
34
+ dont_splat=true
35
+ end
36
+ arg
37
+ end
38
+ args.flatten!
39
+ dont_splat=true if args.size > 1
40
+ result=@children.values_at(*args).compact
41
+ if result.size==1 && dont_splat==true
42
+ return result
43
+ else
44
+ return *result
45
+ end
46
+ end
47
+
48
+ # The preferred way of querying extracted information. If nothing was
49
+ # extracted, an empty array is returned. This is much safer than using
50
+ # Node::Extracted accessors. Consider if your code is reading
51
+ # doc.address.phone_number.area_code - this will raise an error if any one of
52
+ # these were not extracted. (doc/'address/phone_number/area_code') is
53
+ # preferred. Numbered list_items can be queried e.g. (doc/'comment_list/2'),
54
+ # and basic globbing is supported: (doc/'*/*/title').
55
+ def search(search_string)
56
+ queue=search_string.split '/'
57
+ current_term=queue.shift
58
+ return [self] if current_term.nil? #If for some reason nothing is given in the search string
59
+ matches=[]
60
+ if current_term=='*'
61
+ new_matches=self.children.values
62
+ new_matches.sort! {|a, b| a.node_name <=> b.node_name} rescue nil #is this evil?
63
+ matches.concat new_matches
64
+ elsif current_term[/\d+/]==current_term
65
+ matches << @children[current_term.to_i]
66
+ else
67
+ matches << @children[current_term.to_sym]
68
+ end
69
+ if queue.empty?
70
+ return matches.flatten.compact
71
+ else
72
+ return matches.collect {|match| match.search(queue.join('/'))}.flatten.compact
73
+ end
74
+ end
75
+ alias :/ :search
76
+
77
+ # Acts exactly like #search, but returns only the first match or nil if
78
+ # there are no matches.
79
+ def at(search_string)
80
+ self.search(search_string).first
81
+ end
82
+
83
+ def inspect
84
+ [super,
85
+ "structure_node=#{self.structure_node.node_name.inspect};",
86
+ "extracted_text=\"#{text=self.extracted_text; text.size > 100 ? text[0..100]+'...' : text}\";"
87
+ ].join ' '
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,91 @@
1
+ require 'ariel/node'
2
+
3
+ module Ariel
4
+
5
+ # Implements a Node object used to represent the structure of the document
6
+ # tree. Each node stores start and end rules to extract the desired content
7
+ # from its parent node. Could be viewed as a rule-storing object.
8
+ class Node::Structure < Node
9
+ attr_accessor :ruleset, :node_type
10
+
11
+ def initialize(name=:root, type=:not_list, &block)
12
+ super(name)
13
+ @node_type=type
14
+ yield self if block_given?
15
+ end
16
+
17
+ # Used to extend an already created Node. e.g.
18
+ # node.extend_structure do |r|
19
+ # r.item :new_field1
20
+ # r.item :new_field2
21
+ # end
22
+ def extend_structure(&block)
23
+ yield self if block_given?
24
+ end
25
+
26
+ # Given a Node to apply it's rules to, this function will create a new node
27
+ # and add it as a child of the given node. It returns an array of the items
28
+ # extracted by the rule
29
+ def extract_from(node)
30
+ extractions=[]
31
+ i=0
32
+ return extractions if @ruleset.nil? #no extractions if no rule has been learnt
33
+ @ruleset.apply_to(node.tokenstream) do |newstream|
34
+ if self.node_type==:list_item
35
+ new_node_name=i
36
+ i+=1
37
+ else
38
+ new_node_name=@node_name
39
+ end
40
+ extracted_node = Node::Extracted.new(new_node_name, newstream, self)
41
+ node.add_child extracted_node
42
+ extractions << extracted_node
43
+ end
44
+ return extractions
45
+ end
46
+
47
+ # Applies the extraction rules stored in the current Node::Structure and all its
48
+ # descendant children.
49
+ def apply_extraction_tree_on(root_node, extract_labels=false)
50
+ extraction_queue = [root_node]
51
+ until extraction_queue.empty? do
52
+ new_parent = extraction_queue.shift
53
+ new_parent.structure_node.children.values.each do |child|
54
+ if extract_labels
55
+ extractions=LabelUtils.extract_labeled_region(child, new_parent)
56
+ else
57
+ extractions=child.extract_from(new_parent)
58
+ end
59
+ extractions.each {|extracted_node| extraction_queue.push extracted_node}
60
+ end
61
+ end
62
+ return root_node
63
+ end
64
+
65
+ # Use when defining any object that occurs once. #list is a synonym, but
66
+ # it's recommended you use it when defining a container for list_items. The
67
+ # children of a list_item are just items. e.g.
68
+ # <tt>structure = Ariel::Node::Structure.new do |r|
69
+ # r.list :comments do |c| # r.item :comments would be equivalent, but less readable
70
+ # c.list_item :comment do |c|
71
+ # c.item :author # Now these are just normal items, as they are extracted once from their parent
72
+ # c.item :date
73
+ # c.item :body
74
+ # end
75
+ # end
76
+ # end
77
+ def item(name, &block)
78
+ self.add_child(Node::Structure.new(name, &block))
79
+ end
80
+ # Extracting a list is really the same as extracting a normal item, but
81
+ # people probably still prefer to call a list a list.
82
+ alias :list :item
83
+
84
+ # See the docs for #item for a discussion of when to use #item and when to
85
+ # use #list_item.
86
+ def list_item(name, &block)
87
+ self.add_child(Node::Structure.new(name, :list_item, &block))
88
+ end
89
+ end
90
+ end
91
+
@@ -5,17 +5,23 @@ module Ariel
5
5
  # Rule#landmarks. A Rule also has a direction :forward or :back, which
6
6
  # determines whether it is applied from the end or beginning of a tokenstream.
7
7
  class Rule
8
- attr_accessor :landmarks, :direction
8
+ attr_accessor :landmarks, :direction, :exhaustive
9
9
  @@RuleMatchData=Struct.new(:token_loc, :type)
10
-
10
+ @@cache={}
11
+
11
12
  # A rule's direction can be :back or :forward, which determines whether it
12
13
  # is applied from the start of end of the TokenStream. The landmark array
13
14
  # contains an array for each landmark, which consists of one or more
14
- # features. e.g. Rule.new(:forward, [[:anything, "Example"], ["Test"]]).
15
- def initialize(direction, landmarks=[])
15
+ # features. e.g. Rule.new([[:anything, "Example"], ["Test"]], :forward).
16
+ def initialize(landmarks, direction, exhaustive=false)
16
17
  @landmarks=landmarks
17
18
  raise(ArgumentError, "Not a valid direction") unless [:forward, :back].include?(direction)
18
19
  @direction=direction
20
+ @exhaustive=exhaustive
21
+ end
22
+
23
+ def exhaustive?
24
+ @exhaustive
19
25
  end
20
26
 
21
27
  # Two rules are equal if they have the same list of landmarks and the same
@@ -26,12 +32,12 @@ module Ariel
26
32
  alias :eql? :==
27
33
 
28
34
  def hash
29
- [@landmarks, @direction].hash
35
+ [@landmarks, @direction, @exhaustive].hash
30
36
  end
31
37
 
32
38
  # Returns a rule that contains a given range of
33
39
  def partial(range)
34
- return Rule.new(@direction, @landmarks[range])
40
+ return Rule.new(@landmarks[range], @direction)
35
41
  end
36
42
 
37
43
  def deep_clone
@@ -57,7 +63,7 @@ module Ariel
57
63
  end
58
64
 
59
65
  # Given a TokenStream and a rule, applies the rule on the stream and
60
- # returns nil if the match fails and the token_loc if the match succeeds.
66
+ # returns an empty array if the match fails and an array of token_locs if the match succeeds.
61
67
  # Yields a RuleMatchData Struct with accessors token_loc (the position of the match in the stream)
62
68
  # and type if a block is given. type is nil if the TokenStream has no label,
63
69
  # :perfect if all tokens up to the labeled token are consumed, :early if the rule's final position
@@ -65,33 +71,23 @@ module Ariel
65
71
  # token_loc is the position in the stream as it was passed in. That is, the
66
72
  # token_loc is always from the left of the given stream whether it is in a
67
73
  # reversed state or not.
68
- def apply_to(tokenstream)
69
- if tokenstream.reversed?
70
- target=tokenstream if @direction==:back
71
- target=tokenstream.reverse if @direction==:forward
72
- elsif not tokenstream.reversed?
73
- target=tokenstream if @direction==:forward
74
- target=tokenstream.reverse if @direction==:back
75
- end
76
- target.rewind #rules are applied from the beginning of the stream
77
- @landmarks.each do |landmark|
78
- unless target.skip_to(*landmark)
79
- return nil
74
+ def apply_to(tokenstream)
75
+ target=self.class.prepare_tokenstream(tokenstream, @direction)
76
+ cache_check=@@cache[[tokenstream.cache_hash, self.hash]]
77
+ if cache_check
78
+ token_locs=cache_check
79
+ else
80
+ token_locs=[]
81
+ while result=seek_landmarks(target)
82
+ token_locs << correct_match_location(tokenstream, result)
83
+ break unless exhaustive?
80
84
  end
85
+ @@cache[[tokenstream.cache_hash, self.hash]]=token_locs
81
86
  end
82
- token_loc=target.cur_pos
83
- if @direction==:back && !tokenstream.reversed?
84
- token_loc = tokenstream.reverse_pos(token_loc) #Return position from left of given stream
85
- end
86
- md = @@RuleMatchData.new(token_loc)
87
- if target.label_index
88
- idx = target.label_index
89
- md.type = :perfect if token_loc == idx
90
- md.type = :early if token_loc < idx
91
- md.type = :late if token_loc > idx
87
+ if block_given?
88
+ generate_match_data(target, token_locs).each {|md| yield md}
92
89
  end
93
- yield md if block_given?
94
- return token_loc
90
+ return token_locs
95
91
  end
96
92
 
97
93
  # Returns true or false depending on if the match of this rule on the given
@@ -99,8 +95,9 @@ module Ariel
99
95
  # :perfect, :early, :fail and :late). Only valid on streams with labels
100
96
  def matches(tokenstream, *types)
101
97
  raise ArgumentError, "No match types given" if types.empty?
98
+ raise ArgumentError, "Only applicable to tokenstreams containing a label" if tokenstream.label_index.nil?
102
99
  match = nil
103
- apply_to(tokenstream) {|md| match=md.type}
100
+ apply_to(tokenstream) {|md| match=md.type if md.type;}
104
101
  match = :fail if match.nil?
105
102
  if types.include? match
106
103
  return true
@@ -108,5 +105,90 @@ module Ariel
108
105
  return false
109
106
  end
110
107
  end
108
+
109
+ # Only used in rule learning on labeled tokenstreams. Needed to provide the
110
+ # match index most relevant to the currently labeled list item. A preference
111
+ # of :early or :late can be passed, which will only return a
112
+ # token_loc before the stream's label_index or after the label_index.
113
+ def closest_match(tokenstream, preference=:none)
114
+ token_locs=self.apply_to(tokenstream)
115
+ return find_closest_match(token_locs, tokenstream.label_index)
116
+ end
117
+
118
+ # Reverses the given tokenstream if necessary based on its current direction, and
119
+ # the direction given (corresponding to the sort of rule you hope to apply
120
+ # to it).
121
+ def self.prepare_tokenstream(tokenstream, direction)
122
+ if tokenstream.reversed?
123
+ target=tokenstream if direction==:back
124
+ target=tokenstream.reverse if direction==:forward
125
+ elsif not tokenstream.reversed?
126
+ target=tokenstream if direction==:forward
127
+ target=tokenstream.reverse if direction==:back
128
+ end
129
+ target.rewind #rules are applied from the beginning of the stream
130
+ return target
131
+ end
132
+
133
+ private
134
+
135
+ # Finds the sequence of landmarks contained in the Rule instance in the
136
+ # given tokenstream. The logic of reversing or rewinding the stream if necessary
137
+ # is left to the method that uses it. Returns the match location from the
138
+ # beginning of whatever tokenstream it was passed. This location should be
139
+ # corrected by correct_match_location
140
+ def seek_landmarks(tokenstream)
141
+ @landmarks.each do |landmark|
142
+ unless tokenstream.skip_to(*landmark)
143
+ return nil
144
+ end
145
+ end
146
+ return tokenstream.cur_pos
147
+ end
148
+
149
+ # Takes the original tokenstream passed to apply_to and reverses the match
150
+ # location is required, so the match location returned to the user will be
151
+ # the index from the left of the passed tokenstream.
152
+ def correct_match_location(tokenstream, match_loc)
153
+ if tokenstream.reversed?
154
+ result=match_loc if @direction==:back
155
+ result=tokenstream.reverse_pos(match_loc) if @direction==:forward
156
+ elsif not tokenstream.reversed?
157
+ result=match_loc if @direction==:forward
158
+ result=tokenstream.reverse_pos(match_loc) if @direction==:back
159
+ end
160
+ return result
161
+ end
162
+
163
+ def generate_match_data(tokenstream, token_locs)
164
+ result=[]
165
+ if tokenstream.label_index
166
+ closest_match=find_closest_match(token_locs, tokenstream.label_index)
167
+ end
168
+ token_locs.each do |token_loc|
169
+ md = @@RuleMatchData.new(token_loc)
170
+ if tokenstream.label_index && token_loc==closest_match
171
+ idx = tokenstream.label_index
172
+ md.type = :perfect if token_loc == idx
173
+ md.type = :early if token_loc < idx
174
+ md.type = :late if token_loc > idx
175
+ end
176
+ result << md
177
+ end
178
+ return result
179
+ end
180
+
181
+ def find_closest_match(token_locs, label_index, preference=:none)
182
+ if preference==:early
183
+ token_locs = token_locs.reject {|token_loc| token_loc > label_index}
184
+ elsif preference==:late
185
+ token_locs = token_locs.reject {|token_loc| token_loc | label_index}
186
+ end
187
+ token_locs.sort_by {|token_loc| (label_index-token_loc).abs}.first
188
+ end
189
+
190
+ def self.clear_cache
191
+ @@cache.clear
192
+ end
111
193
  end
112
194
  end