scrubyt 0.2.6 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,27 +1,27 @@
1
1
  module Scrubyt
2
2
  ##
3
3
  #=<tt>Rejecting result instances based on further rules</tt>
4
- #
4
+ #
5
5
  #The two most trivial problems with a set of rules is that they match either less
6
6
  #or more instances than we would like them to. Constraints are a way to remedy the second problem:
7
7
  #they serve as a tool to filter out some result instances based on rules. A typical
8
8
  #example:
9
- #
9
+ #
10
10
  #* *ensure_presence_of_ancestor_pattern* consider this model:
11
11
  # <book>
12
12
  # <author>...</author>
13
13
  # <title>...</title>
14
14
  # </book>
15
- #
15
+ #
16
16
  #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
17
- #'author' and 'title', only those books will be matched which have an author and a
17
+ #'author' and 'title', only those books will be matched which have an author and a
18
18
  #title (i.e.the child patterns author and title must extract something). This is a way
19
- #to say 'a book MUST have an author and a title'.
19
+ #to say 'a book MUST have an author and a title'.
20
20
  class Constraint
21
21
  #There are more possible ways of applying/checking constraints in the case of
22
- #ones that can not be checked in the context node (e.g. ensure_presence_of -
22
+ #ones that can not be checked in the context node (e.g. ensure_presence_of -
23
23
  #since it may require the evaluation of child patterns of the context pattern to
24
- #arbitray level)
24
+ #arbitray level)
25
25
  #
26
26
  #In such cases, the possibilities are:
27
27
  #
@@ -29,56 +29,54 @@ module Scrubyt
29
29
  # pattern is evaluated. This can mess things up, since if any ancestor node uses
30
30
  # the sinks of predecessor(s) other than the context node, those need to be evaluated
31
31
  # too, and we may run into a cyclyc dependency or at least a complicated recursion
32
- #
33
- #2) Post processing - evaluate normally and throw out results which do not pass the
32
+ #
33
+ #2) Post processing - evaluate normally and throw out results which do not pass the
34
34
  # constraint
35
35
  #
36
36
  #2b) Do it on the XML level - most probably this solution will be implemented
37
-
37
+
38
38
  # Different constraint types
39
39
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
40
40
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
41
41
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
42
42
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
43
43
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
44
-
45
-
46
- attr_reader :type, :target, :parent_filter
47
-
44
+
45
+
46
+ attr_reader :type, :target
47
+
48
48
  #Add 'ensure presence of ancestor pattern' constraint
49
-
49
+
50
50
  #If this type of constraint is added to a pattern, it must have an ancestor pattern
51
51
  #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
52
52
  #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
53
53
  #(just by looking at the wrapper model, the ancestor pattern is always present)
54
54
  #Note that from this type of constraint there is no 'ensure_absence' version, since
55
- #I could not think about an use case for that
56
- def self.add_ensure_presence_of_pattern(parent_filter, ancestor)
57
- Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
55
+ #I could not think about an use case for that
56
+ def self.add_ensure_presence_of_pattern(ancestor)
57
+ Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
58
58
  end
59
-
59
+
60
60
  #Add 'ensure absence of attribute' constraint
61
-
61
+
62
62
  #If this type of constraint is added to a pattern, the HTML node it targets
63
63
  #must NOT have an attribute named "attribute_name" with the value "attribute_value"
64
- def self.add_ensure_absence_of_attribute(parent_filter, attribute_hash)
65
- Constraint.new(parent_filter,
66
- attribute_hash,
64
+ def self.add_ensure_absence_of_attribute(attribute_hash)
65
+ Constraint.new(attribute_hash,
67
66
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
68
67
  end
69
-
68
+
70
69
  #Add 'ensure presence of attribute' constraint
71
-
70
+
72
71
  #If this type of constraint is added to a pattern, the HTML node it targets
73
72
  #must have an attribute named "attribute_name" with the value "attribute_value"
74
- def self.add_ensure_presence_of_attribute(parent_filter, attribute_hash)
75
- Constraint.new(parent_filter,
76
- attribute_hash,
73
+ def self.add_ensure_presence_of_attribute(attribute_hash)
74
+ Constraint.new(attribute_hash,
77
75
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
78
76
  end
79
-
80
- #Add 'ensure absence of ancestor node' constraint
81
-
77
+
78
+ #Add 'ensure absence of ancestor node' constraint
79
+
82
80
  #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
83
81
  #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
84
82
  #
@@ -88,14 +86,13 @@ module Scrubyt
88
86
  #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
89
87
  #
90
88
  #"attributes" can be empty - in this case just the 'node_name' is checked
91
- def self.add_ensure_absence_of_ancestor_node(parent_filter, node_name, attributes)
92
- Constraint.new(parent_filter,
93
- [node_name, attributes],
94
- CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
89
+ def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
90
+ Constraint.new([node_name, attributes],
91
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
95
92
  end
96
-
97
- #Add 'ensure presence of ancestor node' constraint
98
-
93
+
94
+ #Add 'ensure presence of ancestor node' constraint
95
+
99
96
  #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
100
97
  #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
101
98
  #
@@ -105,12 +102,11 @@ module Scrubyt
105
102
  #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
106
103
  #
107
104
  #"attributes" can be empty - in this case just the 'node_name' is checked
108
- def self.add_ensure_presence_of_ancestor_node(parent_filter, node_name, attributes)
109
- Constraint.new(parent_filter,
110
- [node_name, attributes],
111
- CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
105
+ def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
106
+ Constraint.new([node_name, attributes],
107
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
112
108
  end
113
-
109
+
114
110
  #Evaluate the constraint; if this function returns true,
115
111
  #it means that the constraint passed, i.e. its filter will be added to the exctracted
116
112
  #content of the pattern
@@ -123,21 +119,20 @@ module Scrubyt
123
119
  attribute_present(result)
124
120
  when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
125
121
  !attribute_present(result)
126
- when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
127
- ancestor_node_present(result)
128
- when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
122
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
123
+ ancestor_node_present(result)
124
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
129
125
  !ancestor_node_present(result)
130
126
  end
131
127
  end
132
-
128
+
133
129
  private
134
130
  #We would not like these to be called from outside
135
- def initialize(parent_filter, target, type)
136
- @type = type
137
- @parent_filter = parent_filter
131
+ def initialize(target, type)
138
132
  @target = target
139
- end
140
-
133
+ @type = type
134
+ end
135
+
141
136
  #Implementation of the ancestor node presence test
142
137
  #Check the documentation of the add_ensure_presence_of_ancestor_node method
143
138
  #for further information on the result parameter
@@ -153,21 +148,22 @@ module Scrubyt
153
148
  end
154
149
  false
155
150
  end
156
-
151
+
157
152
  def attribute_present(result)
153
+ return unless result.is_a? Hpricot::Elem
158
154
  match = true
159
155
  #If v = nil, the value of the attribute can be arbitrary;
160
156
  #Therefore, in this case we just have to make sure that the attribute is
161
157
  #present (i.e. != nil), we don't care about the value
162
158
  @target.each do |k,v|
163
159
  if v == nil
164
- match &&= (result.attributes[k.to_s] != nil)
160
+ match &&= (result.attributes[k.to_s] != nil)
165
161
  else
166
- match &&= (result.attributes[k.to_s] == v.to_s)
167
- end
162
+ match &&= (result.attributes[k.to_s] == v.to_s)
163
+ end
168
164
  end
169
165
  match
170
166
  end
171
-
167
+
172
168
  end #end of class
173
169
  end #end of module
@@ -10,58 +10,35 @@ module Scrubyt
10
10
  #I will not document the functions since these are just forwarders; See the 'real'
11
11
  #functions with their documentation in Scrubyt::Constraint.rb
12
12
  class ConstraintAdder
13
-
14
- def self.ensure_presence_of_pattern(pattern, ancestor_node_name)
15
- pattern.filters[0].ensure_presence_of_pattern(ancestor_node_name)
16
- pattern #To make chaining possible
13
+
14
+ def self.ensure_presence_of_pattern(ancestor_node_name)
15
+ Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
17
16
  end
18
17
 
19
- def self.ensure_presence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
20
- pattern.filters[0].ensure_presence_of_ancestor_node(ancestor_node_name,
21
- prepare_attributes(attributes))
22
- pattern #To make chaining possible
18
+ def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
19
+ Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
20
+ prepare_attributes(attributes))
23
21
  end
24
22
 
25
- def self.ensure_absence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
26
- pattern.filters[0].ensure_absence_of_ancestor_node(ancestor_node_name,
23
+ def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
24
+ Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
27
25
  prepare_attributes(attributes))
28
- pattern #To make chaining possible
29
26
  end
30
27
 
31
- def self.ensure_presence_of_attribute(pattern, attribute_hash)
32
- pattern.filters[0].ensure_presence_of_attribute(attribute_hash)
33
- pattern #To make chaining possible
28
+ def self.ensure_presence_of_attribute(attribute_hash)
29
+ Constraint.add_ensure_presence_of_attribute(attribute_hash)
34
30
  end
35
31
 
36
- def self.ensure_absence_of_attribute(pattern, attribute_hash)
37
- pattern.filters[0].ensure_absence_of_attribute(attribute_hash)
38
- pattern #To make chaining possible
32
+ def self.ensure_absence_of_attribute(attribute_hash)
33
+ Constraint.add_ensure_absence_of_attribute(attribute_hash)
39
34
  end
40
-
41
- private
42
- def self.find_by_name(root_pattern, name)
43
- @found_pattern = nil
44
- find_by_name_recursive(root_pattern, name)
45
- if (@found_pattern == nil)
46
- #$Logger.error("Fatal: No pattern named #{name} exists!")
47
- puts "Fatal: No pattern named #{name} exists!"
48
- end
49
- @found_pattern
50
- end
51
-
52
- def self.find_by_name_recursive(pattern, name)
53
- if pattern.name == name
54
- @found_pattern = pattern
55
- else
56
- pattern.children.each {|child| find_by_name_recursive(child, name)}
57
- end
58
- end
59
-
35
+
36
+ private
60
37
  def self.prepare_attributes(attributes)
61
38
  attribute_pairs = []
62
39
  attributes.each do |key, value|
63
40
  if (value.instance_of? Array)
64
- value.each {|val| attribute_pairs << [key,val]}
41
+ value.each {|val| attribute_pairs << [key,val]}
65
42
  else
66
43
  attribute_pairs << [key, value]
67
44
  end
@@ -0,0 +1,17 @@
1
+ module Scrubyt
2
+ class AttributeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
6
+ if elem.is_a? Hpricot::Elem
7
+ return [elem.attributes[@example]]
8
+ else
9
+ return nil
10
+ end
11
+ end
12
+
13
+ def to_sexp
14
+ [:str, @example]
15
+ end #end of method to_sexp
16
+ end #End of class AttributeFilter
17
+ end #End of module Scrubyt
@@ -0,0 +1,111 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item.
19
+ #
20
+ #To accomplish this, first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
23
+ #extract the name and the image of the item; and finally, extract the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class BaseFilter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+ #Regexp example, like /\d+@*\d+[a-z]/
47
+ EXAMPLE_TYPE_REGEXP = 4
48
+ #Compound example, like :contains => 'goodies'
49
+ EXAMPLE_TYPE_COMPOUND = 5
50
+
51
+ attr_accessor(:example_type, :parent_pattern, :temp_sink,
52
+ :constraints, :xpath, :regexp, :example, :source, :sink)
53
+
54
+ def self.create(parent_pattern, example=nil)
55
+
56
+ filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
57
+ if filter_name == 'RootFilter'
58
+ BaseFilter.new(parent_pattern, example)
59
+ else
60
+ instance_eval("#{filter_name}.new(parent_pattern, example)")
61
+ end
62
+ end
63
+
64
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
65
+ #should not be called directly
66
+ #TODO still used?
67
+ def method_missing(method_name, *args, &block)
68
+ case method_name.to_s
69
+ when /^ensure.+/
70
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
71
+ else
72
+ raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
73
+ end
74
+ end
75
+
76
+ def to_sexp
77
+ nil
78
+ end
79
+
80
+ private
81
+ #We don't want this to be accessible from outside
82
+ def initialize(parent_pattern, example)
83
+ @example_type = BaseFilter.determine_example_type(example)
84
+ @parent_pattern = parent_pattern
85
+ @sink = [] #output of a filter
86
+ @source = [] #input of a filter
87
+ @example = example
88
+ @xpath = nil #The xpath to evaluate this filter
89
+ @constraints = [] #list of constraints
90
+ end
91
+
92
+ def self.determine_example_type(example)
93
+ if example.instance_of? Regexp
94
+ EXAMPLE_TYPE_REGEXP
95
+ elsif example.instance_of? Hash
96
+ EXAMPLE_TYPE_COMPOUND
97
+ else
98
+ case example
99
+ when nil
100
+ EXAMPLE_TYPE_CHILDREN
101
+ when /\.(jpg|png|gif|jpeg)$/
102
+ EXAMPLE_TYPE_IMAGE
103
+ when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
104
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
105
+ else
106
+ EXAMPLE_TYPE_STRING
107
+ end
108
+ end
109
+ end #end of method
110
+ end #End of class
111
+ end #End of module
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class DetailPageFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a? String
6
+ result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
7
+ else
8
+ result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
9
+ XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
10
+ @parent_pattern, @parent_pattern.resolve)
11
+ end
12
+ end #end of method
13
+ end #End of class DetailPageFilter
14
+ end #End of module Scrubyt
@@ -0,0 +1,49 @@
1
+ require 'net/http'
2
+ require 'fileutils'
3
+
4
+ module Scrubyt
5
+ class DownloadFilter < BaseFilter
6
+
7
+ def evaluate(source)
8
+ download_file(source)
9
+ end #end of method
10
+
11
+ def to_sexp
12
+ [:str, @example]
13
+ end #end of method to_sexp
14
+
15
+ private
16
+ def download_file(source)
17
+ host_name = @parent_pattern.evaluation_context.extractor.get_host_name
18
+ outfile = nil
19
+ base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
20
+ return '' if source.size < 4
21
+ file_name = source.scan(/.+\/(.*)/)[0][0]
22
+ Net::HTTP.start(base_url) { |http|
23
+ resp = http.get(source)
24
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
25
+ FileUtils.mkdir_p @example
26
+ open(outfile, 'wb') {|f| f.write(resp.body) }
27
+ }
28
+ outfile.scan(/.+\/(.*)/)[0][0]
29
+ end
30
+
31
+ def self.find_nonexisting_file_name(file_name)
32
+ already_found = false
33
+ loop do
34
+ if File.exists? file_name
35
+ if already_found
36
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
37
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
38
+ else
39
+ file_name.sub!(/\./) {"_1\."}
40
+ already_found = true
41
+ end
42
+ else
43
+ break
44
+ end
45
+ end
46
+ file_name
47
+ end #end of method
48
+ end #End of class DownloadFilter
49
+ end #End of module Scrubyt