scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. data/CHANGELOG +59 -12
  2. data/Rakefile +2 -2
  3. data/lib/scrubyt.rb +24 -6
  4. data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
  5. data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
  6. data/lib/scrubyt/core/scraping/constraint.rb +53 -57
  7. data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
  8. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
  9. data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
  10. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
  11. data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
  13. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
  14. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
  15. data/lib/scrubyt/core/scraping/pattern.rb +292 -157
  16. data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
  17. data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
  18. data/lib/scrubyt/core/shared/extractor.rb +122 -163
  19. data/lib/scrubyt/output/export.rb +59 -174
  20. data/lib/scrubyt/output/post_processor.rb +4 -3
  21. data/lib/scrubyt/output/result.rb +8 -9
  22. data/lib/scrubyt/output/result_dumper.rb +81 -42
  23. data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
  24. data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
  25. data/lib/scrubyt/utils/shared_utils.rb +39 -26
  26. data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
  27. data/lib/scrubyt/utils/xpathutils.rb +31 -30
  28. data/test/unittests/constraint_test.rb +11 -7
  29. data/test/unittests/extractor_test.rb +6 -6
  30. data/test/unittests/filter_test.rb +66 -66
  31. metadata +22 -15
  32. data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -1,27 +1,27 @@
1
1
  module Scrubyt
2
2
  ##
3
3
  #=<tt>Rejecting result instances based on further rules</tt>
4
- #
4
+ #
5
5
  #The two most trivial problems with a set of rules is that they match either less
6
6
  #or more instances than we would like them to. Constraints are a way to remedy the second problem:
7
7
  #they serve as a tool to filter out some result instances based on rules. A typical
8
8
  #example:
9
- #
9
+ #
10
10
  #* *ensure_presence_of_ancestor_pattern* consider this model:
11
11
  # <book>
12
12
  # <author>...</author>
13
13
  # <title>...</title>
14
14
  # </book>
15
- #
15
+ #
16
16
  #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
17
- #'author' and 'title', only those books will be matched which have an author and a
17
+ #'author' and 'title', only those books will be matched which have an author and a
18
18
  #title (i.e.the child patterns author and title must extract something). This is a way
19
- #to say 'a book MUST have an author and a title'.
19
+ #to say 'a book MUST have an author and a title'.
20
20
  class Constraint
21
21
  #There are more possible ways of applying/checking constraints in the case of
22
- #ones that can not be checked in the context node (e.g. ensure_presence_of -
22
+ #ones that can not be checked in the context node (e.g. ensure_presence_of -
23
23
  #since it may require the evaluation of child patterns of the context pattern to
24
- #arbitray level)
24
+ #arbitray level)
25
25
  #
26
26
  #In such cases, the possibilities are:
27
27
  #
@@ -29,56 +29,54 @@ module Scrubyt
29
29
  # pattern is evaluated. This can mess things up, since if any ancestor node uses
30
30
  # the sinks of predecessor(s) other than the context node, those need to be evaluated
31
31
  # too, and we may run into a cyclyc dependency or at least a complicated recursion
32
- #
33
- #2) Post processing - evaluate normally and throw out results which do not pass the
32
+ #
33
+ #2) Post processing - evaluate normally and throw out results which do not pass the
34
34
  # constraint
35
35
  #
36
36
  #2b) Do it on the XML level - most probably this solution will be implemented
37
-
37
+
38
38
  # Different constraint types
39
39
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
40
40
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
41
41
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
42
42
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
43
43
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
44
-
45
-
46
- attr_reader :type, :target, :parent_filter
47
-
44
+
45
+
46
+ attr_reader :type, :target
47
+
48
48
  #Add 'ensure presence of ancestor pattern' constraint
49
-
49
+
50
50
  #If this type of constraint is added to a pattern, it must have an ancestor pattern
51
51
  #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
52
52
  #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
53
53
  #(just by looking at the wrapper model, the ancestor pattern is always present)
54
54
  #Note that from this type of constraint there is no 'ensure_absence' version, since
55
- #I could not think about an use case for that
56
- def self.add_ensure_presence_of_pattern(parent_filter, ancestor)
57
- Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
55
+ #I could not think about an use case for that
56
+ def self.add_ensure_presence_of_pattern(ancestor)
57
+ Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
58
58
  end
59
-
59
+
60
60
  #Add 'ensure absence of attribute' constraint
61
-
61
+
62
62
  #If this type of constraint is added to a pattern, the HTML node it targets
63
63
  #must NOT have an attribute named "attribute_name" with the value "attribute_value"
64
- def self.add_ensure_absence_of_attribute(parent_filter, attribute_hash)
65
- Constraint.new(parent_filter,
66
- attribute_hash,
64
+ def self.add_ensure_absence_of_attribute(attribute_hash)
65
+ Constraint.new(attribute_hash,
67
66
  CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
68
67
  end
69
-
68
+
70
69
  #Add 'ensure presence of attribute' constraint
71
-
70
+
72
71
  #If this type of constraint is added to a pattern, the HTML node it targets
73
72
  #must have an attribute named "attribute_name" with the value "attribute_value"
74
- def self.add_ensure_presence_of_attribute(parent_filter, attribute_hash)
75
- Constraint.new(parent_filter,
76
- attribute_hash,
73
+ def self.add_ensure_presence_of_attribute(attribute_hash)
74
+ Constraint.new(attribute_hash,
77
75
  CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
78
76
  end
79
-
80
- #Add 'ensure absence of ancestor node' constraint
81
-
77
+
78
+ #Add 'ensure absence of ancestor node' constraint
79
+
82
80
  #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
83
81
  #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
84
82
  #
@@ -88,14 +86,13 @@ module Scrubyt
88
86
  #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
89
87
  #
90
88
  #"attributes" can be empty - in this case just the 'node_name' is checked
91
- def self.add_ensure_absence_of_ancestor_node(parent_filter, node_name, attributes)
92
- Constraint.new(parent_filter,
93
- [node_name, attributes],
94
- CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
89
+ def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
90
+ Constraint.new([node_name, attributes],
91
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
95
92
  end
96
-
97
- #Add 'ensure presence of ancestor node' constraint
98
-
93
+
94
+ #Add 'ensure presence of ancestor node' constraint
95
+
99
96
  #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
100
97
  #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
101
98
  #
@@ -105,12 +102,11 @@ module Scrubyt
105
102
  #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
106
103
  #
107
104
  #"attributes" can be empty - in this case just the 'node_name' is checked
108
- def self.add_ensure_presence_of_ancestor_node(parent_filter, node_name, attributes)
109
- Constraint.new(parent_filter,
110
- [node_name, attributes],
111
- CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
105
+ def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
106
+ Constraint.new([node_name, attributes],
107
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
112
108
  end
113
-
109
+
114
110
  #Evaluate the constraint; if this function returns true,
115
111
  #it means that the constraint passed, i.e. its filter will be added to the exctracted
116
112
  #content of the pattern
@@ -123,21 +119,20 @@ module Scrubyt
123
119
  attribute_present(result)
124
120
  when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
125
121
  !attribute_present(result)
126
- when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
127
- ancestor_node_present(result)
128
- when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
122
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
123
+ ancestor_node_present(result)
124
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
129
125
  !ancestor_node_present(result)
130
126
  end
131
127
  end
132
-
128
+
133
129
  private
134
130
  #We would not like these to be called from outside
135
- def initialize(parent_filter, target, type)
136
- @type = type
137
- @parent_filter = parent_filter
131
+ def initialize(target, type)
138
132
  @target = target
139
- end
140
-
133
+ @type = type
134
+ end
135
+
141
136
  #Implementation of the ancestor node presence test
142
137
  #Check the documentation of the add_ensure_presence_of_ancestor_node method
143
138
  #for further information on the result parameter
@@ -153,21 +148,22 @@ module Scrubyt
153
148
  end
154
149
  false
155
150
  end
156
-
151
+
157
152
  def attribute_present(result)
153
+ return unless result.is_a? Hpricot::Elem
158
154
  match = true
159
155
  #If v = nil, the value of the attribute can be arbitrary;
160
156
  #Therefore, in this case we just have to make sure that the attribute is
161
157
  #present (i.e. != nil), we don't care about the value
162
158
  @target.each do |k,v|
163
159
  if v == nil
164
- match &&= (result.attributes[k.to_s] != nil)
160
+ match &&= (result.attributes[k.to_s] != nil)
165
161
  else
166
- match &&= (result.attributes[k.to_s] == v.to_s)
167
- end
162
+ match &&= (result.attributes[k.to_s] == v.to_s)
163
+ end
168
164
  end
169
165
  match
170
166
  end
171
-
167
+
172
168
  end #end of class
173
169
  end #end of module
@@ -10,58 +10,35 @@ module Scrubyt
10
10
  #I will not document the functions since these are just forwarders; See the 'real'
11
11
  #functions with their documentation in Scrubyt::Constraint.rb
12
12
  class ConstraintAdder
13
-
14
- def self.ensure_presence_of_pattern(pattern, ancestor_node_name)
15
- pattern.filters[0].ensure_presence_of_pattern(ancestor_node_name)
16
- pattern #To make chaining possible
13
+
14
+ def self.ensure_presence_of_pattern(ancestor_node_name)
15
+ Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
17
16
  end
18
17
 
19
- def self.ensure_presence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
20
- pattern.filters[0].ensure_presence_of_ancestor_node(ancestor_node_name,
21
- prepare_attributes(attributes))
22
- pattern #To make chaining possible
18
+ def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
19
+ Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
20
+ prepare_attributes(attributes))
23
21
  end
24
22
 
25
- def self.ensure_absence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
26
- pattern.filters[0].ensure_absence_of_ancestor_node(ancestor_node_name,
23
+ def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
24
+ Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
27
25
  prepare_attributes(attributes))
28
- pattern #To make chaining possible
29
26
  end
30
27
 
31
- def self.ensure_presence_of_attribute(pattern, attribute_hash)
32
- pattern.filters[0].ensure_presence_of_attribute(attribute_hash)
33
- pattern #To make chaining possible
28
+ def self.ensure_presence_of_attribute(attribute_hash)
29
+ Constraint.add_ensure_presence_of_attribute(attribute_hash)
34
30
  end
35
31
 
36
- def self.ensure_absence_of_attribute(pattern, attribute_hash)
37
- pattern.filters[0].ensure_absence_of_attribute(attribute_hash)
38
- pattern #To make chaining possible
32
+ def self.ensure_absence_of_attribute(attribute_hash)
33
+ Constraint.add_ensure_absence_of_attribute(attribute_hash)
39
34
  end
40
-
41
- private
42
- def self.find_by_name(root_pattern, name)
43
- @found_pattern = nil
44
- find_by_name_recursive(root_pattern, name)
45
- if (@found_pattern == nil)
46
- #$Logger.error("Fatal: No pattern named #{name} exists!")
47
- puts "Fatal: No pattern named #{name} exists!"
48
- end
49
- @found_pattern
50
- end
51
-
52
- def self.find_by_name_recursive(pattern, name)
53
- if pattern.name == name
54
- @found_pattern = pattern
55
- else
56
- pattern.children.each {|child| find_by_name_recursive(child, name)}
57
- end
58
- end
59
-
35
+
36
+ private
60
37
  def self.prepare_attributes(attributes)
61
38
  attribute_pairs = []
62
39
  attributes.each do |key, value|
63
40
  if (value.instance_of? Array)
64
- value.each {|val| attribute_pairs << [key,val]}
41
+ value.each {|val| attribute_pairs << [key,val]}
65
42
  else
66
43
  attribute_pairs << [key, value]
67
44
  end
@@ -0,0 +1,17 @@
1
+ module Scrubyt
2
+ class AttributeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
6
+ if elem.is_a? Hpricot::Elem
7
+ return [elem.attributes[@example]]
8
+ else
9
+ return nil
10
+ end
11
+ end
12
+
13
+ def to_sexp
14
+ [:str, @example]
15
+ end #end of method to_sexp
16
+ end #End of class AttributeFilter
17
+ end #End of module Scrubyt
@@ -0,0 +1,111 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item.
19
+ #
20
+ #To accomplish this, first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
23
+ #extract the name and the image of the item; and finally, extract the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class BaseFilter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+ #Regexp example, like /\d+@*\d+[a-z]/
47
+ EXAMPLE_TYPE_REGEXP = 4
48
+ #Compound example, like :contains => 'goodies'
49
+ EXAMPLE_TYPE_COMPOUND = 5
50
+
51
+ attr_accessor(:example_type, :parent_pattern, :temp_sink,
52
+ :constraints, :xpath, :regexp, :example, :source, :sink)
53
+
54
+ def self.create(parent_pattern, example=nil)
55
+
56
+ filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
57
+ if filter_name == 'RootFilter'
58
+ BaseFilter.new(parent_pattern, example)
59
+ else
60
+ instance_eval("#{filter_name}.new(parent_pattern, example)")
61
+ end
62
+ end
63
+
64
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
65
+ #should not be called directly
66
+ #TODO still used?
67
+ def method_missing(method_name, *args, &block)
68
+ case method_name.to_s
69
+ when /^ensure.+/
70
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
71
+ else
72
+ raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
73
+ end
74
+ end
75
+
76
+ def to_sexp
77
+ nil
78
+ end
79
+
80
+ private
81
+ #We don't want this to be accessible from outside
82
+ def initialize(parent_pattern, example)
83
+ @example_type = BaseFilter.determine_example_type(example)
84
+ @parent_pattern = parent_pattern
85
+ @sink = [] #output of a filter
86
+ @source = [] #input of a filter
87
+ @example = example
88
+ @xpath = nil #The xpath to evaluate this filter
89
+ @constraints = [] #list of constraints
90
+ end
91
+
92
+ def self.determine_example_type(example)
93
+ if example.instance_of? Regexp
94
+ EXAMPLE_TYPE_REGEXP
95
+ elsif example.instance_of? Hash
96
+ EXAMPLE_TYPE_COMPOUND
97
+ else
98
+ case example
99
+ when nil
100
+ EXAMPLE_TYPE_CHILDREN
101
+ when /\.(jpg|png|gif|jpeg)$/
102
+ EXAMPLE_TYPE_IMAGE
103
+ when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
104
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
105
+ else
106
+ EXAMPLE_TYPE_STRING
107
+ end
108
+ end
109
+ end #end of method
110
+ end #End of class
111
+ end #End of module
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class DetailPageFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a? String
6
+ result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
7
+ else
8
+ result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
9
+ XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
10
+ @parent_pattern, @parent_pattern.resolve)
11
+ end
12
+ end #end of method
13
+ end #End of class DetailPageFilter
14
+ end #End of module Scrubyt
@@ -0,0 +1,49 @@
1
+ require 'net/http'
2
+ require 'fileutils'
3
+
4
+ module Scrubyt
5
+ class DownloadFilter < BaseFilter
6
+
7
+ def evaluate(source)
8
+ download_file(source)
9
+ end #end of method
10
+
11
+ def to_sexp
12
+ [:str, @example]
13
+ end #end of method to_sexp
14
+
15
+ private
16
+ def download_file(source)
17
+ host_name = @parent_pattern.evaluation_context.extractor.get_host_name
18
+ outfile = nil
19
+ base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
20
+ return '' if source.size < 4
21
+ file_name = source.scan(/.+\/(.*)/)[0][0]
22
+ Net::HTTP.start(base_url) { |http|
23
+ resp = http.get(source)
24
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
25
+ FileUtils.mkdir_p @example
26
+ open(outfile, 'wb') {|f| f.write(resp.body) }
27
+ }
28
+ outfile.scan(/.+\/(.*)/)[0][0]
29
+ end
30
+
31
+ def self.find_nonexisting_file_name(file_name)
32
+ already_found = false
33
+ loop do
34
+ if File.exists? file_name
35
+ if already_found
36
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
37
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
38
+ else
39
+ file_name.sub!(/\./) {"_1\."}
40
+ already_found = true
41
+ end
42
+ else
43
+ break
44
+ end
45
+ end
46
+ file_name
47
+ end #end of method
48
+ end #End of class DownloadFilter
49
+ end #End of module Scrubyt