sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/CHANGELOG +350 -0
  2. data/COPYING +340 -0
  3. data/README +121 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +45 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +168 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +140 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +117 -0
@@ -0,0 +1,169 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Rejecting result instances based on further rules</tt>
4
+ #
5
+ #The two most trivial problems with a set of rules is that they match either less
6
+ #or more instances than we would like them to. Constraints are a way to remedy the second problem:
7
+ #they serve as a tool to filter out some result instances based on rules. A typical
8
+ #example:
9
+ #
10
+ #* *ensure_presence_of_ancestor_pattern* consider this model:
11
+ # <book>
12
+ # <author>...</author>
13
+ # <title>...</title>
14
+ # </book>
15
+ #
16
+ #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
17
+ #'author' and 'title', only those books will be matched which have an author and a
18
+ #title (i.e.the child patterns author and title must extract something). This is a way
19
+ #to say 'a book MUST have an author and a title'.
20
+ class Constraint
21
+ #There are more possible ways of applying/checking constraints in the case of
22
+ #ones that can not be checked in the context node (e.g. ensure_presence_of -
23
+ #since it may require the evaluation of child patterns of the context pattern to
24
+ #arbitray level)
25
+ #
26
+ #In such cases, the possibilities are:
27
+ #
28
+ #1) make a depth-first evaluation from the context pattern until the needed ancestor
29
+ # pattern is evaluated. This can mess things up, since if any ancestor node uses
30
+ # the sinks of predecessor(s) other than the context node, those need to be evaluated
31
+ # too, and we may run into a cyclyc dependency or at least a complicated recursion
32
+ #
33
+ #2) Post processing - evaluate normally and throw out results which do not pass the
34
+ # constraint
35
+ #
36
+ #2b) Do it on the XML level - most probably this solution will be implemented
37
+
38
+ # Different constraint types
39
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
40
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
41
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
42
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
43
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
44
+
45
+
46
+ attr_reader :type, :target
47
+
48
+ #Add 'ensure presence of ancestor pattern' constraint
49
+
50
+ #If this type of constraint is added to a pattern, it must have an ancestor pattern
51
+ #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
52
+ #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
53
+ #(just by looking at the wrapper model, the ancestor pattern is always present)
54
+ #Note that from this type of constraint there is no 'ensure_absence' version, since
55
+ #I could not think about an use case for that
56
+ def self.add_ensure_presence_of_pattern(ancestor)
57
+ Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
58
+ end
59
+
60
+ #Add 'ensure absence of attribute' constraint
61
+
62
+ #If this type of constraint is added to a pattern, the HTML node it targets
63
+ #must NOT have an attribute named "attribute_name" with the value "attribute_value"
64
+ def self.add_ensure_absence_of_attribute(attribute_hash)
65
+ Constraint.new(attribute_hash,
66
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
67
+ end
68
+
69
+ #Add 'ensure presence of attribute' constraint
70
+
71
+ #If this type of constraint is added to a pattern, the HTML node it targets
72
+ #must have an attribute named "attribute_name" with the value "attribute_value"
73
+ def self.add_ensure_presence_of_attribute(attribute_hash)
74
+ Constraint.new(attribute_hash,
75
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
76
+ end
77
+
78
+ #Add 'ensure absence of ancestor node' constraint
79
+
80
+ #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
81
+ #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
82
+ #
83
+ #"attributes" is an array of hashes, for example
84
+ #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
85
+ #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
86
+ #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
87
+ #
88
+ #"attributes" can be empty - in this case just the 'node_name' is checked
89
+ def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
90
+ Constraint.new([node_name, attributes],
91
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
92
+ end
93
+
94
+ #Add 'ensure presence of ancestor node' constraint
95
+
96
+ #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
97
+ #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
98
+ #
99
+ #"attributes" is an array of hashes, for example
100
+ #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
101
+ #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
102
+ #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
103
+ #
104
+ #"attributes" can be empty - in this case just the 'node_name' is checked
105
+ def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
106
+ Constraint.new([node_name, attributes],
107
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
108
+ end
109
+
110
+ #Evaluate the constraint; if this function returns true,
111
+ #it means that the constraint passed, i.e. its filter will be added to the exctracted
112
+ #content of the pattern
113
+ def check(result)
114
+ case @type
115
+ #checked after evaluation, so here always return true
116
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
117
+ return true
118
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
119
+ attribute_present(result)
120
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
121
+ !attribute_present(result)
122
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
123
+ ancestor_node_present(result)
124
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
125
+ !ancestor_node_present(result)
126
+ end
127
+ end
128
+
129
+ private
130
+ #We would not like these to be called from outside
131
+ def initialize(target, type)
132
+ @target = target
133
+ @type = type
134
+ end
135
+
136
+ #Implementation of the ancestor node presence test
137
+ #Check the documentation of the add_ensure_presence_of_ancestor_node method
138
+ #for further information on the result parameter
139
+ def ancestor_node_present(result)
140
+ found = false
141
+ node_name = @target[0]
142
+ node_attributes = @target[1]
143
+ node_attributes.each do |pair|
144
+ return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
145
+ end
146
+ if node_attributes.empty?
147
+ return true if !result.search("//#{node_name}").empty?
148
+ end
149
+ false
150
+ end
151
+
152
+ def attribute_present(result)
153
+ return unless result.is_a? Hpricot::Elem
154
+ match = true
155
+ #If v = nil, the value of the attribute can be arbitrary;
156
+ #Therefore, in this case we just have to make sure that the attribute is
157
+ #present (i.e. != nil), we don't care about the value
158
+ @target.each do |k,v|
159
+ if v == nil
160
+ match &&= (result.attributes[k.to_s] != nil)
161
+ else
162
+ match &&= (result.attributes[k.to_s] == v.to_s)
163
+ end
164
+ end
165
+ match
166
+ end
167
+
168
+ end #end of class
169
+ end #end of module
@@ -0,0 +1,49 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Utility class for adding constraints</tt>
4
+ #
5
+ #Originally methods of Pattern - but since Pattern was already too heavy (and after
6
+ #all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
7
+ #to this utility class. In pattern everything that begins with ensure_
8
+ #is automatically dispatched here.
9
+ #
10
+ #I will not document the functions since these are just forwarders; See the 'real'
11
+ #functions with their documentation in Scrubyt::Constraint.rb
12
+ class ConstraintAdder
13
+
14
+ def self.ensure_presence_of_pattern(ancestor_node_name)
15
+ Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
16
+ end
17
+
18
+ def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
19
+ Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
20
+ prepare_attributes(attributes))
21
+ end
22
+
23
+ def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
24
+ Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
25
+ prepare_attributes(attributes))
26
+ end
27
+
28
+ def self.ensure_presence_of_attribute(attribute_hash)
29
+ Constraint.add_ensure_presence_of_attribute(attribute_hash)
30
+ end
31
+
32
+ def self.ensure_absence_of_attribute(attribute_hash)
33
+ Constraint.add_ensure_absence_of_attribute(attribute_hash)
34
+ end
35
+
36
+ private
37
+ def self.prepare_attributes(attributes)
38
+ attribute_pairs = []
39
+ attributes.each do |key, value|
40
+ if (value.instance_of? Array)
41
+ value.each {|val| attribute_pairs << [key,val]}
42
+ else
43
+ attribute_pairs << [key, value]
44
+ end
45
+ end
46
+ return attribute_pairs
47
+ end #end of method prepare_attributes
48
+ end #end of class ConstraintAddere
49
+ end #end of module Scrubyt
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class AttributeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
6
+ if elem.is_a? Hpricot::Elem
7
+ return [elem.attributes[@example]]
8
+ else
9
+ return nil
10
+ end
11
+ end
12
+
13
+ end #End of class AttributeFilter
14
+ end #End of module Scrubyt
@@ -0,0 +1,112 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item.
19
+ #
20
+ #To accomplish this, first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
23
+ #extract the name and the image of the item; and finally, extract the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class BaseFilter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+
47
+ #Regexp example, like /\d+@*\d+[a-z]/
48
+ EXAMPLE_TYPE_REGEXP = 4
49
+ #Compound example, like :contains => 'goodies'
50
+ EXAMPLE_TYPE_COMPOUND = 5
51
+
52
+ attr_accessor(:example_type, :parent_pattern, :temp_sink,
53
+ :constraints, :xpath, :regexp, :example, :final_result)
54
+
55
+ def self.create(parent_pattern, example=nil)
56
+ filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
57
+ if filter_name == 'RootFilter'
58
+ BaseFilter.new(parent_pattern, example)
59
+ else
60
+ instance_eval("#{filter_name}.new(parent_pattern, example)")
61
+ end
62
+ end
63
+
64
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
65
+ #should not be called directly
66
+
67
+ #TODO still used?
68
+ alias_method :throw_method_missing, :method_missing
69
+ def method_missing(method_name, *args, &block)
70
+ case method_name.to_s
71
+ when /^ensure.+/
72
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
73
+ else
74
+ throw_method_missing(method_name, *args, &block)
75
+ end
76
+ end
77
+
78
+ private
79
+ #We don't want this to be accessible from outside
80
+ def initialize(parent_pattern, example)
81
+ case parent_pattern.example_type
82
+ when :xpath
83
+ @example_type = EXAMPLE_TYPE_XPATH
84
+ else
85
+ @example_type = BaseFilter.determine_example_type(example)
86
+ end
87
+ @parent_pattern = parent_pattern
88
+ @example = example
89
+ @xpath = nil #The xpath to evaluate this filter
90
+ @constraints = [] #list of constraints
91
+ end
92
+
93
+ def self.determine_example_type(example)
94
+ if example.instance_of? Regexp
95
+ EXAMPLE_TYPE_REGEXP
96
+ elsif example.instance_of? Hash
97
+ EXAMPLE_TYPE_COMPOUND
98
+ else
99
+ case example
100
+ when nil
101
+ EXAMPLE_TYPE_CHILDREN
102
+ when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
103
+ EXAMPLE_TYPE_IMAGE
104
+ when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
105
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
106
+ else
107
+ EXAMPLE_TYPE_STRING
108
+ end
109
+ end
110
+ end #end of method
111
+ end #End of class
112
+ end #End of module
@@ -0,0 +1,9 @@
1
+ module Scrubyt
2
+ class ConstantFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return @example
6
+ end
7
+
8
+ end #End of class ConstantFilter
9
+ end #End of module Scrubyt
@@ -0,0 +1,37 @@
1
+ module Scrubyt
2
+ class DetailPageFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a?(String)
6
+ url = source
7
+ else
8
+ url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
9
+ end
10
+ @parent_pattern.extractor.store_page
11
+ original_host_name = @parent_pattern.extractor.get_host_name
12
+ @parent_pattern.extractor.restore_host_name
13
+
14
+ begin
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+ rescue
17
+ Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
18
+ end
19
+
20
+
21
+ if @detail_extractor.nil?
22
+ @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
23
+ root_results = @detail_extractor.result
24
+ else
25
+ root_results = @detail_extractor.evaluate_extractor
26
+ end
27
+
28
+
29
+
30
+ @parent_pattern.extractor.restore_page
31
+ @parent_pattern.extractor.store_host_name original_host_name
32
+
33
+ root_results
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,64 @@
1
+ require 'net/http'
2
+ require 'fileutils'
3
+
4
+ module Scrubyt
5
+ class DownloadFilter < BaseFilter
6
+
7
+ def evaluate(source)
8
+ download_file(source)
9
+ end #end of method
10
+
11
+ private
12
+ def download_file(source)
13
+ return '' if source.size < 4
14
+ host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
15
+ outfile = nil
16
+ host_name += "/" if host_name[-1..-1] != "/"
17
+ base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
18
+ file_name = source.scan(/.+\/(.*)/)[0][0]
19
+ return nil if @parent_pattern.except.include? file_name
20
+ Net::HTTP.start(base_url) { |http|
21
+ Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
22
+ begin
23
+ ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
24
+ path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
25
+ resp = http.get(path, {'User-Agent'=> ua})
26
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
27
+ FileUtils.mkdir_p @example
28
+ open(outfile, 'wb') {|f| f.write(resp.body) }
29
+ rescue Timeout::Error
30
+ outfile = "[FAILED]#{file_name}"
31
+ end
32
+ }
33
+ outfile.scan(/.+\/(.*)/)[0][0]
34
+ end
35
+
36
+ def self.find_nonexisting_file_name(file_name)
37
+ already_found = false
38
+ loop do
39
+ if File.exists? file_name
40
+ if already_found
41
+ if file_name.include?('.')
42
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
43
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
44
+ else
45
+ last_no = file_name.scan(/_(\d+)$/)[0][0]
46
+ file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
47
+ end
48
+ else
49
+ if file_name.include?('.')
50
+ file_name.sub!(/\./) {"_1\."}
51
+ already_found = true
52
+ else
53
+ file_name << '_1'
54
+ already_found = true
55
+ end
56
+ end
57
+ else
58
+ break
59
+ end
60
+ end
61
+ file_name
62
+ end #end of method
63
+ end #End of class DownloadFilter
64
+ end #End of module Scrubyt