scrubber-scrubyt 0.4.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/CHANGELOG +343 -0
  2. data/COPYING +340 -0
  3. data/README +99 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
  6. data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
  7. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  8. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  9. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  10. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  11. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  13. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  14. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  15. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  16. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  17. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  18. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  19. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  20. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  21. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  22. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  23. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  24. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  25. data/lib/scrubyt/core/shared/extractor.rb +167 -0
  26. data/lib/scrubyt/logging.rb +154 -0
  27. data/lib/scrubyt/output/post_processor.rb +139 -0
  28. data/lib/scrubyt/output/result.rb +44 -0
  29. data/lib/scrubyt/output/result_dumper.rb +154 -0
  30. data/lib/scrubyt/output/result_node.rb +140 -0
  31. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  32. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  33. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  34. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  35. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  36. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  37. data/lib/scrubyt.rb +43 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +115 -0
@@ -0,0 +1,112 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Filter out relevant pieces from the parent pattern</tt>
4
+ #
5
+ #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
6
+ #it reaches the bottom. The biggest difference is that instead of water, a HTML
7
+ #document travels through the space.
8
+ #
9
+ #Of course Scrubyt would not make much sense if the same document would arrive at
10
+ #the bottom that was poured in at the top - since in this case we might use an
11
+ #indentity transformation (i.e. do nothing with the input) as well.
12
+ #
13
+ #This is where filters came in: as they name says, they filter the stuff that is
14
+ #pouring from above, to leave the interesting parts and discard the rest.
15
+ #The working of a filter will be explained most easily by the help of an example.
16
+ #Let's consider that we would like to extract information from a webshop; Concretely
17
+ #we are interested in the name of the items and the URL pointing to the image of the
18
+ #item.
19
+ #
20
+ #To accomplish this, first we select the items with the pattern item (a pattern is
21
+ #a logical grouping of fillters; see Pattern documentation) Then our new
22
+ #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
23
+ #extract the name and the image of the item; and finally, extract the href attribute
24
+ #of the image. Let's see an illustration:
25
+ #
26
+ # root --> This pattern is called a 'root pattern', It is invisible to you
27
+ # | and basically it represents the document; it has no filters
28
+ # +-- item --> Filter what's coming from above (the whole document) to get
29
+ # | relevant pieces of data (in this case webshop items)
30
+ # +-- name --> Again, filter what's coming from above (a webshop item) and
31
+ # | leave only item names after this operation
32
+ # +-- image --> This time filter the image of the item
33
+ # |
34
+ # +-- href --> And finally, from the image elements, get the attribute 'href'
35
+ class BaseFilter
36
+ #Type of the example this filter is extracted with
37
+
38
+ #XPath example, like html/body/tr/td[1] etc.
39
+ EXAMPLE_TYPE_XPATH = 0
40
+ #String from the document, for example 'Canon EOS 300 D'.
41
+ EXAMPLE_TYPE_STRING = 1
42
+ #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
43
+ EXAMPLE_TYPE_IMAGE = 2
44
+ #No example - the actual XPath is determined from the children XPaths (their LCA)
45
+ EXAMPLE_TYPE_CHILDREN = 3
46
+
47
+ #Regexp example, like /\d+@*\d+[a-z]/
48
+ EXAMPLE_TYPE_REGEXP = 4
49
+ #Compound example, like :contains => 'goodies'
50
+ EXAMPLE_TYPE_COMPOUND = 5
51
+
52
+ attr_accessor(:example_type, :parent_pattern, :temp_sink,
53
+ :constraints, :xpath, :regexp, :example, :final_result)
54
+
55
+ def self.create(parent_pattern, example=nil)
56
+ filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
57
+ if filter_name == 'RootFilter'
58
+ BaseFilter.new(parent_pattern, example)
59
+ else
60
+ instance_eval("#{filter_name}.new(parent_pattern, example)")
61
+ end
62
+ end
63
+
64
+ #Dispatcher method to add constraints; of course, as with any method_missing, this method
65
+ #should not be called directly
66
+
67
+ #TODO still used?
68
+ alias_method :throw_method_missing, :method_missing
69
+ def method_missing(method_name, *args, &block)
70
+ case method_name.to_s
71
+ when /^ensure.+/
72
+ constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
73
+ else
74
+ throw_method_missing(method_name, *args, &block)
75
+ end
76
+ end
77
+
78
+ private
79
+ #We don't want this to be accessible from outside
80
+ def initialize(parent_pattern, example)
81
+ case parent_pattern.example_type
82
+ when :xpath
83
+ @example_type = EXAMPLE_TYPE_XPATH
84
+ else
85
+ @example_type = BaseFilter.determine_example_type(example)
86
+ end
87
+ @parent_pattern = parent_pattern
88
+ @example = example
89
+ @xpath = nil #The xpath to evaluate this filter
90
+ @constraints = [] #list of constraints
91
+ end
92
+
93
+ def self.determine_example_type(example)
94
+ if example.instance_of? Regexp
95
+ EXAMPLE_TYPE_REGEXP
96
+ elsif example.instance_of? Hash
97
+ EXAMPLE_TYPE_COMPOUND
98
+ else
99
+ case example
100
+ when nil
101
+ EXAMPLE_TYPE_CHILDREN
102
+ when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
103
+ EXAMPLE_TYPE_IMAGE
104
+ when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
105
+ (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
106
+ else
107
+ EXAMPLE_TYPE_STRING
108
+ end
109
+ end
110
+ end #end of method
111
+ end #End of class
112
+ end #End of module
@@ -0,0 +1,9 @@
1
+ module Scrubyt
2
+ class ConstantFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return @example
6
+ end
7
+
8
+ end #End of class ConstantFilter
9
+ end #End of module Scrubyt
@@ -0,0 +1,37 @@
1
+ module Scrubyt
2
+ class DetailPageFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a?(String)
6
+ url = source
7
+ else
8
+ url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
9
+ end
10
+ @parent_pattern.extractor.store_page
11
+ original_host_name = @parent_pattern.extractor.get_host_name
12
+ @parent_pattern.extractor.restore_host_name
13
+
14
+ begin
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+ rescue
17
+ Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
18
+ end
19
+
20
+
21
+ if @detail_extractor.nil?
22
+ @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
23
+ root_results = @detail_extractor.result
24
+ else
25
+ root_results = @detail_extractor.evaluate_extractor
26
+ end
27
+
28
+
29
+
30
+ @parent_pattern.extractor.restore_page
31
+ @parent_pattern.extractor.store_host_name original_host_name
32
+
33
+ root_results
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,64 @@
1
+ require 'net/http'
2
+ require 'fileutils'
3
+
4
+ module Scrubyt
5
+ class DownloadFilter < BaseFilter
6
+
7
+ def evaluate(source)
8
+ download_file(source)
9
+ end #end of method
10
+
11
+ private
12
+ def download_file(source)
13
+ return '' if source.size < 4
14
+ host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
15
+ outfile = nil
16
+ host_name += "/" if host_name[-1..-1] != "/"
17
+ base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
18
+ file_name = source.scan(/.+\/(.*)/)[0][0]
19
+ return nil if @parent_pattern.except.include? file_name
20
+ Net::HTTP.start(base_url) { |http|
21
+ Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
22
+ begin
23
+ ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
24
+ path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
25
+ resp = http.get(path, {'User-Agent'=> ua})
26
+ outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
27
+ FileUtils.mkdir_p @example
28
+ open(outfile, 'wb') {|f| f.write(resp.body) }
29
+ rescue Timeout::Error
30
+ outfile = "[FAILED]#{file_name}"
31
+ end
32
+ }
33
+ outfile.scan(/.+\/(.*)/)[0][0]
34
+ end
35
+
36
+ def self.find_nonexisting_file_name(file_name)
37
+ already_found = false
38
+ loop do
39
+ if File.exists? file_name
40
+ if already_found
41
+ if file_name.include?('.')
42
+ last_no = file_name.scan(/_(\d+)\./)[0][0]
43
+ file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
44
+ else
45
+ last_no = file_name.scan(/_(\d+)$/)[0][0]
46
+ file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
47
+ end
48
+ else
49
+ if file_name.include?('.')
50
+ file_name.sub!(/\./) {"_1\."}
51
+ already_found = true
52
+ else
53
+ file_name << '_1'
54
+ already_found = true
55
+ end
56
+ end
57
+ else
58
+ break
59
+ end
60
+ end
61
+ file_name
62
+ end #end of method
63
+ end #End of class DownloadFilter
64
+ end #End of module Scrubyt
@@ -0,0 +1,9 @@
1
+ module Scrubyt
2
+ class HtmlSubtreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ source.inner_html
6
+ end
7
+
8
+ end #End of class TreeFilter
9
+ end #End of module Scrubyt
@@ -0,0 +1,13 @@
1
+ module Scrubyt
2
+ class RegexpFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ if source.is_a? String
6
+ source.scan(@example).flatten
7
+ else
8
+ source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
9
+ end
10
+ end
11
+
12
+ end #End of class TreeFilter
13
+ end #End of module Scrubyt
@@ -0,0 +1,11 @@
1
+ module Scrubyt
2
+ class ScriptFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ param = source
6
+ param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
7
+ @example.call param
8
+ end
9
+
10
+ end #End of class ConstantFilter
11
+ end #End of module Scrubyt
@@ -0,0 +1,34 @@
1
+ module Scrubyt
2
+ class TextFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return find_string(source) if @example =~ /^find\(/
6
+ final_element_name = @example.scan(/^(.+?)\[/)[0][0]
7
+ text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
8
+
9
+ index = @example.scan(/\]:(.+)/).flatten
10
+ index = 0 if index.empty?
11
+ index = index[0].to_i unless index[0] == "all"
12
+ result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
13
+ return "" unless result
14
+
15
+ if index[0] == "all"
16
+ result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
17
+ else
18
+ [XPathUtils.traverse_up_until_name(result,final_element_name)]
19
+ end
20
+ end
21
+
22
+ def find_string(source)
23
+ str = @example.scan(/find\((.+)\)/).flatten[0]
24
+ strings_to_find = str.include?('|') ? str.split('|') : [str]
25
+ strings_to_find.each do |s|
26
+ result = SharedUtils.traverse_for_match(source,/#{s}/i)
27
+ return [s] unless result.empty?
28
+ end
29
+ return []
30
+ end
31
+
32
+ end #End of class TextFilter
33
+ end #End of module Scrubyt
34
+
@@ -0,0 +1,138 @@
1
+ module Scrubyt
2
+ class TreeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return [@final_result] if @final_result
6
+ #Crude hack! Drop it after it will be supported in Hpricot
7
+ if @xpath =~ /.+\/@.+$/
8
+ @example = @xpath
9
+ @xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
10
+ end
11
+ result = source/@xpath
12
+
13
+ Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
14
+
15
+ xpath_results = Hpricot::Elements === result ? result : [result]
16
+
17
+ if @example =~ /.+\/@.+$/
18
+ result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
19
+ xpath_results.map! {|r| r.attributes[result_attribute] }
20
+ end
21
+ if @regexp == nil
22
+ xpath_results
23
+ else
24
+ regexp_results = []
25
+ xpath_results.each do |entry|
26
+ text = SharedUtils.prepare_text_for_comparison(result.inner_html)
27
+ if text =~ @regexp
28
+ regexp_results << $1
29
+ end
30
+ end
31
+ regexp_results
32
+ end
33
+ end
34
+
35
+ def generate_regexp_for_example
36
+ return if @example_type != EXAMPLE_TYPE_STRING
37
+ return if @temp_sink.nil?
38
+ return if @temp_sink.is_a? String
39
+ return if @example =~ /.+\[.+\]$/
40
+
41
+ text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
42
+ match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
43
+ return if match_range == (0..text.length)
44
+
45
+ @regexp = text
46
+ @temp_sink.changing_ranges.sort.reverse.each do |range|
47
+ @regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
48
+ end
49
+ @regexp = Regexp.escape(@regexp)
50
+ @regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
51
+ @regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
52
+ @regexp = '^' + @regexp + '$'
53
+ @regexp = /#{@regexp}/
54
+ end
55
+
56
+
57
+ #For all the tree patterns, generate an XPath based on the given example
58
+ #Also this method should not be called directly; It is automatically called for every tree
59
+ #pattern directly after wrapper definition
60
+ def generate_XPath_for_example(next_page_example=false)
61
+ #puts "generating example for: #{@parent_pattern.name}"
62
+ #puts @example_type
63
+ case @example_type
64
+ when EXAMPLE_TYPE_XPATH
65
+ @xpath = @example
66
+ when EXAMPLE_TYPE_STRING
67
+ @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
68
+ @example,
69
+ next_page_example)
70
+ return if @temp_sink == nil
71
+ if @temp_sink.is_a? String
72
+ @final_result = @temp_sink
73
+ return
74
+ end
75
+
76
+ mark_changing_ranges = lambda { |element, range|
77
+ element.instance_eval do
78
+ @changing_ranges ||= [] << range
79
+ def changing_ranges
80
+ @changing_ranges
81
+ end
82
+ end
83
+ }
84
+ mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
85
+ write_indices = next_page_example ? true : !@parent_pattern.generalize
86
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
87
+ when EXAMPLE_TYPE_CHILDREN
88
+ current_example_index = 0
89
+ loop do
90
+ all_child_temp_sinks = []
91
+ @parent_pattern.children.each do |child_pattern|
92
+ all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
93
+ end
94
+ result = all_child_temp_sinks.pop
95
+ if all_child_temp_sinks.empty?
96
+ result = result.parent
97
+ else
98
+ all_child_temp_sinks.each do |child_sink|
99
+ result = XPathUtils.lowest_common_ancestor(result, child_sink)
100
+ end
101
+ end
102
+ xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
103
+ XPathUtils.generate_XPath(result, nil, true)
104
+ if @parent_pattern.filters.size < current_example_index + 1
105
+ @parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
106
+ end
107
+ @parent_pattern.filters[current_example_index].xpath = xpath
108
+ @parent_pattern.filters[current_example_index].temp_sink = result
109
+ @parent_pattern.children.each do |child_pattern|
110
+ next if child_pattern.type == :detail_page
111
+ child_pattern.filters[current_example_index].xpath =
112
+ child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
113
+ XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
114
+ end
115
+ break if @parent_pattern.children[0].filters.size == current_example_index + 1
116
+ current_example_index += 1
117
+ end
118
+ when EXAMPLE_TYPE_IMAGE
119
+ @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
120
+ @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
121
+ when EXAMPLE_TYPE_COMPOUND
122
+ @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
123
+ @example,
124
+ next_page_example)
125
+ @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
126
+ XPathUtils.generate_XPath(@temp_sink, nil, true)
127
+ end
128
+ end
129
+
130
+ def generate_relative_XPath(parent_xpath)
131
+ parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
132
+ parent_xpath,
133
+ @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
134
+ @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
+ end
136
+
137
+ end #End of class TreeFilter
138
+ end #End of module Scrubyt