scrubber-scrubyt 0.4.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/CHANGELOG +343 -0
  2. data/COPYING +340 -0
  3. data/README +99 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
  6. data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
  7. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  8. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  9. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  10. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  11. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  12. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  13. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  14. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  15. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  16. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  17. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  18. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  19. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  20. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  21. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  22. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  23. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  24. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  25. data/lib/scrubyt/core/shared/extractor.rb +167 -0
  26. data/lib/scrubyt/logging.rb +154 -0
  27. data/lib/scrubyt/output/post_processor.rb +139 -0
  28. data/lib/scrubyt/output/result.rb +44 -0
  29. data/lib/scrubyt/output/result_dumper.rb +154 -0
  30. data/lib/scrubyt/output/result_node.rb +140 -0
  31. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  32. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  33. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  34. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  35. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  36. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  37. data/lib/scrubyt.rb +43 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +115 -0
@@ -0,0 +1,253 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Mechanize
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = WWW::Mechanize.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+
24
+ ##
25
+ #Action to fetch a document (either a file or a http address)
26
+ #
27
+ #*parameters*
28
+ #
29
+ #_doc_url_ - the url or file name to fetch
30
+ def self.fetch(doc_url, *args)
31
+ #Refactor this crap!!! with option_accessor stuff
32
+
33
+ if args.size > 0
34
+ mechanize_doc = args[0][:mechanize_doc]
35
+ html = args[0][:html]
36
+ resolve = args[0][:resolve]
37
+ basic_auth = args[0][:basic_auth]
38
+ parse_and_set_basic_auth(basic_auth) if basic_auth
39
+ if html
40
+ @@current_doc_protocol = 'string'
41
+ mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
42
+ end
43
+ else
44
+ mechanize_doc = nil
45
+ resolve = :full
46
+ end
47
+
48
+ @@current_doc_url = doc_url
49
+ @@current_doc_protocol = determine_protocol
50
+
51
+ if mechanize_doc.nil? && @@current_doc_protocol != 'file'
52
+ handle_relative_path(doc_url)
53
+ handle_relative_url(doc_url, resolve)
54
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
55
+
56
+ unless 'file' == @@current_doc_protocol
57
+ @@mechanize_doc = @@agent.get(@@current_doc_url)
58
+ end
59
+ else
60
+ @@mechanize_doc = mechanize_doc
61
+ end
62
+
63
+ if @@current_doc_protocol == 'file'
64
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
65
+ else
66
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
67
+ store_host_name(self.get_current_doc_url) #if self.get_current_doc_url # in case we're on a new host
68
+ end
69
+ end
70
+
71
+ ##
72
+ #Submit the last form;
73
+ def self.submit(index=nil, sleep_time=nil, type=nil)
74
+ Scrubyt.log :ACTION, 'Submitting form...'
75
+ if index == nil
76
+ result_page = @@agent.submit(@@current_form)
77
+ process_submit(@@current_form)
78
+ #----- added by nickmerwin@gmail.com -----
79
+ elsif index.class == String && !type.nil?
80
+ button = @@current_form.buttons.detect{|b| b.name == index}
81
+ result_page = @@current_form.submit(button)
82
+ process_submit(@@current_form, button,type)
83
+ #-----------------------------------------
84
+ else
85
+ result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
86
+ end
87
+ @@current_doc_url = result_page.uri.to_s
88
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
89
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
90
+ end
91
+
92
+ ##
93
+ #Click the link specified by the text
94
+ def self.click_link(link_spec,index = 0,wait_secs=0)
95
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
96
+ if link_spec.is_a? Hash
97
+ clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
98
+ else
99
+ clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
100
+ end
101
+ clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
102
+ result_page = @@agent.click(clicked_elem)
103
+ @@current_doc_url = result_page.uri.to_s
104
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
105
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
106
+ end
107
+
108
+ def self.click_image_map(index = 0)
109
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
110
+ uri = @@mechanize_doc.search("//area")[index]['href']
111
+ result_page = @@agent.get(uri)
112
+ @@current_doc_url = result_page.uri.to_s
113
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
114
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
115
+ end
116
+
117
+ def self.store_host_name(doc_url)
118
+ @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
119
+ @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
120
+ @@host_name = doc_url if @@host_name == nil
121
+ @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
122
+ @@original_host_name ||= @@host_name
123
+ end #end of method store_host_name
124
+
125
+ def self.determine_protocol
126
+ old_protocol = @@current_doc_protocol
127
+ new_protocol = case @@current_doc_url
128
+ when /^https/
129
+ 'https'
130
+ when /^http/
131
+ 'http'
132
+ when /^www/
133
+ 'http'
134
+ else
135
+ 'file'
136
+ end
137
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
138
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
139
+ new_protocol
140
+ end
141
+
142
+ def self.handle_relative_path(doc_url)
143
+ if @@base_dir == nil
144
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
145
+ else
146
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
147
+ end
148
+ end
149
+
150
+ def self.handle_relative_url(doc_url, resolve)
151
+ return if doc_url =~ /^http/
152
+ if doc_url !~ /^\//
153
+ first_char = doc_url[0..0]
154
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
155
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
156
+ current_uri = @@mechanize_doc.uri.to_s
157
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
158
+ if (current_uri.include? '?')
159
+ current_uri = current_uri.scan(/.+\//)[0]
160
+ else
161
+ current_uri += '/' unless current_uri[-1..-1] == '/'
162
+ end
163
+ @@current_doc_url = current_uri + doc_url
164
+ return
165
+ end
166
+ end
167
+ case resolve
168
+ when :full
169
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
170
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
171
+ when :host
172
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
173
+ @@current_doc_url = base_host_name + doc_url
174
+ else
175
+ #custom resilving
176
+ @@current_doc_url = resolve + doc_url
177
+ end
178
+ end
179
+
180
+ def self.fill_textfield(textfield_name, query_string, *unused)
181
+ lookup_form_for_tag('input','textfield',textfield_name,query_string)
182
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
183
+ end
184
+
185
+ ##
186
+ #Action to fill a textarea with text
187
+ def self.fill_textarea(textarea_name, text)
188
+ lookup_form_for_tag('textarea','textarea',textarea_name,text)
189
+ eval("@@current_form['#{textarea_name}'] = '#{text}'")
190
+ end
191
+
192
+ ##
193
+ #Action for selecting an option from a dropdown box
194
+ def self.select_option(selectlist_name, option)
195
+ lookup_form_for_tag('select','select list',selectlist_name,option)
196
+ select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
197
+ searched_option = select_list.options.find{|f| f.text.strip == option}
198
+ searched_option.click
199
+ end
200
+
201
+ def self.check_checkbox(checkbox_name)
202
+ lookup_form_for_tag('input','checkbox',checkbox_name, '')
203
+ @@current_form.checkboxes.name(checkbox_name).check
204
+ end
205
+
206
+ def self.check_radiobutton(checkbox_name, index=0)
207
+ lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
208
+ @@current_form.radiobuttons.name(checkbox_name)[index].check
209
+ end
210
+
211
+ #private
212
+ def self.process_submit(current_form, button=nil, type=nil)
213
+ if button == nil
214
+ result_page = @@agent.submit(current_form)
215
+ elsif type
216
+ result_page = current_form.submit(button)
217
+ else
218
+ result_page = @@agent.submit(current_form, button)
219
+ end
220
+ @@current_doc_url = result_page.uri.to_s
221
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
222
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
223
+ end
224
+
225
+ def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
226
+ Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
227
+ widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
228
+ form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
229
+ find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
230
+ end
231
+
232
+ def self.find_form_based_on_tag(tag, possible_attrs)
233
+ lookup_attribute_name = nil
234
+ lookup_attribute_value = nil
235
+
236
+ possible_attrs.each { |a|
237
+ lookup_attribute_name = a
238
+ lookup_attribute_value = tag.attributes[a]
239
+ break if lookup_attribute_value != nil
240
+ }
241
+ i = 0
242
+ loop do
243
+ @@current_form = FetchAction.get_mechanize_doc.forms[i]
244
+ return nil if @@current_form == nil
245
+ break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
246
+ i+= 1
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end
@@ -0,0 +1,54 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Fetching pages (and related functionality)</tt>
4
+ #
5
+ #Since lot of things are happening during (and before)
6
+ #the fetching of a document, I decided to move out fetching related
7
+ #functionality to a separate class - so if you are looking for anything
8
+ #which is loading a document (even by submitting a form or clicking a link)
9
+ #and related things like setting a proxy etc. you should find it here.
10
+ module FetchAction
11
+ @@current_doc_url = nil
12
+ @@current_doc_protocol = nil
13
+ @@base_dir = nil
14
+ @@host_name = nil
15
+ @@history = []
16
+ @@current_form = nil
17
+
18
+ ##
19
+ # At any given point, the current document can be queried with this method; Typically used
20
+ # when the navigation is over and the result document is passed to the wrapper
21
+ def self.get_current_doc_url
22
+ @@current_doc_url
23
+ end
24
+
25
+ def self.get_mechanize_doc
26
+ @@mechanize_doc
27
+ end
28
+
29
+ def self.get_hpricot_doc
30
+ @@hpricot_doc
31
+ end
32
+
33
+ def get_host_name
34
+ @@host_name
35
+ end
36
+
37
+ def restore_host_name
38
+ return if @@current_doc_protocol == 'file'
39
+ @@host_name = @@original_host_name
40
+ end
41
+
42
+ def store_page
43
+ @@history.push @@hpricot_doc
44
+ end
45
+
46
+ def restore_page
47
+ @@hpricot_doc = @@history.pop
48
+ end
49
+
50
+ def store_host_name(doc_url)
51
+ FetchAction.store_host_name(doc_url)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,95 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Describing actions which interact with the page</tt>
4
+ #
5
+ #This class contains all the actions that are used to navigate on web pages;
6
+ #first of all, *fetch* for downloading the pages - then various actions
7
+ #like filling textfields, submitting formst, clicking links and more
8
+ module NavigationActions
9
+
10
+ def self.extend_object(obj)
11
+ super(obj)
12
+ obj.instance_eval do
13
+ @current_form = nil
14
+ end
15
+ end
16
+
17
+ ##
18
+ #Action to fill a textfield with a query string
19
+ #
20
+ ##*parameters*
21
+ #
22
+ #_textfield_name_ - the name of the textfield (e.g. the name of the google search
23
+ #textfield is 'q'
24
+ #
25
+ #_query_string_ - the string that should be entered into the textfield
26
+ def fill_textfield(textfield_name, query_string, use_value = nil)
27
+ FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
+ end
29
+
30
+ def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
+ FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
32
+ end
33
+
34
+ ##
35
+ #Action to fill a textarea with text
36
+ def fill_textarea(textarea_name, text)
37
+ FetchAction.fill_textarea(textarea_name, text)
38
+ end
39
+
40
+ ##
41
+ #Action for selecting an option from a dropdown box
42
+ def select_option(selectlist_name, option)
43
+ FetchAction.select_option(selectlist_name, option)
44
+ end
45
+
46
+ def check_checkbox(checkbox_name)
47
+ FetchAction.check_checkbox(checkbox_name)
48
+ end
49
+
50
+ def check_radiobutton(checkbox_name, index=0)
51
+ FetchAction.check_radiobutton(checkbox_name, index=0)
52
+ end
53
+
54
+ ##
55
+ #Fetch the document
56
+ def fetch(*args)
57
+ FetchAction.fetch(*args)
58
+ end
59
+ ##
60
+ #Submit the current form
61
+ def submit(index=nil, type=nil)
62
+ FetchAction.submit(nil, index, type)
63
+ end
64
+
65
+ def submit_and_wait(sleep_time, index=nil, type=nil)
66
+ FetchAction.submit(index, sleep_time, type)
67
+ end
68
+
69
+ ##
70
+ #Click the link specified by the text
71
+ def click_link(link_spec,index=0)
72
+ FetchAction.click_link(link_spec,index, 0)
73
+ end
74
+
75
+ def click_link_and_wait(link_spec, sleep_secs=0)
76
+ FetchAction.click_link(link_spec, 0, sleep_secs)
77
+ end
78
+
79
+ def click_by_xpath(xpath)
80
+ FetchAction.click_by_xpath(xpath)
81
+ end
82
+
83
+ def click_image_map(index=0)
84
+ FetchAction.click_image_map(index)
85
+ end
86
+
87
+ def frame(attribute,value)
88
+ FetchAction.frame(attribute,value)
89
+ end
90
+
91
+ def wait(time=1)
92
+ FetchAction.wait(time)
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,30 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Represents a compund example</tt>
4
+ #
5
+ #There are two types of string examples in scRUBYt! right now:
6
+ #the simple example and the compound example. The simple example
7
+ #is specified by a string, and a compound example is specified with
8
+ #:contains, :begins_with and :ends_with descriptors - which can be
9
+ #both regexps or strings
10
+ class CompoundExample
11
+
12
+ DESCRIPTORS = [:contains, :begins_with, :ends_with]
13
+
14
+ attr_accessor :descriptor_hash
15
+
16
+ def initialize(descriptor_hash)
17
+ @descriptor_hash = descriptor_hash
18
+ end
19
+
20
+ ##
21
+ #Is the hash passed to this function a compound example descriptor hash?
22
+ #Need to decide this when parsing pattern parameters
23
+ def self.compound_example?(hash)
24
+ hash.each do |k,v|
25
+ return false if !DESCRIPTORS.include? k
26
+ end
27
+ true
28
+ end# end of method
29
+ end# #end of class CompoundExample
30
+ end# end of module Scrubyt
@@ -0,0 +1,169 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Rejecting result instances based on further rules</tt>
4
+ #
5
+ #The two most trivial problems with a set of rules is that they match either less
6
+ #or more instances than we would like them to. Constraints are a way to remedy the second problem:
7
+ #they serve as a tool to filter out some result instances based on rules. A typical
8
+ #example:
9
+ #
10
+ #* *ensure_presence_of_ancestor_pattern* consider this model:
11
+ # <book>
12
+ # <author>...</author>
13
+ # <title>...</title>
14
+ # </book>
15
+ #
16
+ #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
17
+ #'author' and 'title', only those books will be matched which have an author and a
18
+ #title (i.e.the child patterns author and title must extract something). This is a way
19
+ #to say 'a book MUST have an author and a title'.
20
+ class Constraint
21
+ #There are more possible ways of applying/checking constraints in the case of
22
+ #ones that can not be checked in the context node (e.g. ensure_presence_of -
23
+ #since it may require the evaluation of child patterns of the context pattern to
24
+ #arbitray level)
25
+ #
26
+ #In such cases, the possibilities are:
27
+ #
28
+ #1) make a depth-first evaluation from the context pattern until the needed ancestor
29
+ # pattern is evaluated. This can mess things up, since if any ancestor node uses
30
+ # the sinks of predecessor(s) other than the context node, those need to be evaluated
31
+ # too, and we may run into a cyclyc dependency or at least a complicated recursion
32
+ #
33
+ #2) Post processing - evaluate normally and throw out results which do not pass the
34
+ # constraint
35
+ #
36
+ #2b) Do it on the XML level - most probably this solution will be implemented
37
+
38
+ # Different constraint types
39
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
40
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
41
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
42
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
43
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
44
+
45
+
46
+ attr_reader :type, :target
47
+
48
+ #Add 'ensure presence of ancestor pattern' constraint
49
+
50
+ #If this type of constraint is added to a pattern, it must have an ancestor pattern
51
+ #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
52
+ #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
53
+ #(just by looking at the wrapper model, the ancestor pattern is always present)
54
+ #Note that from this type of constraint there is no 'ensure_absence' version, since
55
+ #I could not think about an use case for that
56
+ def self.add_ensure_presence_of_pattern(ancestor)
57
+ Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
58
+ end
59
+
60
+ #Add 'ensure absence of attribute' constraint
61
+
62
+ #If this type of constraint is added to a pattern, the HTML node it targets
63
+ #must NOT have an attribute named "attribute_name" with the value "attribute_value"
64
+ def self.add_ensure_absence_of_attribute(attribute_hash)
65
+ Constraint.new(attribute_hash,
66
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
67
+ end
68
+
69
+ #Add 'ensure presence of attribute' constraint
70
+
71
+ #If this type of constraint is added to a pattern, the HTML node it targets
72
+ #must have an attribute named "attribute_name" with the value "attribute_value"
73
+ def self.add_ensure_presence_of_attribute(attribute_hash)
74
+ Constraint.new(attribute_hash,
75
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
76
+ end
77
+
78
+ #Add 'ensure absence of ancestor node' constraint
79
+
80
+ #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
81
+ #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
82
+ #
83
+ #"attributes" is an array of hashes, for example
84
+ #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
85
+ #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
86
+ #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
87
+ #
88
+ #"attributes" can be empty - in this case just the 'node_name' is checked
89
+ def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
90
+ Constraint.new([node_name, attributes],
91
+ CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
92
+ end
93
+
94
+ #Add 'ensure presence of ancestor node' constraint
95
+
96
+ #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
97
+ #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
98
+ #
99
+ #"attributes" is an array of hashes, for example
100
+ #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
101
+ #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
102
+ #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
103
+ #
104
+ #"attributes" can be empty - in this case just the 'node_name' is checked
105
+ def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
106
+ Constraint.new([node_name, attributes],
107
+ CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
108
+ end
109
+
110
+ #Evaluate the constraint; if this function returns true,
111
+ #it means that the constraint passed, i.e. its filter will be added to the exctracted
112
+ #content of the pattern
113
+ def check(result)
114
+ case @type
115
+ #checked after evaluation, so here always return true
116
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
117
+ return true
118
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
119
+ attribute_present(result)
120
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
121
+ !attribute_present(result)
122
+ when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
123
+ ancestor_node_present(result)
124
+ when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
125
+ !ancestor_node_present(result)
126
+ end
127
+ end
128
+
129
+ private
130
+ #We would not like these to be called from outside
131
+ def initialize(target, type)
132
+ @target = target
133
+ @type = type
134
+ end
135
+
136
+ #Implementation of the ancestor node presence test
137
+ #Check the documentation of the add_ensure_presence_of_ancestor_node method
138
+ #for further information on the result parameter
139
+ def ancestor_node_present(result)
140
+ found = false
141
+ node_name = @target[0]
142
+ node_attributes = @target[1]
143
+ node_attributes.each do |pair|
144
+ return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
145
+ end
146
+ if node_attributes.empty?
147
+ return true if !result.search("//#{node_name}").empty?
148
+ end
149
+ false
150
+ end
151
+
152
+ def attribute_present(result)
153
+ return unless result.is_a? Hpricot::Elem
154
+ match = true
155
+ #If v = nil, the value of the attribute can be arbitrary;
156
+ #Therefore, in this case we just have to make sure that the attribute is
157
+ #present (i.e. != nil), we don't care about the value
158
+ @target.each do |k,v|
159
+ if v == nil
160
+ match &&= (result.attributes[k.to_s] != nil)
161
+ else
162
+ match &&= (result.attributes[k.to_s] == v.to_s)
163
+ end
164
+ end
165
+ match
166
+ end
167
+
168
+ end #end of class
169
+ end #end of module
@@ -0,0 +1,49 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Utility class for adding constraints</tt>
4
+ #
5
+ #Originally methods of Pattern - but since Pattern was already too heavy (and after
6
+ #all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
7
+ #to this utility class. In pattern everything that begins with ensure_
8
+ #is automatically dispatched here.
9
+ #
10
+ #I will not document the functions since these are just forwarders; See the 'real'
11
+ #functions with their documentation in Scrubyt::Constraint.rb
12
+ class ConstraintAdder
13
+
14
+ def self.ensure_presence_of_pattern(ancestor_node_name)
15
+ Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
16
+ end
17
+
18
+ def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
19
+ Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
20
+ prepare_attributes(attributes))
21
+ end
22
+
23
+ def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
24
+ Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
25
+ prepare_attributes(attributes))
26
+ end
27
+
28
+ def self.ensure_presence_of_attribute(attribute_hash)
29
+ Constraint.add_ensure_presence_of_attribute(attribute_hash)
30
+ end
31
+
32
+ def self.ensure_absence_of_attribute(attribute_hash)
33
+ Constraint.add_ensure_absence_of_attribute(attribute_hash)
34
+ end
35
+
36
+ private
37
+ def self.prepare_attributes(attributes)
38
+ attribute_pairs = []
39
+ attributes.each do |key, value|
40
+ if (value.instance_of? Array)
41
+ value.each {|val| attribute_pairs << [key,val]}
42
+ else
43
+ attribute_pairs << [key, value]
44
+ end
45
+ end
46
+ return attribute_pairs
47
+ end #end of method prepare_attributes
48
+ end #end of class ConstraintAddere
49
+ end #end of module Scrubyt
@@ -0,0 +1,14 @@
1
+ module Scrubyt
2
+ class AttributeFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
6
+ if elem.is_a? Hpricot::Elem
7
+ return [elem.attributes[@example]]
8
+ else
9
+ return nil
10
+ end
11
+ end
12
+
13
+ end #End of class AttributeFilter
14
+ end #End of module Scrubyt