scrubber-scrubyt 0.4.11
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +343 -0
- data/COPYING +340 -0
- data/README +99 -0
- data/Rakefile +101 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +167 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/lib/scrubyt.rb +43 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +115 -0
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
module Scrubyt
|
4
|
+
##
|
5
|
+
#=<tt>Fetching pages (and related functionality)</tt>
|
6
|
+
#
|
7
|
+
#Since lot of things are happening during (and before)
|
8
|
+
#the fetching of a document, I decided to move out fetching related
|
9
|
+
#functionality to a separate class - so if you are looking for anything
|
10
|
+
#which is loading a document (even by submitting a form or clicking a link)
|
11
|
+
#and related things like setting a proxy etc. you should find it here.
|
12
|
+
module Navigation
|
13
|
+
module Mechanize
|
14
|
+
|
15
|
+
def self.included(base)
|
16
|
+
base.module_eval do
|
17
|
+
@@agent = WWW::Mechanize.new
|
18
|
+
@@current_doc_url = nil
|
19
|
+
@@current_doc_protocol = nil
|
20
|
+
@@base_dir = nil
|
21
|
+
@@host_name = nil
|
22
|
+
@@history = []
|
23
|
+
|
24
|
+
##
|
25
|
+
#Action to fetch a document (either a file or a http address)
|
26
|
+
#
|
27
|
+
#*parameters*
|
28
|
+
#
|
29
|
+
#_doc_url_ - the url or file name to fetch
|
30
|
+
def self.fetch(doc_url, *args)
|
31
|
+
#Refactor this crap!!! with option_accessor stuff
|
32
|
+
|
33
|
+
if args.size > 0
|
34
|
+
mechanize_doc = args[0][:mechanize_doc]
|
35
|
+
html = args[0][:html]
|
36
|
+
resolve = args[0][:resolve]
|
37
|
+
basic_auth = args[0][:basic_auth]
|
38
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
39
|
+
if html
|
40
|
+
@@current_doc_protocol = 'string'
|
41
|
+
mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
|
42
|
+
end
|
43
|
+
else
|
44
|
+
mechanize_doc = nil
|
45
|
+
resolve = :full
|
46
|
+
end
|
47
|
+
|
48
|
+
@@current_doc_url = doc_url
|
49
|
+
@@current_doc_protocol = determine_protocol
|
50
|
+
|
51
|
+
if mechanize_doc.nil? && @@current_doc_protocol != 'file'
|
52
|
+
handle_relative_path(doc_url)
|
53
|
+
handle_relative_url(doc_url, resolve)
|
54
|
+
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
55
|
+
|
56
|
+
unless 'file' == @@current_doc_protocol
|
57
|
+
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
58
|
+
end
|
59
|
+
else
|
60
|
+
@@mechanize_doc = mechanize_doc
|
61
|
+
end
|
62
|
+
|
63
|
+
if @@current_doc_protocol == 'file'
|
64
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
65
|
+
else
|
66
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
67
|
+
store_host_name(self.get_current_doc_url) #if self.get_current_doc_url # in case we're on a new host
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
#Submit the last form;
|
73
|
+
def self.submit(index=nil, sleep_time=nil, type=nil)
|
74
|
+
Scrubyt.log :ACTION, 'Submitting form...'
|
75
|
+
if index == nil
|
76
|
+
result_page = @@agent.submit(@@current_form)
|
77
|
+
process_submit(@@current_form)
|
78
|
+
#----- added by nickmerwin@gmail.com -----
|
79
|
+
elsif index.class == String && !type.nil?
|
80
|
+
button = @@current_form.buttons.detect{|b| b.name == index}
|
81
|
+
result_page = @@current_form.submit(button)
|
82
|
+
process_submit(@@current_form, button,type)
|
83
|
+
#-----------------------------------------
|
84
|
+
else
|
85
|
+
result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
|
86
|
+
end
|
87
|
+
@@current_doc_url = result_page.uri.to_s
|
88
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
89
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
#Click the link specified by the text
|
94
|
+
def self.click_link(link_spec,index = 0,wait_secs=0)
|
95
|
+
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
96
|
+
if link_spec.is_a? Hash
|
97
|
+
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
98
|
+
else
|
99
|
+
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
100
|
+
end
|
101
|
+
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
102
|
+
result_page = @@agent.click(clicked_elem)
|
103
|
+
@@current_doc_url = result_page.uri.to_s
|
104
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
105
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.click_image_map(index = 0)
|
109
|
+
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
110
|
+
uri = @@mechanize_doc.search("//area")[index]['href']
|
111
|
+
result_page = @@agent.get(uri)
|
112
|
+
@@current_doc_url = result_page.uri.to_s
|
113
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
114
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.store_host_name(doc_url)
|
118
|
+
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
|
119
|
+
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
|
120
|
+
@@host_name = doc_url if @@host_name == nil
|
121
|
+
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
122
|
+
@@original_host_name ||= @@host_name
|
123
|
+
end #end of method store_host_name
|
124
|
+
|
125
|
+
def self.determine_protocol
|
126
|
+
old_protocol = @@current_doc_protocol
|
127
|
+
new_protocol = case @@current_doc_url
|
128
|
+
when /^https/
|
129
|
+
'https'
|
130
|
+
when /^http/
|
131
|
+
'http'
|
132
|
+
when /^www/
|
133
|
+
'http'
|
134
|
+
else
|
135
|
+
'file'
|
136
|
+
end
|
137
|
+
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
138
|
+
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
139
|
+
new_protocol
|
140
|
+
end
|
141
|
+
|
142
|
+
def self.handle_relative_path(doc_url)
|
143
|
+
if @@base_dir == nil
|
144
|
+
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
145
|
+
else
|
146
|
+
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def self.handle_relative_url(doc_url, resolve)
|
151
|
+
return if doc_url =~ /^http/
|
152
|
+
if doc_url !~ /^\//
|
153
|
+
first_char = doc_url[0..0]
|
154
|
+
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
155
|
+
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
156
|
+
current_uri = @@mechanize_doc.uri.to_s
|
157
|
+
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
158
|
+
if (current_uri.include? '?')
|
159
|
+
current_uri = current_uri.scan(/.+\//)[0]
|
160
|
+
else
|
161
|
+
current_uri += '/' unless current_uri[-1..-1] == '/'
|
162
|
+
end
|
163
|
+
@@current_doc_url = current_uri + doc_url
|
164
|
+
return
|
165
|
+
end
|
166
|
+
end
|
167
|
+
case resolve
|
168
|
+
when :full
|
169
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
170
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
171
|
+
when :host
|
172
|
+
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
173
|
+
@@current_doc_url = base_host_name + doc_url
|
174
|
+
else
|
175
|
+
#custom resilving
|
176
|
+
@@current_doc_url = resolve + doc_url
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def self.fill_textfield(textfield_name, query_string, *unused)
|
181
|
+
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
182
|
+
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
183
|
+
end
|
184
|
+
|
185
|
+
##
|
186
|
+
#Action to fill a textarea with text
|
187
|
+
def self.fill_textarea(textarea_name, text)
|
188
|
+
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
189
|
+
eval("@@current_form['#{textarea_name}'] = '#{text}'")
|
190
|
+
end
|
191
|
+
|
192
|
+
##
|
193
|
+
#Action for selecting an option from a dropdown box
|
194
|
+
def self.select_option(selectlist_name, option)
|
195
|
+
lookup_form_for_tag('select','select list',selectlist_name,option)
|
196
|
+
select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
|
197
|
+
searched_option = select_list.options.find{|f| f.text.strip == option}
|
198
|
+
searched_option.click
|
199
|
+
end
|
200
|
+
|
201
|
+
def self.check_checkbox(checkbox_name)
|
202
|
+
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
203
|
+
@@current_form.checkboxes.name(checkbox_name).check
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.check_radiobutton(checkbox_name, index=0)
|
207
|
+
lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
|
208
|
+
@@current_form.radiobuttons.name(checkbox_name)[index].check
|
209
|
+
end
|
210
|
+
|
211
|
+
#private
|
212
|
+
def self.process_submit(current_form, button=nil, type=nil)
|
213
|
+
if button == nil
|
214
|
+
result_page = @@agent.submit(current_form)
|
215
|
+
elsif type
|
216
|
+
result_page = current_form.submit(button)
|
217
|
+
else
|
218
|
+
result_page = @@agent.submit(current_form, button)
|
219
|
+
end
|
220
|
+
@@current_doc_url = result_page.uri.to_s
|
221
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
222
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
226
|
+
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
227
|
+
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
228
|
+
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
229
|
+
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
230
|
+
end
|
231
|
+
|
232
|
+
def self.find_form_based_on_tag(tag, possible_attrs)
|
233
|
+
lookup_attribute_name = nil
|
234
|
+
lookup_attribute_value = nil
|
235
|
+
|
236
|
+
possible_attrs.each { |a|
|
237
|
+
lookup_attribute_name = a
|
238
|
+
lookup_attribute_value = tag.attributes[a]
|
239
|
+
break if lookup_attribute_value != nil
|
240
|
+
}
|
241
|
+
i = 0
|
242
|
+
loop do
|
243
|
+
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|
244
|
+
return nil if @@current_form == nil
|
245
|
+
break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
246
|
+
i+= 1
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Fetching pages (and related functionality)</tt>
|
4
|
+
#
|
5
|
+
#Since lot of things are happening during (and before)
|
6
|
+
#the fetching of a document, I decided to move out fetching related
|
7
|
+
#functionality to a separate class - so if you are looking for anything
|
8
|
+
#which is loading a document (even by submitting a form or clicking a link)
|
9
|
+
#and related things like setting a proxy etc. you should find it here.
|
10
|
+
module FetchAction
|
11
|
+
@@current_doc_url = nil
|
12
|
+
@@current_doc_protocol = nil
|
13
|
+
@@base_dir = nil
|
14
|
+
@@host_name = nil
|
15
|
+
@@history = []
|
16
|
+
@@current_form = nil
|
17
|
+
|
18
|
+
##
|
19
|
+
# At any given point, the current document can be queried with this method; Typically used
|
20
|
+
# when the navigation is over and the result document is passed to the wrapper
|
21
|
+
def self.get_current_doc_url
|
22
|
+
@@current_doc_url
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.get_mechanize_doc
|
26
|
+
@@mechanize_doc
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.get_hpricot_doc
|
30
|
+
@@hpricot_doc
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_host_name
|
34
|
+
@@host_name
|
35
|
+
end
|
36
|
+
|
37
|
+
def restore_host_name
|
38
|
+
return if @@current_doc_protocol == 'file'
|
39
|
+
@@host_name = @@original_host_name
|
40
|
+
end
|
41
|
+
|
42
|
+
def store_page
|
43
|
+
@@history.push @@hpricot_doc
|
44
|
+
end
|
45
|
+
|
46
|
+
def restore_page
|
47
|
+
@@hpricot_doc = @@history.pop
|
48
|
+
end
|
49
|
+
|
50
|
+
def store_host_name(doc_url)
|
51
|
+
FetchAction.store_host_name(doc_url)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Describing actions which interact with the page</tt>
|
4
|
+
#
|
5
|
+
#This class contains all the actions that are used to navigate on web pages;
|
6
|
+
#first of all, *fetch* for downloading the pages - then various actions
|
7
|
+
#like filling textfields, submitting formst, clicking links and more
|
8
|
+
module NavigationActions
|
9
|
+
|
10
|
+
def self.extend_object(obj)
|
11
|
+
super(obj)
|
12
|
+
obj.instance_eval do
|
13
|
+
@current_form = nil
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
#Action to fill a textfield with a query string
|
19
|
+
#
|
20
|
+
##*parameters*
|
21
|
+
#
|
22
|
+
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
23
|
+
#textfield is 'q'
|
24
|
+
#
|
25
|
+
#_query_string_ - the string that should be entered into the textfield
|
26
|
+
def fill_textfield(textfield_name, query_string, use_value = nil)
|
27
|
+
FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
|
28
|
+
end
|
29
|
+
|
30
|
+
def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
|
31
|
+
FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
#Action to fill a textarea with text
|
36
|
+
def fill_textarea(textarea_name, text)
|
37
|
+
FetchAction.fill_textarea(textarea_name, text)
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
#Action for selecting an option from a dropdown box
|
42
|
+
def select_option(selectlist_name, option)
|
43
|
+
FetchAction.select_option(selectlist_name, option)
|
44
|
+
end
|
45
|
+
|
46
|
+
def check_checkbox(checkbox_name)
|
47
|
+
FetchAction.check_checkbox(checkbox_name)
|
48
|
+
end
|
49
|
+
|
50
|
+
def check_radiobutton(checkbox_name, index=0)
|
51
|
+
FetchAction.check_radiobutton(checkbox_name, index=0)
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
#Fetch the document
|
56
|
+
def fetch(*args)
|
57
|
+
FetchAction.fetch(*args)
|
58
|
+
end
|
59
|
+
##
|
60
|
+
#Submit the current form
|
61
|
+
def submit(index=nil, type=nil)
|
62
|
+
FetchAction.submit(nil, index, type)
|
63
|
+
end
|
64
|
+
|
65
|
+
def submit_and_wait(sleep_time, index=nil, type=nil)
|
66
|
+
FetchAction.submit(index, sleep_time, type)
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
#Click the link specified by the text
|
71
|
+
def click_link(link_spec,index=0)
|
72
|
+
FetchAction.click_link(link_spec,index, 0)
|
73
|
+
end
|
74
|
+
|
75
|
+
def click_link_and_wait(link_spec, sleep_secs=0)
|
76
|
+
FetchAction.click_link(link_spec, 0, sleep_secs)
|
77
|
+
end
|
78
|
+
|
79
|
+
def click_by_xpath(xpath)
|
80
|
+
FetchAction.click_by_xpath(xpath)
|
81
|
+
end
|
82
|
+
|
83
|
+
def click_image_map(index=0)
|
84
|
+
FetchAction.click_image_map(index)
|
85
|
+
end
|
86
|
+
|
87
|
+
def frame(attribute,value)
|
88
|
+
FetchAction.frame(attribute,value)
|
89
|
+
end
|
90
|
+
|
91
|
+
def wait(time=1)
|
92
|
+
FetchAction.wait(time)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Represents a compund example</tt>
|
4
|
+
#
|
5
|
+
#There are two types of string examples in scRUBYt! right now:
|
6
|
+
#the simple example and the compound example. The simple example
|
7
|
+
#is specified by a string, and a compound example is specified with
|
8
|
+
#:contains, :begins_with and :ends_with descriptors - which can be
|
9
|
+
#both regexps or strings
|
10
|
+
class CompoundExample
|
11
|
+
|
12
|
+
DESCRIPTORS = [:contains, :begins_with, :ends_with]
|
13
|
+
|
14
|
+
attr_accessor :descriptor_hash
|
15
|
+
|
16
|
+
def initialize(descriptor_hash)
|
17
|
+
@descriptor_hash = descriptor_hash
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
#Is the hash passed to this function a compound example descriptor hash?
|
22
|
+
#Need to decide this when parsing pattern parameters
|
23
|
+
def self.compound_example?(hash)
|
24
|
+
hash.each do |k,v|
|
25
|
+
return false if !DESCRIPTORS.include? k
|
26
|
+
end
|
27
|
+
true
|
28
|
+
end# end of method
|
29
|
+
end# #end of class CompoundExample
|
30
|
+
end# end of module Scrubyt
|
@@ -0,0 +1,169 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Rejecting result instances based on further rules</tt>
|
4
|
+
#
|
5
|
+
#The two most trivial problems with a set of rules is that they match either less
|
6
|
+
#or more instances than we would like them to. Constraints are a way to remedy the second problem:
|
7
|
+
#they serve as a tool to filter out some result instances based on rules. A typical
|
8
|
+
#example:
|
9
|
+
#
|
10
|
+
#* *ensure_presence_of_ancestor_pattern* consider this model:
|
11
|
+
# <book>
|
12
|
+
# <author>...</author>
|
13
|
+
# <title>...</title>
|
14
|
+
# </book>
|
15
|
+
#
|
16
|
+
#If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
|
17
|
+
#'author' and 'title', only those books will be matched which have an author and a
|
18
|
+
#title (i.e.the child patterns author and title must extract something). This is a way
|
19
|
+
#to say 'a book MUST have an author and a title'.
|
20
|
+
class Constraint
|
21
|
+
#There are more possible ways of applying/checking constraints in the case of
|
22
|
+
#ones that can not be checked in the context node (e.g. ensure_presence_of -
|
23
|
+
#since it may require the evaluation of child patterns of the context pattern to
|
24
|
+
#arbitray level)
|
25
|
+
#
|
26
|
+
#In such cases, the possibilities are:
|
27
|
+
#
|
28
|
+
#1) make a depth-first evaluation from the context pattern until the needed ancestor
|
29
|
+
# pattern is evaluated. This can mess things up, since if any ancestor node uses
|
30
|
+
# the sinks of predecessor(s) other than the context node, those need to be evaluated
|
31
|
+
# too, and we may run into a cyclyc dependency or at least a complicated recursion
|
32
|
+
#
|
33
|
+
#2) Post processing - evaluate normally and throw out results which do not pass the
|
34
|
+
# constraint
|
35
|
+
#
|
36
|
+
#2b) Do it on the XML level - most probably this solution will be implemented
|
37
|
+
|
38
|
+
# Different constraint types
|
39
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
|
40
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
|
41
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
|
42
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
|
43
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
|
44
|
+
|
45
|
+
|
46
|
+
attr_reader :type, :target
|
47
|
+
|
48
|
+
#Add 'ensure presence of ancestor pattern' constraint
|
49
|
+
|
50
|
+
#If this type of constraint is added to a pattern, it must have an ancestor pattern
|
51
|
+
#(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
|
52
|
+
#'Has an ancestor pattern' means that the ancestor pattern actually extracts something
|
53
|
+
#(just by looking at the wrapper model, the ancestor pattern is always present)
|
54
|
+
#Note that from this type of constraint there is no 'ensure_absence' version, since
|
55
|
+
#I could not think about an use case for that
|
56
|
+
def self.add_ensure_presence_of_pattern(ancestor)
|
57
|
+
Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
|
58
|
+
end
|
59
|
+
|
60
|
+
#Add 'ensure absence of attribute' constraint
|
61
|
+
|
62
|
+
#If this type of constraint is added to a pattern, the HTML node it targets
|
63
|
+
#must NOT have an attribute named "attribute_name" with the value "attribute_value"
|
64
|
+
def self.add_ensure_absence_of_attribute(attribute_hash)
|
65
|
+
Constraint.new(attribute_hash,
|
66
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
|
67
|
+
end
|
68
|
+
|
69
|
+
#Add 'ensure presence of attribute' constraint
|
70
|
+
|
71
|
+
#If this type of constraint is added to a pattern, the HTML node it targets
|
72
|
+
#must have an attribute named "attribute_name" with the value "attribute_value"
|
73
|
+
def self.add_ensure_presence_of_attribute(attribute_hash)
|
74
|
+
Constraint.new(attribute_hash,
|
75
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
|
76
|
+
end
|
77
|
+
|
78
|
+
#Add 'ensure absence of ancestor node' constraint
|
79
|
+
|
80
|
+
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
81
|
+
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
82
|
+
#
|
83
|
+
#"attributes" is an array of hashes, for example
|
84
|
+
#[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
|
85
|
+
#in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
|
86
|
+
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
87
|
+
#
|
88
|
+
#"attributes" can be empty - in this case just the 'node_name' is checked
|
89
|
+
def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
|
90
|
+
Constraint.new([node_name, attributes],
|
91
|
+
CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
|
92
|
+
end
|
93
|
+
|
94
|
+
#Add 'ensure presence of ancestor node' constraint
|
95
|
+
|
96
|
+
#If this type of constraint is added to a pattern, the HTML node extracted by the pattern
|
97
|
+
#must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
|
98
|
+
#
|
99
|
+
#"attributes" is an array of hashes, for example
|
100
|
+
#[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
|
101
|
+
#in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
|
102
|
+
#class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
|
103
|
+
#
|
104
|
+
#"attributes" can be empty - in this case just the 'node_name' is checked
|
105
|
+
def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
|
106
|
+
Constraint.new([node_name, attributes],
|
107
|
+
CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
|
108
|
+
end
|
109
|
+
|
110
|
+
#Evaluate the constraint; if this function returns true,
|
111
|
+
#it means that the constraint passed, i.e. its filter will be added to the exctracted
|
112
|
+
#content of the pattern
|
113
|
+
def check(result)
|
114
|
+
case @type
|
115
|
+
#checked after evaluation, so here always return true
|
116
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
|
117
|
+
return true
|
118
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
|
119
|
+
attribute_present(result)
|
120
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
|
121
|
+
!attribute_present(result)
|
122
|
+
when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
|
123
|
+
ancestor_node_present(result)
|
124
|
+
when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
|
125
|
+
!ancestor_node_present(result)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
#We would not like these to be called from outside
|
131
|
+
def initialize(target, type)
|
132
|
+
@target = target
|
133
|
+
@type = type
|
134
|
+
end
|
135
|
+
|
136
|
+
#Implementation of the ancestor node presence test
|
137
|
+
#Check the documentation of the add_ensure_presence_of_ancestor_node method
|
138
|
+
#for further information on the result parameter
|
139
|
+
def ancestor_node_present(result)
|
140
|
+
found = false
|
141
|
+
node_name = @target[0]
|
142
|
+
node_attributes = @target[1]
|
143
|
+
node_attributes.each do |pair|
|
144
|
+
return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
|
145
|
+
end
|
146
|
+
if node_attributes.empty?
|
147
|
+
return true if !result.search("//#{node_name}").empty?
|
148
|
+
end
|
149
|
+
false
|
150
|
+
end
|
151
|
+
|
152
|
+
def attribute_present(result)
|
153
|
+
return unless result.is_a? Hpricot::Elem
|
154
|
+
match = true
|
155
|
+
#If v = nil, the value of the attribute can be arbitrary;
|
156
|
+
#Therefore, in this case we just have to make sure that the attribute is
|
157
|
+
#present (i.e. != nil), we don't care about the value
|
158
|
+
@target.each do |k,v|
|
159
|
+
if v == nil
|
160
|
+
match &&= (result.attributes[k.to_s] != nil)
|
161
|
+
else
|
162
|
+
match &&= (result.attributes[k.to_s] == v.to_s)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
match
|
166
|
+
end
|
167
|
+
|
168
|
+
end #end of class
|
169
|
+
end #end of module
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Utility class for adding constraints</tt>
|
4
|
+
#
|
5
|
+
#Originally methods of Pattern - but since Pattern was already too heavy (and after
|
6
|
+
#all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
|
7
|
+
#to this utility class. In pattern everything that begins with ensure_
|
8
|
+
#is automatically dispatched here.
|
9
|
+
#
|
10
|
+
#I will not document the functions since these are just forwarders; See the 'real'
|
11
|
+
#functions with their documentation in Scrubyt::Constraint.rb
|
12
|
+
class ConstraintAdder
|
13
|
+
|
14
|
+
def self.ensure_presence_of_pattern(ancestor_node_name)
|
15
|
+
Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
|
19
|
+
Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
|
20
|
+
prepare_attributes(attributes))
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
|
24
|
+
Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
|
25
|
+
prepare_attributes(attributes))
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.ensure_presence_of_attribute(attribute_hash)
|
29
|
+
Constraint.add_ensure_presence_of_attribute(attribute_hash)
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.ensure_absence_of_attribute(attribute_hash)
|
33
|
+
Constraint.add_ensure_absence_of_attribute(attribute_hash)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def self.prepare_attributes(attributes)
|
38
|
+
attribute_pairs = []
|
39
|
+
attributes.each do |key, value|
|
40
|
+
if (value.instance_of? Array)
|
41
|
+
value.each {|val| attribute_pairs << [key,val]}
|
42
|
+
else
|
43
|
+
attribute_pairs << [key, value]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
return attribute_pairs
|
47
|
+
end #end of method prepare_attributes
|
48
|
+
end #end of class ConstraintAddere
|
49
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
class AttributeFilter < BaseFilter
|
3
|
+
|
4
|
+
def evaluate(source)
|
5
|
+
elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
|
6
|
+
if elem.is_a? Hpricot::Elem
|
7
|
+
return [elem.attributes[@example]]
|
8
|
+
else
|
9
|
+
return nil
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end #End of class AttributeFilter
|
14
|
+
end #End of module Scrubyt
|