watir_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format progress
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in watir_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # WatirCrawler
2
+
3
+ A watir based web crawler
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'watir_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install watir_crawler
18
+
19
+ ## Usage
20
+
21
+ See examples
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
@@ -0,0 +1,115 @@
1
+ require 'bundler/setup'
2
+ require 'watir_crawler'
3
+
4
+ #WatirCrawler.logger.level = Logger::INFO
5
+ #WatirCrawler.debug = true
6
+
7
+ class CrawlerExample < WatirCrawler::Base
8
+ def yandex_news
9
+ browser_session do
10
+ goto 'http://yandex.ru'
11
+
12
+ news_list = "//ul[@class='b-news-list']"
13
+ wait(news_list)
14
+
15
+ pull(news_list) do
16
+ titles = pull(:all, "./li").map{|li| li.text }
17
+ links = pull(:all, "./li/a").map{|a| a.uri }
18
+
19
+ Hash[ titles.zip(links) ]
20
+ end
21
+ end
22
+ end
23
+
24
+ def goto_login
25
+ browser_session do
26
+ goto 'https://myaccount.alagasco.com/ccsuces/public/frameset_top_html.jsp'
27
+ end
28
+ end
29
+
30
+ def google_news
31
+ browser_session do
32
+ goto 'http://news.google.com/'
33
+
34
+ sections = "//div[@class='section-stream-content']//div[@class='section-list-content']/div"
35
+ wait(sections)
36
+
37
+ pull(:all, sections).reduce({}) do |result, section|
38
+ pull(section.node_xpath) do
39
+ section_name = pull(".//span[@class='section-name']").text
40
+ article_titles = pull(:all, ".//span[@class='titletext']").map{|element| element.text }
41
+
42
+ result[section_name] = article_titles
43
+ end
44
+
45
+ result
46
+ end
47
+ end
48
+ end
49
+
50
+ def get_proxy(proxy_port = 3128)
51
+ browser_session do
52
+ goto 'http://hideme.ru/proxy-list/'
53
+
54
+ checkbox = "//input[@id='c_all']"
55
+ wait(checkbox).clear
56
+ sleep 1
57
+
58
+ wait("//select[@id='country']").select_value('JP')
59
+ wait("//input[@id='t_h']").set # set http proxy
60
+ wait("//input[@id='maxtime']").set 1400 # set proxy timeout
61
+ wait("//input[@id='ports']").set proxy_port # set proxy port
62
+ wait("//a[contains(@href,'search()')]").click # search !
63
+
64
+ # get 1th proxy ip from the list
65
+ proxy_list = "//table[@class='pl']"
66
+ wait(proxy_list)
67
+
68
+ proxy_ip = pull(proxy_list) { pull(".//tr[2]/td[1]") }
69
+ raise 'No proxy found' unless proxy_ip
70
+
71
+ [proxy_ip.text, proxy_port]
72
+ end
73
+ end
74
+
75
+ def via_proxy proxy_ip, proxy_port
76
+ browser_profile do |profile|
77
+ profile['network.proxy.type'] = 1
78
+ profile['network.proxy.http'] = proxy_ip
79
+ profile['network.proxy.http_port'] = proxy_port.to_i
80
+ end
81
+
82
+ browser_session do
83
+ goto 'http://www.whatsmyip.org/'
84
+ wait("//span[@id='ip']").text
85
+ end
86
+ end
87
+
88
+ end
89
+
90
+ # ----------------------------------------------------------------------------------------------------------------------
91
+
92
+ timeouts = {
93
+ :page_load => 150,
94
+ :wait_timeout => 100 # wait for element on the page
95
+ }
96
+
97
+ begin
98
+ crawler = CrawlerExample.new(timeouts)
99
+
100
+ puts 'Last google news'
101
+ p crawler.google_news
102
+
103
+ puts 'Last yandex news'
104
+ p crawler.yandex_news
105
+
106
+ proxy = crawler.get_proxy
107
+ puts "Found proxy: #{proxy.join(':')}"
108
+
109
+ current_proxy = crawler.via_proxy(*proxy)
110
+ puts "Current proxy: #{current_proxy}"
111
+ rescue WatirCrawler::SiteTooSlow
112
+ puts
113
+ puts 'ERROR: Site too slow'
114
+ end
115
+
@@ -0,0 +1,5 @@
1
+ module WatirCrawler
2
+ class Abstract
3
+ include Loggable
4
+ end
5
+ end
@@ -0,0 +1,216 @@
1
+ require 'timeout' # fix error 'uninitialized constant WatirCrawler::Base::Timeout'
2
+
3
+ module WatirCrawler
4
+ class Base < Abstract
5
+ attr_reader :timeouts
6
+
7
+ def initialize(timeouts = {})
8
+ @elements_path = []
9
+ @timeouts = { :wait_timeout => 150 }.merge(timeouts)
10
+ @browser = WatirCrawler::Browser.new(@timeouts)
11
+ end
12
+
13
+ def browser_profile
14
+ @browser.profile do |profile|
15
+ yield profile if block_given?
16
+ end
17
+ end
18
+
19
+ def browser
20
+ @browser.browser
21
+ end
22
+
23
+ def browser_start
24
+ @browser.start
25
+ end
26
+
27
+ def browser_stop
28
+ @browser.stop
29
+ end
30
+
31
+ def browser_session
32
+ timer do
33
+ catch_error do
34
+ browser_start
35
+ yield
36
+ end
37
+ end
38
+ ensure
39
+ browser_stop
40
+ end
41
+
42
+ def timer
43
+ log.info "Session start"
44
+ start_time = Time.now
45
+ yield
46
+ ensure
47
+ log.info "Session end, elapsed time: #{Time.now - start_time}"
48
+ end
49
+
50
+ def catch_error
51
+ yield
52
+ rescue Timeout::Error, # http connection with driver
53
+ Selenium::WebDriver::Error::TimeOutError, # browser.driver.manage.timeouts.page_load
54
+ Selenium::WebDriver::Error::ScriptTimeOutError # browser.driver.manage.timeouts.script_timeout
55
+
56
+ log.error "Site is too slow at page: '#{browser.url}'"
57
+ raise SiteTooSlow
58
+
59
+ rescue SystemCallError, # 'Unknown error - Connection reset by peer'
60
+ Errno::ECONNREFUSED, # 'Connection refused - Connection refused'
61
+ Selenium::WebDriver::Error::WebDriverError => e # 'unable to obtain stable firefox connection in 60 seconds (127.0.0.1:7055)'
62
+ # 'unable to bind to locking port 7054 within 45 seconds'
63
+ messages = [
64
+ /Connection reset by peer/, # SystemCallError
65
+ /Connection refused/, # Errno::ECONNREFUSED
66
+ /unable to obtain stable firefox connection/, # Selenium::WebDriver::Error::WebDriverError
67
+ /unable to bind to locking port/ # Selenium::WebDriver::Error::WebDriverError
68
+ ]
69
+
70
+ log "#{e.class}: #{e.message} \n#{e.backtrace.join("\n")}"
71
+
72
+ klass = messages.select{|msg| msg =~ e.message }.any? ? WebdriverError : SiteChanged
73
+ raise klass
74
+ end
75
+
76
+ # --- commands
77
+
78
+ def goto url
79
+ browser.goto url if url != browser.url
80
+ end
81
+
82
+ def exec script
83
+ browser.execute_script(script)
84
+ end
85
+
86
+ # #####################################################################################################
87
+
88
+ # --------------------------------------------------------------------------------------------------------------------
89
+ def pull *args, &block
90
+ opts, xpaths = args.flatten.partition{|a| a.is_a?(Symbol) }
91
+ opt_mode = opts.delete(:exist?) || opts.delete(:present?) || :present? # default is :present?
92
+ opt_first = !!opts.delete(:first) || !opts.delete(:all) # default is true, return 1th element
93
+ raise "Unknown options: '#{opts.inspect}'" if opts.any?
94
+
95
+ elements = xpaths.select do |xpath|
96
+ node_for(xpath).send(opt_mode) # detect element on the page by opt_mode
97
+ end.map do |xpath|
98
+ nodes_for(xpath) # get all elements
99
+ end.flatten.select do |node|
100
+ node.send(opt_mode) # select elements by mode
101
+ end
102
+
103
+ # flash result nodes
104
+ elements = elements.take(1) if opt_first
105
+ elements.each{|node| node.flash unless node.is_a?(Watir::Frame) }
106
+
107
+ first_element = elements.first
108
+
109
+ if block
110
+ raise SiteChanged, "Not found elements for xpath: #{xpaths.inspect}" if first_element.nil?
111
+ nodes_path << first_element.node_xpath
112
+ yield
113
+ else
114
+ if opt_first
115
+ first_element && first_element.to_subtype
116
+ else
117
+ elements.map{|element| element.to_subtype }
118
+ end
119
+ end
120
+ rescue Selenium::WebDriver::Error::StaleElementReferenceError,
121
+ Selenium::WebDriver::Error::ObsoleteElementError
122
+ sleep 1
123
+ retry
124
+ ensure
125
+ nodes_path.pop if block
126
+ end
127
+
128
+ # --------------------------------------------------------------------------------------------------------------------
129
+ # :first - get FIRST element of FIRST founded xpath, DEFAULT OPTION
130
+ # :all - get ALL elements of FIRST founded xpath
131
+ def wait *xpaths, &block
132
+ #todo 3 raise_if_site_too_slow if respond_to?(:raise_if_site_too_slow)
133
+ common_wait *xpaths, &block
134
+ end
135
+
136
+ # --------------------------------------------------------------------------------------------------------------------
137
+ def common_wait *args, &block
138
+ browser.wait_until(@timeouts[:wait_timeout]) do
139
+ #todo 1 raise_if_firefox_error if respond_to?(:raise_if_firefox_error)
140
+ #todo 2 raise_if_service_unavailable if respond_to?(:raise_if_service_unavailable) # see class method :raise_service_unavailable_if
141
+
142
+ if args.any? || block
143
+ pull(args) || (block && instance_eval(&block))
144
+ else
145
+ return nil # running raise_if 1 times and exit if no args & block
146
+ end
147
+ end
148
+ rescue Selenium::WebDriver::Error::StaleElementReferenceError,
149
+ Selenium::WebDriver::Error::ObsoleteElementError
150
+ sleep 1
151
+ retry
152
+ rescue Watir::Wait::TimeoutError
153
+ raise SiteChanged
154
+ end
155
+ # --------------------------------------------------------------------------------------------------------------------
156
+
157
+ def exist? xpath
158
+ !!pull(xpath)
159
+ end
160
+
161
+ # --------------------------------------------------------------------------------------------------------------------
162
+
163
+ private
164
+
165
+ def nodes_path
166
+ @nodes_path ||= []
167
+ end
168
+
169
+ def node_for xpath
170
+ get_nodes(xpath, :get_all => false).first
171
+ end
172
+
173
+ def nodes_for xpath
174
+ get_nodes(xpath, :get_all => true)
175
+ end
176
+
177
+ def get_nodes xpath, opts
178
+ element_path = xpath_relative?(xpath) ? nodes_path.map { |node_xpath| element_name_for(node_xpath) } : nil
179
+ element_name = element_name_for(xpath, opts[:get_all])
180
+
181
+ eval_string = [element_path, element_name].flatten.compact.join('.')
182
+
183
+ log "#{File.basename(__FILE__)}:#{__LINE__}, eval_string: " + eval_string.inspect
184
+
185
+ elements = browser.instance_eval(eval_string)
186
+ elements = elements.to_a if elements.is_a? Watir::ElementCollection
187
+ elements = [elements].flatten
188
+
189
+ elements.map.with_index do |element, index|
190
+ class << element
191
+ attr_accessor :node_xpath
192
+ end
193
+
194
+ element.node_xpath = xpath + "[#{index + 1}]"
195
+ element
196
+ end
197
+ end
198
+
199
+ def element_name_for xpath, plural = false
200
+ [
201
+ xpath_with_frame?(xpath) ? 'frame' : 'element',
202
+ plural ? 's' : '',
203
+ '(:xpath, "' + xpath + '")'
204
+ ].join
205
+ end
206
+
207
+ def xpath_relative? xpath
208
+ xpath =~ /^\.\/.*/ # "./"
209
+ end
210
+
211
+ def xpath_with_frame? xpath
212
+ xpath =~ /^[\.]?\/[\/]?[i]?frame.*/ # "//frame", "//iframe", ".//frame", ".//iframe"
213
+ end
214
+
215
+ end
216
+ end
@@ -0,0 +1,41 @@
1
+ module WatirCrawler
2
+ class Browser < Abstract
3
+ def initialize(timeouts)
4
+ @browser = nil
5
+
6
+ @timeouts = {
7
+ :http_client_timeout => 120,
8
+ :implicit_wait => 0,
9
+ :page_load => 100,
10
+ :script_timeout => 10
11
+ }.merge(timeouts)
12
+ end
13
+
14
+ def profile
15
+ @browser_profile ||= Selenium::WebDriver::Firefox::Profile.new
16
+ yield @browser_profile if block_given?
17
+ @browser_profile
18
+ end
19
+
20
+ def browser
21
+ @browser
22
+ end
23
+
24
+ def start
25
+ return if @browser && @browser.exist?
26
+
27
+ # See http://code.google.com/p/selenium/wiki/RubyBindings#Timeouts
28
+ http_client = Selenium::WebDriver::Remote::Http::Default.new
29
+ http_client.timeout = @timeouts[:http_client_timeout]
30
+
31
+ @browser = Watir::Browser.new :firefox, :profile => profile, :http_client => http_client
32
+ @browser.driver.manage.timeouts.implicit_wait = @timeouts[:implicit_wait]
33
+ @browser.driver.manage.timeouts.page_load = @timeouts[:page_load]
34
+ @browser.driver.manage.timeouts.script_timeout = @timeouts[:script_timeout]
35
+ end
36
+
37
+ def stop
38
+ @browser.close if @browser
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,14 @@
1
+ require 'nestegg'
2
+
3
+ module WatirCrawler
4
+ class Error < StandardError
5
+ include Nestegg::NestingException
6
+ end
7
+
8
+ class WebdriverError < Error; end
9
+ class ServiceUnavailable < Error; end
10
+ class SiteTooSlow < Error; end
11
+ class SiteChanged < Error; end
12
+
13
+ class UnknownError < Error; end
14
+ end
@@ -0,0 +1,49 @@
1
+ require 'logger'
2
+
3
+ module WatirCrawler
4
+ module Loggable
5
+ module Logger
6
+ def logger
7
+ @@logger ||= ::Logger.new(STDOUT) # Ruby's logger by default
8
+ end
9
+
10
+ def logger=(logger)
11
+ @@logger = logger
12
+ end
13
+
14
+ def debug
15
+ @@debug ||= false
16
+ end
17
+
18
+ def debug=(debug)
19
+ @@debug = debug
20
+ end
21
+ end
22
+
23
+ extend Logger
24
+
25
+ module Log
26
+ def log msg = nil
27
+ if msg
28
+ Loggable.logger.debug(msg) if Loggable.debug
29
+ else
30
+ Loggable.logger
31
+ end
32
+ end
33
+ end
34
+
35
+ # for extending of module
36
+ def self.extended(base)
37
+ base.extend Logger
38
+ base.extend Log
39
+ end
40
+
41
+ # for including to class
42
+ def self.included(base)
43
+ base.extend Log
44
+ base.send :include, Log
45
+ end
46
+ end
47
+
48
+ extend Loggable
49
+ end
@@ -0,0 +1,3 @@
1
+ module WatirCrawler
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,19 @@
1
+ module Watir
2
+ class Element
3
+ def uri
4
+ url = self.attribute_value(:src) || self.attribute_value(:href)
5
+ URI.join(self.browser.url, url).to_s if url
6
+ end
7
+ end
8
+
9
+ class Image
10
+ def save_to_file filepath
11
+ File.open(filepath, 'wb') do |f|
12
+ f.write open(self.uri).read
13
+ end
14
+
15
+ filepath
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,9 @@
1
+ require 'watir-webdriver'
2
+ require 'watir_crawler/watir-webdriver'
3
+
4
+ require 'watir_crawler/version'
5
+ require 'watir_crawler/errors'
6
+ require 'watir_crawler/loggable'
7
+ require 'watir_crawler/abstract'
8
+ require 'watir_crawler/browser'
9
+ require 'watir_crawler/base'
data/spec/base_spec.rb ADDED
@@ -0,0 +1,234 @@
1
+ describe 'WatirCrawler::Base : ' do
2
+ before(:all) do
3
+ #log_file = File.join('/tmp', 'watir_crawler.log')
4
+ #puts "\nSaving debug log to: '#{log_file}'"
5
+ #WatirCrawler.logger = ::Logger.new(log_file)
6
+ #WatirCrawler.debug = true
7
+
8
+ @htmlfile = Htmlfile.new
9
+
10
+ @crawler = WatirCrawler::Base.new(:wait_timeout => 3)
11
+ @crawler.browser_start
12
+ @crawler.goto @htmlfile.url
13
+ end
14
+
15
+ after(:all) do
16
+ @crawler.browser_stop
17
+ @htmlfile.delete
18
+ end
19
+
20
+ # --------------------------------------------------------------------------------------------------------------------
21
+
22
+ it 'pull' do
23
+ # unknown option
24
+ begin
25
+ @crawler.pull(:unknown_option)
26
+ rescue => e
27
+ e.should be_a(RuntimeError)
28
+ e.message.should =~ /Unknown option/i
29
+ end
30
+
31
+ # not exist tag
32
+ @crawler.pull('//not_exist_tag').should be_nil
33
+ @crawler.pull(:first, '//not_exist_tag').should be_nil
34
+ @crawler.pull(:all, '//not_exist_tag').should =~ []
35
+
36
+ # exist tag, :present? mode by default
37
+ @crawler.pull('//div').should be_a(Watir::HTMLElement)
38
+ @crawler.pull(:first, '//div').should be_a(Watir::HTMLElement)
39
+
40
+ result = @crawler.pull(:all, '//div')
41
+ result.should be_a(Array)
42
+ result.size.should eq(3)
43
+
44
+ # exist tag, :present? mode
45
+ @crawler.pull(:present?, '//div').should be_a(Watir::HTMLElement)
46
+ @crawler.pull(:present?, :first, '//div').should be_a(Watir::HTMLElement)
47
+
48
+ result = @crawler.pull(:present?, :all, '//div')
49
+ result.should be_a(Array)
50
+ result.size.should eq(3)
51
+
52
+ # exist tag, :exist? mode
53
+ @crawler.pull(:exist?, '//div').should be_a(Watir::HTMLElement)
54
+ @crawler.pull(:exist?, :first, '//div').should be_a(Watir::HTMLElement)
55
+
56
+ result = @crawler.pull(:exist?, :all, '//div')
57
+ result.should be_a(Array)
58
+ result.size.should eq(4)
59
+
60
+ # hidden tag, :present? mode
61
+ @crawler.pull(:present?, "//div[@id='4']").should be_nil
62
+
63
+ result = @crawler.pull(:present?, :all, "//div[@id='4']")
64
+ result.should be_a(Array)
65
+ result.should be_empty
66
+
67
+ # hidden tag, :exist? mode
68
+ @crawler.pull(:exist?, "//div[@id='4']").should be_a(Watir::HTMLElement)
69
+
70
+ result = @crawler.pull(:exist?, :all, "//div[@id='4']")
71
+ result.should be_a(Array)
72
+ result.size.should eq(1)
73
+ end
74
+
75
+ # --------------------------------------------------------------------------------------------------------------------
76
+
77
+ it 'wait' do
78
+ @crawler.wait.should be_nil
79
+
80
+ # wait without params but with block
81
+ @crawler.wait{ true }.should be_true
82
+
83
+ begin
84
+ @crawler.wait{ false }
85
+ rescue => e
86
+ e.should be_a(WatirCrawler::SiteChanged)
87
+ end
88
+
89
+ # unknown option
90
+ begin
91
+ @crawler.wait(:unknown_option)
92
+ rescue => e
93
+ e.should be_a(RuntimeError)
94
+ e.message.should =~ /Unknown option/i
95
+ end
96
+
97
+ # not exist tag
98
+ begin
99
+ @crawler.wait('//not_exist_tag')
100
+ rescue => e
101
+ e.should be_a(WatirCrawler::SiteChanged)
102
+ end
103
+
104
+ # exist tag, :present? mode by default
105
+ @crawler.wait('//div').should be_a(Watir::HTMLElement)
106
+
107
+ # exist tag, :present? mode
108
+ @crawler.wait(:present?, '//div').should be_a(Watir::HTMLElement)
109
+
110
+ # hidden tag, :present? mode
111
+ begin
112
+ @crawler.wait(:present?, "//div[@id='4']")
113
+ rescue => e
114
+ e.should be_a(WatirCrawler::SiteChanged)
115
+ end
116
+
117
+ # exist tag, :exist? mode
118
+ @crawler.wait(:exist?, "//div[@id='1']").should be_a(Watir::HTMLElement)
119
+
120
+ # hidden tag, :exist? mode
121
+ @crawler.wait(:exist?, "//div[@id='4']").should be_a(Watir::HTMLElement)
122
+ end
123
+
124
+ # --------------------------------------------------------------------------------------------------------------------
125
+
126
+ it 'exist?' do
127
+ @crawler.exist?("//div[@id='1']").should be_true
128
+ @crawler.exist?('//not_exist_tag').should be_false
129
+ end
130
+
131
+ # --------------------------------------------------------------------------------------------------------------------
132
+
133
+ it 'pull for nested elements' do
134
+ @crawler.pull("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
135
+ @crawler.pull(".//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
136
+
137
+ result = @crawler.pull(:all, '//span')
138
+ result.should be_a(Array)
139
+ result.size.should eq(2)
140
+
141
+ result = @crawler.pull(:all, './/span')
142
+ result.should be_a(Array)
143
+ result.size.should eq(2)
144
+
145
+ @crawler.pull("//p[@id='level_1']") do
146
+ @crawler.pull("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
147
+ @crawler.pull(".//span[text()='Test span 1']").should be_nil
148
+
149
+ result = @crawler.pull(:all, '//span')
150
+ result.should be_a(Array)
151
+ result.size.should eq(2)
152
+
153
+ result = @crawler.pull(:all, './/span')
154
+ result.should be_a(Array)
155
+ result.should be_empty
156
+
157
+ @crawler.pull("//p[@id='level_2']") do
158
+ @crawler.pull("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
159
+ @crawler.pull(".//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
160
+
161
+ result = @crawler.pull(:all, '//span')
162
+ result.should be_a(Array)
163
+ result.size.should eq(2)
164
+
165
+ result = @crawler.pull(:all, './/span')
166
+ result.should be_a(Array)
167
+ result.size.should eq(2)
168
+ end
169
+ end
170
+
171
+ end
172
+
173
+ it 'wait for nested elements' do
174
+ @crawler.wait("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
175
+ @crawler.wait(".//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
176
+
177
+ result = @crawler.wait(:all, '//span')
178
+ result.should be_a(Array)
179
+ result.size.should eq(2)
180
+
181
+ result = @crawler.wait(:all, './/span')
182
+ result.should be_a(Array)
183
+ result.size.should eq(2)
184
+
185
+ @crawler.pull("//p[@id='level_1']") do
186
+ @crawler.wait("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
187
+
188
+ begin
189
+ @crawler.wait(".//span[text()='Test span 1']")
190
+ rescue => e
191
+ e.should be_a(WatirCrawler::SiteChanged)
192
+ end
193
+
194
+ result = @crawler.wait(:all, '//span')
195
+ result.should be_a(Array)
196
+ result.size.should eq(2)
197
+
198
+ begin
199
+ @crawler.wait('.//span')
200
+ rescue => e
201
+ e.should be_a(WatirCrawler::SiteChanged)
202
+ end
203
+
204
+ @crawler.pull("//p[@id='level_2']") do
205
+ @crawler.wait("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
206
+ @crawler.wait(".//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
207
+
208
+ result = @crawler.wait(:all, '//span')
209
+ result.should be_a(Array)
210
+ result.size.should eq(2)
211
+
212
+ result = @crawler.wait(:all, './/span')
213
+ result.should be_a(Array)
214
+ result.size.should eq(2)
215
+ end
216
+ end
217
+ end
218
+
219
+ it 'frame', :frame => true do
220
+ @crawler.pull('//iframe') do
221
+ @crawler.pull(".//span[text()='Test span 11']").should be_a(Watir::HTMLElement)
222
+
223
+ @crawler.pull(".//span[text()='Test span 21']").should be_nil
224
+ @crawler.pull('.//iframe') do
225
+ @crawler.pull("//span[text()='Test span 1']").should be_a(Watir::HTMLElement)
226
+ @crawler.pull("//span[text()='Test span 11']").should be_nil
227
+ @crawler.pull("//span[text()='Test span 21']").should be_nil
228
+ @crawler.pull(".//span[text()='Test span 11']").should be_nil
229
+ @crawler.pull(".//span[text()='Test span 21']").should be_a(Watir::HTMLElement)
230
+ end
231
+ end
232
+ end
233
+
234
+ end
@@ -0,0 +1,189 @@
1
+ RSpec.configure do |config|
2
+ config.treat_symbols_as_metadata_keys_with_true_values = true
3
+ config.run_all_when_everything_filtered = true
4
+ config.filter_run :focus
5
+ end
6
+
7
+ require 'watir_crawler'
8
+ WORKING_DIRECTORY = '/tmp'
9
+ $debug = true
10
+ # ----------------------------------------------------------------------------------------------------------------------
11
+
12
+ require 'tempfile'
13
+
14
+ class Htmlfile
15
+ def initialize
16
+ @frameset = Tempfile.new('frameset.html')
17
+ @frameset.write(frameset_content)
18
+ @frameset.close
19
+
20
+ @frame2 = Tempfile.new('frame2.html')
21
+ @frame2.write(frame2_content)
22
+ @frame2.close
23
+
24
+ @frame1 = Tempfile.new('frame1.html')
25
+ @frame1.write(frame1_content)
26
+ @frame1.close
27
+
28
+ @file = Tempfile.new('test.html')
29
+ @file.write(content)
30
+ @file.close
31
+ @file
32
+ end
33
+
34
+ def content
35
+ # todo #{frameset}
36
+ cnt = <<-EOF
37
+ <!DOCTYPE html>
38
+ <html>
39
+ <head>
40
+ <title>Test</title>
41
+ </head>
42
+ <body>
43
+ <div id="1">div 1</div>
44
+ <div id="2">div 2</div>
45
+ <div id="3">div 3</div>
46
+ <div id="4" style="display:none;">div 4</div>
47
+ <p id="level_1">
48
+ <p id="level_2">
49
+ <span>Test span 1</span>
50
+ <span>Test span 2</span>
51
+ </p>
52
+ </p>
53
+ #{frame1}
54
+ </body>
55
+ </html>
56
+ EOF
57
+
58
+ cnt
59
+ end
60
+
61
+ def frame1
62
+ cnt = <<-EOF
63
+ <iframe id="loginframe1" src="#{file_url @frame1}" style=""></iframe>
64
+ EOF
65
+
66
+ cnt
67
+ end
68
+
69
+ def frame2
70
+ cnt = <<-EOF
71
+ <iframe id="loginframe2" src="#{file_url @frame2}" style=""></iframe>
72
+ EOF
73
+
74
+ cnt
75
+ end
76
+
77
+ def frame1_content
78
+ cnt = <<-EOF
79
+ <!DOCTYPE html>
80
+ <html>
81
+ <head>
82
+ <title>Frame 1</title>
83
+ </head>
84
+ <body>
85
+ <h2>Frame 1</h2>
86
+ <div id="11">div 1</div>
87
+ <div id="12">div 2</div>
88
+ <div id="13">div 3</div>
89
+ <div id="14" style="display:none;">div 4</div>
90
+ <p id="level_11">
91
+ <p id="level_12">
92
+ <span>Test span 11</span>
93
+ <span>Test span 12</span>
94
+ </p>
95
+ </p>
96
+ #{frame2}
97
+ </body>
98
+ </html>
99
+ EOF
100
+
101
+ cnt
102
+ end
103
+
104
+ def frame2_content
105
+ cnt = <<-EOF
106
+ <!DOCTYPE html>
107
+ <html>
108
+ <head>
109
+ <title>Frame 2</title>
110
+ </head>
111
+ <body>
112
+ <h2>Frame 2</h2>
113
+ <div id="21">div 1</div>
114
+ <div id="22">div 2</div>
115
+ <div id="23">div 3</div>
116
+ <div id="24" style="display:none;">div 4</div>
117
+ <p id="level_21">
118
+ <p id="level_22">
119
+ <span>Test span 21</span>
120
+ <span>Test span 22</span>
121
+ </p>
122
+ </p>
123
+ </body>
124
+ </html>
125
+ EOF
126
+
127
+ cnt
128
+ end
129
+
130
+ def frameset
131
+ # TODO
132
+ cnt = <<-EOF
133
+ <frameset rows="80,*" cols="*">
134
+ <frame src="#{file_url @frameset}" name="topFrame">
135
+ <frameset cols="80,*">
136
+ <frame src="#{file_url @frameset}" name="leftFrame">
137
+ <frame src="#{file_url @frameset}" name="mainFrame">
138
+ </frameset>
139
+ </frameset>
140
+ EOF
141
+
142
+ cnt
143
+ end
144
+
145
+ def frameset_content
146
+ cnt = <<-EOF
147
+ <!DOCTYPE html>
148
+ <html>
149
+ <head>
150
+ <title>Frameset Frame</title>
151
+ </head>
152
+ <body>
153
+ <h2>Frameset Frame</h2>
154
+ <div id="111">div 1</div>
155
+ <div id="112">div 2</div>
156
+ <div id="113">div 3</div>
157
+ <div id="114" style="display:none;">div 4</div>
158
+ <p id="level_111">
159
+ <p id="level_112">
160
+ <span>Test span 111</span>
161
+ <span>Test span 112</span>
162
+ </p>
163
+ </p>
164
+ </body>
165
+ </html>
166
+ EOF
167
+
168
+ cnt
169
+ end
170
+
171
+ def url
172
+ file_url @file
173
+ end
174
+
175
+ def file_url file
176
+ "file://#{file.path}"
177
+ end
178
+
179
+ def delete
180
+ [
181
+ @frameset,
182
+ @frame1,
183
+ @frame2,
184
+ @file
185
+ ].each{|file| file.unlink }
186
+ end
187
+ end
188
+
189
+
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'watir_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'watir_crawler'
8
+ spec.version = WatirCrawler::VERSION
9
+ spec.authors = ['Dmitry T']
10
+ spec.email = ['atlancer@gmail.com']
11
+ spec.description = %q{A watir based web crawler}
12
+ spec.summary = %q{A watir based web crawler}
13
+ spec.homepage = ''
14
+ spec.license = 'MIT'
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+ spec.add_dependency 'nestegg'
20
+ spec.add_dependency 'settingslogic'
21
+ spec.add_dependency 'watir-webdriver', '~> 0.6.4'
22
+ spec.add_development_dependency 'bundler', '~> 1.3'
23
+ spec.add_development_dependency 'rspec', '~> 2.6'
24
+ spec.add_development_dependency 'rake'
25
+ end
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: watir_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Dmitry T
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-01-21 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nestegg
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: settingslogic
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: watir-webdriver
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 0.6.4
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.6.4
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '1.3'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '1.3'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: '2.6'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: '2.6'
94
+ - !ruby/object:Gem::Dependency
95
+ name: rake
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: A watir based web crawler
111
+ email:
112
+ - atlancer@gmail.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - .gitignore
118
+ - .rspec
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - examples/example.rb
124
+ - lib/watir_crawler.rb
125
+ - lib/watir_crawler/abstract.rb
126
+ - lib/watir_crawler/base.rb
127
+ - lib/watir_crawler/browser.rb
128
+ - lib/watir_crawler/errors.rb
129
+ - lib/watir_crawler/loggable.rb
130
+ - lib/watir_crawler/version.rb
131
+ - lib/watir_crawler/watir-webdriver.rb
132
+ - spec/base_spec.rb
133
+ - spec/spec_helper.rb
134
+ - watir_crawler.gemspec
135
+ homepage: ''
136
+ licenses:
137
+ - MIT
138
+ post_install_message:
139
+ rdoc_options: []
140
+ require_paths:
141
+ - lib
142
+ required_ruby_version: !ruby/object:Gem::Requirement
143
+ none: false
144
+ requirements:
145
+ - - ! '>='
146
+ - !ruby/object:Gem::Version
147
+ version: '0'
148
+ segments:
149
+ - 0
150
+ hash: -715382381
151
+ required_rubygems_version: !ruby/object:Gem::Requirement
152
+ none: false
153
+ requirements:
154
+ - - ! '>='
155
+ - !ruby/object:Gem::Version
156
+ version: '0'
157
+ segments:
158
+ - 0
159
+ hash: -715382381
160
+ requirements: []
161
+ rubyforge_project:
162
+ rubygems_version: 1.8.25
163
+ signing_key:
164
+ specification_version: 3
165
+ summary: A watir based web crawler
166
+ test_files:
167
+ - spec/base_spec.rb
168
+ - spec/spec_helper.rb