scraypa 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 428e660dbb1d40ffb042053e7dbc0198b21aff72
4
+ data.tar.gz: fe07d8c05e7f458a41a234221ddab2c1c79f357c
5
+ SHA512:
6
+ metadata.gz: 1757aa2cb798597a6588c57975dfa516048286f2c43eebcbd79affb2fba56519fb034b15f44c8b01a0fc80938db5465f578e2cb85b1428ed6f7282a4afc35d77
7
+ data.tar.gz: 983a91b91f510bdf93142f6614262af81398a33ddcb41fd02e25b408ee944a2e06afaa41e760939bba7b37ab306638d392edb7015b342c27847851d2e9a36a25
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /.idea/
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.14.6
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scraypa.gemspec
4
+ gemspec
5
+
@@ -0,0 +1,90 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ ## Uncomment and set this to only include directories you want to watch
5
+ #directories %w(lib lib/scraypa/ lib/scraypa/eye/ spec) \
6
+ # .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")}
7
+
8
+ ## Note: if you are using the `directories` clause above and you are not
9
+ ## watching the project directory ('.'), then you will want to move
10
+ ## the Guardfile to a watched dir and symlink it back, e.g.
11
+ #
12
+ # $ mkdir config
13
+ # $ mv Guardfile config/
14
+ # $ ln -s config/Guardfile .
15
+ #
16
+ # and, you'll have to watch "config/Guardfile" instead of "Guardfile"
17
+
18
+ # Note: The cmd option is now required due to the increasing number of ways
19
+ # rspec may be run, below are examples of the most common uses.
20
+ # * bundler: 'bundle exec rspec'
21
+ # * bundler binstubs: 'bin/rspec'
22
+ # * spring: 'bin/rspec' (This will use spring if running and you have
23
+ # installed the spring binstubs per the docs)
24
+ # * zeus: 'zeus rspec' (requires the server to be started separately)
25
+ # * 'just' rspec: 'rspec'
26
+ =begin
27
+ guard :rspec, cmd: "bundle exec rspec" do
28
+ require "guard/rspec/dsl"
29
+ dsl = Guard::RSpec::Dsl.new(self)
30
+
31
+ # Feel free to open issues for suggestions and improvements
32
+
33
+ # RSpec files
34
+ rspec = dsl.rspec
35
+ watch(rspec.spec_helper) { rspec.spec_dir }
36
+ watch(rspec.spec_support) { rspec.spec_dir }
37
+ watch(rspec.spec_files)
38
+
39
+ # Ruby files
40
+ ruby = dsl.ruby
41
+ dsl.watch_spec_files_for(ruby.lib_files)
42
+
43
+ # Rails files
44
+ rails = dsl.rails(view_extensions: %w(erb haml slim))
45
+ dsl.watch_spec_files_for(rails.app_files)
46
+ dsl.watch_spec_files_for(rails.views)
47
+
48
+ watch(rails.controllers) do |m|
49
+ [
50
+ rspec.spec.call("routing/#{m[1]}_routing"),
51
+ rspec.spec.call("controllers/#{m[1]}_controller"),
52
+ rspec.spec.call("acceptance/#{m[1]}")
53
+ ]
54
+ end
55
+
56
+ # Rails config changes
57
+ watch(rails.spec_helper) { rspec.spec_dir }
58
+ watch(rails.routes) { "#{rspec.spec_dir}/routing" }
59
+ watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
60
+
61
+ # Capybara features specs
62
+ watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") }
63
+ watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") }
64
+
65
+ # Turnip features and steps
66
+ watch(%r{^spec/acceptance/(.+)\.feature$})
67
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m|
68
+ Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance"
69
+ end
70
+ end
71
+ =end
72
+
73
+ guard :rspec, cmd: 'bundle exec rspec' do
74
+ require "guard/rspec/dsl"
75
+ dsl = Guard::RSpec::Dsl.new(self)
76
+
77
+ # RSpec files
78
+ rspec = dsl.rspec
79
+ watch(rspec.spec_helper) { rspec.spec_dir }
80
+ watch(rspec.spec_support) { rspec.spec_dir }
81
+ watch(rspec.spec_files)
82
+
83
+ # Ruby files
84
+ ruby = dsl.ruby
85
+ dsl.watch_spec_files_for(ruby.lib_files)
86
+
87
+ #watch(%r{^spec/.+_spec\.rb$})
88
+ #watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
89
+ #watch('spec/spec_helper.rb') { "spec" }
90
+ end
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 joshweir
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,142 @@
1
+ # Scraypa
2
+
3
+ A Ruby gem to scrape web content with configuration options including:
4
+
5
+ 1. [Javscript support](#javascript-support)
6
+ 2. [The Onion Router (Tor)](#tor)
7
+ 3. [Disguise](#disguise) (TODO)
8
+
9
+ Scraypa is essentially a wrapper for the light-weight
10
+ [Rest Client](https://github.com/rest-client/rest-client) (if you dont require javascript support)
11
+ or [Capybara](https://github.com/teamcapybara/capybara) (for Javascript support).
12
+
13
+ ## Why?
14
+
15
+ A web scraper that can be configured to support javascript and/or Tor. If javascript is not required,
16
+ it will use the lighter Rest Client. Scraypa is an attempt to remove the complexities associated to web agent setup.
17
+
18
+ ## Installation
19
+
20
+ ### Install Tor (optional)
21
+
22
+ If you want to use Tor, install tor:
23
+
24
+ `sudo apt-get install tor`
25
+
26
+ ### Install Headless Chrome (optional)
27
+
28
+ If you want to use `:headless_chrome` with capybara, install
29
+ headless chrome by following instructions here:
30
+
31
+ http://blog.faraday.io/headless-chromium/
32
+
33
+ For ubuntu I did this:
34
+
35
+ 1. Install chromium:
36
+
37
+ git clone https://github.com/scheib/chromium-latest-linux.git
38
+ cd chromium-latest-linux
39
+ ./update-and-run.sh
40
+
41
+ 2. Install chromedriver by [following the build instructions](https://chromium.googlesource.com/chromium/src/+/master/docs/linux_build_instructions.md).
42
+
43
+ ### Install Scraypa
44
+
45
+ Add this line to your application's Gemfile:
46
+
47
+ ```ruby
48
+ gem 'scraypa'
49
+ ```
50
+
51
+ And then execute:
52
+
53
+ $ bundle install
54
+
55
+ Or install it yourself as:
56
+
57
+ $ gem install scraypa
58
+
59
+ ## Usage
60
+
61
+ response = Scraypa.visit(method: :get,
62
+ url: "http://example.com")
63
+
64
+ #the response contains the RestClient response object
65
+ response.code
66
+ #-> 200
67
+ response.to_str
68
+ #-> http://example.com content
69
+
70
+ By default Scraypa uses the rest-client gem which does
71
+ not support Javascript. The `#visit` method wraps the
72
+ [`RestClient#execute` method](https://github.com/rest-client/rest-client#passing-advanced-options)
73
+ so you can pass in whatever `RestClient#execute` will accept,
74
+ for example:
75
+
76
+ Scraypa.visit(method: :get,
77
+ url: 'http://example.com/resource',
78
+ timeout: 10,
79
+ headers: {params: {foo: 'bar'}})
80
+
81
+ ➔ GET http://example.com/resource?foo=bar
82
+
83
+ ### Javascript Support
84
+
85
+ Capybara is used for Javascript support:
86
+
87
+ #configure Scraypa to #use_capybara
88
+ #and choose your capybara driver, here is poltergeist:
89
+ Scraypa.configure do |config|
90
+ config.use_capybara = true
91
+ config.driver = :poltergeist
92
+ config.driver_options = {
93
+ :phantomjs => Phantomjs.path,
94
+ :js_errors => false,
95
+ :phantomjs_options => ["--web-security=true"]
96
+ }
97
+
98
+ #or you could instead use headless_chrome:
99
+ #config.driver = :headless_chromium
100
+ #config.driver_options = {
101
+ # browser: :chrome,
102
+ # desired_capabilities: Selenium::WebDriver::Remote::Capabilities.chrome(
103
+ # "chromeOptions" => {
104
+ # "binary" => "/home/resrev/chromium/src/out/Default/chrome",
105
+ # "args" => %w{headless no-sandbox disable-gpu}
106
+ # }
107
+ # )
108
+ #}
109
+ end
110
+
111
+ #when using capybara, just the url parameter is required:
112
+ response = Scraypa.visit(url: "http://example.com")
113
+
114
+ #the response contains the capybara page object
115
+ response.status_code
116
+ #-> 200
117
+ response.text
118
+ #-> http://example.com content
119
+
120
+ #execute some javascript:
121
+ response.execute_script(
122
+ "document.getElementsByTagName('body')[0].innerHTML = 'changed content';")
123
+ response.text
124
+ #-> "changed content"
125
+
126
+ ### Tor
127
+
128
+ TODO
129
+
130
+ ### Disguise
131
+
132
+ TODO
133
+
134
+ ## Contributing
135
+
136
+ Bug reports and pull requests are welcome on GitHub at https://github.com/joshweir/scraypa.
137
+
138
+
139
+ ## License
140
+
141
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
142
+
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scraypa"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,189 @@
1
+ require "scraypa/version"
2
+ require "scraypa/configuration"
3
+ require "scraypa/visit/visit_interface"
4
+ require "scraypa/visit/visit_rest_client"
5
+ require "scraypa/visit/visit_capabara_poltergeist"
6
+ require "scraypa/visit/visit_capabara_headless_chromium"
7
+ require "scraypa/visit/visit_factory"
8
+ require "scraypa/user_agent/user_agent_abstract"
9
+ require "scraypa/user_agent/user_agent_common_aliases_lists"
10
+ require "scraypa/user_agent/user_agent_iterator"
11
+ require "scraypa/user_agent/user_agent_random"
12
+ require "scraypa/user_agent/user_agent_factory"
13
+ require "scraypa/throttle"
14
+ require "scraypa/driver_resetter"
15
+ require 'tormanager'
16
+
17
+ module Scraypa
18
+ class TorNotSupportedByAgent < StandardError; end
19
+ class UnrecognisedUserAgentsMethod < StandardError; end
20
+ class CapybaraDriverUnsupported < StandardError; end
21
+ class HeadlessChromiumMissingConfig < StandardError; end
22
+
23
+ class << self
24
+ attr_accessor :agent, :tor_process, :tor_ip_control, :tor_proxy,
25
+ :throttle, :user_agent_retriever, :driver_resetter
26
+
27
+ def configuration
28
+ @configuration ||= Configuration.new
29
+ end
30
+
31
+ def configuration=(config)
32
+ @configuration = config
33
+ end
34
+
35
+ def reset
36
+ @configuration = Configuration.new
37
+ reset_throttle
38
+ setup_scraypa
39
+ @configuration
40
+ end
41
+
42
+ def configure
43
+ yield(configuration).tap{
44
+ validate_configuration
45
+ setup_scraypa
46
+ }
47
+ end
48
+
49
+ def visit params={}
50
+ setup_scraypa unless @agent
51
+ visit_with_throttle params
52
+ end
53
+
54
+ def change_tor_ip_address
55
+ @tor_ip_control.get_new_ip if using_tor?
56
+ end
57
+
58
+ def user_agent
59
+ @user_agent_retriever ?
60
+ @user_agent_retriever.current_user_agent : nil
61
+ end
62
+
63
+ private
64
+
65
+ def validate_configuration
66
+ headless_chromium_with_tor_is_invalid
67
+ end
68
+
69
+ def headless_chromium_with_tor_is_invalid
70
+ raise TorNotSupportedByAgent,
71
+ "Capybara :headless_chromium does not support Tor" if
72
+ using_tor? && @configuration.driver == :headless_chromium
73
+ end
74
+
75
+ def setup_scraypa
76
+ setup_user_agent
77
+ setup_tor
78
+ setup_driver_resetter
79
+ setup_agent
80
+ setup_throttle
81
+ end
82
+
83
+ def setup_user_agent
84
+ @user_agent_retriever =
85
+ @configuration.user_agent ?
86
+ UserAgentFactory.build(
87
+ merge_user_agent_list_limit_for_chrome(
88
+ @configuration.user_agent)) : nil
89
+ end
90
+
91
+ def merge_user_agent_list_limit_for_chrome config
92
+ @configuration.driver == :headless_chromium &&
93
+ !config[:list_limit] ?
94
+ config.merge({list_limit: 30}) : config
95
+ end
96
+
97
+ def setup_tor
98
+ ensure_tor_options_are_configured
99
+ using_tor? && !tor_running_in_current_process? ?
100
+ reset_tor :
101
+ (!using_tor? && tor_running_in_current_process? ?
102
+ destruct_tor : nil)
103
+ end
104
+
105
+ def ensure_tor_options_are_configured
106
+ if using_tor?
107
+ @configuration.tor_options ||= {}
108
+ @configuration.tor_options[:tor_port] ||= 9050
109
+ @configuration.tor_options[:control_port] ||= 50500
110
+ else
111
+ @configuration.tor_options = nil
112
+ end
113
+ end
114
+
115
+ def using_tor?
116
+ @configuration.tor
117
+ end
118
+
119
+ def tor_running_in_current_process?
120
+ @configuration.tor_options &&
121
+ @configuration.tor_options[:tor_port] ?
122
+ TorManager::TorProcess
123
+ .tor_running_on?(port: @configuration.tor_options[:tor_port],
124
+ parent_pid: Process.pid) :
125
+ TorManager::TorProcess
126
+ .tor_running_on?(parent_pid: Process.pid)
127
+ end
128
+
129
+ def reset_tor
130
+ destruct_tor
131
+ initialize_tor(@configuration.tor_options) if @configuration.tor
132
+ end
133
+
134
+ def initialize_tor params={}
135
+ @tor_process = TorManager::TorProcess.new params || {}
136
+ @tor_proxy =
137
+ TorManager::Proxy.new tor_process: @tor_process
138
+ @tor_ip_control = TorManager::IpAddressControl.new(
139
+ tor_process: @tor_process, tor_proxy: @tor_proxy)
140
+ @tor_process.start
141
+ end
142
+
143
+ def destruct_tor
144
+ @tor_process.stop if @tor_process
145
+ TorManager::TorProcess.stop_obsolete_processes
146
+ @tor_ip_control = nil
147
+ @tor_proxy = nil
148
+ @tor_process = nil
149
+ end
150
+
151
+ def setup_driver_resetter
152
+ @driver_resetter =
153
+ DriverResetter.new(
154
+ @configuration.reset_driver_every_n_requests)
155
+ end
156
+
157
+ def setup_agent
158
+ @agent = Scraypa::VisitFactory
159
+ .build(config: @configuration,
160
+ tor_proxy: @tor_proxy,
161
+ driver_resetter: @driver_resetter,
162
+ user_agent_retriever: @user_agent_retriever)
163
+ end
164
+
165
+ def setup_throttle
166
+ @throttle = Throttle.new seconds: @configuration.throttle_seconds if
167
+ throttle_config_has_changed?
168
+ end
169
+
170
+ def throttle_config_has_changed?
171
+ @configuration.throttle_seconds &&
172
+ (@configuration.throttle_seconds.is_a?(Hash) ||
173
+ @configuration.throttle_seconds.to_f > 0) &&
174
+ (!@throttle || @throttle.seconds != @configuration.throttle_seconds)
175
+ end
176
+
177
+ def visit_with_throttle params
178
+ @throttle.throttle if @throttle
179
+ response = @agent.execute(params)
180
+ @throttle.last_request_time = Time.now if @throttle
181
+ response
182
+ end
183
+
184
+ def reset_throttle
185
+ @throttle.last_request_time = nil if @throttle
186
+ @throttle = nil
187
+ end
188
+ end
189
+ end