scraypa 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 428e660dbb1d40ffb042053e7dbc0198b21aff72
4
+ data.tar.gz: fe07d8c05e7f458a41a234221ddab2c1c79f357c
5
+ SHA512:
6
+ metadata.gz: 1757aa2cb798597a6588c57975dfa516048286f2c43eebcbd79affb2fba56519fb034b15f44c8b01a0fc80938db5465f578e2cb85b1428ed6f7282a4afc35d77
7
+ data.tar.gz: 983a91b91f510bdf93142f6614262af81398a33ddcb41fd02e25b408ee944a2e06afaa41e760939bba7b37ab306638d392edb7015b342c27847851d2e9a36a25
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /.idea/
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.14.6
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scraypa.gemspec
4
+ gemspec
5
+
@@ -0,0 +1,90 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ ## Uncomment and set this to only include directories you want to watch
5
+ #directories %w(lib lib/scraypa/ lib/scraypa/eye/ spec) \
6
+ # .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")}
7
+
8
+ ## Note: if you are using the `directories` clause above and you are not
9
+ ## watching the project directory ('.'), then you will want to move
10
+ ## the Guardfile to a watched dir and symlink it back, e.g.
11
+ #
12
+ # $ mkdir config
13
+ # $ mv Guardfile config/
14
+ # $ ln -s config/Guardfile .
15
+ #
16
+ # and, you'll have to watch "config/Guardfile" instead of "Guardfile"
17
+
18
+ # Note: The cmd option is now required due to the increasing number of ways
19
+ # rspec may be run, below are examples of the most common uses.
20
+ # * bundler: 'bundle exec rspec'
21
+ # * bundler binstubs: 'bin/rspec'
22
+ # * spring: 'bin/rspec' (This will use spring if running and you have
23
+ # installed the spring binstubs per the docs)
24
+ # * zeus: 'zeus rspec' (requires the server to be started separately)
25
+ # * 'just' rspec: 'rspec'
26
+ =begin
27
+ guard :rspec, cmd: "bundle exec rspec" do
28
+ require "guard/rspec/dsl"
29
+ dsl = Guard::RSpec::Dsl.new(self)
30
+
31
+ # Feel free to open issues for suggestions and improvements
32
+
33
+ # RSpec files
34
+ rspec = dsl.rspec
35
+ watch(rspec.spec_helper) { rspec.spec_dir }
36
+ watch(rspec.spec_support) { rspec.spec_dir }
37
+ watch(rspec.spec_files)
38
+
39
+ # Ruby files
40
+ ruby = dsl.ruby
41
+ dsl.watch_spec_files_for(ruby.lib_files)
42
+
43
+ # Rails files
44
+ rails = dsl.rails(view_extensions: %w(erb haml slim))
45
+ dsl.watch_spec_files_for(rails.app_files)
46
+ dsl.watch_spec_files_for(rails.views)
47
+
48
+ watch(rails.controllers) do |m|
49
+ [
50
+ rspec.spec.call("routing/#{m[1]}_routing"),
51
+ rspec.spec.call("controllers/#{m[1]}_controller"),
52
+ rspec.spec.call("acceptance/#{m[1]}")
53
+ ]
54
+ end
55
+
56
+ # Rails config changes
57
+ watch(rails.spec_helper) { rspec.spec_dir }
58
+ watch(rails.routes) { "#{rspec.spec_dir}/routing" }
59
+ watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
60
+
61
+ # Capybara features specs
62
+ watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") }
63
+ watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") }
64
+
65
+ # Turnip features and steps
66
+ watch(%r{^spec/acceptance/(.+)\.feature$})
67
+ watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m|
68
+ Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance"
69
+ end
70
+ end
71
+ =end
72
+
73
+ guard :rspec, cmd: 'bundle exec rspec' do
74
+ require "guard/rspec/dsl"
75
+ dsl = Guard::RSpec::Dsl.new(self)
76
+
77
+ # RSpec files
78
+ rspec = dsl.rspec
79
+ watch(rspec.spec_helper) { rspec.spec_dir }
80
+ watch(rspec.spec_support) { rspec.spec_dir }
81
+ watch(rspec.spec_files)
82
+
83
+ # Ruby files
84
+ ruby = dsl.ruby
85
+ dsl.watch_spec_files_for(ruby.lib_files)
86
+
87
+ #watch(%r{^spec/.+_spec\.rb$})
88
+ #watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
89
+ #watch('spec/spec_helper.rb') { "spec" }
90
+ end
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 joshweir
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,142 @@
1
+ # Scraypa
2
+
3
+ A Ruby gem to scrape web content with configuration options including:
4
+
5
+ 1. [Javscript support](#javascript-support)
6
+ 2. [The Onion Router (Tor)](#tor)
7
+ 3. [Disguise](#disguise) (TODO)
8
+
9
+ Scraypa is essentially a wrapper for the light-weight
10
+ [Rest Client](https://github.com/rest-client/rest-client) (if you dont require javascript support)
11
+ or [Capybara](https://github.com/teamcapybara/capybara) (for Javascript support).
12
+
13
+ ## Why?
14
+
15
+ A web scraper that can be configured to support javascript and/or Tor. If javascript is not required,
16
+ it will use the lighter Rest Client. Scraypa is an attempt to remove the complexities associated to web agent setup.
17
+
18
+ ## Installation
19
+
20
+ ### Install Tor (optional)
21
+
22
+ If you want to use Tor, install tor:
23
+
24
+ `sudo apt-get install tor`
25
+
26
+ ### Install Headless Chrome (optional)
27
+
28
+ If you want to use `:headless_chrome` with capybara, install
29
+ headless chrome by following instructions here:
30
+
31
+ http://blog.faraday.io/headless-chromium/
32
+
33
+ For ubuntu I did this:
34
+
35
+ 1. Install chromium:
36
+
37
+ git clone https://github.com/scheib/chromium-latest-linux.git
38
+ cd chromium-latest-linux
39
+ ./update-and-run.sh
40
+
41
+ 2. Install chromedriver by [following the build instructions](https://chromium.googlesource.com/chromium/src/+/master/docs/linux_build_instructions.md).
42
+
43
+ ### Install Scraypa
44
+
45
+ Add this line to your application's Gemfile:
46
+
47
+ ```ruby
48
+ gem 'scraypa'
49
+ ```
50
+
51
+ And then execute:
52
+
53
+ $ bundle install
54
+
55
+ Or install it yourself as:
56
+
57
+ $ gem install scraypa
58
+
59
+ ## Usage
60
+
61
+ response = Scraypa.visit(method: :get,
62
+ url: "http://example.com")
63
+
64
+ #the response contains the RestClient response object
65
+ response.code
66
+ #-> 200
67
+ response.to_str
68
+ #-> http://example.com content
69
+
70
+ By default Scraypa uses the rest-client gem which does
71
+ not support Javascript. The `#visit` method wraps the
72
+ [`RestClient#execute` method](https://github.com/rest-client/rest-client#passing-advanced-options)
73
+ so you can pass in whatever `RestClient#execute` will accept,
74
+ for example:
75
+
76
+ Scraypa.visit(method: :get,
77
+ url: 'http://example.com/resource',
78
+ timeout: 10,
79
+ headers: {params: {foo: 'bar'}})
80
+
81
+ ➔ GET http://example.com/resource?foo=bar
82
+
83
+ ### Javascript Support
84
+
85
+ Capybara is used for Javascript support:
86
+
87
+ #configure Scraypa to #use_capybara
88
+ #and choose your capybara driver, here is poltergeist:
89
+ Scraypa.configure do |config|
90
+ config.use_capybara = true
91
+ config.driver = :poltergeist
92
+ config.driver_options = {
93
+ :phantomjs => Phantomjs.path,
94
+ :js_errors => false,
95
+ :phantomjs_options => ["--web-security=true"]
96
+ }
97
+
98
+ #or you could instead use headless_chrome:
99
+ #config.driver = :headless_chromium
100
+ #config.driver_options = {
101
+ # browser: :chrome,
102
+ # desired_capabilities: Selenium::WebDriver::Remote::Capabilities.chrome(
103
+ # "chromeOptions" => {
104
+ # "binary" => "/home/resrev/chromium/src/out/Default/chrome",
105
+ # "args" => %w{headless no-sandbox disable-gpu}
106
+ # }
107
+ # )
108
+ #}
109
+ end
110
+
111
+ #when using capybara, just the url parameter is required:
112
+ response = Scraypa.visit(url: "http://example.com")
113
+
114
+ #the response contains the capybara page object
115
+ response.status_code
116
+ #-> 200
117
+ response.text
118
+ #-> http://example.com content
119
+
120
+ #execute some javascript:
121
+ response.execute_script(
122
+ "document.getElementsByTagName('body')[0].innerHTML = 'changed content';")
123
+ response.text
124
+ #-> "changed content"
125
+
126
+ ### Tor
127
+
128
+ TODO
129
+
130
+ ### Disguise
131
+
132
+ TODO
133
+
134
+ ## Contributing
135
+
136
+ Bug reports and pull requests are welcome on GitHub at https://github.com/joshweir/scraypa.
137
+
138
+
139
+ ## License
140
+
141
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
142
+
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scraypa"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,189 @@
1
+ require "scraypa/version"
2
+ require "scraypa/configuration"
3
+ require "scraypa/visit/visit_interface"
4
+ require "scraypa/visit/visit_rest_client"
5
+ require "scraypa/visit/visit_capabara_poltergeist"
6
+ require "scraypa/visit/visit_capabara_headless_chromium"
7
+ require "scraypa/visit/visit_factory"
8
+ require "scraypa/user_agent/user_agent_abstract"
9
+ require "scraypa/user_agent/user_agent_common_aliases_lists"
10
+ require "scraypa/user_agent/user_agent_iterator"
11
+ require "scraypa/user_agent/user_agent_random"
12
+ require "scraypa/user_agent/user_agent_factory"
13
+ require "scraypa/throttle"
14
+ require "scraypa/driver_resetter"
15
+ require 'tormanager'
16
+
17
+ module Scraypa
18
+ class TorNotSupportedByAgent < StandardError; end
19
+ class UnrecognisedUserAgentsMethod < StandardError; end
20
+ class CapybaraDriverUnsupported < StandardError; end
21
+ class HeadlessChromiumMissingConfig < StandardError; end
22
+
23
+ class << self
24
+ attr_accessor :agent, :tor_process, :tor_ip_control, :tor_proxy,
25
+ :throttle, :user_agent_retriever, :driver_resetter
26
+
27
+ def configuration
28
+ @configuration ||= Configuration.new
29
+ end
30
+
31
+ def configuration=(config)
32
+ @configuration = config
33
+ end
34
+
35
+ def reset
36
+ @configuration = Configuration.new
37
+ reset_throttle
38
+ setup_scraypa
39
+ @configuration
40
+ end
41
+
42
+ def configure
43
+ yield(configuration).tap{
44
+ validate_configuration
45
+ setup_scraypa
46
+ }
47
+ end
48
+
49
+ def visit params={}
50
+ setup_scraypa unless @agent
51
+ visit_with_throttle params
52
+ end
53
+
54
+ def change_tor_ip_address
55
+ @tor_ip_control.get_new_ip if using_tor?
56
+ end
57
+
58
+ def user_agent
59
+ @user_agent_retriever ?
60
+ @user_agent_retriever.current_user_agent : nil
61
+ end
62
+
63
+ private
64
+
65
+ def validate_configuration
66
+ headless_chromium_with_tor_is_invalid
67
+ end
68
+
69
+ def headless_chromium_with_tor_is_invalid
70
+ raise TorNotSupportedByAgent,
71
+ "Capybara :headless_chromium does not support Tor" if
72
+ using_tor? && @configuration.driver == :headless_chromium
73
+ end
74
+
75
+ def setup_scraypa
76
+ setup_user_agent
77
+ setup_tor
78
+ setup_driver_resetter
79
+ setup_agent
80
+ setup_throttle
81
+ end
82
+
83
+ def setup_user_agent
84
+ @user_agent_retriever =
85
+ @configuration.user_agent ?
86
+ UserAgentFactory.build(
87
+ merge_user_agent_list_limit_for_chrome(
88
+ @configuration.user_agent)) : nil
89
+ end
90
+
91
+ def merge_user_agent_list_limit_for_chrome config
92
+ @configuration.driver == :headless_chromium &&
93
+ !config[:list_limit] ?
94
+ config.merge({list_limit: 30}) : config
95
+ end
96
+
97
+ def setup_tor
98
+ ensure_tor_options_are_configured
99
+ using_tor? && !tor_running_in_current_process? ?
100
+ reset_tor :
101
+ (!using_tor? && tor_running_in_current_process? ?
102
+ destruct_tor : nil)
103
+ end
104
+
105
+ def ensure_tor_options_are_configured
106
+ if using_tor?
107
+ @configuration.tor_options ||= {}
108
+ @configuration.tor_options[:tor_port] ||= 9050
109
+ @configuration.tor_options[:control_port] ||= 50500
110
+ else
111
+ @configuration.tor_options = nil
112
+ end
113
+ end
114
+
115
+ def using_tor?
116
+ @configuration.tor
117
+ end
118
+
119
+ def tor_running_in_current_process?
120
+ @configuration.tor_options &&
121
+ @configuration.tor_options[:tor_port] ?
122
+ TorManager::TorProcess
123
+ .tor_running_on?(port: @configuration.tor_options[:tor_port],
124
+ parent_pid: Process.pid) :
125
+ TorManager::TorProcess
126
+ .tor_running_on?(parent_pid: Process.pid)
127
+ end
128
+
129
+ def reset_tor
130
+ destruct_tor
131
+ initialize_tor(@configuration.tor_options) if @configuration.tor
132
+ end
133
+
134
+ def initialize_tor params={}
135
+ @tor_process = TorManager::TorProcess.new params || {}
136
+ @tor_proxy =
137
+ TorManager::Proxy.new tor_process: @tor_process
138
+ @tor_ip_control = TorManager::IpAddressControl.new(
139
+ tor_process: @tor_process, tor_proxy: @tor_proxy)
140
+ @tor_process.start
141
+ end
142
+
143
+ def destruct_tor
144
+ @tor_process.stop if @tor_process
145
+ TorManager::TorProcess.stop_obsolete_processes
146
+ @tor_ip_control = nil
147
+ @tor_proxy = nil
148
+ @tor_process = nil
149
+ end
150
+
151
+ def setup_driver_resetter
152
+ @driver_resetter =
153
+ DriverResetter.new(
154
+ @configuration.reset_driver_every_n_requests)
155
+ end
156
+
157
+ def setup_agent
158
+ @agent = Scraypa::VisitFactory
159
+ .build(config: @configuration,
160
+ tor_proxy: @tor_proxy,
161
+ driver_resetter: @driver_resetter,
162
+ user_agent_retriever: @user_agent_retriever)
163
+ end
164
+
165
+ def setup_throttle
166
+ @throttle = Throttle.new seconds: @configuration.throttle_seconds if
167
+ throttle_config_has_changed?
168
+ end
169
+
170
+ def throttle_config_has_changed?
171
+ @configuration.throttle_seconds &&
172
+ (@configuration.throttle_seconds.is_a?(Hash) ||
173
+ @configuration.throttle_seconds.to_f > 0) &&
174
+ (!@throttle || @throttle.seconds != @configuration.throttle_seconds)
175
+ end
176
+
177
+ def visit_with_throttle params
178
+ @throttle.throttle if @throttle
179
+ response = @agent.execute(params)
180
+ @throttle.last_request_time = Time.now if @throttle
181
+ response
182
+ end
183
+
184
+ def reset_throttle
185
+ @throttle.last_request_time = nil if @throttle
186
+ @throttle = nil
187
+ end
188
+ end
189
+ end