scraypa 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +5 -0
- data/Guardfile +90 -0
- data/LICENSE.txt +21 -0
- data/README.md +142 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/scraypa.rb +189 -0
- data/lib/scraypa/configuration.rb +20 -0
- data/lib/scraypa/driver_resetter.rb +37 -0
- data/lib/scraypa/throttle.rb +27 -0
- data/lib/scraypa/user_agent/user_agent_abstract.rb +15 -0
- data/lib/scraypa/user_agent/user_agent_common_aliases_lists.rb +27 -0
- data/lib/scraypa/user_agent/user_agent_factory.rb +19 -0
- data/lib/scraypa/user_agent/user_agent_iterator.rb +97 -0
- data/lib/scraypa/user_agent/user_agent_random.rb +66 -0
- data/lib/scraypa/version.rb +3 -0
- data/lib/scraypa/visit/visit_capabara_headless_chromium.rb +120 -0
- data/lib/scraypa/visit/visit_capabara_poltergeist.rb +76 -0
- data/lib/scraypa/visit/visit_factory.rb +18 -0
- data/lib/scraypa/visit/visit_interface.rb +13 -0
- data/lib/scraypa/visit/visit_rest_client.rb +39 -0
- data/scraypa.gemspec +42 -0
- metadata +322 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 428e660dbb1d40ffb042053e7dbc0198b21aff72
|
4
|
+
data.tar.gz: fe07d8c05e7f458a41a234221ddab2c1c79f357c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1757aa2cb798597a6588c57975dfa516048286f2c43eebcbd79affb2fba56519fb034b15f44c8b01a0fc80938db5465f578e2cb85b1428ed6f7282a4afc35d77
|
7
|
+
data.tar.gz: 983a91b91f510bdf93142f6614262af81398a33ddcb41fd02e25b408ee944a2e06afaa41e760939bba7b37ab306638d392edb7015b342c27847851d2e9a36a25
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
## Uncomment and set this to only include directories you want to watch
|
5
|
+
#directories %w(lib lib/scraypa/ lib/scraypa/eye/ spec) \
|
6
|
+
# .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")}
|
7
|
+
|
8
|
+
## Note: if you are using the `directories` clause above and you are not
|
9
|
+
## watching the project directory ('.'), then you will want to move
|
10
|
+
## the Guardfile to a watched dir and symlink it back, e.g.
|
11
|
+
#
|
12
|
+
# $ mkdir config
|
13
|
+
# $ mv Guardfile config/
|
14
|
+
# $ ln -s config/Guardfile .
|
15
|
+
#
|
16
|
+
# and, you'll have to watch "config/Guardfile" instead of "Guardfile"
|
17
|
+
|
18
|
+
# Note: The cmd option is now required due to the increasing number of ways
|
19
|
+
# rspec may be run, below are examples of the most common uses.
|
20
|
+
# * bundler: 'bundle exec rspec'
|
21
|
+
# * bundler binstubs: 'bin/rspec'
|
22
|
+
# * spring: 'bin/rspec' (This will use spring if running and you have
|
23
|
+
# installed the spring binstubs per the docs)
|
24
|
+
# * zeus: 'zeus rspec' (requires the server to be started separately)
|
25
|
+
# * 'just' rspec: 'rspec'
|
26
|
+
=begin
|
27
|
+
guard :rspec, cmd: "bundle exec rspec" do
|
28
|
+
require "guard/rspec/dsl"
|
29
|
+
dsl = Guard::RSpec::Dsl.new(self)
|
30
|
+
|
31
|
+
# Feel free to open issues for suggestions and improvements
|
32
|
+
|
33
|
+
# RSpec files
|
34
|
+
rspec = dsl.rspec
|
35
|
+
watch(rspec.spec_helper) { rspec.spec_dir }
|
36
|
+
watch(rspec.spec_support) { rspec.spec_dir }
|
37
|
+
watch(rspec.spec_files)
|
38
|
+
|
39
|
+
# Ruby files
|
40
|
+
ruby = dsl.ruby
|
41
|
+
dsl.watch_spec_files_for(ruby.lib_files)
|
42
|
+
|
43
|
+
# Rails files
|
44
|
+
rails = dsl.rails(view_extensions: %w(erb haml slim))
|
45
|
+
dsl.watch_spec_files_for(rails.app_files)
|
46
|
+
dsl.watch_spec_files_for(rails.views)
|
47
|
+
|
48
|
+
watch(rails.controllers) do |m|
|
49
|
+
[
|
50
|
+
rspec.spec.call("routing/#{m[1]}_routing"),
|
51
|
+
rspec.spec.call("controllers/#{m[1]}_controller"),
|
52
|
+
rspec.spec.call("acceptance/#{m[1]}")
|
53
|
+
]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Rails config changes
|
57
|
+
watch(rails.spec_helper) { rspec.spec_dir }
|
58
|
+
watch(rails.routes) { "#{rspec.spec_dir}/routing" }
|
59
|
+
watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" }
|
60
|
+
|
61
|
+
# Capybara features specs
|
62
|
+
watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") }
|
63
|
+
watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") }
|
64
|
+
|
65
|
+
# Turnip features and steps
|
66
|
+
watch(%r{^spec/acceptance/(.+)\.feature$})
|
67
|
+
watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m|
|
68
|
+
Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
=end
|
72
|
+
|
73
|
+
guard :rspec, cmd: 'bundle exec rspec' do
|
74
|
+
require "guard/rspec/dsl"
|
75
|
+
dsl = Guard::RSpec::Dsl.new(self)
|
76
|
+
|
77
|
+
# RSpec files
|
78
|
+
rspec = dsl.rspec
|
79
|
+
watch(rspec.spec_helper) { rspec.spec_dir }
|
80
|
+
watch(rspec.spec_support) { rspec.spec_dir }
|
81
|
+
watch(rspec.spec_files)
|
82
|
+
|
83
|
+
# Ruby files
|
84
|
+
ruby = dsl.ruby
|
85
|
+
dsl.watch_spec_files_for(ruby.lib_files)
|
86
|
+
|
87
|
+
#watch(%r{^spec/.+_spec\.rb$})
|
88
|
+
#watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
89
|
+
#watch('spec/spec_helper.rb') { "spec" }
|
90
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 joshweir
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
# Scraypa
|
2
|
+
|
3
|
+
A Ruby gem to scrape web content with configuration options including:
|
4
|
+
|
5
|
+
1. [Javscript support](#javascript-support)
|
6
|
+
2. [The Onion Router (Tor)](#tor)
|
7
|
+
3. [Disguise](#disguise) (TODO)
|
8
|
+
|
9
|
+
Scraypa is essentially a wrapper for the light-weight
|
10
|
+
[Rest Client](https://github.com/rest-client/rest-client) (if you dont require javascript support)
|
11
|
+
or [Capybara](https://github.com/teamcapybara/capybara) (for Javascript support).
|
12
|
+
|
13
|
+
## Why?
|
14
|
+
|
15
|
+
A web scraper that can be configured to support javascript and/or Tor. If javascript is not required,
|
16
|
+
it will use the lighter Rest Client. Scraypa is an attempt to remove the complexities associated to web agent setup.
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
### Install Tor (optional)
|
21
|
+
|
22
|
+
If you want to use Tor, install tor:
|
23
|
+
|
24
|
+
`sudo apt-get install tor`
|
25
|
+
|
26
|
+
### Install Headless Chrome (optional)
|
27
|
+
|
28
|
+
If you want to use `:headless_chrome` with capybara, install
|
29
|
+
headless chrome by following instructions here:
|
30
|
+
|
31
|
+
http://blog.faraday.io/headless-chromium/
|
32
|
+
|
33
|
+
For ubuntu I did this:
|
34
|
+
|
35
|
+
1. Install chromium:
|
36
|
+
|
37
|
+
git clone https://github.com/scheib/chromium-latest-linux.git
|
38
|
+
cd chromium-latest-linux
|
39
|
+
./update-and-run.sh
|
40
|
+
|
41
|
+
2. Install chromedriver by [following the build instructions](https://chromium.googlesource.com/chromium/src/+/master/docs/linux_build_instructions.md).
|
42
|
+
|
43
|
+
### Install Scraypa
|
44
|
+
|
45
|
+
Add this line to your application's Gemfile:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
gem 'scraypa'
|
49
|
+
```
|
50
|
+
|
51
|
+
And then execute:
|
52
|
+
|
53
|
+
$ bundle install
|
54
|
+
|
55
|
+
Or install it yourself as:
|
56
|
+
|
57
|
+
$ gem install scraypa
|
58
|
+
|
59
|
+
## Usage
|
60
|
+
|
61
|
+
response = Scraypa.visit(method: :get,
|
62
|
+
url: "http://example.com")
|
63
|
+
|
64
|
+
#the response contains the RestClient response object
|
65
|
+
response.code
|
66
|
+
#-> 200
|
67
|
+
response.to_str
|
68
|
+
#-> http://example.com content
|
69
|
+
|
70
|
+
By default Scraypa uses the rest-client gem which does
|
71
|
+
not support Javascript. The `#visit` method wraps the
|
72
|
+
[`RestClient#execute` method](https://github.com/rest-client/rest-client#passing-advanced-options)
|
73
|
+
so you can pass in whatever `RestClient#execute` will accept,
|
74
|
+
for example:
|
75
|
+
|
76
|
+
Scraypa.visit(method: :get,
|
77
|
+
url: 'http://example.com/resource',
|
78
|
+
timeout: 10,
|
79
|
+
headers: {params: {foo: 'bar'}})
|
80
|
+
|
81
|
+
➔ GET http://example.com/resource?foo=bar
|
82
|
+
|
83
|
+
### Javascript Support
|
84
|
+
|
85
|
+
Capybara is used for Javascript support:
|
86
|
+
|
87
|
+
#configure Scraypa to #use_capybara
|
88
|
+
#and choose your capybara driver, here is poltergeist:
|
89
|
+
Scraypa.configure do |config|
|
90
|
+
config.use_capybara = true
|
91
|
+
config.driver = :poltergeist
|
92
|
+
config.driver_options = {
|
93
|
+
:phantomjs => Phantomjs.path,
|
94
|
+
:js_errors => false,
|
95
|
+
:phantomjs_options => ["--web-security=true"]
|
96
|
+
}
|
97
|
+
|
98
|
+
#or you could instead use headless_chrome:
|
99
|
+
#config.driver = :headless_chromium
|
100
|
+
#config.driver_options = {
|
101
|
+
# browser: :chrome,
|
102
|
+
# desired_capabilities: Selenium::WebDriver::Remote::Capabilities.chrome(
|
103
|
+
# "chromeOptions" => {
|
104
|
+
# "binary" => "/home/resrev/chromium/src/out/Default/chrome",
|
105
|
+
# "args" => %w{headless no-sandbox disable-gpu}
|
106
|
+
# }
|
107
|
+
# )
|
108
|
+
#}
|
109
|
+
end
|
110
|
+
|
111
|
+
#when using capybara, just the url parameter is required:
|
112
|
+
response = Scraypa.visit(url: "http://example.com")
|
113
|
+
|
114
|
+
#the response contains the capybara page object
|
115
|
+
response.status_code
|
116
|
+
#-> 200
|
117
|
+
response.text
|
118
|
+
#-> http://example.com content
|
119
|
+
|
120
|
+
#execute some javascript:
|
121
|
+
response.execute_script(
|
122
|
+
"document.getElementsByTagName('body')[0].innerHTML = 'changed content';")
|
123
|
+
response.text
|
124
|
+
#-> "changed content"
|
125
|
+
|
126
|
+
### Tor
|
127
|
+
|
128
|
+
TODO
|
129
|
+
|
130
|
+
### Disguise
|
131
|
+
|
132
|
+
TODO
|
133
|
+
|
134
|
+
## Contributing
|
135
|
+
|
136
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/joshweir/scraypa.
|
137
|
+
|
138
|
+
|
139
|
+
## License
|
140
|
+
|
141
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
142
|
+
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "scraypa"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/scraypa.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
require "scraypa/version"
|
2
|
+
require "scraypa/configuration"
|
3
|
+
require "scraypa/visit/visit_interface"
|
4
|
+
require "scraypa/visit/visit_rest_client"
|
5
|
+
require "scraypa/visit/visit_capabara_poltergeist"
|
6
|
+
require "scraypa/visit/visit_capabara_headless_chromium"
|
7
|
+
require "scraypa/visit/visit_factory"
|
8
|
+
require "scraypa/user_agent/user_agent_abstract"
|
9
|
+
require "scraypa/user_agent/user_agent_common_aliases_lists"
|
10
|
+
require "scraypa/user_agent/user_agent_iterator"
|
11
|
+
require "scraypa/user_agent/user_agent_random"
|
12
|
+
require "scraypa/user_agent/user_agent_factory"
|
13
|
+
require "scraypa/throttle"
|
14
|
+
require "scraypa/driver_resetter"
|
15
|
+
require 'tormanager'
|
16
|
+
|
17
|
+
module Scraypa
|
18
|
+
class TorNotSupportedByAgent < StandardError; end
|
19
|
+
class UnrecognisedUserAgentsMethod < StandardError; end
|
20
|
+
class CapybaraDriverUnsupported < StandardError; end
|
21
|
+
class HeadlessChromiumMissingConfig < StandardError; end
|
22
|
+
|
23
|
+
class << self
|
24
|
+
attr_accessor :agent, :tor_process, :tor_ip_control, :tor_proxy,
|
25
|
+
:throttle, :user_agent_retriever, :driver_resetter
|
26
|
+
|
27
|
+
def configuration
|
28
|
+
@configuration ||= Configuration.new
|
29
|
+
end
|
30
|
+
|
31
|
+
def configuration=(config)
|
32
|
+
@configuration = config
|
33
|
+
end
|
34
|
+
|
35
|
+
def reset
|
36
|
+
@configuration = Configuration.new
|
37
|
+
reset_throttle
|
38
|
+
setup_scraypa
|
39
|
+
@configuration
|
40
|
+
end
|
41
|
+
|
42
|
+
def configure
|
43
|
+
yield(configuration).tap{
|
44
|
+
validate_configuration
|
45
|
+
setup_scraypa
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def visit params={}
|
50
|
+
setup_scraypa unless @agent
|
51
|
+
visit_with_throttle params
|
52
|
+
end
|
53
|
+
|
54
|
+
def change_tor_ip_address
|
55
|
+
@tor_ip_control.get_new_ip if using_tor?
|
56
|
+
end
|
57
|
+
|
58
|
+
def user_agent
|
59
|
+
@user_agent_retriever ?
|
60
|
+
@user_agent_retriever.current_user_agent : nil
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def validate_configuration
|
66
|
+
headless_chromium_with_tor_is_invalid
|
67
|
+
end
|
68
|
+
|
69
|
+
def headless_chromium_with_tor_is_invalid
|
70
|
+
raise TorNotSupportedByAgent,
|
71
|
+
"Capybara :headless_chromium does not support Tor" if
|
72
|
+
using_tor? && @configuration.driver == :headless_chromium
|
73
|
+
end
|
74
|
+
|
75
|
+
def setup_scraypa
|
76
|
+
setup_user_agent
|
77
|
+
setup_tor
|
78
|
+
setup_driver_resetter
|
79
|
+
setup_agent
|
80
|
+
setup_throttle
|
81
|
+
end
|
82
|
+
|
83
|
+
def setup_user_agent
|
84
|
+
@user_agent_retriever =
|
85
|
+
@configuration.user_agent ?
|
86
|
+
UserAgentFactory.build(
|
87
|
+
merge_user_agent_list_limit_for_chrome(
|
88
|
+
@configuration.user_agent)) : nil
|
89
|
+
end
|
90
|
+
|
91
|
+
def merge_user_agent_list_limit_for_chrome config
|
92
|
+
@configuration.driver == :headless_chromium &&
|
93
|
+
!config[:list_limit] ?
|
94
|
+
config.merge({list_limit: 30}) : config
|
95
|
+
end
|
96
|
+
|
97
|
+
def setup_tor
|
98
|
+
ensure_tor_options_are_configured
|
99
|
+
using_tor? && !tor_running_in_current_process? ?
|
100
|
+
reset_tor :
|
101
|
+
(!using_tor? && tor_running_in_current_process? ?
|
102
|
+
destruct_tor : nil)
|
103
|
+
end
|
104
|
+
|
105
|
+
def ensure_tor_options_are_configured
|
106
|
+
if using_tor?
|
107
|
+
@configuration.tor_options ||= {}
|
108
|
+
@configuration.tor_options[:tor_port] ||= 9050
|
109
|
+
@configuration.tor_options[:control_port] ||= 50500
|
110
|
+
else
|
111
|
+
@configuration.tor_options = nil
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def using_tor?
|
116
|
+
@configuration.tor
|
117
|
+
end
|
118
|
+
|
119
|
+
def tor_running_in_current_process?
|
120
|
+
@configuration.tor_options &&
|
121
|
+
@configuration.tor_options[:tor_port] ?
|
122
|
+
TorManager::TorProcess
|
123
|
+
.tor_running_on?(port: @configuration.tor_options[:tor_port],
|
124
|
+
parent_pid: Process.pid) :
|
125
|
+
TorManager::TorProcess
|
126
|
+
.tor_running_on?(parent_pid: Process.pid)
|
127
|
+
end
|
128
|
+
|
129
|
+
def reset_tor
|
130
|
+
destruct_tor
|
131
|
+
initialize_tor(@configuration.tor_options) if @configuration.tor
|
132
|
+
end
|
133
|
+
|
134
|
+
def initialize_tor params={}
|
135
|
+
@tor_process = TorManager::TorProcess.new params || {}
|
136
|
+
@tor_proxy =
|
137
|
+
TorManager::Proxy.new tor_process: @tor_process
|
138
|
+
@tor_ip_control = TorManager::IpAddressControl.new(
|
139
|
+
tor_process: @tor_process, tor_proxy: @tor_proxy)
|
140
|
+
@tor_process.start
|
141
|
+
end
|
142
|
+
|
143
|
+
def destruct_tor
|
144
|
+
@tor_process.stop if @tor_process
|
145
|
+
TorManager::TorProcess.stop_obsolete_processes
|
146
|
+
@tor_ip_control = nil
|
147
|
+
@tor_proxy = nil
|
148
|
+
@tor_process = nil
|
149
|
+
end
|
150
|
+
|
151
|
+
def setup_driver_resetter
|
152
|
+
@driver_resetter =
|
153
|
+
DriverResetter.new(
|
154
|
+
@configuration.reset_driver_every_n_requests)
|
155
|
+
end
|
156
|
+
|
157
|
+
def setup_agent
|
158
|
+
@agent = Scraypa::VisitFactory
|
159
|
+
.build(config: @configuration,
|
160
|
+
tor_proxy: @tor_proxy,
|
161
|
+
driver_resetter: @driver_resetter,
|
162
|
+
user_agent_retriever: @user_agent_retriever)
|
163
|
+
end
|
164
|
+
|
165
|
+
def setup_throttle
|
166
|
+
@throttle = Throttle.new seconds: @configuration.throttle_seconds if
|
167
|
+
throttle_config_has_changed?
|
168
|
+
end
|
169
|
+
|
170
|
+
def throttle_config_has_changed?
|
171
|
+
@configuration.throttle_seconds &&
|
172
|
+
(@configuration.throttle_seconds.is_a?(Hash) ||
|
173
|
+
@configuration.throttle_seconds.to_f > 0) &&
|
174
|
+
(!@throttle || @throttle.seconds != @configuration.throttle_seconds)
|
175
|
+
end
|
176
|
+
|
177
|
+
def visit_with_throttle params
|
178
|
+
@throttle.throttle if @throttle
|
179
|
+
response = @agent.execute(params)
|
180
|
+
@throttle.last_request_time = Time.now if @throttle
|
181
|
+
response
|
182
|
+
end
|
183
|
+
|
184
|
+
def reset_throttle
|
185
|
+
@throttle.last_request_time = nil if @throttle
|
186
|
+
@throttle = nil
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|