scrapey 0.0.8 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scrapey.rb +2 -2
- data/lib/scrapey/cache.rb +6 -0
- data/lib/scrapey/cache/disk.rb +7 -0
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/multi.rb +17 -10
- data/lib/scrapey/scrapey.rb +4 -2
- data/lib/scrapey/tee.rb +1 -1
- data/lib/scrapey/tor.rb +15 -0
- data/template/src/template.rb +3 -3
- data/template/template.iss +13 -1
- metadata +3 -2
data/lib/scrapey.rb
CHANGED
@@ -17,7 +17,7 @@ include Scrapey
|
|
17
17
|
@agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
18
18
|
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
19
19
|
# default output file
|
20
|
-
@output = 'output.csv'
|
20
|
+
@output = File.join BASEDIR, 'output.csv'
|
21
21
|
|
22
22
|
# read config file
|
23
23
|
config_file = "#{BASEDIR}/config/config.yml"
|
@@ -25,4 +25,4 @@ config_file = "#{BASEDIR}/config/config.yml"
|
|
25
25
|
|
26
26
|
init_db if @config['database']
|
27
27
|
|
28
|
-
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "
|
28
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
data/lib/scrapey/cache.rb
CHANGED
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
1
3
|
module Scrapey
|
2
4
|
|
3
5
|
def cache_filename url
|
@@ -22,4 +24,9 @@ module Scrapey
|
|
22
24
|
def save_cache url, doc, options = {}
|
23
25
|
File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
|
24
26
|
end
|
27
|
+
|
28
|
+
def delete_cache url
|
29
|
+
FileUtils.rm cache_filename(url)
|
30
|
+
end
|
31
|
+
|
25
32
|
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/multi.rb
CHANGED
@@ -5,18 +5,25 @@ class HTTPClient; def warn str; end; end
|
|
5
5
|
|
6
6
|
module Scrapey
|
7
7
|
def multi_get_or_post method, all_urls, options = {}
|
8
|
-
all_urls.reject!{|url| is_cached? url} if @use_cache
|
9
|
-
return unless all_urls.size > 0
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
9
|
+
# some sensible defaults
|
10
|
+
threads = options[:threads] || 20
|
11
|
+
on_success = options[:on_success] || :on_success
|
12
|
+
on_error = options[:on_error] || :on_error
|
13
|
+
user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
14
|
+
proxy = options[:proxy] || nil
|
15
|
+
timeout = options[:timeout] || 1000
|
16
|
+
follow_redirect = options[:follow_redirect] || true
|
17
17
|
|
18
18
|
@lock ||= Mutex.new
|
19
|
-
|
19
|
+
|
20
|
+
@http_clients ||= threads.times.map do
|
21
|
+
c = HTTPClient.new proxy, user_agent
|
22
|
+
c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
23
|
+
c.receive_timeout = timeout
|
24
|
+
yield c if block_given?
|
25
|
+
c
|
26
|
+
end
|
20
27
|
|
21
28
|
debug 'starting multi'
|
22
29
|
|
@@ -24,7 +31,7 @@ module Scrapey
|
|
24
31
|
urls.each_with_index.map do |url, i|
|
25
32
|
Thread.new do
|
26
33
|
begin
|
27
|
-
response = @http_clients[i].send method, url, options[:query], options[:headers]
|
34
|
+
response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
|
28
35
|
rescue Exception => e
|
29
36
|
error = e
|
30
37
|
end
|
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Scrapey
|
2
2
|
|
3
3
|
def get_or_post method, url, options={}, *args
|
4
|
-
agent =
|
4
|
+
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
5
5
|
_retries = options.delete :retries
|
6
6
|
_sleep = options.delete :sleep
|
7
7
|
begin
|
@@ -15,7 +15,8 @@ module Scrapey
|
|
15
15
|
return doc if doc
|
16
16
|
|
17
17
|
page = agent.send *new_args
|
18
|
-
|
18
|
+
str = page.respond_to?('root') ? page.root.to_s : page.body
|
19
|
+
save_cache(url, str) if @use_cache
|
19
20
|
|
20
21
|
#exit if Object.const_defined? :Ocra
|
21
22
|
page
|
@@ -36,6 +37,7 @@ module Scrapey
|
|
36
37
|
def post *args; get_or_post 'post', *args; end
|
37
38
|
def head *args; get_or_post 'head', *args; end
|
38
39
|
def goto *args; get_or_post 'goto', *args; end
|
40
|
+
def visit *args; get_or_post 'visit', *args; end
|
39
41
|
|
40
42
|
def set_proxy *args
|
41
43
|
@agent.set_proxy *args
|
data/lib/scrapey/tee.rb
CHANGED
data/lib/scrapey/tor.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'net/telnet'
|
2
|
+
|
3
|
+
module Scrapey
|
4
|
+
def use_tor
|
5
|
+
set_proxy('localhost', 8118)
|
6
|
+
end
|
7
|
+
|
8
|
+
def change_identity
|
9
|
+
debug "changing identity..."
|
10
|
+
localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
|
11
|
+
localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
|
12
|
+
localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
|
13
|
+
localhost.close
|
14
|
+
end
|
15
|
+
end
|
data/template/src/template.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
require 'scrapey'
|
2
2
|
require 'pry'
|
3
3
|
|
4
|
-
# some skeleton code that I like to start with
|
5
|
-
# require 'scrapey/multi' #=> requires em-http-request
|
6
|
-
|
7
4
|
# sample customizations...
|
8
5
|
# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
9
6
|
# @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
|
@@ -28,3 +25,6 @@ fields 'name', 'address', 'zip'
|
|
28
25
|
|
29
26
|
page = get @url
|
30
27
|
scrape page.at('div')
|
28
|
+
|
29
|
+
#@csv.close
|
30
|
+
#%x{call #{@output}}
|
data/template/template.iss
CHANGED
@@ -3,10 +3,22 @@ AppName=Template Scraper
|
|
3
3
|
AppVersion=1.0
|
4
4
|
DefaultDirName={localappdata}\Template Scraper
|
5
5
|
DefaultGroupName=Template Scraper
|
6
|
+
// ChangesAssociations=yes
|
6
7
|
|
7
8
|
[Files]
|
8
9
|
Source: "config\*"; DestDir: "{app}\config";
|
9
10
|
Source: "src\*"; DestDir: "{app}\src";
|
11
|
+
Source: "icon.ico"; DestDir: "{app}";
|
10
12
|
|
11
13
|
[Icons]
|
12
|
-
Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
|
14
|
+
Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
|
15
|
+
// Name: "{group}\Edit Config"; Filename: "{win}\notepad.exe"; Parameters: "{app}\config\config.yml"; Comment: "Config"; WorkingDir: "{app}\config"
|
16
|
+
// Name: "{group}\View Output File"; Filename: "{app}\output.csv"; WorkingDir: "{app}"
|
17
|
+
Name: "{group}\Uninstall Template Scraper"; Filename: "{uninstallexe}"
|
18
|
+
|
19
|
+
[Registry]
|
20
|
+
// Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper; ValueType: string; Flags: uninsdeletekey deletekey; ValueName: Icon; ValueData: """{app}\icon.ico"""
|
21
|
+
// Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper\command; ValueType: string; ValueData: """{app}\template.exe"" ""%1"""; Flags: uninsdeletekey deletekey
|
22
|
+
|
23
|
+
[Run]
|
24
|
+
//Filename: schtasks.exe; Parameters:" /CREATE /TN ""Scheduled Template"" /TR ""{app}\template.exe"" /SC DAILY /ST 02:30"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -88,6 +88,7 @@ files:
|
|
88
88
|
- lib/scrapey/scrapey.rb
|
89
89
|
- lib/scrapey/tee.rb
|
90
90
|
- lib/scrapey/template.rb
|
91
|
+
- lib/scrapey/tor.rb
|
91
92
|
- lib/scrapey.rb
|
92
93
|
- scrapey.gemspec
|
93
94
|
- template/config/config.yml
|