scrapey 0.0.8 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/scrapey.rb +2 -2
- data/lib/scrapey/cache.rb +6 -0
- data/lib/scrapey/cache/disk.rb +7 -0
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/multi.rb +17 -10
- data/lib/scrapey/scrapey.rb +4 -2
- data/lib/scrapey/tee.rb +1 -1
- data/lib/scrapey/tor.rb +15 -0
- data/template/src/template.rb +3 -3
- data/template/template.iss +13 -1
- metadata +3 -2
data/lib/scrapey.rb
CHANGED
@@ -17,7 +17,7 @@ include Scrapey
|
|
17
17
|
@agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
18
18
|
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
19
19
|
# default output file
|
20
|
-
@output = 'output.csv'
|
20
|
+
@output = File.join BASEDIR, 'output.csv'
|
21
21
|
|
22
22
|
# read config file
|
23
23
|
config_file = "#{BASEDIR}/config/config.yml"
|
@@ -25,4 +25,4 @@ config_file = "#{BASEDIR}/config/config.yml"
|
|
25
25
|
|
26
26
|
init_db if @config['database']
|
27
27
|
|
28
|
-
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "
|
28
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
data/lib/scrapey/cache.rb
CHANGED
data/lib/scrapey/cache/disk.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
1
3
|
module Scrapey
|
2
4
|
|
3
5
|
def cache_filename url
|
@@ -22,4 +24,9 @@ module Scrapey
|
|
22
24
|
def save_cache url, doc, options = {}
|
23
25
|
File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
|
24
26
|
end
|
27
|
+
|
28
|
+
def delete_cache url
|
29
|
+
FileUtils.rm cache_filename(url)
|
30
|
+
end
|
31
|
+
|
25
32
|
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/multi.rb
CHANGED
@@ -5,18 +5,25 @@ class HTTPClient; def warn str; end; end
|
|
5
5
|
|
6
6
|
module Scrapey
|
7
7
|
def multi_get_or_post method, all_urls, options = {}
|
8
|
-
all_urls.reject!{|url| is_cached? url} if @use_cache
|
9
|
-
return unless all_urls.size > 0
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
9
|
+
# some sensible defaults
|
10
|
+
threads = options[:threads] || 20
|
11
|
+
on_success = options[:on_success] || :on_success
|
12
|
+
on_error = options[:on_error] || :on_error
|
13
|
+
user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
14
|
+
proxy = options[:proxy] || nil
|
15
|
+
timeout = options[:timeout] || 1000
|
16
|
+
follow_redirect = options[:follow_redirect] || true
|
17
17
|
|
18
18
|
@lock ||= Mutex.new
|
19
|
-
|
19
|
+
|
20
|
+
@http_clients ||= threads.times.map do
|
21
|
+
c = HTTPClient.new proxy, user_agent
|
22
|
+
c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
23
|
+
c.receive_timeout = timeout
|
24
|
+
yield c if block_given?
|
25
|
+
c
|
26
|
+
end
|
20
27
|
|
21
28
|
debug 'starting multi'
|
22
29
|
|
@@ -24,7 +31,7 @@ module Scrapey
|
|
24
31
|
urls.each_with_index.map do |url, i|
|
25
32
|
Thread.new do
|
26
33
|
begin
|
27
|
-
response = @http_clients[i].send method, url, options[:query], options[:headers]
|
34
|
+
response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
|
28
35
|
rescue Exception => e
|
29
36
|
error = e
|
30
37
|
end
|
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Scrapey
|
2
2
|
|
3
3
|
def get_or_post method, url, options={}, *args
|
4
|
-
agent =
|
4
|
+
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
5
5
|
_retries = options.delete :retries
|
6
6
|
_sleep = options.delete :sleep
|
7
7
|
begin
|
@@ -15,7 +15,8 @@ module Scrapey
|
|
15
15
|
return doc if doc
|
16
16
|
|
17
17
|
page = agent.send *new_args
|
18
|
-
|
18
|
+
str = page.respond_to?('root') ? page.root.to_s : page.body
|
19
|
+
save_cache(url, str) if @use_cache
|
19
20
|
|
20
21
|
#exit if Object.const_defined? :Ocra
|
21
22
|
page
|
@@ -36,6 +37,7 @@ module Scrapey
|
|
36
37
|
def post *args; get_or_post 'post', *args; end
|
37
38
|
def head *args; get_or_post 'head', *args; end
|
38
39
|
def goto *args; get_or_post 'goto', *args; end
|
40
|
+
def visit *args; get_or_post 'visit', *args; end
|
39
41
|
|
40
42
|
def set_proxy *args
|
41
43
|
@agent.set_proxy *args
|
data/lib/scrapey/tee.rb
CHANGED
data/lib/scrapey/tor.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'net/telnet'
|
2
|
+
|
3
|
+
module Scrapey
|
4
|
+
def use_tor
|
5
|
+
set_proxy('localhost', 8118)
|
6
|
+
end
|
7
|
+
|
8
|
+
def change_identity
|
9
|
+
debug "changing identity..."
|
10
|
+
localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
|
11
|
+
localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
|
12
|
+
localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
|
13
|
+
localhost.close
|
14
|
+
end
|
15
|
+
end
|
data/template/src/template.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
require 'scrapey'
|
2
2
|
require 'pry'
|
3
3
|
|
4
|
-
# some skeleton code that I like to start with
|
5
|
-
# require 'scrapey/multi' #=> requires em-http-request
|
6
|
-
|
7
4
|
# sample customizations...
|
8
5
|
# @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
9
6
|
# @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
|
@@ -28,3 +25,6 @@ fields 'name', 'address', 'zip'
|
|
28
25
|
|
29
26
|
page = get @url
|
30
27
|
scrape page.at('div')
|
28
|
+
|
29
|
+
#@csv.close
|
30
|
+
#%x{call #{@output}}
|
data/template/template.iss
CHANGED
@@ -3,10 +3,22 @@ AppName=Template Scraper
|
|
3
3
|
AppVersion=1.0
|
4
4
|
DefaultDirName={localappdata}\Template Scraper
|
5
5
|
DefaultGroupName=Template Scraper
|
6
|
+
// ChangesAssociations=yes
|
6
7
|
|
7
8
|
[Files]
|
8
9
|
Source: "config\*"; DestDir: "{app}\config";
|
9
10
|
Source: "src\*"; DestDir: "{app}\src";
|
11
|
+
Source: "icon.ico"; DestDir: "{app}";
|
10
12
|
|
11
13
|
[Icons]
|
12
|
-
Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
|
14
|
+
Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
|
15
|
+
// Name: "{group}\Edit Config"; Filename: "{win}\notepad.exe"; Parameters: "{app}\config\config.yml"; Comment: "Config"; WorkingDir: "{app}\config"
|
16
|
+
// Name: "{group}\View Output File"; Filename: "{app}\output.csv"; WorkingDir: "{app}"
|
17
|
+
Name: "{group}\Uninstall Template Scraper"; Filename: "{uninstallexe}"
|
18
|
+
|
19
|
+
[Registry]
|
20
|
+
// Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper; ValueType: string; Flags: uninsdeletekey deletekey; ValueName: Icon; ValueData: """{app}\icon.ico"""
|
21
|
+
// Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper\command; ValueType: string; ValueData: """{app}\template.exe"" ""%1"""; Flags: uninsdeletekey deletekey
|
22
|
+
|
23
|
+
[Run]
|
24
|
+
//Filename: schtasks.exe; Parameters:" /CREATE /TN ""Scheduled Template"" /TR ""{app}\template.exe"" /SC DAILY /ST 02:30"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -88,6 +88,7 @@ files:
|
|
88
88
|
- lib/scrapey/scrapey.rb
|
89
89
|
- lib/scrapey/tee.rb
|
90
90
|
- lib/scrapey/template.rb
|
91
|
+
- lib/scrapey/tor.rb
|
91
92
|
- lib/scrapey.rb
|
92
93
|
- scrapey.gemspec
|
93
94
|
- template/config/config.yml
|