scrapey 0.0.8 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
data/lib/scrapey.rb CHANGED
@@ -17,7 +17,7 @@ include Scrapey
17
17
  @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
18
18
  @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
19
19
  # default output file
20
- @output = 'output.csv'
20
+ @output = File.join BASEDIR, 'output.csv'
21
21
 
22
22
  # read config file
23
23
  config_file = "#{BASEDIR}/config/config.yml"
@@ -25,4 +25,4 @@ config_file = "#{BASEDIR}/config/config.yml"
25
25
 
26
26
  init_db if @config['database']
27
27
 
28
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
28
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
data/lib/scrapey/cache.rb CHANGED
@@ -11,4 +11,10 @@ module Scrapey
11
11
  end
12
12
  end
13
13
 
14
+ def disable_cache
15
+ @use_cache = false
16
+ yield
17
+ @use_cache = true
18
+ end
19
+
14
20
  end
@@ -1,3 +1,5 @@
1
+ require 'fileutils'
2
+
1
3
  module Scrapey
2
4
 
3
5
  def cache_filename url
@@ -22,4 +24,9 @@ module Scrapey
22
24
  def save_cache url, doc, options = {}
23
25
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
24
26
  end
27
+
28
+ def delete_cache url
29
+ FileUtils.rm cache_filename(url)
30
+ end
31
+
25
32
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.11"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
data/lib/scrapey/multi.rb CHANGED
@@ -5,18 +5,25 @@ class HTTPClient; def warn str; end; end
5
5
 
6
6
  module Scrapey
7
7
  def multi_get_or_post method, all_urls, options = {}
8
- all_urls.reject!{|url| is_cached? url} if @use_cache
9
- return unless all_urls.size > 0
10
8
 
11
- threads = options[:threads] || 20
12
- on_success = options[:on_success] || :on_success
13
- on_error = options[:on_error] || :on_error
14
- user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
15
- proxy = options[:proxy] || nil
16
- timeout = options[:timeout] || 1000
9
+ # some sensible defaults
10
+ threads = options[:threads] || 20
11
+ on_success = options[:on_success] || :on_success
12
+ on_error = options[:on_error] || :on_error
13
+ user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
14
+ proxy = options[:proxy] || nil
15
+ timeout = options[:timeout] || 1000
16
+ follow_redirect = options[:follow_redirect] || true
17
17
 
18
18
  @lock ||= Mutex.new
19
- @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
19
+
20
+ @http_clients ||= threads.times.map do
21
+ c = HTTPClient.new proxy, user_agent
22
+ c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
23
+ c.receive_timeout = timeout
24
+ yield c if block_given?
25
+ c
26
+ end
20
27
 
21
28
  debug 'starting multi'
22
29
 
@@ -24,7 +31,7 @@ module Scrapey
24
31
  urls.each_with_index.map do |url, i|
25
32
  Thread.new do
26
33
  begin
27
- response = @http_clients[i].send method, url, options[:query], options[:headers]
34
+ response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
28
35
  rescue Exception => e
29
36
  error = e
30
37
  end
@@ -1,7 +1,7 @@
1
1
  module Scrapey
2
2
 
3
3
  def get_or_post method, url, options={}, *args
4
- agent = method == 'goto' ? @browser : @agent
4
+ agent = ['goto', 'visit'].include?(method) ? @browser : @agent
5
5
  _retries = options.delete :retries
6
6
  _sleep = options.delete :sleep
7
7
  begin
@@ -15,7 +15,8 @@ module Scrapey
15
15
  return doc if doc
16
16
 
17
17
  page = agent.send *new_args
18
- save_cache(url, page.root.to_s) if @use_cache
18
+ str = page.respond_to?('root') ? page.root.to_s : page.body
19
+ save_cache(url, str) if @use_cache
19
20
 
20
21
  #exit if Object.const_defined? :Ocra
21
22
  page
@@ -36,6 +37,7 @@ module Scrapey
36
37
  def post *args; get_or_post 'post', *args; end
37
38
  def head *args; get_or_post 'head', *args; end
38
39
  def goto *args; get_or_post 'goto', *args; end
40
+ def visit *args; get_or_post 'visit', *args; end
39
41
 
40
42
  def set_proxy *args
41
43
  @agent.set_proxy *args
data/lib/scrapey/tee.rb CHANGED
@@ -5,7 +5,7 @@ module Scrapey
5
5
  end
6
6
 
7
7
  def write *args
8
- @targets.each {|t| t.write(*args)}
8
+ @targets.each {|t| t.write(*args); t.flush}
9
9
  end
10
10
 
11
11
  def flush *args
@@ -0,0 +1,15 @@
1
+ require 'net/telnet'
2
+
3
+ module Scrapey
4
+ def use_tor
5
+ set_proxy('localhost', 8118)
6
+ end
7
+
8
+ def change_identity
9
+ debug "changing identity..."
10
+ localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
11
+ localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
12
+ localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
13
+ localhost.close
14
+ end
15
+ end
@@ -1,9 +1,6 @@
1
1
  require 'scrapey'
2
2
  require 'pry'
3
3
 
4
- # some skeleton code that I like to start with
5
- # require 'scrapey/multi' #=> requires em-http-request
6
-
7
4
  # sample customizations...
8
5
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
9
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
@@ -28,3 +25,6 @@ fields 'name', 'address', 'zip'
28
25
 
29
26
  page = get @url
30
27
  scrape page.at('div')
28
+
29
+ #@csv.close
30
+ #%x{call #{@output}}
@@ -3,10 +3,22 @@ AppName=Template Scraper
3
3
  AppVersion=1.0
4
4
  DefaultDirName={localappdata}\Template Scraper
5
5
  DefaultGroupName=Template Scraper
6
+ // ChangesAssociations=yes
6
7
 
7
8
  [Files]
8
9
  Source: "config\*"; DestDir: "{app}\config";
9
10
  Source: "src\*"; DestDir: "{app}\src";
11
+ Source: "icon.ico"; DestDir: "{app}";
10
12
 
11
13
  [Icons]
12
- Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
14
+ Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
15
+ // Name: "{group}\Edit Config"; Filename: "{win}\notepad.exe"; Parameters: "{app}\config\config.yml"; Comment: "Config"; WorkingDir: "{app}\config"
16
+ // Name: "{group}\View Output File"; Filename: "{app}\output.csv"; WorkingDir: "{app}"
17
+ Name: "{group}\Uninstall Template Scraper"; Filename: "{uninstallexe}"
18
+
19
+ [Registry]
20
+ // Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper; ValueType: string; Flags: uninsdeletekey deletekey; ValueName: Icon; ValueData: """{app}\icon.ico"""
21
+ // Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper\command; ValueType: string; ValueData: """{app}\template.exe"" ""%1"""; Flags: uninsdeletekey deletekey
22
+
23
+ [Run]
24
+ //Filename: schtasks.exe; Parameters:" /CREATE /TN ""Scheduled Template"" /TR ""{app}\template.exe"" /SC DAILY /ST 02:30"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.11
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-30 00:00:00.000000000 Z
12
+ date: 2012-12-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -88,6 +88,7 @@ files:
88
88
  - lib/scrapey/scrapey.rb
89
89
  - lib/scrapey/tee.rb
90
90
  - lib/scrapey/template.rb
91
+ - lib/scrapey/tor.rb
91
92
  - lib/scrapey.rb
92
93
  - scrapey.gemspec
93
94
  - template/config/config.yml