scrapey 0.0.8 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/scrapey.rb CHANGED
@@ -17,7 +17,7 @@ include Scrapey
17
17
  @agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
18
18
  @agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
19
19
  # default output file
20
- @output = 'output.csv'
20
+ @output = File.join BASEDIR, 'output.csv'
21
21
 
22
22
  # read config file
23
23
  config_file = "#{BASEDIR}/config/config.yml"
@@ -25,4 +25,4 @@ config_file = "#{BASEDIR}/config/config.yml"
25
25
 
26
26
  init_db if @config['database']
27
27
 
28
- $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "a"))
28
+ $stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
data/lib/scrapey/cache.rb CHANGED
@@ -11,4 +11,10 @@ module Scrapey
11
11
  end
12
12
  end
13
13
 
14
+ def disable_cache
15
+ @use_cache = false
16
+ yield
17
+ @use_cache = true
18
+ end
19
+
14
20
  end
@@ -1,3 +1,5 @@
1
+ require 'fileutils'
2
+
1
3
  module Scrapey
2
4
 
3
5
  def cache_filename url
@@ -22,4 +24,9 @@ module Scrapey
22
24
  def save_cache url, doc, options = {}
23
25
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
24
26
  end
27
+
28
+ def delete_cache url
29
+ FileUtils.rm cache_filename(url)
30
+ end
31
+
25
32
  end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.11"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
data/lib/scrapey/multi.rb CHANGED
@@ -5,18 +5,25 @@ class HTTPClient; def warn str; end; end
5
5
 
6
6
  module Scrapey
7
7
  def multi_get_or_post method, all_urls, options = {}
8
- all_urls.reject!{|url| is_cached? url} if @use_cache
9
- return unless all_urls.size > 0
10
8
 
11
- threads = options[:threads] || 20
12
- on_success = options[:on_success] || :on_success
13
- on_error = options[:on_error] || :on_error
14
- user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
15
- proxy = options[:proxy] || nil
16
- timeout = options[:timeout] || 1000
9
+ # some sensible defaults
10
+ threads = options[:threads] || 20
11
+ on_success = options[:on_success] || :on_success
12
+ on_error = options[:on_error] || :on_error
13
+ user_agent = options[:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
14
+ proxy = options[:proxy] || nil
15
+ timeout = options[:timeout] || 1000
16
+ follow_redirect = options[:follow_redirect] || true
17
17
 
18
18
  @lock ||= Mutex.new
19
- @http_clients ||= threads.times.map{HTTPClient.new(options[:proxies] ? options[:proxies].rotate!.first : proxy, user_agent).tap{|c| c.ssl_config.verify_mode, c.receive_timeout, c.ssl_config.verify_callback = OpenSSL::SSL::VERIFY_NONE, timeout, proc{true}}}
19
+
20
+ @http_clients ||= threads.times.map do
21
+ c = HTTPClient.new proxy, user_agent
22
+ c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
23
+ c.receive_timeout = timeout
24
+ yield c if block_given?
25
+ c
26
+ end
20
27
 
21
28
  debug 'starting multi'
22
29
 
@@ -24,7 +31,7 @@ module Scrapey
24
31
  urls.each_with_index.map do |url, i|
25
32
  Thread.new do
26
33
  begin
27
- response = @http_clients[i].send method, url, options[:query], options[:headers]
34
+ response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
28
35
  rescue Exception => e
29
36
  error = e
30
37
  end
@@ -1,7 +1,7 @@
1
1
  module Scrapey
2
2
 
3
3
  def get_or_post method, url, options={}, *args
4
- agent = method == 'goto' ? @browser : @agent
4
+ agent = ['goto', 'visit'].include?(method) ? @browser : @agent
5
5
  _retries = options.delete :retries
6
6
  _sleep = options.delete :sleep
7
7
  begin
@@ -15,7 +15,8 @@ module Scrapey
15
15
  return doc if doc
16
16
 
17
17
  page = agent.send *new_args
18
- save_cache(url, page.root.to_s) if @use_cache
18
+ str = page.respond_to?('root') ? page.root.to_s : page.body
19
+ save_cache(url, str) if @use_cache
19
20
 
20
21
  #exit if Object.const_defined? :Ocra
21
22
  page
@@ -36,6 +37,7 @@ module Scrapey
36
37
  def post *args; get_or_post 'post', *args; end
37
38
  def head *args; get_or_post 'head', *args; end
38
39
  def goto *args; get_or_post 'goto', *args; end
40
+ def visit *args; get_or_post 'visit', *args; end
39
41
 
40
42
  def set_proxy *args
41
43
  @agent.set_proxy *args
data/lib/scrapey/tee.rb CHANGED
@@ -5,7 +5,7 @@ module Scrapey
5
5
  end
6
6
 
7
7
  def write *args
8
- @targets.each {|t| t.write(*args)}
8
+ @targets.each {|t| t.write(*args); t.flush}
9
9
  end
10
10
 
11
11
  def flush *args
@@ -0,0 +1,15 @@
1
+ require 'net/telnet'
2
+
3
+ module Scrapey
4
+ def use_tor
5
+ set_proxy('localhost', 8118)
6
+ end
7
+
8
+ def change_identity
9
+ debug "changing identity..."
10
+ localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
11
+ localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
12
+ localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
13
+ localhost.close
14
+ end
15
+ end
@@ -1,9 +1,6 @@
1
1
  require 'scrapey'
2
2
  require 'pry'
3
3
 
4
- # some skeleton code that I like to start with
5
- # require 'scrapey/multi' #=> requires em-http-request
6
-
7
4
  # sample customizations...
8
5
  # @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
9
6
  # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
@@ -28,3 +25,6 @@ fields 'name', 'address', 'zip'
28
25
 
29
26
  page = get @url
30
27
  scrape page.at('div')
28
+
29
+ #@csv.close
30
+ #%x{call #{@output}}
@@ -3,10 +3,22 @@ AppName=Template Scraper
3
3
  AppVersion=1.0
4
4
  DefaultDirName={localappdata}\Template Scraper
5
5
  DefaultGroupName=Template Scraper
6
+ // ChangesAssociations=yes
6
7
 
7
8
  [Files]
8
9
  Source: "config\*"; DestDir: "{app}\config";
9
10
  Source: "src\*"; DestDir: "{app}\src";
11
+ Source: "icon.ico"; DestDir: "{app}";
10
12
 
11
13
  [Icons]
12
- Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
14
+ Name: "{group}\Template Scraper"; Filename: "{app}\template.exe"
15
+ // Name: "{group}\Edit Config"; Filename: "{win}\notepad.exe"; Parameters: "{app}\config\config.yml"; Comment: "Config"; WorkingDir: "{app}\config"
16
+ // Name: "{group}\View Output File"; Filename: "{app}\output.csv"; WorkingDir: "{app}"
17
+ Name: "{group}\Uninstall Template Scraper"; Filename: "{uninstallexe}"
18
+
19
+ [Registry]
20
+ // Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper; ValueType: string; Flags: uninsdeletekey deletekey; ValueName: Icon; ValueData: """{app}\icon.ico"""
21
+ // Root: HKCR; Subkey: SystemFileAssociations\.csv\shell\Template Scraper\command; ValueType: string; ValueData: """{app}\template.exe"" ""%1"""; Flags: uninsdeletekey deletekey
22
+
23
+ [Run]
24
+ //Filename: schtasks.exe; Parameters:" /CREATE /TN ""Scheduled Template"" /TR ""{app}\template.exe"" /SC DAILY /ST 02:30"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.11
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-30 00:00:00.000000000 Z
12
+ date: 2012-12-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -88,6 +88,7 @@ files:
88
88
  - lib/scrapey/scrapey.rb
89
89
  - lib/scrapey/tee.rb
90
90
  - lib/scrapey/template.rb
91
+ - lib/scrapey/tor.rb
91
92
  - lib/scrapey.rb
92
93
  - scrapey.gemspec
93
94
  - template/config/config.yml