scrapey 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scrapey.rb +11 -13
- data/lib/scrapey/constants.rb +1 -1
- data/lib/scrapey/scrapey.rb +10 -0
- data/template/Rakefile +8 -7
- data/template/src/downloader.rb +28 -0
- data/template/src/template.rb +5 -3
- metadata +3 -2
data/lib/scrapey.rb
CHANGED
@@ -3,7 +3,6 @@ require 'csv'
|
|
3
3
|
require 'json'
|
4
4
|
require 'yaml'
|
5
5
|
require 'unf_ext'
|
6
|
-
require 'coderay/tokens_proxy'
|
7
6
|
|
8
7
|
require "scrapey/scrapey"
|
9
8
|
require "scrapey/constants"
|
@@ -12,19 +11,18 @@ require "scrapey/database"
|
|
12
11
|
require "scrapey/multi"
|
13
12
|
require "scrapey/tee"
|
14
13
|
|
15
|
-
|
14
|
+
# don't do this stuff in rails:
|
15
|
+
unless defined? Rails
|
16
|
+
Scrapey::init binding
|
16
17
|
|
17
|
-
#
|
18
|
-
@
|
19
|
-
@agent.user_agent = "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
|
20
|
-
@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
21
|
-
# default output file
|
22
|
-
@output = File.join BASEDIR, 'output.csv'
|
18
|
+
# default output file
|
19
|
+
@output = File.join BASEDIR, 'output.csv'
|
23
20
|
|
24
|
-
# read config file
|
25
|
-
config_file = "#{BASEDIR}/config/config.yml"
|
26
|
-
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
21
|
+
# read config file
|
22
|
+
config_file = "#{BASEDIR}/config/config.yml"
|
23
|
+
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
27
24
|
|
28
|
-
init_db if @config['database']
|
25
|
+
init_db if @config['database']
|
29
26
|
|
30
|
-
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
27
|
+
$stderr = Scrapey::Tee.new(STDERR, File.open("#{BASEDIR}/errors.log", "w"))
|
28
|
+
end
|
data/lib/scrapey/constants.rb
CHANGED
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
module Scrapey
|
2
2
|
|
3
|
+
def self.init b
|
4
|
+
eval "include Scrapey", b
|
5
|
+
|
6
|
+
# some defaults that I like
|
7
|
+
eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
|
8
|
+
eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
|
9
|
+
eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
|
10
|
+
end
|
11
|
+
|
12
|
+
|
3
13
|
def get_or_post method, url, options={}, *args
|
4
14
|
agent = ['goto', 'visit'].include?(method) ? @browser : @agent
|
5
15
|
_retries = options.delete :retries
|
data/template/Rakefile
CHANGED
@@ -13,13 +13,14 @@ end
|
|
13
13
|
|
14
14
|
desc "Copy installer to dropbox folder"
|
15
15
|
task 'dropbox' do
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
16
|
+
Dir.glob('Output/*').map{|x| x[/[^\/]*$/]}.each do |file|
|
17
|
+
raise 'no dropbox folder!' unless ENV['DROPBOX']
|
18
|
+
folder = [ENV['DROPBOX'], name].join('/').squeeze('/')
|
19
|
+
FileUtils.mkdir(folder) unless File.exists?(folder)
|
20
|
+
FileUtils.cp "Output/#{file}", folder
|
21
|
+
url = [ENV['DROPBOX_public_url'], name, file].join('/').squeeze('/')
|
22
|
+
puts "uploaded to #{url}"
|
23
|
+
end
|
23
24
|
end
|
24
25
|
|
25
26
|
task 'git' do
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'pry'
|
3
|
+
|
4
|
+
use_cache
|
5
|
+
|
6
|
+
# File.open("#{BASEDIR}/config/urls.txt", 'w'){|f| f<< (0..100).map{|i| "http://www.example.com/id=#{i}"} * "\n"}
|
7
|
+
@queue = File.read("#{BASEDIR}/config/urls.txt").split("\n").shuffle
|
8
|
+
|
9
|
+
def download agent
|
10
|
+
while url = @queue.shift
|
11
|
+
if is_cached? url
|
12
|
+
puts 'skipping'
|
13
|
+
next
|
14
|
+
end
|
15
|
+
page = agent.get url
|
16
|
+
save_cache url, page.body
|
17
|
+
puts url
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
threads = []
|
22
|
+
5.times do
|
23
|
+
threads << Thread.new { download Mechanize.new{|a| a.history.max_size, a.verify_mode = 10, OpenSSL::SSL::VERIFY_NONE}}
|
24
|
+
end
|
25
|
+
|
26
|
+
threads.each { |t| t.join }
|
27
|
+
|
28
|
+
binding.pry
|
data/template/src/template.rb
CHANGED
@@ -14,17 +14,19 @@ def scrape div
|
|
14
14
|
save item
|
15
15
|
exit if defined? Ocra
|
16
16
|
rescue StandardError => e
|
17
|
+
puts e.message, e.backtrace
|
17
18
|
binding.pry
|
18
19
|
end
|
19
20
|
|
20
|
-
#use_cache :redis
|
21
21
|
|
22
22
|
fields 'name', 'address', 'zip'
|
23
23
|
|
24
24
|
@url = "http://www.example.com/"
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
with_cache do
|
27
|
+
page = get @url
|
28
|
+
binding.pry
|
29
|
+
end
|
28
30
|
|
29
31
|
#@csv.close
|
30
32
|
#%x{call #{@output}}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-03-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- template/icon.ico
|
97
97
|
- template/output.csv
|
98
98
|
- template/Rakefile
|
99
|
+
- template/src/downloader.rb
|
99
100
|
- template/src/schema.rb
|
100
101
|
- template/src/template.rb
|
101
102
|
- template/template.iss
|