scrapey 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/scrapey +4 -0
- data/lib/scrapey.rb +10 -77
- data/lib/scrapey/cache.rb +22 -0
- data/lib/scrapey/database.rb +14 -0
- data/lib/scrapey/scrapey.rb +72 -0
- data/lib/scrapey/template.rb +16 -0
- data/lib/scrapey/version.rb +1 -1
- data/scrapey.gemspec +4 -4
- data/template/config/config.yml +23 -0
- data/template/src/template.rb +0 -0
- metadata +35 -6
- data/.gitignore +0 -17
data/bin/scrapey
ADDED
data/lib/scrapey.rb
CHANGED
@@ -1,79 +1,12 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
|
5
|
-
module Scrapey
|
6
|
-
BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
|
7
|
-
|
8
|
-
def get_or_post method, url, options={}, *args
|
9
|
-
agent = method == 'goto' ? @browser : @agent
|
10
|
-
_retries = options.delete :retries
|
11
|
-
_sleep = options.delete :sleep
|
12
|
-
begin
|
13
|
-
new_args = method, url
|
14
|
-
unless options.empty? && args.empty?
|
15
|
-
new_args << options
|
16
|
-
args.each{|arg| new_args << arg}
|
17
|
-
end
|
18
|
-
|
19
|
-
doc = load_cache(url) if @use_cache
|
20
|
-
return doc if doc
|
21
|
-
|
22
|
-
page = agent.send *new_args
|
23
|
-
save_cache(url, page.body) if @use_cache
|
24
|
-
|
25
|
-
#exit if Object.const_defined? :Ocra
|
26
|
-
page
|
27
|
-
rescue Exception => e
|
28
|
-
case
|
29
|
-
when defined? on_error
|
30
|
-
on_error e
|
31
|
-
get_or_post method, url, options, *args
|
32
|
-
when _retries && _retries > 0
|
33
|
-
puts "Error. Retries remaining: #{options[:retries]}"
|
34
|
-
sleep _sleep if _sleep
|
35
|
-
get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
|
36
|
-
else raise e
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
def get *args; get_or_post 'get', *args; end
|
42
|
-
def post *args; get_or_post 'post', *args; end
|
43
|
-
def head *args; get_or_post 'head', *args; end
|
44
|
-
def goto *args; get_or_post 'goto', *args; end
|
1
|
+
require 'mechanize'
|
2
|
+
require 'csv'
|
3
|
+
require 'json'
|
4
|
+
require 'yaml'
|
45
5
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
def fields *args
|
51
|
-
@fields = args
|
52
|
-
end
|
53
|
-
|
54
|
-
def save item
|
55
|
-
unless @csv && !@csv.closed?
|
56
|
-
@csv = CSV.open @output, 'w'
|
57
|
-
@csv << @fields if @fields
|
58
|
-
end
|
59
|
-
case
|
60
|
-
when item.is_a?(Array) then @csv << item
|
61
|
-
when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
|
62
|
-
else raise "unsupported type: #{item.class}"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def visited? url
|
67
|
-
@visited ||= []
|
68
|
-
return true if @visited.include? url
|
69
|
-
@visited << url
|
70
|
-
false
|
71
|
-
end
|
72
|
-
|
73
|
-
def ts
|
74
|
-
Time.now.to_i.to_s
|
75
|
-
end
|
76
|
-
end
|
6
|
+
require "scrapey/version"
|
7
|
+
require "scrapey/scrapey"
|
8
|
+
require "scrapey/cache"
|
9
|
+
require "scrapey/database"
|
77
10
|
|
78
11
|
include Scrapey
|
79
12
|
|
@@ -81,7 +14,7 @@ include Scrapey
|
|
81
14
|
@agent ||= Mechanize.new{|a| a.history.max_size = 10}
|
82
15
|
@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
83
16
|
|
84
|
-
#
|
17
|
+
# default output file
|
85
18
|
@output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
|
86
19
|
|
87
20
|
# read config file
|
@@ -89,7 +22,7 @@ config_file = "#{BASEDIR}/config/config.yml"
|
|
89
22
|
@config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
|
90
23
|
|
91
24
|
if @config['database']
|
92
|
-
['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/
|
25
|
+
['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
|
93
26
|
ActiveRecord::Base.establish_connection(@config['database'])
|
94
27
|
end
|
95
28
|
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Scrapey
|
2
|
+
def use_cache
|
3
|
+
@use_cache = true
|
4
|
+
@config['cache_dir'] ||= "#{BASEDIR}/cache"
|
5
|
+
FileUtils.mkdir_p @config['cache_dir']
|
6
|
+
end
|
7
|
+
|
8
|
+
def cache_filename url
|
9
|
+
@config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
|
10
|
+
end
|
11
|
+
|
12
|
+
def load_cache url
|
13
|
+
filename = cache_filename url
|
14
|
+
return nil unless File::exists?(filename)
|
15
|
+
puts "Loading #{filename} from cache"
|
16
|
+
Nokogiri::HTML File.read(filename)
|
17
|
+
end
|
18
|
+
|
19
|
+
def save_cache url,doc
|
20
|
+
File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Scrapey
|
2
|
+
def tables *args
|
3
|
+
args.each do |arg|
|
4
|
+
#eval("class #{arg} < ActiveRecord::Base; end")
|
5
|
+
Object.const_set(arg, Class.new(ActiveRecord::Base) {})
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
def truncate *args
|
10
|
+
args.each do |arg|
|
11
|
+
ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Scrapey
|
2
|
+
BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
|
3
|
+
|
4
|
+
def get_or_post method, url, options={}, *args
|
5
|
+
agent = method == 'goto' ? @browser : @agent
|
6
|
+
_retries = options.delete :retries
|
7
|
+
_sleep = options.delete :sleep
|
8
|
+
begin
|
9
|
+
new_args = method, url
|
10
|
+
unless options.empty? && args.empty?
|
11
|
+
new_args << options
|
12
|
+
args.each{|arg| new_args << arg}
|
13
|
+
end
|
14
|
+
|
15
|
+
doc = load_cache(url) if @use_cache
|
16
|
+
return doc if doc
|
17
|
+
|
18
|
+
page = agent.send *new_args
|
19
|
+
save_cache(url, page.body) if @use_cache
|
20
|
+
|
21
|
+
#exit if Object.const_defined? :Ocra
|
22
|
+
page
|
23
|
+
rescue Exception => e
|
24
|
+
case
|
25
|
+
when defined? on_error
|
26
|
+
on_error e
|
27
|
+
get_or_post method, url, options, *args
|
28
|
+
when _retries && _retries > 0
|
29
|
+
puts "Error. Retries remaining: #{options[:retries]}"
|
30
|
+
sleep _sleep if _sleep
|
31
|
+
get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
|
32
|
+
else raise e
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def get *args; get_or_post 'get', *args; end
|
38
|
+
def post *args; get_or_post 'post', *args; end
|
39
|
+
def head *args; get_or_post 'head', *args; end
|
40
|
+
def goto *args; get_or_post 'goto', *args; end
|
41
|
+
|
42
|
+
def set_proxy *args
|
43
|
+
@agent.set_proxy *args
|
44
|
+
end
|
45
|
+
|
46
|
+
def fields *args
|
47
|
+
@fields = args
|
48
|
+
end
|
49
|
+
|
50
|
+
def save item
|
51
|
+
unless @csv && !@csv.closed?
|
52
|
+
@csv = CSV.open @output, 'w'
|
53
|
+
@csv << @fields if @fields
|
54
|
+
end
|
55
|
+
case
|
56
|
+
when item.is_a?(Array) then @csv << item
|
57
|
+
when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
|
58
|
+
else raise "unsupported type: #{item.class}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def visited? url
|
63
|
+
@visited ||= []
|
64
|
+
return true if @visited.include? url
|
65
|
+
@visited << url
|
66
|
+
false
|
67
|
+
end
|
68
|
+
|
69
|
+
def ts
|
70
|
+
Time.now.to_i.to_s
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module Scrapey
|
4
|
+
module Template
|
5
|
+
def self.generate name, *args
|
6
|
+
puts "creating new scrapey project: #{name}..."
|
7
|
+
template = File.expand_path('../../../template', __FILE__)
|
8
|
+
FileUtils.cp_r template, name
|
9
|
+
Dir.chdir name
|
10
|
+
Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
|
11
|
+
FileUtils.mv fn, fn.gsub('template', name)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/scrapey/version.rb
CHANGED
data/scrapey.gemspec
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
require File.expand_path('../lib/scrapey/version', __FILE__)
|
3
3
|
|
4
|
-
gem 'mechanize'
|
5
|
-
gem 'json'
|
6
|
-
|
7
4
|
Gem::Specification.new do |gem|
|
8
5
|
gem.authors = ["P Guardiario"]
|
9
6
|
gem.email = ["pguardiario@gmail.com"]
|
@@ -11,11 +8,14 @@ Gem::Specification.new do |gem|
|
|
11
8
|
gem.summary = %q{A simple scraping framework}
|
12
9
|
gem.homepage = ""
|
13
10
|
|
14
|
-
gem.files = `git ls-files`.split($\)
|
11
|
+
# gem.files = `git ls-files`.split($\)
|
12
|
+
gem.files = `find * -type f | grep -v pkg`.split($\)
|
15
13
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
14
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
15
|
gem.name = "scrapey"
|
18
16
|
gem.require_paths = ["lib"]
|
19
17
|
gem.version = Scrapey::VERSION
|
18
|
+
gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
|
19
|
+
gem.add_dependency(%q<json>, ["~> 1.7.0"])
|
20
20
|
end
|
21
21
|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# you can put any configurable options in here
|
2
|
+
# login information for example
|
3
|
+
email: foo@gmail.com
|
4
|
+
password: 12345
|
5
|
+
|
6
|
+
=begin
|
7
|
+
# example database section
|
8
|
+
database:
|
9
|
+
adapter: mysql
|
10
|
+
database: template
|
11
|
+
username: root
|
12
|
+
password: 12345
|
13
|
+
host: localhost
|
14
|
+
encoding: 'utf8'
|
15
|
+
=end
|
16
|
+
|
17
|
+
=begin
|
18
|
+
# example proxies section
|
19
|
+
proxies:
|
20
|
+
- www.host1.com:80
|
21
|
+
- localhost:8080
|
22
|
+
- 127.0.0.1:8888
|
23
|
+
=end
|
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,23 +9,52 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
13
|
-
dependencies:
|
12
|
+
date: 2012-07-24 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mechanize
|
16
|
+
requirement: &16919208 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.5.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *16919208
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: json
|
27
|
+
requirement: &16115544 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.7.0
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *16115544
|
14
36
|
description: A simple scraping framework
|
15
37
|
email:
|
16
38
|
- pguardiario@gmail.com
|
17
|
-
executables:
|
39
|
+
executables:
|
40
|
+
- scrapey
|
18
41
|
extensions: []
|
19
42
|
extra_rdoc_files: []
|
20
43
|
files:
|
21
|
-
- .gitignore
|
22
44
|
- Gemfile
|
23
45
|
- LICENSE
|
24
46
|
- README.md
|
25
47
|
- Rakefile
|
26
|
-
-
|
48
|
+
- bin/scrapey
|
49
|
+
- lib/scrapey/cache.rb
|
50
|
+
- lib/scrapey/database.rb
|
51
|
+
- lib/scrapey/scrapey.rb
|
52
|
+
- lib/scrapey/template.rb
|
27
53
|
- lib/scrapey/version.rb
|
54
|
+
- lib/scrapey.rb
|
28
55
|
- scrapey.gemspec
|
56
|
+
- template/config/config.yml
|
57
|
+
- template/src/template.rb
|
29
58
|
homepage: ''
|
30
59
|
licenses: []
|
31
60
|
post_install_message:
|