scrapey 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/bin/scrapey ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'scrapey/template'
3
+
4
+ Scrapey::Template::generate *ARGV
data/lib/scrapey.rb CHANGED
@@ -1,79 +1,12 @@
1
- require "scrapey/version"
2
- require "scrapey/src/cache"
3
- require "scrapey/src/database"
4
-
5
- module Scrapey
6
- BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
7
-
8
- def get_or_post method, url, options={}, *args
9
- agent = method == 'goto' ? @browser : @agent
10
- _retries = options.delete :retries
11
- _sleep = options.delete :sleep
12
- begin
13
- new_args = method, url
14
- unless options.empty? && args.empty?
15
- new_args << options
16
- args.each{|arg| new_args << arg}
17
- end
18
-
19
- doc = load_cache(url) if @use_cache
20
- return doc if doc
21
-
22
- page = agent.send *new_args
23
- save_cache(url, page.body) if @use_cache
24
-
25
- #exit if Object.const_defined? :Ocra
26
- page
27
- rescue Exception => e
28
- case
29
- when defined? on_error
30
- on_error e
31
- get_or_post method, url, options, *args
32
- when _retries && _retries > 0
33
- puts "Error. Retries remaining: #{options[:retries]}"
34
- sleep _sleep if _sleep
35
- get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
36
- else raise e
37
- end
38
- end
39
- end
40
-
41
- def get *args; get_or_post 'get', *args; end
42
- def post *args; get_or_post 'post', *args; end
43
- def head *args; get_or_post 'head', *args; end
44
- def goto *args; get_or_post 'goto', *args; end
1
+ require 'mechanize'
2
+ require 'csv'
3
+ require 'json'
4
+ require 'yaml'
45
5
 
46
- def set_proxy *args
47
- @agent.set_proxy *args
48
- end
49
-
50
- def fields *args
51
- @fields = args
52
- end
53
-
54
- def save item
55
- unless @csv && !@csv.closed?
56
- @csv = CSV.open @output, 'w'
57
- @csv << @fields if @fields
58
- end
59
- case
60
- when item.is_a?(Array) then @csv << item
61
- when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
62
- else raise "unsupported type: #{item.class}"
63
- end
64
- end
65
-
66
- def visited? url
67
- @visited ||= []
68
- return true if @visited.include? url
69
- @visited << url
70
- false
71
- end
72
-
73
- def ts
74
- Time.now.to_i.to_s
75
- end
76
- end
6
+ require "scrapey/version"
7
+ require "scrapey/scrapey"
8
+ require "scrapey/cache"
9
+ require "scrapey/database"
77
10
 
78
11
  include Scrapey
79
12
 
@@ -81,7 +14,7 @@ include Scrapey
81
14
  @agent ||= Mechanize.new{|a| a.history.max_size = 10}
82
15
  @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
83
16
 
84
- # defaulkt output file
17
+ # default output file
85
18
  @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
86
19
 
87
20
  # read config file
@@ -89,7 +22,7 @@ config_file = "#{BASEDIR}/config/config.yml"
89
22
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
90
23
 
91
24
  if @config['database']
92
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/multibyte'].each{|lib| require lib}
25
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
93
26
  ActiveRecord::Base.establish_connection(@config['database'])
94
27
  end
95
28
 
@@ -0,0 +1,22 @@
1
+ module Scrapey
2
+ def use_cache
3
+ @use_cache = true
4
+ @config['cache_dir'] ||= "#{BASEDIR}/cache"
5
+ FileUtils.mkdir_p @config['cache_dir']
6
+ end
7
+
8
+ def cache_filename url
9
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
10
+ end
11
+
12
+ def load_cache url
13
+ filename = cache_filename url
14
+ return nil unless File::exists?(filename)
15
+ puts "Loading #{filename} from cache"
16
+ Nokogiri::HTML File.read(filename)
17
+ end
18
+
19
+ def save_cache url,doc
20
+ File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ module Scrapey
2
+ def tables *args
3
+ args.each do |arg|
4
+ #eval("class #{arg} < ActiveRecord::Base; end")
5
+ Object.const_set(arg, Class.new(ActiveRecord::Base) {})
6
+ end
7
+ end
8
+
9
+ def truncate *args
10
+ args.each do |arg|
11
+ ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,72 @@
1
+ module Scrapey
2
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
+
4
+ def get_or_post method, url, options={}, *args
5
+ agent = method == 'goto' ? @browser : @agent
6
+ _retries = options.delete :retries
7
+ _sleep = options.delete :sleep
8
+ begin
9
+ new_args = method, url
10
+ unless options.empty? && args.empty?
11
+ new_args << options
12
+ args.each{|arg| new_args << arg}
13
+ end
14
+
15
+ doc = load_cache(url) if @use_cache
16
+ return doc if doc
17
+
18
+ page = agent.send *new_args
19
+ save_cache(url, page.body) if @use_cache
20
+
21
+ #exit if Object.const_defined? :Ocra
22
+ page
23
+ rescue Exception => e
24
+ case
25
+ when defined? on_error
26
+ on_error e
27
+ get_or_post method, url, options, *args
28
+ when _retries && _retries > 0
29
+ puts "Error. Retries remaining: #{options[:retries]}"
30
+ sleep _sleep if _sleep
31
+ get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
32
+ else raise e
33
+ end
34
+ end
35
+ end
36
+
37
+ def get *args; get_or_post 'get', *args; end
38
+ def post *args; get_or_post 'post', *args; end
39
+ def head *args; get_or_post 'head', *args; end
40
+ def goto *args; get_or_post 'goto', *args; end
41
+
42
+ def set_proxy *args
43
+ @agent.set_proxy *args
44
+ end
45
+
46
+ def fields *args
47
+ @fields = args
48
+ end
49
+
50
+ def save item
51
+ unless @csv && !@csv.closed?
52
+ @csv = CSV.open @output, 'w'
53
+ @csv << @fields if @fields
54
+ end
55
+ case
56
+ when item.is_a?(Array) then @csv << item
57
+ when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
58
+ else raise "unsupported type: #{item.class}"
59
+ end
60
+ end
61
+
62
+ def visited? url
63
+ @visited ||= []
64
+ return true if @visited.include? url
65
+ @visited << url
66
+ false
67
+ end
68
+
69
+ def ts
70
+ Time.now.to_i.to_s
71
+ end
72
+ end
@@ -0,0 +1,16 @@
1
+ require 'fileutils'
2
+
3
+ module Scrapey
4
+ module Template
5
+ def self.generate name, *args
6
+ puts "creating new scrapey project: #{name}..."
7
+ template = File.expand_path('../../../template', __FILE__)
8
+ FileUtils.cp_r template, name
9
+ Dir.chdir name
10
+ Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
11
+ FileUtils.mv fn, fn.gsub('template', name)
12
+ end
13
+
14
+ end
15
+ end
16
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrapey
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/scrapey.gemspec CHANGED
@@ -1,9 +1,6 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  require File.expand_path('../lib/scrapey/version', __FILE__)
3
3
 
4
- gem 'mechanize'
5
- gem 'json'
6
-
7
4
  Gem::Specification.new do |gem|
8
5
  gem.authors = ["P Guardiario"]
9
6
  gem.email = ["pguardiario@gmail.com"]
@@ -11,11 +8,14 @@ Gem::Specification.new do |gem|
11
8
  gem.summary = %q{A simple scraping framework}
12
9
  gem.homepage = ""
13
10
 
14
- gem.files = `git ls-files`.split($\)
11
+ # gem.files = `git ls-files`.split($\)
12
+ gem.files = `find * -type f | grep -v pkg`.split($\)
15
13
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
14
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
15
  gem.name = "scrapey"
18
16
  gem.require_paths = ["lib"]
19
17
  gem.version = Scrapey::VERSION
18
+ gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
19
+ gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
20
  end
21
21
 
@@ -0,0 +1,23 @@
1
+ # you can put any configurable options in here
2
+ # login information for example
3
+ email: foo@gmail.com
4
+ password: 12345
5
+
6
+ =begin
7
+ # example database section
8
+ database:
9
+ adapter: mysql
10
+ database: template
11
+ username: root
12
+ password: 12345
13
+ host: localhost
14
+ encoding: 'utf8'
15
+ =end
16
+
17
+ =begin
18
+ # example proxies section
19
+ proxies:
20
+ - www.host1.com:80
21
+ - localhost:8080
22
+ - 127.0.0.1:8888
23
+ =end
File without changes
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,23 +9,52 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-22 00:00:00.000000000 Z
13
- dependencies: []
12
+ date: 2012-07-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &16919208 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *16919208
25
+ - !ruby/object:Gem::Dependency
26
+ name: json
27
+ requirement: &16115544 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.7.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *16115544
14
36
  description: A simple scraping framework
15
37
  email:
16
38
  - pguardiario@gmail.com
17
- executables: []
39
+ executables:
40
+ - scrapey
18
41
  extensions: []
19
42
  extra_rdoc_files: []
20
43
  files:
21
- - .gitignore
22
44
  - Gemfile
23
45
  - LICENSE
24
46
  - README.md
25
47
  - Rakefile
26
- - lib/scrapey.rb
48
+ - bin/scrapey
49
+ - lib/scrapey/cache.rb
50
+ - lib/scrapey/database.rb
51
+ - lib/scrapey/scrapey.rb
52
+ - lib/scrapey/template.rb
27
53
  - lib/scrapey/version.rb
54
+ - lib/scrapey.rb
28
55
  - scrapey.gemspec
56
+ - template/config/config.yml
57
+ - template/src/template.rb
29
58
  homepage: ''
30
59
  licenses: []
31
60
  post_install_message:
data/.gitignore DELETED
@@ -1,17 +0,0 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp