scrapey 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/scrapey ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'scrapey/template'
3
+
4
+ Scrapey::Template::generate *ARGV
data/lib/scrapey.rb CHANGED
@@ -1,79 +1,12 @@
1
- require "scrapey/version"
2
- require "scrapey/src/cache"
3
- require "scrapey/src/database"
4
-
5
- module Scrapey
6
- BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
7
-
8
- def get_or_post method, url, options={}, *args
9
- agent = method == 'goto' ? @browser : @agent
10
- _retries = options.delete :retries
11
- _sleep = options.delete :sleep
12
- begin
13
- new_args = method, url
14
- unless options.empty? && args.empty?
15
- new_args << options
16
- args.each{|arg| new_args << arg}
17
- end
18
-
19
- doc = load_cache(url) if @use_cache
20
- return doc if doc
21
-
22
- page = agent.send *new_args
23
- save_cache(url, page.body) if @use_cache
24
-
25
- #exit if Object.const_defined? :Ocra
26
- page
27
- rescue Exception => e
28
- case
29
- when defined? on_error
30
- on_error e
31
- get_or_post method, url, options, *args
32
- when _retries && _retries > 0
33
- puts "Error. Retries remaining: #{options[:retries]}"
34
- sleep _sleep if _sleep
35
- get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
36
- else raise e
37
- end
38
- end
39
- end
40
-
41
- def get *args; get_or_post 'get', *args; end
42
- def post *args; get_or_post 'post', *args; end
43
- def head *args; get_or_post 'head', *args; end
44
- def goto *args; get_or_post 'goto', *args; end
1
+ require 'mechanize'
2
+ require 'csv'
3
+ require 'json'
4
+ require 'yaml'
45
5
 
46
- def set_proxy *args
47
- @agent.set_proxy *args
48
- end
49
-
50
- def fields *args
51
- @fields = args
52
- end
53
-
54
- def save item
55
- unless @csv && !@csv.closed?
56
- @csv = CSV.open @output, 'w'
57
- @csv << @fields if @fields
58
- end
59
- case
60
- when item.is_a?(Array) then @csv << item
61
- when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
62
- else raise "unsupported type: #{item.class}"
63
- end
64
- end
65
-
66
- def visited? url
67
- @visited ||= []
68
- return true if @visited.include? url
69
- @visited << url
70
- false
71
- end
72
-
73
- def ts
74
- Time.now.to_i.to_s
75
- end
76
- end
6
+ require "scrapey/version"
7
+ require "scrapey/scrapey"
8
+ require "scrapey/cache"
9
+ require "scrapey/database"
77
10
 
78
11
  include Scrapey
79
12
 
@@ -81,7 +14,7 @@ include Scrapey
81
14
  @agent ||= Mechanize.new{|a| a.history.max_size = 10}
82
15
  @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
83
16
 
84
- # defaulkt output file
17
+ # default output file
85
18
  @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
86
19
 
87
20
  # read config file
@@ -89,7 +22,7 @@ config_file = "#{BASEDIR}/config/config.yml"
89
22
  @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
90
23
 
91
24
  if @config['database']
92
- ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/multibyte'].each{|lib| require lib}
25
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/all'].each{|lib| require lib}
93
26
  ActiveRecord::Base.establish_connection(@config['database'])
94
27
  end
95
28
 
@@ -0,0 +1,22 @@
1
+ module Scrapey
2
+ def use_cache
3
+ @use_cache = true
4
+ @config['cache_dir'] ||= "#{BASEDIR}/cache"
5
+ FileUtils.mkdir_p @config['cache_dir']
6
+ end
7
+
8
+ def cache_filename url
9
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
10
+ end
11
+
12
+ def load_cache url
13
+ filename = cache_filename url
14
+ return nil unless File::exists?(filename)
15
+ puts "Loading #{filename} from cache"
16
+ Nokogiri::HTML File.read(filename)
17
+ end
18
+
19
+ def save_cache url,doc
20
+ File.open(cache_filename(url), 'wb') {|f| f.write(doc) }
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ module Scrapey
2
+ def tables *args
3
+ args.each do |arg|
4
+ #eval("class #{arg} < ActiveRecord::Base; end")
5
+ Object.const_set(arg, Class.new(ActiveRecord::Base) {})
6
+ end
7
+ end
8
+
9
+ def truncate *args
10
+ args.each do |arg|
11
+ ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,72 @@
1
+ module Scrapey
2
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
+
4
+ def get_or_post method, url, options={}, *args
5
+ agent = method == 'goto' ? @browser : @agent
6
+ _retries = options.delete :retries
7
+ _sleep = options.delete :sleep
8
+ begin
9
+ new_args = method, url
10
+ unless options.empty? && args.empty?
11
+ new_args << options
12
+ args.each{|arg| new_args << arg}
13
+ end
14
+
15
+ doc = load_cache(url) if @use_cache
16
+ return doc if doc
17
+
18
+ page = agent.send *new_args
19
+ save_cache(url, page.body) if @use_cache
20
+
21
+ #exit if Object.const_defined? :Ocra
22
+ page
23
+ rescue Exception => e
24
+ case
25
+ when defined? on_error
26
+ on_error e
27
+ get_or_post method, url, options, *args
28
+ when _retries && _retries > 0
29
+ puts "Error. Retries remaining: #{options[:retries]}"
30
+ sleep _sleep if _sleep
31
+ get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
32
+ else raise e
33
+ end
34
+ end
35
+ end
36
+
37
+ def get *args; get_or_post 'get', *args; end
38
+ def post *args; get_or_post 'post', *args; end
39
+ def head *args; get_or_post 'head', *args; end
40
+ def goto *args; get_or_post 'goto', *args; end
41
+
42
+ def set_proxy *args
43
+ @agent.set_proxy *args
44
+ end
45
+
46
+ def fields *args
47
+ @fields = args
48
+ end
49
+
50
+ def save item
51
+ unless @csv && !@csv.closed?
52
+ @csv = CSV.open @output, 'w'
53
+ @csv << @fields if @fields
54
+ end
55
+ case
56
+ when item.is_a?(Array) then @csv << item
57
+ when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
58
+ else raise "unsupported type: #{item.class}"
59
+ end
60
+ end
61
+
62
+ def visited? url
63
+ @visited ||= []
64
+ return true if @visited.include? url
65
+ @visited << url
66
+ false
67
+ end
68
+
69
+ def ts
70
+ Time.now.to_i.to_s
71
+ end
72
+ end
@@ -0,0 +1,16 @@
1
+ require 'fileutils'
2
+
3
+ module Scrapey
4
+ module Template
5
+ def self.generate name, *args
6
+ puts "creating new scrapey project: #{name}..."
7
+ template = File.expand_path('../../../template', __FILE__)
8
+ FileUtils.cp_r template, name
9
+ Dir.chdir name
10
+ Dir.glob(['*/*.*', '*.*']).grep(/template/).each do |fn|
11
+ FileUtils.mv fn, fn.gsub('template', name)
12
+ end
13
+
14
+ end
15
+ end
16
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrapey
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/scrapey.gemspec CHANGED
@@ -1,9 +1,6 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  require File.expand_path('../lib/scrapey/version', __FILE__)
3
3
 
4
- gem 'mechanize'
5
- gem 'json'
6
-
7
4
  Gem::Specification.new do |gem|
8
5
  gem.authors = ["P Guardiario"]
9
6
  gem.email = ["pguardiario@gmail.com"]
@@ -11,11 +8,14 @@ Gem::Specification.new do |gem|
11
8
  gem.summary = %q{A simple scraping framework}
12
9
  gem.homepage = ""
13
10
 
14
- gem.files = `git ls-files`.split($\)
11
+ # gem.files = `git ls-files`.split($\)
12
+ gem.files = `find * -type f | grep -v pkg`.split($\)
15
13
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
14
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
15
  gem.name = "scrapey"
18
16
  gem.require_paths = ["lib"]
19
17
  gem.version = Scrapey::VERSION
18
+ gem.add_dependency(%q<mechanize>, ["~> 2.5.0"])
19
+ gem.add_dependency(%q<json>, ["~> 1.7.0"])
20
20
  end
21
21
 
@@ -0,0 +1,23 @@
1
+ # you can put any configurable options in here
2
+ # login information for example
3
+ email: foo@gmail.com
4
+ password: 12345
5
+
6
+ =begin
7
+ # example database section
8
+ database:
9
+ adapter: mysql
10
+ database: template
11
+ username: root
12
+ password: 12345
13
+ host: localhost
14
+ encoding: 'utf8'
15
+ =end
16
+
17
+ =begin
18
+ # example proxies section
19
+ proxies:
20
+ - www.host1.com:80
21
+ - localhost:8080
22
+ - 127.0.0.1:8888
23
+ =end
File without changes
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,23 +9,52 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-22 00:00:00.000000000 Z
13
- dependencies: []
12
+ date: 2012-07-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &16919208 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *16919208
25
+ - !ruby/object:Gem::Dependency
26
+ name: json
27
+ requirement: &16115544 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.7.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *16115544
14
36
  description: A simple scraping framework
15
37
  email:
16
38
  - pguardiario@gmail.com
17
- executables: []
39
+ executables:
40
+ - scrapey
18
41
  extensions: []
19
42
  extra_rdoc_files: []
20
43
  files:
21
- - .gitignore
22
44
  - Gemfile
23
45
  - LICENSE
24
46
  - README.md
25
47
  - Rakefile
26
- - lib/scrapey.rb
48
+ - bin/scrapey
49
+ - lib/scrapey/cache.rb
50
+ - lib/scrapey/database.rb
51
+ - lib/scrapey/scrapey.rb
52
+ - lib/scrapey/template.rb
27
53
  - lib/scrapey/version.rb
54
+ - lib/scrapey.rb
28
55
  - scrapey.gemspec
56
+ - template/config/config.yml
57
+ - template/src/template.rb
29
58
  homepage: ''
30
59
  licenses: []
31
60
  post_install_message:
data/.gitignore DELETED
@@ -1,17 +0,0 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- .yardoc
6
- Gemfile.lock
7
- InstalledFiles
8
- _yardoc
9
- coverage
10
- doc/
11
- lib/bundler/man
12
- pkg
13
- rdoc
14
- spec/reports
15
- test/tmp
16
- test/version_tmp
17
- tmp