scrapey 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scrapey.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 P Guardiario
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Scrapey
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'scrapey'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install scrapey
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/lib/scrapey.rb ADDED
@@ -0,0 +1,95 @@
1
+ require "scrapey/version"
2
+ require "scrapey/src/cache"
3
+ require "scrapey/src/database"
4
+
5
+ module Scrapey
6
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
7
+
8
+ def get_or_post method, url, options={}, *args
9
+ agent = method == 'goto' ? @browser : @agent
10
+ _retries = options.delete :retries
11
+ _sleep = options.delete :sleep
12
+ begin
13
+ new_args = method, url
14
+ unless options.empty? && args.empty?
15
+ new_args << options
16
+ args.each{|arg| new_args << arg}
17
+ end
18
+
19
+ doc = load_cache(url) if @use_cache
20
+ return doc if doc
21
+
22
+ page = agent.send *new_args
23
+ save_cache(url, page.body) if @use_cache
24
+
25
+ #exit if Object.const_defined? :Ocra
26
+ page
27
+ rescue Exception => e
28
+ case
29
+ when defined? on_error
30
+ on_error e
31
+ get_or_post method, url, options, *args
32
+ when _retries && _retries > 0
33
+ puts "Error. Retries remaining: #{options[:retries]}"
34
+ sleep _sleep if _sleep
35
+ get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
36
+ else raise e
37
+ end
38
+ end
39
+ end
40
+
41
+ def get *args; get_or_post 'get', *args; end
42
+ def post *args; get_or_post 'post', *args; end
43
+ def head *args; get_or_post 'head', *args; end
44
+ def goto *args; get_or_post 'goto', *args; end
45
+
46
+ def set_proxy *args
47
+ @agent.set_proxy *args
48
+ end
49
+
50
+ def fields *args
51
+ @fields = args
52
+ end
53
+
54
+ def save item
55
+ unless @csv && !@csv.closed?
56
+ @csv = CSV.open @output, 'w'
57
+ @csv << @fields if @fields
58
+ end
59
+ case
60
+ when item.is_a?(Array) then @csv << item
61
+ when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
62
+ else raise "unsupported type: #{item.class}"
63
+ end
64
+ end
65
+
66
+ def visited? url
67
+ @visited ||= []
68
+ return true if @visited.include? url
69
+ @visited << url
70
+ false
71
+ end
72
+
73
+ def ts
74
+ Time.now.to_i.to_s
75
+ end
76
+ end
77
+
78
+ include Scrapey
79
+
80
+ # some defaults that I like
81
+ @agent ||= Mechanize.new{|a| a.history.max_size = 10}
82
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
83
+
84
+ # defaulkt output file
85
+ @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
86
+
87
+ # read config file
88
+ config_file = "#{BASEDIR}/config/config.yml"
89
+ @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
90
+
91
+ if @config['database']
92
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/multibyte'].each{|lib| require lib}
93
+ ActiveRecord::Base.establish_connection(@config['database'])
94
+ end
95
+
@@ -0,0 +1,3 @@
1
+ module Scrapey
2
+ VERSION = "0.0.1"
3
+ end
data/scrapey.gemspec ADDED
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/scrapey/version', __FILE__)
3
+
4
+ gem 'mechanize'
5
+ gem 'json'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.authors = ["P Guardiario"]
9
+ gem.email = ["pguardiario@gmail.com"]
10
+ gem.description = %q{A simple scraping framework}
11
+ gem.summary = %q{A simple scraping framework}
12
+ gem.homepage = ""
13
+
14
+ gem.files = `git ls-files`.split($\)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.name = "scrapey"
18
+ gem.require_paths = ["lib"]
19
+ gem.version = Scrapey::VERSION
20
+ end
21
+
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapey
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - P Guardiario
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A simple scraping framework
15
+ email:
16
+ - pguardiario@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - LICENSE
24
+ - README.md
25
+ - Rakefile
26
+ - lib/scrapey.rb
27
+ - lib/scrapey/version.rb
28
+ - scrapey.gemspec
29
+ homepage: ''
30
+ licenses: []
31
+ post_install_message:
32
+ rdoc_options: []
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 1.8.11
50
+ signing_key:
51
+ specification_version: 3
52
+ summary: A simple scraping framework
53
+ test_files: []