scrapey 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scrapey.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 P Guardiario
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Scrapey
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'scrapey'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install scrapey
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/lib/scrapey.rb ADDED
@@ -0,0 +1,95 @@
1
+ require "scrapey/version"
2
+ require "scrapey/src/cache"
3
+ require "scrapey/src/database"
4
+
5
+ module Scrapey
6
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
7
+
8
+ def get_or_post method, url, options={}, *args
9
+ agent = method == 'goto' ? @browser : @agent
10
+ _retries = options.delete :retries
11
+ _sleep = options.delete :sleep
12
+ begin
13
+ new_args = method, url
14
+ unless options.empty? && args.empty?
15
+ new_args << options
16
+ args.each{|arg| new_args << arg}
17
+ end
18
+
19
+ doc = load_cache(url) if @use_cache
20
+ return doc if doc
21
+
22
+ page = agent.send *new_args
23
+ save_cache(url, page.body) if @use_cache
24
+
25
+ #exit if Object.const_defined? :Ocra
26
+ page
27
+ rescue Exception => e
28
+ case
29
+ when defined? on_error
30
+ on_error e
31
+ get_or_post method, url, options, *args
32
+ when _retries && _retries > 0
33
+ puts "Error. Retries remaining: #{options[:retries]}"
34
+ sleep _sleep if _sleep
35
+ get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
36
+ else raise e
37
+ end
38
+ end
39
+ end
40
+
41
+ def get *args; get_or_post 'get', *args; end
42
+ def post *args; get_or_post 'post', *args; end
43
+ def head *args; get_or_post 'head', *args; end
44
+ def goto *args; get_or_post 'goto', *args; end
45
+
46
+ def set_proxy *args
47
+ @agent.set_proxy *args
48
+ end
49
+
50
+ def fields *args
51
+ @fields = args
52
+ end
53
+
54
+ def save item
55
+ unless @csv && !@csv.closed?
56
+ @csv = CSV.open @output, 'w'
57
+ @csv << @fields if @fields
58
+ end
59
+ case
60
+ when item.is_a?(Array) then @csv << item
61
+ when item.is_a?(Hash) then @csv << @fields.map{|f| item[f]}
62
+ else raise "unsupported type: #{item.class}"
63
+ end
64
+ end
65
+
66
+ def visited? url
67
+ @visited ||= []
68
+ return true if @visited.include? url
69
+ @visited << url
70
+ false
71
+ end
72
+
73
+ def ts
74
+ Time.now.to_i.to_s
75
+ end
76
+ end
77
+
78
+ include Scrapey
79
+
80
+ # some defaults that I like
81
+ @agent ||= Mechanize.new{|a| a.history.max_size = 10}
82
+ @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
83
+
84
+ # defaulkt output file
85
+ @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
86
+
87
+ # read config file
88
+ config_file = "#{BASEDIR}/config/config.yml"
89
+ @config = File.exists?(config_file) ? YAML::load(File.open(config_file)) : {}
90
+
91
+ if @config['database']
92
+ ['active_record', @config['database']['adapter'], 'tzinfo', 'active_support/multibyte'].each{|lib| require lib}
93
+ ActiveRecord::Base.establish_connection(@config['database'])
94
+ end
95
+
@@ -0,0 +1,3 @@
1
+ module Scrapey
2
+ VERSION = "0.0.1"
3
+ end
data/scrapey.gemspec ADDED
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/scrapey/version', __FILE__)
3
+
4
+ gem 'mechanize'
5
+ gem 'json'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.authors = ["P Guardiario"]
9
+ gem.email = ["pguardiario@gmail.com"]
10
+ gem.description = %q{A simple scraping framework}
11
+ gem.summary = %q{A simple scraping framework}
12
+ gem.homepage = ""
13
+
14
+ gem.files = `git ls-files`.split($\)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.name = "scrapey"
18
+ gem.require_paths = ["lib"]
19
+ gem.version = Scrapey::VERSION
20
+ end
21
+
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapey
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - P Guardiario
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A simple scraping framework
15
+ email:
16
+ - pguardiario@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - LICENSE
24
+ - README.md
25
+ - Rakefile
26
+ - lib/scrapey.rb
27
+ - lib/scrapey/version.rb
28
+ - scrapey.gemspec
29
+ homepage: ''
30
+ licenses: []
31
+ post_install_message:
32
+ rdoc_options: []
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 1.8.11
50
+ signing_key:
51
+ specification_version: 3
52
+ summary: A simple scraping framework
53
+ test_files: []