scrapey 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -16,14 +16,22 @@ Or install it yourself as:
16
16
 
17
17
  $ gem install scrapey
18
18
 
19
- ## Usage
19
+ ## Examples
20
20
 
21
- TODO: Write usage instructions here
21
+ ### Concurrent downloads
22
22
 
23
- ## Contributing
23
+ ```ruby
24
+ require 'scrapey'
25
+ require 'scrapey/multi'
24
26
 
25
- 1. Fork it
26
- 2. Create your feature branch (`git checkout -b my-new-feature`)
27
- 3. Commit your changes (`git commit -am 'Added some feature'`)
28
- 4. Push to the branch (`git push origin my-new-feature`)
29
- 5. Create new Pull Request
27
+ fields 'url', 'title'
28
+
29
+ def scrape url, response
30
+ doc = Nokogiri::HTML response
31
+ @items << {'url' => url, 'title' => doc.at('title').text}
32
+ end
33
+
34
+ @items = []
35
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
36
+ @items.each{|item| save item}
37
+ ```
data/examples/multi.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'title'
5
+
6
+ def scrape url, response
7
+ doc = Nokogiri::HTML response
8
+ @items << {'url' => url, 'title' => doc.at('title').text}
9
+ end
10
+
11
+ @items = []
12
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
13
+ @items.each{|item| save item}
@@ -0,0 +1,4 @@
1
+ module Scrapey
2
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
+ ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
+ end
@@ -1,7 +1,6 @@
1
1
  module Scrapey
2
2
  def tables *args
3
3
  args.each do |arg|
4
- #eval("class #{arg} < ActiveRecord::Base; end")
5
4
  Object.const_set(arg, Class.new(ActiveRecord::Base) {})
6
5
  end
7
6
  end
@@ -0,0 +1,25 @@
1
+ require 'em-http-request'
2
+
3
+ module Scrapey
4
+ def multi_get all_urls, num_threads = 20, callback = :save_cache
5
+ all_urls.each_slice(num_threads) do |urls|
6
+ next unless urls.size > 0
7
+ EventMachine.run do
8
+ multi = EventMachine::MultiRequest.new
9
+ urls.each_with_index do |url, i|
10
+ multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
11
+ end
12
+ multi.callback do
13
+ (0...multi.requests.length).each do |i|
14
+ if multi.responses[:callback][i]
15
+ send callback, urls[i], multi.responses[:callback][i].response
16
+ else
17
+ puts "problem downloading #{urls[i]}!"
18
+ end
19
+ end
20
+ EventMachine.stop
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,5 +1,4 @@
1
1
  module Scrapey
2
- BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
2
 
4
3
  def get_or_post method, url, options={}, *args
5
4
  agent = method == 'goto' ? @browser : @agent
@@ -13,7 +12,7 @@ module Scrapey
13
12
  end
14
13
 
15
14
  doc = load_cache(url) if @use_cache
16
- return doc if doc
15
+ return doc if doc
17
16
 
18
17
  page = agent.send *new_args
19
18
  save_cache(url, page.body) if @use_cache
@@ -23,8 +22,7 @@ module Scrapey
23
22
  rescue Exception => e
24
23
  case
25
24
  when defined? on_error
26
- on_error e
27
- get_or_post method, url, options, *args
25
+ return on_error e, method, url, options, *args
28
26
  when _retries && _retries > 0
29
27
  puts "Error. Retries remaining: #{options[:retries]}"
30
28
  sleep _sleep if _sleep
@@ -59,12 +57,12 @@ module Scrapey
59
57
  end
60
58
  end
61
59
 
62
- def visited? url
63
- @visited ||= []
64
- return true if @visited.include? url
65
- @visited << url
66
- false
67
- end
60
+ def visited? url
61
+ @visited ||= []
62
+ return true if @visited.include? url
63
+ @visited << url
64
+ false
65
+ end
68
66
 
69
67
  def ts
70
68
  Time.now.to_i.to_s
@@ -1,3 +1,3 @@
1
1
  module Scrapey
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/scrapey.rb CHANGED
@@ -3,8 +3,9 @@ require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
5
 
6
- require "scrapey/version"
7
6
  require "scrapey/scrapey"
7
+ require "scrapey/version"
8
+ require "scrapey/constants"
8
9
  require "scrapey/cache"
9
10
  require "scrapey/database"
10
11
 
@@ -15,7 +16,7 @@ include Scrapey
15
16
  @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
16
17
 
17
18
  # default output file
18
- @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
19
+ @output = 'output.csv'
19
20
 
20
21
  # read config file
21
22
  config_file = "#{BASEDIR}/config/config.yml"
@@ -3,21 +3,17 @@
3
3
  email: foo@gmail.com
4
4
  password: 12345
5
5
 
6
- =begin
7
6
  # example database section
8
- database:
9
- adapter: mysql
10
- database: template
11
- username: root
12
- password: 12345
13
- host: localhost
14
- encoding: 'utf8'
15
- =end
7
+ #database:
8
+ # adapter: mysql
9
+ # database: template
10
+ # username: root
11
+ # password: 12345
12
+ # host: localhost
13
+ # encoding: 'utf8'
16
14
 
17
- =begin
18
15
  # example proxies section
19
- proxies:
20
- - www.host1.com:80
21
- - localhost:8080
22
- - 127.0.0.1:8888
23
- =end
16
+ #proxies:
17
+ #- www.host1.com:80
18
+ #- localhost:8080
19
+ #- 127.0.0.1:8888
@@ -0,0 +1,4 @@
1
+ require 'scrapey'
2
+
3
+ # customizations...
4
+ # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-24 00:00:00.000000000 Z
12
+ date: 2012-07-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &16919208 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16919208
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 2.5.0
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: json
27
- requirement: &16115544 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ~>
@@ -32,7 +37,12 @@ dependencies:
32
37
  version: 1.7.0
33
38
  type: :runtime
34
39
  prerelease: false
35
- version_requirements: *16115544
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.7.0
36
46
  description: A simple scraping framework
37
47
  email:
38
48
  - pguardiario@gmail.com
@@ -46,8 +56,11 @@ files:
46
56
  - README.md
47
57
  - Rakefile
48
58
  - bin/scrapey
59
+ - examples/multi.rb
49
60
  - lib/scrapey/cache.rb
61
+ - lib/scrapey/constants.rb
50
62
  - lib/scrapey/database.rb
63
+ - lib/scrapey/multi.rb
51
64
  - lib/scrapey/scrapey.rb
52
65
  - lib/scrapey/template.rb
53
66
  - lib/scrapey/version.rb
@@ -75,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
88
  version: '0'
76
89
  requirements: []
77
90
  rubyforge_project:
78
- rubygems_version: 1.8.11
91
+ rubygems_version: 1.8.23
79
92
  signing_key:
80
93
  specification_version: 3
81
94
  summary: A simple scraping framework