scrapey 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -16,14 +16,22 @@ Or install it yourself as:
16
16
 
17
17
  $ gem install scrapey
18
18
 
19
- ## Usage
19
+ ## Examples
20
20
 
21
- TODO: Write usage instructions here
21
+ ### Concurrent downloads
22
22
 
23
- ## Contributing
23
+ ```ruby
24
+ require 'scrapey'
25
+ require 'scrapey/multi'
24
26
 
25
- 1. Fork it
26
- 2. Create your feature branch (`git checkout -b my-new-feature`)
27
- 3. Commit your changes (`git commit -am 'Added some feature'`)
28
- 4. Push to the branch (`git push origin my-new-feature`)
29
- 5. Create new Pull Request
27
+ fields 'url', 'title'
28
+
29
+ def scrape url, response
30
+ doc = Nokogiri::HTML response
31
+ @items << {'url' => url, 'title' => doc.at('title').text}
32
+ end
33
+
34
+ @items = []
35
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
36
+ @items.each{|item| save item}
37
+ ```
data/examples/multi.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'scrapey'
2
+ require 'scrapey/multi'
3
+
4
+ fields 'url', 'title'
5
+
6
+ def scrape url, response
7
+ doc = Nokogiri::HTML response
8
+ @items << {'url' => url, 'title' => doc.at('title').text}
9
+ end
10
+
11
+ @items = []
12
+ multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
13
+ @items.each{|item| save item}
@@ -0,0 +1,4 @@
1
+ module Scrapey
2
+ BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
+ ENV['SSL_FILE'] = "#{Gem.dir}/gems/scrapey-#{Scrapey::VERSION}/ssl/cacert.pem"
4
+ end
@@ -1,7 +1,6 @@
1
1
  module Scrapey
2
2
  def tables *args
3
3
  args.each do |arg|
4
- #eval("class #{arg} < ActiveRecord::Base; end")
5
4
  Object.const_set(arg, Class.new(ActiveRecord::Base) {})
6
5
  end
7
6
  end
@@ -0,0 +1,25 @@
1
+ require 'em-http-request'
2
+
3
+ module Scrapey
4
+ def multi_get all_urls, num_threads = 20, callback = :save_cache
5
+ all_urls.each_slice(num_threads) do |urls|
6
+ next unless urls.size > 0
7
+ EventMachine.run do
8
+ multi = EventMachine::MultiRequest.new
9
+ urls.each_with_index do |url, i|
10
+ multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
11
+ end
12
+ multi.callback do
13
+ (0...multi.requests.length).each do |i|
14
+ if multi.responses[:callback][i]
15
+ send callback, urls[i], multi.responses[:callback][i].response
16
+ else
17
+ puts "problem downloading #{urls[i]}!"
18
+ end
19
+ end
20
+ EventMachine.stop
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,5 +1,4 @@
1
1
  module Scrapey
2
- BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
3
2
 
4
3
  def get_or_post method, url, options={}, *args
5
4
  agent = method == 'goto' ? @browser : @agent
@@ -13,7 +12,7 @@ module Scrapey
13
12
  end
14
13
 
15
14
  doc = load_cache(url) if @use_cache
16
- return doc if doc
15
+ return doc if doc
17
16
 
18
17
  page = agent.send *new_args
19
18
  save_cache(url, page.body) if @use_cache
@@ -23,8 +22,7 @@ module Scrapey
23
22
  rescue Exception => e
24
23
  case
25
24
  when defined? on_error
26
- on_error e
27
- get_or_post method, url, options, *args
25
+ return on_error e, method, url, options, *args
28
26
  when _retries && _retries > 0
29
27
  puts "Error. Retries remaining: #{options[:retries]}"
30
28
  sleep _sleep if _sleep
@@ -59,12 +57,12 @@ module Scrapey
59
57
  end
60
58
  end
61
59
 
62
- def visited? url
63
- @visited ||= []
64
- return true if @visited.include? url
65
- @visited << url
66
- false
67
- end
60
+ def visited? url
61
+ @visited ||= []
62
+ return true if @visited.include? url
63
+ @visited << url
64
+ false
65
+ end
68
66
 
69
67
  def ts
70
68
  Time.now.to_i.to_s
@@ -1,3 +1,3 @@
1
1
  module Scrapey
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/scrapey.rb CHANGED
@@ -3,8 +3,9 @@ require 'csv'
3
3
  require 'json'
4
4
  require 'yaml'
5
5
 
6
- require "scrapey/version"
7
6
  require "scrapey/scrapey"
7
+ require "scrapey/version"
8
+ require "scrapey/constants"
8
9
  require "scrapey/cache"
9
10
  require "scrapey/database"
10
11
 
@@ -15,7 +16,7 @@ include Scrapey
15
16
  @agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
16
17
 
17
18
  # default output file
18
- @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
19
+ @output = 'output.csv'
19
20
 
20
21
  # read config file
21
22
  config_file = "#{BASEDIR}/config/config.yml"
@@ -3,21 +3,17 @@
3
3
  email: foo@gmail.com
4
4
  password: 12345
5
5
 
6
- =begin
7
6
  # example database section
8
- database:
9
- adapter: mysql
10
- database: template
11
- username: root
12
- password: 12345
13
- host: localhost
14
- encoding: 'utf8'
15
- =end
7
+ #database:
8
+ # adapter: mysql
9
+ # database: template
10
+ # username: root
11
+ # password: 12345
12
+ # host: localhost
13
+ # encoding: 'utf8'
16
14
 
17
- =begin
18
15
  # example proxies section
19
- proxies:
20
- - www.host1.com:80
21
- - localhost:8080
22
- - 127.0.0.1:8888
23
- =end
16
+ #proxies:
17
+ #- www.host1.com:80
18
+ #- localhost:8080
19
+ #- 127.0.0.1:8888
@@ -0,0 +1,4 @@
1
+ require 'scrapey'
2
+
3
+ # customizations...
4
+ # @output = Time.now.strftime("#{BASEDIR}/Output/output_%Y_%m_%d_%H_%M_%S.csv")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-24 00:00:00.000000000 Z
12
+ date: 2012-07-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &16919208 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *16919208
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 2.5.0
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: json
27
- requirement: &16115544 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ~>
@@ -32,7 +37,12 @@ dependencies:
32
37
  version: 1.7.0
33
38
  type: :runtime
34
39
  prerelease: false
35
- version_requirements: *16115544
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.7.0
36
46
  description: A simple scraping framework
37
47
  email:
38
48
  - pguardiario@gmail.com
@@ -46,8 +56,11 @@ files:
46
56
  - README.md
47
57
  - Rakefile
48
58
  - bin/scrapey
59
+ - examples/multi.rb
49
60
  - lib/scrapey/cache.rb
61
+ - lib/scrapey/constants.rb
50
62
  - lib/scrapey/database.rb
63
+ - lib/scrapey/multi.rb
51
64
  - lib/scrapey/scrapey.rb
52
65
  - lib/scrapey/template.rb
53
66
  - lib/scrapey/version.rb
@@ -75,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
75
88
  version: '0'
76
89
  requirements: []
77
90
  rubyforge_project:
78
- rubygems_version: 1.8.11
91
+ rubygems_version: 1.8.23
79
92
  signing_key:
80
93
  specification_version: 3
81
94
  summary: A simple scraping framework