scrapey 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +16 -8
- data/examples/multi.rb +13 -0
- data/lib/scrapey/constants.rb +4 -0
- data/lib/scrapey/database.rb +0 -1
- data/lib/scrapey/multi.rb +25 -0
- data/lib/scrapey/scrapey.rb +8 -10
- data/lib/scrapey/version.rb +1 -1
- data/lib/scrapey.rb +3 -2
- data/template/config/config.yml +11 -15
- data/template/src/template.rb +4 -0
- metadata +20 -7
data/README.md
CHANGED
@@ -16,14 +16,22 @@ Or install it yourself as:
|
|
16
16
|
|
17
17
|
$ gem install scrapey
|
18
18
|
|
19
|
-
##
|
19
|
+
## Examples
|
20
20
|
|
21
|
-
|
21
|
+
### Concurrent downloads
|
22
22
|
|
23
|
-
|
23
|
+
```ruby
|
24
|
+
require 'scrapey'
|
25
|
+
require 'scrapey/multi'
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
fields 'url', 'title'
|
28
|
+
|
29
|
+
def scrape url, response
|
30
|
+
doc = Nokogiri::HTML response
|
31
|
+
@items << {'url' => url, 'title' => doc.at('title').text}
|
32
|
+
end
|
33
|
+
|
34
|
+
@items = []
|
35
|
+
multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
|
36
|
+
@items.each{|item| save item}
|
37
|
+
```
|
data/examples/multi.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'scrapey'
|
2
|
+
require 'scrapey/multi'
|
3
|
+
|
4
|
+
fields 'url', 'title'
|
5
|
+
|
6
|
+
def scrape url, response
|
7
|
+
doc = Nokogiri::HTML response
|
8
|
+
@items << {'url' => url, 'title' => doc.at('title').text}
|
9
|
+
end
|
10
|
+
|
11
|
+
@items = []
|
12
|
+
multi_get ['http://www.yahoo.com/', 'http://www.google.com.', 'http://www.bing.com/'], 3, :scrape
|
13
|
+
@items.each{|item| save item}
|
data/lib/scrapey/database.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'em-http-request'
|
2
|
+
|
3
|
+
module Scrapey
|
4
|
+
def multi_get all_urls, num_threads = 20, callback = :save_cache
|
5
|
+
all_urls.each_slice(num_threads) do |urls|
|
6
|
+
next unless urls.size > 0
|
7
|
+
EventMachine.run do
|
8
|
+
multi = EventMachine::MultiRequest.new
|
9
|
+
urls.each_with_index do |url, i|
|
10
|
+
multi.add i, EventMachine::HttpRequest.new(url).get(:redirects => 10)
|
11
|
+
end
|
12
|
+
multi.callback do
|
13
|
+
(0...multi.requests.length).each do |i|
|
14
|
+
if multi.responses[:callback][i]
|
15
|
+
send callback, urls[i], multi.responses[:callback][i].response
|
16
|
+
else
|
17
|
+
puts "problem downloading #{urls[i]}!"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
EventMachine.stop
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/scrapey/scrapey.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
module Scrapey
|
2
|
-
BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
|
3
2
|
|
4
3
|
def get_or_post method, url, options={}, *args
|
5
4
|
agent = method == 'goto' ? @browser : @agent
|
@@ -13,7 +12,7 @@ module Scrapey
|
|
13
12
|
end
|
14
13
|
|
15
14
|
doc = load_cache(url) if @use_cache
|
16
|
-
|
15
|
+
return doc if doc
|
17
16
|
|
18
17
|
page = agent.send *new_args
|
19
18
|
save_cache(url, page.body) if @use_cache
|
@@ -23,8 +22,7 @@ module Scrapey
|
|
23
22
|
rescue Exception => e
|
24
23
|
case
|
25
24
|
when defined? on_error
|
26
|
-
on_error e
|
27
|
-
get_or_post method, url, options, *args
|
25
|
+
return on_error e, method, url, options, *args
|
28
26
|
when _retries && _retries > 0
|
29
27
|
puts "Error. Retries remaining: #{options[:retries]}"
|
30
28
|
sleep _sleep if _sleep
|
@@ -59,12 +57,12 @@ module Scrapey
|
|
59
57
|
end
|
60
58
|
end
|
61
59
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
def visited? url
|
61
|
+
@visited ||= []
|
62
|
+
return true if @visited.include? url
|
63
|
+
@visited << url
|
64
|
+
false
|
65
|
+
end
|
68
66
|
|
69
67
|
def ts
|
70
68
|
Time.now.to_i.to_s
|
data/lib/scrapey/version.rb
CHANGED
data/lib/scrapey.rb
CHANGED
@@ -3,8 +3,9 @@ require 'csv'
|
|
3
3
|
require 'json'
|
4
4
|
require 'yaml'
|
5
5
|
|
6
|
-
require "scrapey/version"
|
7
6
|
require "scrapey/scrapey"
|
7
|
+
require "scrapey/version"
|
8
|
+
require "scrapey/constants"
|
8
9
|
require "scrapey/cache"
|
9
10
|
require "scrapey/database"
|
10
11
|
|
@@ -15,7 +16,7 @@ include Scrapey
|
|
15
16
|
@agent.user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5'
|
16
17
|
|
17
18
|
# default output file
|
18
|
-
@output =
|
19
|
+
@output = 'output.csv'
|
19
20
|
|
20
21
|
# read config file
|
21
22
|
config_file = "#{BASEDIR}/config/config.yml"
|
data/template/config/config.yml
CHANGED
@@ -3,21 +3,17 @@
|
|
3
3
|
email: foo@gmail.com
|
4
4
|
password: 12345
|
5
5
|
|
6
|
-
=begin
|
7
6
|
# example database section
|
8
|
-
database:
|
9
|
-
adapter: mysql
|
10
|
-
database: template
|
11
|
-
username: root
|
12
|
-
password: 12345
|
13
|
-
host: localhost
|
14
|
-
encoding: 'utf8'
|
15
|
-
=end
|
7
|
+
#database:
|
8
|
+
# adapter: mysql
|
9
|
+
# database: template
|
10
|
+
# username: root
|
11
|
+
# password: 12345
|
12
|
+
# host: localhost
|
13
|
+
# encoding: 'utf8'
|
16
14
|
|
17
|
-
=begin
|
18
15
|
# example proxies section
|
19
|
-
proxies:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
=end
|
16
|
+
#proxies:
|
17
|
+
#- www.host1.com:80
|
18
|
+
#- localhost:8080
|
19
|
+
#- 127.0.0.1:8888
|
data/template/src/template.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 2.5.0
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: json
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ~>
|
@@ -32,7 +37,12 @@ dependencies:
|
|
32
37
|
version: 1.7.0
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.7.0
|
36
46
|
description: A simple scraping framework
|
37
47
|
email:
|
38
48
|
- pguardiario@gmail.com
|
@@ -46,8 +56,11 @@ files:
|
|
46
56
|
- README.md
|
47
57
|
- Rakefile
|
48
58
|
- bin/scrapey
|
59
|
+
- examples/multi.rb
|
49
60
|
- lib/scrapey/cache.rb
|
61
|
+
- lib/scrapey/constants.rb
|
50
62
|
- lib/scrapey/database.rb
|
63
|
+
- lib/scrapey/multi.rb
|
51
64
|
- lib/scrapey/scrapey.rb
|
52
65
|
- lib/scrapey/template.rb
|
53
66
|
- lib/scrapey/version.rb
|
@@ -75,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
88
|
version: '0'
|
76
89
|
requirements: []
|
77
90
|
rubyforge_project:
|
78
|
-
rubygems_version: 1.8.
|
91
|
+
rubygems_version: 1.8.23
|
79
92
|
signing_key:
|
80
93
|
specification_version: 3
|
81
94
|
summary: A simple scraping framework
|