reid 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/lib/reid.rb +5 -0
- data/lib/reid/reid.rb +31 -0
- data/lib/reid/requester.rb +43 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OTBmYTkyYzZjYzU3OTZiYTJhYzViYTZkN2M4ZWMyZWJjODUzOGUzYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjhhZTBjZGQyOWU3M2VkNjU1ZjA3NGZhYWE1ZDgyN2I2NTNjYjFjOA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
N2MyMzRjYTNkYzBhMTFkNWEwNGM0YzRmYWFmZTc4MzkyZWEyMWUyZWJjNGY4
|
10
|
+
Mzg1YzRhYWYwZThiYzk0ZWZkODI5Y2Q1Y2IxZGE4MmQwOTJmODc2Zjc1ZDNl
|
11
|
+
NjdmMjM4NjI4MWI0MGY1YzllZTVjMzViNDk0YThhM2JmMTE4ZmY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MTZkYzdkMmZhNTEyNDRiNjU3M2MwZjQzYTEyNDEzNzVmYjI1ZWY4Y2I5NmY5
|
14
|
+
NGE1MGI2NmY1ODZmNjhjNWIxMmVlNGU0OTY2ZGZjMmRjYmE0NjJiMzhmNDE0
|
15
|
+
YzhjMDgxODAxNTk4NjNlMzNjMmRmNTljOGJiZjUyYjM1ZjAxNmY=
|
data/lib/reid.rb
ADDED
data/lib/reid/reid.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Reid
|
4
|
+
def initialize(requester_options = {})
|
5
|
+
@requester = Requester.new(requester_options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def scrape_doc doc, operations
|
9
|
+
record = {}
|
10
|
+
operations.each do |e|
|
11
|
+
if e[3] == :xpath
|
12
|
+
e[0].call(doc.xpath(e[1]), record)
|
13
|
+
else
|
14
|
+
e[0].call(doc.css(e[1]), record)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
return record
|
18
|
+
end
|
19
|
+
|
20
|
+
def scrape_page url, operations
|
21
|
+
return scrape_doc(@requester.request(url), operations)
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl url_crawler, operations, store_function
|
25
|
+
doc = nil
|
26
|
+
while(url = url_crawler.next(doc))
|
27
|
+
doc = @requester.request url
|
28
|
+
store_function.call(scrape_doc(doc, operations))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
class Requester
|
5
|
+
def initialize(options = {})
|
6
|
+
@min_request_interval=(options[:min_request_interval] || false)
|
7
|
+
@error_log = (options[:error_log] || false)
|
8
|
+
@max_backoff_time = (options[:max_backoff_time] || false)
|
9
|
+
@initial_delay = (options[:intial_delay] || 1.0)
|
10
|
+
@multiplicand = (options[:multiplicand] || 1.3)
|
11
|
+
@previous_request = (@min_request_interval && Time.now)
|
12
|
+
end
|
13
|
+
|
14
|
+
def request(url, time = false)
|
15
|
+
if !time then if @min_request_interval then damper end end
|
16
|
+
begin
|
17
|
+
return Nokogiri::HTML(open(url))
|
18
|
+
rescue Exception => e
|
19
|
+
if @error_record
|
20
|
+
error_record = {:time => Time.now, :message =>e.message}
|
21
|
+
@error_log.insert(error_record)
|
22
|
+
end
|
23
|
+
unless time
|
24
|
+
sleep @intial_delay
|
25
|
+
return request url, @initial_delay * @multiplicand
|
26
|
+
else
|
27
|
+
if (!@max_backoff_time || (time < @max_backoff_time))
|
28
|
+
sleep time
|
29
|
+
return request url, time * @multiplicand
|
30
|
+
else
|
31
|
+
raise "Problem with request. Max backoff time exceided."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def damper()
|
38
|
+
if (Time.now - @previous_request < @min_request_interval)
|
39
|
+
sleep(@min_request_interval - (Time.now - @previous_request))
|
40
|
+
end
|
41
|
+
@previous_request = Time.now
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reid
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dan Breczinski
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
description: Reid is a simple tool for crawling web pages and throttling requests
|
28
|
+
email: pt2323@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/reid.rb
|
34
|
+
- lib/reid/reid.rb
|
35
|
+
- lib/reid/requester.rb
|
36
|
+
homepage: https://github.com/danpaul/reid
|
37
|
+
licenses:
|
38
|
+
- MIT (http://opensource.org/licenses/MIT)
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
requirements: []
|
55
|
+
rubyforge_project:
|
56
|
+
rubygems_version: 2.0.5
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Reid is a gem to help structure web scraping.
|
60
|
+
test_files: []
|