reid 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/reid.rb +5 -0
- data/lib/reid/reid.rb +31 -0
- data/lib/reid/requester.rb +43 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OTBmYTkyYzZjYzU3OTZiYTJhYzViYTZkN2M4ZWMyZWJjODUzOGUzYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjhhZTBjZGQyOWU3M2VkNjU1ZjA3NGZhYWE1ZDgyN2I2NTNjYjFjOA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
N2MyMzRjYTNkYzBhMTFkNWEwNGM0YzRmYWFmZTc4MzkyZWEyMWUyZWJjNGY4
|
10
|
+
Mzg1YzRhYWYwZThiYzk0ZWZkODI5Y2Q1Y2IxZGE4MmQwOTJmODc2Zjc1ZDNl
|
11
|
+
NjdmMjM4NjI4MWI0MGY1YzllZTVjMzViNDk0YThhM2JmMTE4ZmY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MTZkYzdkMmZhNTEyNDRiNjU3M2MwZjQzYTEyNDEzNzVmYjI1ZWY4Y2I5NmY5
|
14
|
+
NGE1MGI2NmY1ODZmNjhjNWIxMmVlNGU0OTY2ZGZjMmRjYmE0NjJiMzhmNDE0
|
15
|
+
YzhjMDgxODAxNTk4NjNlMzNjMmRmNTljOGJiZjUyYjM1ZjAxNmY=
|
data/lib/reid.rb
ADDED
data/lib/reid/reid.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Reid
|
4
|
+
def initialize(requester_options = {})
|
5
|
+
@requester = Requester.new(requester_options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def scrape_doc doc, operations
|
9
|
+
record = {}
|
10
|
+
operations.each do |e|
|
11
|
+
if e[3] == :xpath
|
12
|
+
e[0].call(doc.xpath(e[1]), record)
|
13
|
+
else
|
14
|
+
e[0].call(doc.css(e[1]), record)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
return record
|
18
|
+
end
|
19
|
+
|
20
|
+
def scrape_page url, operations
|
21
|
+
return scrape_doc(@requester.request(url), operations)
|
22
|
+
end
|
23
|
+
|
24
|
+
def crawl url_crawler, operations, store_function
|
25
|
+
doc = nil
|
26
|
+
while(url = url_crawler.next(doc))
|
27
|
+
doc = @requester.request url
|
28
|
+
store_function.call(scrape_doc(doc, operations))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
class Requester
|
5
|
+
def initialize(options = {})
|
6
|
+
@min_request_interval=(options[:min_request_interval] || false)
|
7
|
+
@error_log = (options[:error_log] || false)
|
8
|
+
@max_backoff_time = (options[:max_backoff_time] || false)
|
9
|
+
@initial_delay = (options[:intial_delay] || 1.0)
|
10
|
+
@multiplicand = (options[:multiplicand] || 1.3)
|
11
|
+
@previous_request = (@min_request_interval && Time.now)
|
12
|
+
end
|
13
|
+
|
14
|
+
def request(url, time = false)
|
15
|
+
if !time then if @min_request_interval then damper end end
|
16
|
+
begin
|
17
|
+
return Nokogiri::HTML(open(url))
|
18
|
+
rescue Exception => e
|
19
|
+
if @error_record
|
20
|
+
error_record = {:time => Time.now, :message =>e.message}
|
21
|
+
@error_log.insert(error_record)
|
22
|
+
end
|
23
|
+
unless time
|
24
|
+
sleep @intial_delay
|
25
|
+
return request url, @initial_delay * @multiplicand
|
26
|
+
else
|
27
|
+
if (!@max_backoff_time || (time < @max_backoff_time))
|
28
|
+
sleep time
|
29
|
+
return request url, time * @multiplicand
|
30
|
+
else
|
31
|
+
raise "Problem with request. Max backoff time exceided."
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def damper()
|
38
|
+
if (Time.now - @previous_request < @min_request_interval)
|
39
|
+
sleep(@min_request_interval - (Time.now - @previous_request))
|
40
|
+
end
|
41
|
+
@previous_request = Time.now
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reid
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dan Breczinski
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
description: Reid is a simple tool for crawling web pages and throttling requests
|
28
|
+
email: pt2323@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/reid.rb
|
34
|
+
- lib/reid/reid.rb
|
35
|
+
- lib/reid/requester.rb
|
36
|
+
homepage: https://github.com/danpaul/reid
|
37
|
+
licenses:
|
38
|
+
- MIT (http://opensource.org/licenses/MIT)
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
requirements: []
|
55
|
+
rubyforge_project:
|
56
|
+
rubygems_version: 2.0.5
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Reid is a gem to help structure web scraping.
|
60
|
+
test_files: []
|