reid 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OTBmYTkyYzZjYzU3OTZiYTJhYzViYTZkN2M4ZWMyZWJjODUzOGUzYw==
5
+ data.tar.gz: !binary |-
6
+ YjhhZTBjZGQyOWU3M2VkNjU1ZjA3NGZhYWE1ZDgyN2I2NTNjYjFjOA==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ N2MyMzRjYTNkYzBhMTFkNWEwNGM0YzRmYWFmZTc4MzkyZWEyMWUyZWJjNGY4
10
+ Mzg1YzRhYWYwZThiYzk0ZWZkODI5Y2Q1Y2IxZGE4MmQwOTJmODc2Zjc1ZDNl
11
+ NjdmMjM4NjI4MWI0MGY1YzllZTVjMzViNDk0YThhM2JmMTE4ZmY=
12
+ data.tar.gz: !binary |-
13
+ MTZkYzdkMmZhNTEyNDRiNjU3M2MwZjQzYTEyNDEzNzVmYjI1ZWY4Y2I5NmY5
14
+ NGE1MGI2NmY1ODZmNjhjNWIxMmVlNGU0OTY2ZGZjMmRjYmE0NjJiMzhmNDE0
15
+ YzhjMDgxODAxNTk4NjNlMzNjMmRmNTljOGJiZjUyYjM1ZjAxNmY=
@@ -0,0 +1,5 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ require 'reid/reid'
5
+ require 'reid/requester.rb'
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+
3
+ class Reid
4
+ def initialize(requester_options = {})
5
+ @requester = Requester.new(requester_options)
6
+ end
7
+
8
+ def scrape_doc doc, operations
9
+ record = {}
10
+ operations.each do |e|
11
+ if e[3] == :xpath
12
+ e[0].call(doc.xpath(e[1]), record)
13
+ else
14
+ e[0].call(doc.css(e[1]), record)
15
+ end
16
+ end
17
+ return record
18
+ end
19
+
20
+ def scrape_page url, operations
21
+ return scrape_doc(@requester.request(url), operations)
22
+ end
23
+
24
+ def crawl url_crawler, operations, store_function
25
+ doc = nil
26
+ while(url = url_crawler.next(doc))
27
+ doc = @requester.request url
28
+ store_function.call(scrape_doc(doc, operations))
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ class Requester
5
+ def initialize(options = {})
6
+ @min_request_interval=(options[:min_request_interval] || false)
7
+ @error_log = (options[:error_log] || false)
8
+ @max_backoff_time = (options[:max_backoff_time] || false)
9
+ @initial_delay = (options[:intial_delay] || 1.0)
10
+ @multiplicand = (options[:multiplicand] || 1.3)
11
+ @previous_request = (@min_request_interval && Time.now)
12
+ end
13
+
14
+ def request(url, time = false)
15
+ if !time then if @min_request_interval then damper end end
16
+ begin
17
+ return Nokogiri::HTML(open(url))
18
+ rescue Exception => e
19
+ if @error_record
20
+ error_record = {:time => Time.now, :message =>e.message}
21
+ @error_log.insert(error_record)
22
+ end
23
+ unless time
24
+ sleep @intial_delay
25
+ return request url, @initial_delay * @multiplicand
26
+ else
27
+ if (!@max_backoff_time || (time < @max_backoff_time))
28
+ sleep time
29
+ return request url, time * @multiplicand
30
+ else
31
+ raise "Problem with request. Max backoff time exceided."
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def damper()
38
+ if (Time.now - @previous_request < @min_request_interval)
39
+ sleep(@min_request_interval - (Time.now - @previous_request))
40
+ end
41
+ @previous_request = Time.now
42
+ end
43
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reid
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dan Breczinski
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ description: Reid is a simple tool for crawling web pages and throttling requests
28
+ email: pt2323@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/reid.rb
34
+ - lib/reid/reid.rb
35
+ - lib/reid/requester.rb
36
+ homepage: https://github.com/danpaul/reid
37
+ licenses:
38
+ - MIT (http://opensource.org/licenses/MIT)
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.0.5
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Reid is a gem to help structure web scraping.
60
+ test_files: []