reid 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OTBmYTkyYzZjYzU3OTZiYTJhYzViYTZkN2M4ZWMyZWJjODUzOGUzYw==
5
+ data.tar.gz: !binary |-
6
+ YjhhZTBjZGQyOWU3M2VkNjU1ZjA3NGZhYWE1ZDgyN2I2NTNjYjFjOA==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ N2MyMzRjYTNkYzBhMTFkNWEwNGM0YzRmYWFmZTc4MzkyZWEyMWUyZWJjNGY4
10
+ Mzg1YzRhYWYwZThiYzk0ZWZkODI5Y2Q1Y2IxZGE4MmQwOTJmODc2Zjc1ZDNl
11
+ NjdmMjM4NjI4MWI0MGY1YzllZTVjMzViNDk0YThhM2JmMTE4ZmY=
12
+ data.tar.gz: !binary |-
13
+ MTZkYzdkMmZhNTEyNDRiNjU3M2MwZjQzYTEyNDEzNzVmYjI1ZWY4Y2I5NmY5
14
+ NGE1MGI2NmY1ODZmNjhjNWIxMmVlNGU0OTY2ZGZjMmRjYmE0NjJiMzhmNDE0
15
+ YzhjMDgxODAxNTk4NjNlMzNjMmRmNTljOGJiZjUyYjM1ZjAxNmY=
@@ -0,0 +1,5 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ require 'reid/reid'
5
+ require 'reid/requester.rb'
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+
3
+ class Reid
4
+ def initialize(requester_options = {})
5
+ @requester = Requester.new(requester_options)
6
+ end
7
+
8
+ def scrape_doc doc, operations
9
+ record = {}
10
+ operations.each do |e|
11
+ if e[3] == :xpath
12
+ e[0].call(doc.xpath(e[1]), record)
13
+ else
14
+ e[0].call(doc.css(e[1]), record)
15
+ end
16
+ end
17
+ return record
18
+ end
19
+
20
+ def scrape_page url, operations
21
+ return scrape_doc(@requester.request(url), operations)
22
+ end
23
+
24
+ def crawl url_crawler, operations, store_function
25
+ doc = nil
26
+ while(url = url_crawler.next(doc))
27
+ doc = @requester.request url
28
+ store_function.call(scrape_doc(doc, operations))
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,43 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ class Requester
5
+ def initialize(options = {})
6
+ @min_request_interval=(options[:min_request_interval] || false)
7
+ @error_log = (options[:error_log] || false)
8
+ @max_backoff_time = (options[:max_backoff_time] || false)
9
+ @initial_delay = (options[:intial_delay] || 1.0)
10
+ @multiplicand = (options[:multiplicand] || 1.3)
11
+ @previous_request = (@min_request_interval && Time.now)
12
+ end
13
+
14
+ def request(url, time = false)
15
+ if !time then if @min_request_interval then damper end end
16
+ begin
17
+ return Nokogiri::HTML(open(url))
18
+ rescue Exception => e
19
+ if @error_record
20
+ error_record = {:time => Time.now, :message =>e.message}
21
+ @error_log.insert(error_record)
22
+ end
23
+ unless time
24
+ sleep @intial_delay
25
+ return request url, @initial_delay * @multiplicand
26
+ else
27
+ if (!@max_backoff_time || (time < @max_backoff_time))
28
+ sleep time
29
+ return request url, time * @multiplicand
30
+ else
31
+ raise "Problem with request. Max backoff time exceided."
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def damper()
38
+ if (Time.now - @previous_request < @min_request_interval)
39
+ sleep(@min_request_interval - (Time.now - @previous_request))
40
+ end
41
+ @previous_request = Time.now
42
+ end
43
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reid
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dan Breczinski
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ description: Reid is a simple tool for crawling web pages and throttling requests
28
+ email: pt2323@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/reid.rb
34
+ - lib/reid/reid.rb
35
+ - lib/reid/requester.rb
36
+ homepage: https://github.com/danpaul/reid
37
+ licenses:
38
+ - MIT (http://opensource.org/licenses/MIT)
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.0.5
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Reid is a gem to help structure web scraping.
60
+ test_files: []