hash_spidey 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,112 +1,108 @@
1
1
  module HashSpidey
2
2
  module Strategies
3
-
4
3
  module HashStore
5
4
 
6
- def initialize(attrs = {})
7
- @url_collection = {}
8
- @error_collection = []
9
-
10
- super(attrs)
11
- end
5
+ def initialize(attrs = {})
6
+ @url_collection = {}
7
+ @error_collection = []
12
8
 
13
- #### process strategies
9
+ super(attrs)
10
+ end
14
11
 
12
+ #### process strategies
15
13
 
16
- ## conveinence methods
17
- def crawls
18
- @url_collection.select{|k,v| v.crawled?}
19
- end
20
14
 
15
+ ## conveinence methods
16
+ def crawls
17
+ @url_collection.select{|k,v| v.crawled?}
18
+ end
21
19
 
22
- def uncrawled
23
- @url_collection.reject{|k,v| v.crawled?}
24
- end
25
20
 
26
- def records
27
- @url_collection.select{|k,v| v.recorded?}
28
- end
21
+ def uncrawled
22
+ @url_collection.reject{|k,v| v.crawled?}
23
+ end
29
24
 
30
- def process_crawl(url, page)
31
- h_url = @url_collection[url]
32
- h_url.mark_as_crawled(page)
33
- end
25
+ def records
26
+ @url_collection.select{|k,v| v.recorded?}
27
+ end
34
28
 
29
+ def process_crawl(url, page)
30
+ h_url = @url_collection[url]
31
+ h_url.mark_as_crawled(page)
32
+ end
35
33
 
36
- def crawl(options = {})
37
- @crawl_started_at = Time.now
38
- @until = Time.now + options[:crawl_for] if options[:crawl_for]
39
34
 
40
- i = 0
41
- each_url do |url, handler, default_data|
42
- break if options[:max_urls] && i >= options[:max_urls]
43
- begin
44
- page = agent.get(url)
45
- Spidey.logger.info "Handling #{url.inspect}"
46
- process_crawl(url, page)
47
- send handler, page, default_data
48
- rescue => ex
49
- add_error url: url, handler: handler, error: ex
50
- end
51
- sleep request_interval if request_interval > 0
52
- i += 1
53
- end
54
- end
35
+ def crawl(options = {})
36
+ @crawl_started_at = Time.now
37
+ @until = Time.now + options[:crawl_for] if options[:crawl_for]
38
+
39
+ i = 0
40
+ each_url do |url, handler, default_data|
41
+ break if options[:max_urls] && i >= options[:max_urls]
42
+ begin
43
+ page = agent.get(url)
44
+ Spidey.logger.info "Handling #{url.inspect}"
45
+ process_crawl(url, page)
46
+ send handler, page, default_data
47
+ rescue => ex
48
+ add_error url: url, handler: handler, error: ex
49
+ end
50
+ sleep request_interval if request_interval > 0
51
+ i += 1
52
+ end
53
+ end
55
54
 
56
55
 
57
- def handle(url, handler, handle_data = {})
58
- Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
56
+ def handle(url, handler, handle_data = {})
57
+ Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
59
58
 
60
- spider_name = self.class.name
61
- @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
62
- end
59
+ spider_name = self.class.name
60
+ @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
61
+ end
63
62
 
64
- # expects @url_collection to have :url, but if not, creates new HashUrlRecord
65
- def record(data_hashie)
66
- url = data_hashie.url
67
- h_url = @url_collection[url] || HashUrlRecord.new(url)
63
+ # expects @url_collection to have :url, but if not, creates new HashUrlRecord
64
+ def record(data_hashie)
65
+ url = data_hashie.url
66
+ h_url = @url_collection[url] || HashUrlRecord.new(url)
68
67
 
69
- # set the content and record_timestamp of the HashUrlRecord
70
- h_url.record_content(data_hashie.content)
68
+ # set the content and record_timestamp of the HashUrlRecord
69
+ h_url.record_content(data_hashie.content)
71
70
 
72
- # reassign, update collection
73
- @url_collection[url] = h_url
74
- end
71
+ # reassign, update collection
72
+ @url_collection[url] = h_url
73
+ end
75
74
 
76
75
 
77
- # wrapper around #record
78
- def record_page(page, default_data={})
79
- msh = Hashie::Mash.new(default_data)
80
- msh.url = page.uri.to_s
81
- msh.content = page.content
76
+ # wrapper around #record
77
+ def record_page(page, default_data={})
78
+ msh = Hashie::Mash.new(default_data)
79
+ msh.url = page.uri.to_s
80
+ msh.content = page.content
82
81
 
83
- record(msh)
84
- end
82
+ record(msh)
83
+ end
85
84
 
86
- def each_url(&block)
87
- while h_url = get_next_url_hash
88
- yield h_url.url, h_url.handler, h_url.handle_data
89
- end
90
- end
85
+ def each_url(&block)
86
+ while h_url = get_next_url_hash
87
+ yield h_url.url, h_url.handler, h_url.handle_data
88
+ end
89
+ end
91
90
 
92
- protected
91
+ protected
93
92
 
94
- def add_error(attrs)
95
- @error_collection << attrs
96
- Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
97
- end
93
+ def add_error(attrs)
94
+ @error_collection << attrs
95
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
96
+ end
98
97
 
99
98
 
100
99
  private
101
100
 
102
101
  def get_next_url_hash
103
102
  return nil if (@until && Time.now >= @until) # exceeded time bound
104
-
105
103
  # uncrawled is a filtered collection
106
104
  uncrawled.values.first
107
105
  end
108
-
109
-
110
106
  end
111
107
  end
112
108
  end
@@ -1,3 +1,3 @@
1
1
  module HashSpidey
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
16
16
  end
17
17
 
18
18
  end
19
-
19
+
20
20
  context 'generic #handle' do
21
21
 
22
22
  before(:each) do
23
23
  FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
24
24
  "content-type"=>"text/html; charset=UTF-8"
25
- )
25
+ )
26
26
  @spider = TestSpider.new request_interval: 0
27
27
  @spider.handle "http://www.example.com/", :process_size
28
28
  @spider.crawl
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
36
36
 
37
37
  it 'should update #crawled_timestamp' do
38
38
  @crawled_url = @spider.crawls.values.first
39
- expect( @crawled_url.url ).to eq 'http://www.example.com/'
40
- expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
39
+ expect( @crawled_url.url ).to eq 'http://www.example.com/'
40
+ expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
41
41
  end
42
42
 
43
43
  it 'should have #crawls act as a Hash' do
44
44
  expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
45
45
  end
46
-
46
+
47
47
  it "should not add duplicate URLs" do
48
- @spider.handle "http://www.example.com/", :process_something_else # second time
49
- expect( @spider.crawls.count ).to eq 1
48
+ @spider.handle "http://www.example.com/", :process_something_else # second time
49
+ expect( @spider.crawls.count ).to eq 1
50
50
  end
51
-
52
- context '@crawl_record' do
53
51
 
52
+ context '@crawl_record' do
54
53
  before(:each) do
55
54
  @crawled_url = @spider.crawls["http://www.example.com/"]
56
55
  end
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
60
59
  end
61
60
 
62
61
  it 'should respond to header#content-type' do
63
- expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
62
+ expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
64
63
  end
65
64
  end
66
65
  end
67
-
68
-
69
-
70
66
  end
71
67
 
72
68
 
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
91
87
  end
92
88
  end
93
89
  end
94
-
95
-
96
-
97
-
98
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hash_spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: