hash_spidey 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,112 +1,108 @@
1
1
  module HashSpidey
2
2
  module Strategies
3
-
4
3
  module HashStore
5
4
 
6
- def initialize(attrs = {})
7
- @url_collection = {}
8
- @error_collection = []
9
-
10
- super(attrs)
11
- end
5
+ def initialize(attrs = {})
6
+ @url_collection = {}
7
+ @error_collection = []
12
8
 
13
- #### process strategies
9
+ super(attrs)
10
+ end
14
11
 
12
+ #### process strategies
15
13
 
16
- ## conveinence methods
17
- def crawls
18
- @url_collection.select{|k,v| v.crawled?}
19
- end
20
14
 
15
+ ## conveinence methods
16
+ def crawls
17
+ @url_collection.select{|k,v| v.crawled?}
18
+ end
21
19
 
22
- def uncrawled
23
- @url_collection.reject{|k,v| v.crawled?}
24
- end
25
20
 
26
- def records
27
- @url_collection.select{|k,v| v.recorded?}
28
- end
21
+ def uncrawled
22
+ @url_collection.reject{|k,v| v.crawled?}
23
+ end
29
24
 
30
- def process_crawl(url, page)
31
- h_url = @url_collection[url]
32
- h_url.mark_as_crawled(page)
33
- end
25
+ def records
26
+ @url_collection.select{|k,v| v.recorded?}
27
+ end
34
28
 
29
+ def process_crawl(url, page)
30
+ h_url = @url_collection[url]
31
+ h_url.mark_as_crawled(page)
32
+ end
35
33
 
36
- def crawl(options = {})
37
- @crawl_started_at = Time.now
38
- @until = Time.now + options[:crawl_for] if options[:crawl_for]
39
34
 
40
- i = 0
41
- each_url do |url, handler, default_data|
42
- break if options[:max_urls] && i >= options[:max_urls]
43
- begin
44
- page = agent.get(url)
45
- Spidey.logger.info "Handling #{url.inspect}"
46
- process_crawl(url, page)
47
- send handler, page, default_data
48
- rescue => ex
49
- add_error url: url, handler: handler, error: ex
50
- end
51
- sleep request_interval if request_interval > 0
52
- i += 1
53
- end
54
- end
35
+ def crawl(options = {})
36
+ @crawl_started_at = Time.now
37
+ @until = Time.now + options[:crawl_for] if options[:crawl_for]
38
+
39
+ i = 0
40
+ each_url do |url, handler, default_data|
41
+ break if options[:max_urls] && i >= options[:max_urls]
42
+ begin
43
+ page = agent.get(url)
44
+ Spidey.logger.info "Handling #{url.inspect}"
45
+ process_crawl(url, page)
46
+ send handler, page, default_data
47
+ rescue => ex
48
+ add_error url: url, handler: handler, error: ex
49
+ end
50
+ sleep request_interval if request_interval > 0
51
+ i += 1
52
+ end
53
+ end
55
54
 
56
55
 
57
- def handle(url, handler, handle_data = {})
58
- Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
56
+ def handle(url, handler, handle_data = {})
57
+ Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
59
58
 
60
- spider_name = self.class.name
61
- @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
62
- end
59
+ spider_name = self.class.name
60
+ @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
61
+ end
63
62
 
64
- # expects @url_collection to have :url, but if not, creates new HashUrlRecord
65
- def record(data_hashie)
66
- url = data_hashie.url
67
- h_url = @url_collection[url] || HashUrlRecord.new(url)
63
+ # expects @url_collection to have :url, but if not, creates new HashUrlRecord
64
+ def record(data_hashie)
65
+ url = data_hashie.url
66
+ h_url = @url_collection[url] || HashUrlRecord.new(url)
68
67
 
69
- # set the content and record_timestamp of the HashUrlRecord
70
- h_url.record_content(data_hashie.content)
68
+ # set the content and record_timestamp of the HashUrlRecord
69
+ h_url.record_content(data_hashie.content)
71
70
 
72
- # reassign, update collection
73
- @url_collection[url] = h_url
74
- end
71
+ # reassign, update collection
72
+ @url_collection[url] = h_url
73
+ end
75
74
 
76
75
 
77
- # wrapper around #record
78
- def record_page(page, default_data={})
79
- msh = Hashie::Mash.new(default_data)
80
- msh.url = page.uri.to_s
81
- msh.content = page.content
76
+ # wrapper around #record
77
+ def record_page(page, default_data={})
78
+ msh = Hashie::Mash.new(default_data)
79
+ msh.url = page.uri.to_s
80
+ msh.content = page.content
82
81
 
83
- record(msh)
84
- end
82
+ record(msh)
83
+ end
85
84
 
86
- def each_url(&block)
87
- while h_url = get_next_url_hash
88
- yield h_url.url, h_url.handler, h_url.handle_data
89
- end
90
- end
85
+ def each_url(&block)
86
+ while h_url = get_next_url_hash
87
+ yield h_url.url, h_url.handler, h_url.handle_data
88
+ end
89
+ end
91
90
 
92
- protected
91
+ protected
93
92
 
94
- def add_error(attrs)
95
- @error_collection << attrs
96
- Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
97
- end
93
+ def add_error(attrs)
94
+ @error_collection << attrs
95
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
96
+ end
98
97
 
99
98
 
100
99
  private
101
100
 
102
101
  def get_next_url_hash
103
102
  return nil if (@until && Time.now >= @until) # exceeded time bound
104
-
105
103
  # uncrawled is a filtered collection
106
104
  uncrawled.values.first
107
105
  end
108
-
109
-
110
106
  end
111
107
  end
112
108
  end
@@ -1,3 +1,3 @@
1
1
  module HashSpidey
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
16
16
  end
17
17
 
18
18
  end
19
-
19
+
20
20
  context 'generic #handle' do
21
21
 
22
22
  before(:each) do
23
23
  FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
24
24
  "content-type"=>"text/html; charset=UTF-8"
25
- )
25
+ )
26
26
  @spider = TestSpider.new request_interval: 0
27
27
  @spider.handle "http://www.example.com/", :process_size
28
28
  @spider.crawl
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
36
36
 
37
37
  it 'should update #crawled_timestamp' do
38
38
  @crawled_url = @spider.crawls.values.first
39
- expect( @crawled_url.url ).to eq 'http://www.example.com/'
40
- expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
39
+ expect( @crawled_url.url ).to eq 'http://www.example.com/'
40
+ expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
41
41
  end
42
42
 
43
43
  it 'should have #crawls act as a Hash' do
44
44
  expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
45
45
  end
46
-
46
+
47
47
  it "should not add duplicate URLs" do
48
- @spider.handle "http://www.example.com/", :process_something_else # second time
49
- expect( @spider.crawls.count ).to eq 1
48
+ @spider.handle "http://www.example.com/", :process_something_else # second time
49
+ expect( @spider.crawls.count ).to eq 1
50
50
  end
51
-
52
- context '@crawl_record' do
53
51
 
52
+ context '@crawl_record' do
54
53
  before(:each) do
55
54
  @crawled_url = @spider.crawls["http://www.example.com/"]
56
55
  end
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
60
59
  end
61
60
 
62
61
  it 'should respond to header#content-type' do
63
- expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
62
+ expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
64
63
  end
65
64
  end
66
65
  end
67
-
68
-
69
-
70
66
  end
71
67
 
72
68
 
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
91
87
  end
92
88
  end
93
89
  end
94
-
95
-
96
-
97
-
98
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hash_spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: