hash_spidey 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/wikipedia_article_spider.rb +28 -0
- data/hash_spidey.gemspec +1 -1
- data/lib/hash_spidey/crawl_record.rb +11 -8
- data/lib/hash_spidey/hash_url_record.rb +9 -3
- data/lib/hash_spidey/strategies/hash_store_strategy.rb +15 -11
- data/lib/hash_spidey/version.rb +1 -1
- data/spec/spiders/hash_store_strategy_spec.rb +10 -5
- data/spec/unit/hash_url_record_spec.rb +2 -2
- metadata +3 -2
@@ -0,0 +1,28 @@
|
|
1
|
+
# obviously, you should just use Wikipedia's API
|
2
|
+
class WikipediaArticleSpider < HashSpidey::AbstractSpider
|
3
|
+
|
4
|
+
def initialize(first_article_url, opts)
|
5
|
+
super(opts)
|
6
|
+
handle first_article_url, :process_article
|
7
|
+
end
|
8
|
+
|
9
|
+
def process_article(page, default_opts={})
|
10
|
+
|
11
|
+
record_page(page)
|
12
|
+
|
13
|
+
page.search('a').select{|a| a['href'] =~ /wiki\/Category:/}.each do |a|
|
14
|
+
href = resolve_url( a['href'], page)
|
15
|
+
handle href, :process_category_page
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_category_page(page, default_opts={})
|
20
|
+
title = page.title
|
21
|
+
page_count_text = page.search('#mw-pages > p')[0].text.match(/[\d,]+ total\./)
|
22
|
+
datastr = "#{title} has #{page_count_text}"
|
23
|
+
|
24
|
+
record_data(page, datastr)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
end
|
data/hash_spidey.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = HashSpidey::VERSION
|
9
9
|
spec.authors = ["dannguyen"]
|
10
10
|
spec.email = ["dansonguyen@gmail.com"]
|
11
|
-
spec.description = %q{An implementation of joeyAghion's Spidey
|
11
|
+
spec.description = %q{An implementation of Artsy's joeyAghion's Spidey::AbstractSpider}
|
12
12
|
spec.summary = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
|
13
13
|
spec.homepage = "http://github.com/dannguyen"
|
14
14
|
spec.license = "MIT"
|
@@ -3,22 +3,25 @@ require 'mechanize'
|
|
3
3
|
|
4
4
|
module HashSpidey
|
5
5
|
|
6
|
-
class CrawlRecord
|
6
|
+
class CrawlRecord
|
7
7
|
|
8
8
|
META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
|
9
9
|
attr_reader :crawled_timestamp
|
10
10
|
|
11
11
|
def initialize(obj, timestamp)
|
12
12
|
@crawled_timestamp = timestamp
|
13
|
-
@page_object = obj
|
14
|
-
end
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
msh
|
14
|
+
|
15
|
+
@page_object = META_ATTS.inject(Hashie::Mash.new) do |msh, att|
|
16
|
+
msh[att] = obj.send(att) if obj.respond_to?(att)
|
17
|
+
msh
|
20
18
|
end
|
21
|
-
|
19
|
+
|
20
|
+
@page_object.crawled_timestamp = @crawled_timestamp
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_hash
|
24
|
+
return @page_object
|
22
25
|
end
|
23
26
|
|
24
27
|
protected
|
@@ -7,7 +7,7 @@ module HashSpidey
|
|
7
7
|
attr_reader :url, :code,
|
8
8
|
:initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
|
9
9
|
:content, :handler, :spider, :handle_data,
|
10
|
-
:crawl_metadata
|
10
|
+
:crawl_metadata, :parsed_data
|
11
11
|
|
12
12
|
|
13
13
|
# convenience name for spidey
|
@@ -31,8 +31,11 @@ module HashSpidey
|
|
31
31
|
end
|
32
32
|
|
33
33
|
|
34
|
-
def
|
35
|
-
|
34
|
+
def mark_record(obj)
|
35
|
+
obj = Hashie::Mash.new(obj) if obj.is_a?(Hash)
|
36
|
+
|
37
|
+
@content = obj.content if obj.respond_to?(:content)
|
38
|
+
@parsed_data = obj.parsed_data if obj.respond_to?(:parsed_data)
|
36
39
|
@recorded_timestamp = Time.now
|
37
40
|
end
|
38
41
|
|
@@ -51,6 +54,9 @@ module HashSpidey
|
|
51
54
|
!(crawled_timestamp.nil?)
|
52
55
|
end
|
53
56
|
|
57
|
+
def has_content?
|
58
|
+
!(@content.nil? || @content.empty?)
|
59
|
+
end
|
54
60
|
|
55
61
|
## this is just an alias
|
56
62
|
|
@@ -5,6 +5,7 @@ module HashSpidey
|
|
5
5
|
def initialize(attrs = {})
|
6
6
|
@url_collection = {}
|
7
7
|
@error_collection = []
|
8
|
+
agent.user_agent = "Abstract Spider"
|
8
9
|
|
9
10
|
super(attrs)
|
10
11
|
end
|
@@ -42,8 +43,8 @@ module HashSpidey
|
|
42
43
|
begin
|
43
44
|
page = agent.get(url)
|
44
45
|
Spidey.logger.info "Handling #{url.inspect}"
|
45
|
-
process_crawl(url, page)
|
46
46
|
send handler, page, default_data
|
47
|
+
process_crawl(url, page)
|
47
48
|
rescue => ex
|
48
49
|
add_error url: url, handler: handler, error: ex
|
49
50
|
end
|
@@ -61,27 +62,30 @@ module HashSpidey
|
|
61
62
|
end
|
62
63
|
|
63
64
|
# expects @url_collection to have :url, but if not, creates new HashUrlRecord
|
64
|
-
|
65
|
-
|
65
|
+
# data_hashie should have :content and/or :parsed_data
|
66
|
+
def record(url, data_hashie)
|
66
67
|
h_url = @url_collection[url] || HashUrlRecord.new(url)
|
67
68
|
|
68
69
|
# set the content and record_timestamp of the HashUrlRecord
|
69
|
-
h_url.
|
70
|
+
h_url.mark_record(data_hashie)
|
70
71
|
|
71
72
|
# reassign, update collection
|
72
73
|
@url_collection[url] = h_url
|
73
74
|
end
|
74
75
|
|
76
|
+
# convenience method, expecting :page to be a Nokogiri::Page
|
77
|
+
def record_page(page)
|
78
|
+
url = page.uri.to_s
|
79
|
+
record(url, content: page.content)
|
80
|
+
end
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
msh.url = page.uri.to_s
|
80
|
-
msh.content = page.content
|
81
|
-
|
82
|
-
record(msh)
|
82
|
+
def record_data(page, data)
|
83
|
+
url = page.uri.to_s
|
84
|
+
record(url, parsed_data: data)
|
83
85
|
end
|
84
86
|
|
87
|
+
|
88
|
+
|
85
89
|
def each_url(&block)
|
86
90
|
while h_url = get_next_url_hash
|
87
91
|
yield h_url.url, h_url.handler, h_url.handle_data
|
data/lib/hash_spidey/version.rb
CHANGED
@@ -59,20 +59,23 @@ describe HashSpidey::Strategies::HashStore do
|
|
59
59
|
end
|
60
60
|
|
61
61
|
it 'should respond to header#content-type' do
|
62
|
-
expect(@crawled_url.
|
62
|
+
expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
|
63
63
|
end
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
68
|
|
69
|
-
context 'generic #
|
69
|
+
context 'generic #record_page' do
|
70
70
|
describe '#records' do
|
71
71
|
before(:each) do
|
72
|
+
FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
|
73
|
+
"content-type"=>"text/html; charset=UTF-8"
|
74
|
+
)
|
72
75
|
|
73
|
-
@data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
|
74
76
|
@spider = TestSpider.new request_interval: 0
|
75
|
-
@
|
77
|
+
@page = Mechanize.new.get("http://www.example.com/")
|
78
|
+
@spider.record_page @page
|
76
79
|
end
|
77
80
|
|
78
81
|
it "should add to records" do
|
@@ -81,7 +84,9 @@ describe HashSpidey::Strategies::HashStore do
|
|
81
84
|
end
|
82
85
|
|
83
86
|
it 'should update existing result' do
|
84
|
-
@
|
87
|
+
@page.stub(:content){ 'Bye World' }
|
88
|
+
|
89
|
+
@spider.record_page @page
|
85
90
|
expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
|
86
91
|
expect(@spider.records.count).to eq 1
|
87
92
|
end
|
@@ -37,9 +37,9 @@ describe HashSpidey::HashUrlRecord do
|
|
37
37
|
@hurl = HashUrlRecord.new "http://www.example.com"
|
38
38
|
end
|
39
39
|
|
40
|
-
describe '#
|
40
|
+
describe '#mark_record' do
|
41
41
|
before(:each) do
|
42
|
-
@hurl.
|
42
|
+
@hurl.mark_record content: 'hello'
|
43
43
|
end
|
44
44
|
|
45
45
|
it 'should set @recorded_timestamp' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hash_spidey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -91,7 +91,7 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
-
description: An implementation of joeyAghion's Spidey
|
94
|
+
description: An implementation of Artsy's joeyAghion's Spidey::AbstractSpider
|
95
95
|
email:
|
96
96
|
- dansonguyen@gmail.com
|
97
97
|
executables: []
|
@@ -103,6 +103,7 @@ files:
|
|
103
103
|
- LICENSE.txt
|
104
104
|
- README.md
|
105
105
|
- Rakefile
|
106
|
+
- examples/wikipedia_article_spider.rb
|
106
107
|
- hash_spidey.gemspec
|
107
108
|
- lib/hash_spidey.rb
|
108
109
|
- lib/hash_spidey/crawl_record.rb
|