hash_spidey 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/wikipedia_article_spider.rb +28 -0
- data/hash_spidey.gemspec +1 -1
- data/lib/hash_spidey/crawl_record.rb +11 -8
- data/lib/hash_spidey/hash_url_record.rb +9 -3
- data/lib/hash_spidey/strategies/hash_store_strategy.rb +15 -11
- data/lib/hash_spidey/version.rb +1 -1
- data/spec/spiders/hash_store_strategy_spec.rb +10 -5
- data/spec/unit/hash_url_record_spec.rb +2 -2
- metadata +3 -2
@@ -0,0 +1,28 @@
|
|
1
|
+
# obviously, you should just use Wikipedia's API
|
2
|
+
class WikipediaArticleSpider < HashSpidey::AbstractSpider
|
3
|
+
|
4
|
+
def initialize(first_article_url, opts)
|
5
|
+
super(opts)
|
6
|
+
handle first_article_url, :process_article
|
7
|
+
end
|
8
|
+
|
9
|
+
def process_article(page, default_opts={})
|
10
|
+
|
11
|
+
record_page(page)
|
12
|
+
|
13
|
+
page.search('a').select{|a| a['href'] =~ /wiki\/Category:/}.each do |a|
|
14
|
+
href = resolve_url( a['href'], page)
|
15
|
+
handle href, :process_category_page
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_category_page(page, default_opts={})
|
20
|
+
title = page.title
|
21
|
+
page_count_text = page.search('#mw-pages > p')[0].text.match(/[\d,]+ total\./)
|
22
|
+
datastr = "#{title} has #{page_count_text}"
|
23
|
+
|
24
|
+
record_data(page, datastr)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
end
|
data/hash_spidey.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = HashSpidey::VERSION
|
9
9
|
spec.authors = ["dannguyen"]
|
10
10
|
spec.email = ["dansonguyen@gmail.com"]
|
11
|
-
spec.description = %q{An implementation of joeyAghion's Spidey
|
11
|
+
spec.description = %q{An implementation of Artsy's joeyAghion's Spidey::AbstractSpider}
|
12
12
|
spec.summary = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
|
13
13
|
spec.homepage = "http://github.com/dannguyen"
|
14
14
|
spec.license = "MIT"
|
@@ -3,22 +3,25 @@ require 'mechanize'
|
|
3
3
|
|
4
4
|
module HashSpidey
|
5
5
|
|
6
|
-
class CrawlRecord
|
6
|
+
class CrawlRecord
|
7
7
|
|
8
8
|
META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
|
9
9
|
attr_reader :crawled_timestamp
|
10
10
|
|
11
11
|
def initialize(obj, timestamp)
|
12
12
|
@crawled_timestamp = timestamp
|
13
|
-
@page_object = obj
|
14
|
-
end
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
msh
|
14
|
+
|
15
|
+
@page_object = META_ATTS.inject(Hashie::Mash.new) do |msh, att|
|
16
|
+
msh[att] = obj.send(att) if obj.respond_to?(att)
|
17
|
+
msh
|
20
18
|
end
|
21
|
-
|
19
|
+
|
20
|
+
@page_object.crawled_timestamp = @crawled_timestamp
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_hash
|
24
|
+
return @page_object
|
22
25
|
end
|
23
26
|
|
24
27
|
protected
|
@@ -7,7 +7,7 @@ module HashSpidey
|
|
7
7
|
attr_reader :url, :code,
|
8
8
|
:initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
|
9
9
|
:content, :handler, :spider, :handle_data,
|
10
|
-
:crawl_metadata
|
10
|
+
:crawl_metadata, :parsed_data
|
11
11
|
|
12
12
|
|
13
13
|
# convenience name for spidey
|
@@ -31,8 +31,11 @@ module HashSpidey
|
|
31
31
|
end
|
32
32
|
|
33
33
|
|
34
|
-
def
|
35
|
-
|
34
|
+
def mark_record(obj)
|
35
|
+
obj = Hashie::Mash.new(obj) if obj.is_a?(Hash)
|
36
|
+
|
37
|
+
@content = obj.content if obj.respond_to?(:content)
|
38
|
+
@parsed_data = obj.parsed_data if obj.respond_to?(:parsed_data)
|
36
39
|
@recorded_timestamp = Time.now
|
37
40
|
end
|
38
41
|
|
@@ -51,6 +54,9 @@ module HashSpidey
|
|
51
54
|
!(crawled_timestamp.nil?)
|
52
55
|
end
|
53
56
|
|
57
|
+
def has_content?
|
58
|
+
!(@content.nil? || @content.empty?)
|
59
|
+
end
|
54
60
|
|
55
61
|
## this is just an alias
|
56
62
|
|
@@ -5,6 +5,7 @@ module HashSpidey
|
|
5
5
|
def initialize(attrs = {})
|
6
6
|
@url_collection = {}
|
7
7
|
@error_collection = []
|
8
|
+
agent.user_agent = "Abstract Spider"
|
8
9
|
|
9
10
|
super(attrs)
|
10
11
|
end
|
@@ -42,8 +43,8 @@ module HashSpidey
|
|
42
43
|
begin
|
43
44
|
page = agent.get(url)
|
44
45
|
Spidey.logger.info "Handling #{url.inspect}"
|
45
|
-
process_crawl(url, page)
|
46
46
|
send handler, page, default_data
|
47
|
+
process_crawl(url, page)
|
47
48
|
rescue => ex
|
48
49
|
add_error url: url, handler: handler, error: ex
|
49
50
|
end
|
@@ -61,27 +62,30 @@ module HashSpidey
|
|
61
62
|
end
|
62
63
|
|
63
64
|
# expects @url_collection to have :url, but if not, creates new HashUrlRecord
|
64
|
-
|
65
|
-
|
65
|
+
# data_hashie should have :content and/or :parsed_data
|
66
|
+
def record(url, data_hashie)
|
66
67
|
h_url = @url_collection[url] || HashUrlRecord.new(url)
|
67
68
|
|
68
69
|
# set the content and record_timestamp of the HashUrlRecord
|
69
|
-
h_url.
|
70
|
+
h_url.mark_record(data_hashie)
|
70
71
|
|
71
72
|
# reassign, update collection
|
72
73
|
@url_collection[url] = h_url
|
73
74
|
end
|
74
75
|
|
76
|
+
# convenience method, expecting :page to be a Nokogiri::Page
|
77
|
+
def record_page(page)
|
78
|
+
url = page.uri.to_s
|
79
|
+
record(url, content: page.content)
|
80
|
+
end
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
msh.url = page.uri.to_s
|
80
|
-
msh.content = page.content
|
81
|
-
|
82
|
-
record(msh)
|
82
|
+
def record_data(page, data)
|
83
|
+
url = page.uri.to_s
|
84
|
+
record(url, parsed_data: data)
|
83
85
|
end
|
84
86
|
|
87
|
+
|
88
|
+
|
85
89
|
def each_url(&block)
|
86
90
|
while h_url = get_next_url_hash
|
87
91
|
yield h_url.url, h_url.handler, h_url.handle_data
|
data/lib/hash_spidey/version.rb
CHANGED
@@ -59,20 +59,23 @@ describe HashSpidey::Strategies::HashStore do
|
|
59
59
|
end
|
60
60
|
|
61
61
|
it 'should respond to header#content-type' do
|
62
|
-
expect(@crawled_url.
|
62
|
+
expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
|
63
63
|
end
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
68
|
|
69
|
-
context 'generic #
|
69
|
+
context 'generic #record_page' do
|
70
70
|
describe '#records' do
|
71
71
|
before(:each) do
|
72
|
+
FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
|
73
|
+
"content-type"=>"text/html; charset=UTF-8"
|
74
|
+
)
|
72
75
|
|
73
|
-
@data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
|
74
76
|
@spider = TestSpider.new request_interval: 0
|
75
|
-
@
|
77
|
+
@page = Mechanize.new.get("http://www.example.com/")
|
78
|
+
@spider.record_page @page
|
76
79
|
end
|
77
80
|
|
78
81
|
it "should add to records" do
|
@@ -81,7 +84,9 @@ describe HashSpidey::Strategies::HashStore do
|
|
81
84
|
end
|
82
85
|
|
83
86
|
it 'should update existing result' do
|
84
|
-
@
|
87
|
+
@page.stub(:content){ 'Bye World' }
|
88
|
+
|
89
|
+
@spider.record_page @page
|
85
90
|
expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
|
86
91
|
expect(@spider.records.count).to eq 1
|
87
92
|
end
|
@@ -37,9 +37,9 @@ describe HashSpidey::HashUrlRecord do
|
|
37
37
|
@hurl = HashUrlRecord.new "http://www.example.com"
|
38
38
|
end
|
39
39
|
|
40
|
-
describe '#
|
40
|
+
describe '#mark_record' do
|
41
41
|
before(:each) do
|
42
|
-
@hurl.
|
42
|
+
@hurl.mark_record content: 'hello'
|
43
43
|
end
|
44
44
|
|
45
45
|
it 'should set @recorded_timestamp' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hash_spidey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -91,7 +91,7 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
-
description: An implementation of joeyAghion's Spidey
|
94
|
+
description: An implementation of Artsy's joeyAghion's Spidey::AbstractSpider
|
95
95
|
email:
|
96
96
|
- dansonguyen@gmail.com
|
97
97
|
executables: []
|
@@ -103,6 +103,7 @@ files:
|
|
103
103
|
- LICENSE.txt
|
104
104
|
- README.md
|
105
105
|
- Rakefile
|
106
|
+
- examples/wikipedia_article_spider.rb
|
106
107
|
- hash_spidey.gemspec
|
107
108
|
- lib/hash_spidey.rb
|
108
109
|
- lib/hash_spidey/crawl_record.rb
|