hash_spidey 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,28 @@
1
+ # obviously, you should just use Wikipedia's API
2
+ class WikipediaArticleSpider < HashSpidey::AbstractSpider
3
+
4
+ def initialize(first_article_url, opts)
5
+ super(opts)
6
+ handle first_article_url, :process_article
7
+ end
8
+
9
+ def process_article(page, default_opts={})
10
+
11
+ record_page(page)
12
+
13
+ page.search('a').select{|a| a['href'] =~ /wiki\/Category:/}.each do |a|
14
+ href = resolve_url( a['href'], page)
15
+ handle href, :process_category_page
16
+ end
17
+ end
18
+
19
+ def process_category_page(page, default_opts={})
20
+ title = page.title
21
+ page_count_text = page.search('#mw-pages > p')[0].text.match(/[\d,]+ total\./)
22
+ datastr = "#{title} has #{page_count_text}"
23
+
24
+ record_data(page, datastr)
25
+ end
26
+
27
+
28
+ end
data/hash_spidey.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = HashSpidey::VERSION
9
9
  spec.authors = ["dannguyen"]
10
10
  spec.email = ["dansonguyen@gmail.com"]
11
- spec.description = %q{An implementation of joeyAghion's Spidey class at Artsy}
11
+ spec.description = %q{An implementation of Artsy's joeyAghion's Spidey::AbstractSpider}
12
12
  spec.summary = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
13
13
  spec.homepage = "http://github.com/dannguyen"
14
14
  spec.license = "MIT"
@@ -3,22 +3,25 @@ require 'mechanize'
3
3
 
4
4
  module HashSpidey
5
5
 
6
- class CrawlRecord < BasicObject
6
+ class CrawlRecord
7
7
 
8
8
  META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
9
9
  attr_reader :crawled_timestamp
10
10
 
11
11
  def initialize(obj, timestamp)
12
12
  @crawled_timestamp = timestamp
13
- @page_object = obj
14
- end
15
13
 
16
- def to_hash
17
- msh = Hashie::Mash.new
18
- META_ATTS.each do |att|
19
- msh[att] = self.send(att) if self.respond_to?(att)
14
+
15
+ @page_object = META_ATTS.inject(Hashie::Mash.new) do |msh, att|
16
+ msh[att] = obj.send(att) if obj.respond_to?(att)
17
+ msh
20
18
  end
21
- return msh
19
+
20
+ @page_object.crawled_timestamp = @crawled_timestamp
21
+ end
22
+
23
+ def to_hash
24
+ return @page_object
22
25
  end
23
26
 
24
27
  protected
@@ -7,7 +7,7 @@ module HashSpidey
7
7
  attr_reader :url, :code,
8
8
  :initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
9
9
  :content, :handler, :spider, :handle_data,
10
- :crawl_metadata
10
+ :crawl_metadata, :parsed_data
11
11
 
12
12
 
13
13
  # convenience name for spidey
@@ -31,8 +31,11 @@ module HashSpidey
31
31
  end
32
32
 
33
33
 
34
- def record_content(ct)
35
- @content = ct
34
+ def mark_record(obj)
35
+ obj = Hashie::Mash.new(obj) if obj.is_a?(Hash)
36
+
37
+ @content = obj.content if obj.respond_to?(:content)
38
+ @parsed_data = obj.parsed_data if obj.respond_to?(:parsed_data)
36
39
  @recorded_timestamp = Time.now
37
40
  end
38
41
 
@@ -51,6 +54,9 @@ module HashSpidey
51
54
  !(crawled_timestamp.nil?)
52
55
  end
53
56
 
57
+ def has_content?
58
+ !(@content.nil? || @content.empty?)
59
+ end
54
60
 
55
61
  ## this is just an alias
56
62
 
@@ -5,6 +5,7 @@ module HashSpidey
5
5
  def initialize(attrs = {})
6
6
  @url_collection = {}
7
7
  @error_collection = []
8
+ agent.user_agent = "Abstract Spider"
8
9
 
9
10
  super(attrs)
10
11
  end
@@ -42,8 +43,8 @@ module HashSpidey
42
43
  begin
43
44
  page = agent.get(url)
44
45
  Spidey.logger.info "Handling #{url.inspect}"
45
- process_crawl(url, page)
46
46
  send handler, page, default_data
47
+ process_crawl(url, page)
47
48
  rescue => ex
48
49
  add_error url: url, handler: handler, error: ex
49
50
  end
@@ -61,27 +62,30 @@ module HashSpidey
61
62
  end
62
63
 
63
64
  # expects @url_collection to have :url, but if not, creates new HashUrlRecord
64
- def record(data_hashie)
65
- url = data_hashie.url
65
+ # data_hashie should have :content and/or :parsed_data
66
+ def record(url, data_hashie)
66
67
  h_url = @url_collection[url] || HashUrlRecord.new(url)
67
68
 
68
69
  # set the content and record_timestamp of the HashUrlRecord
69
- h_url.record_content(data_hashie.content)
70
+ h_url.mark_record(data_hashie)
70
71
 
71
72
  # reassign, update collection
72
73
  @url_collection[url] = h_url
73
74
  end
74
75
 
76
+ # convenience method, expecting :page to be a Nokogiri::Page
77
+ def record_page(page)
78
+ url = page.uri.to_s
79
+ record(url, content: page.content)
80
+ end
75
81
 
76
- # wrapper around #record
77
- def record_page(page, default_data={})
78
- msh = Hashie::Mash.new(default_data)
79
- msh.url = page.uri.to_s
80
- msh.content = page.content
81
-
82
- record(msh)
82
+ def record_data(page, data)
83
+ url = page.uri.to_s
84
+ record(url, parsed_data: data)
83
85
  end
84
86
 
87
+
88
+
85
89
  def each_url(&block)
86
90
  while h_url = get_next_url_hash
87
91
  yield h_url.url, h_url.handler, h_url.handle_data
@@ -1,3 +1,3 @@
1
1
  module HashSpidey
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -59,20 +59,23 @@ describe HashSpidey::Strategies::HashStore do
59
59
  end
60
60
 
61
61
  it 'should respond to header#content-type' do
62
- expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
62
+ expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
63
63
  end
64
64
  end
65
65
  end
66
66
  end
67
67
 
68
68
 
69
- context 'generic #record' do
69
+ context 'generic #record_page' do
70
70
  describe '#records' do
71
71
  before(:each) do
72
+ FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
73
+ "content-type"=>"text/html; charset=UTF-8"
74
+ )
72
75
 
73
- @data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
74
76
  @spider = TestSpider.new request_interval: 0
75
- @spider.record @data
77
+ @page = Mechanize.new.get("http://www.example.com/")
78
+ @spider.record_page @page
76
79
  end
77
80
 
78
81
  it "should add to records" do
@@ -81,7 +84,9 @@ describe HashSpidey::Strategies::HashStore do
81
84
  end
82
85
 
83
86
  it 'should update existing result' do
84
- @spider.record Hashie::Mash.new url: 'http://www.example.com/', content: 'Bye World'
87
+ @page.stub(:content){ 'Bye World' }
88
+
89
+ @spider.record_page @page
85
90
  expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
86
91
  expect(@spider.records.count).to eq 1
87
92
  end
@@ -37,9 +37,9 @@ describe HashSpidey::HashUrlRecord do
37
37
  @hurl = HashUrlRecord.new "http://www.example.com"
38
38
  end
39
39
 
40
- describe '#record_content' do
40
+ describe '#mark_record' do
41
41
  before(:each) do
42
- @hurl.record_content 'hello'
42
+ @hurl.mark_record content: 'hello'
43
43
  end
44
44
 
45
45
  it 'should set @recorded_timestamp' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hash_spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -91,7 +91,7 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
- description: An implementation of joeyAghion's Spidey class at Artsy
94
+ description: An implementation of Artsy's joeyAghion's Spidey::AbstractSpider
95
95
  email:
96
96
  - dansonguyen@gmail.com
97
97
  executables: []
@@ -103,6 +103,7 @@ files:
103
103
  - LICENSE.txt
104
104
  - README.md
105
105
  - Rakefile
106
+ - examples/wikipedia_article_spider.rb
106
107
  - hash_spidey.gemspec
107
108
  - lib/hash_spidey.rb
108
109
  - lib/hash_spidey/crawl_record.rb