hash_spidey 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ # obviously, you should just use Wikipedia's API
2
+ class WikipediaArticleSpider < HashSpidey::AbstractSpider
3
+
4
+ def initialize(first_article_url, opts)
5
+ super(opts)
6
+ handle first_article_url, :process_article
7
+ end
8
+
9
+ def process_article(page, default_opts={})
10
+
11
+ record_page(page)
12
+
13
+ page.search('a').select{|a| a['href'] =~ /wiki\/Category:/}.each do |a|
14
+ href = resolve_url( a['href'], page)
15
+ handle href, :process_category_page
16
+ end
17
+ end
18
+
19
+ def process_category_page(page, default_opts={})
20
+ title = page.title
21
+ page_count_text = page.search('#mw-pages > p')[0].text.match(/[\d,]+ total\./)
22
+ datastr = "#{title} has #{page_count_text}"
23
+
24
+ record_data(page, datastr)
25
+ end
26
+
27
+
28
+ end
data/hash_spidey.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = HashSpidey::VERSION
9
9
  spec.authors = ["dannguyen"]
10
10
  spec.email = ["dansonguyen@gmail.com"]
11
- spec.description = %q{An implementation of joeyAghion's Spidey class at Artsy}
11
+ spec.description = %q{An implementation of Artsy's joeyAghion's Spidey::AbstractSpider}
12
12
  spec.summary = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
13
13
  spec.homepage = "http://github.com/dannguyen"
14
14
  spec.license = "MIT"
@@ -3,22 +3,25 @@ require 'mechanize'
3
3
 
4
4
  module HashSpidey
5
5
 
6
- class CrawlRecord < BasicObject
6
+ class CrawlRecord
7
7
 
8
8
  META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
9
9
  attr_reader :crawled_timestamp
10
10
 
11
11
  def initialize(obj, timestamp)
12
12
  @crawled_timestamp = timestamp
13
- @page_object = obj
14
- end
15
13
 
16
- def to_hash
17
- msh = Hashie::Mash.new
18
- META_ATTS.each do |att|
19
- msh[att] = self.send(att) if self.respond_to?(att)
14
+
15
+ @page_object = META_ATTS.inject(Hashie::Mash.new) do |msh, att|
16
+ msh[att] = obj.send(att) if obj.respond_to?(att)
17
+ msh
20
18
  end
21
- return msh
19
+
20
+ @page_object.crawled_timestamp = @crawled_timestamp
21
+ end
22
+
23
+ def to_hash
24
+ return @page_object
22
25
  end
23
26
 
24
27
  protected
@@ -7,7 +7,7 @@ module HashSpidey
7
7
  attr_reader :url, :code,
8
8
  :initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
9
9
  :content, :handler, :spider, :handle_data,
10
- :crawl_metadata
10
+ :crawl_metadata, :parsed_data
11
11
 
12
12
 
13
13
  # convenience name for spidey
@@ -31,8 +31,11 @@ module HashSpidey
31
31
  end
32
32
 
33
33
 
34
- def record_content(ct)
35
- @content = ct
34
+ def mark_record(obj)
35
+ obj = Hashie::Mash.new(obj) if obj.is_a?(Hash)
36
+
37
+ @content = obj.content if obj.respond_to?(:content)
38
+ @parsed_data = obj.parsed_data if obj.respond_to?(:parsed_data)
36
39
  @recorded_timestamp = Time.now
37
40
  end
38
41
 
@@ -51,6 +54,9 @@ module HashSpidey
51
54
  !(crawled_timestamp.nil?)
52
55
  end
53
56
 
57
+ def has_content?
58
+ !(@content.nil? || @content.empty?)
59
+ end
54
60
 
55
61
  ## this is just an alias
56
62
 
@@ -5,6 +5,7 @@ module HashSpidey
5
5
  def initialize(attrs = {})
6
6
  @url_collection = {}
7
7
  @error_collection = []
8
+ agent.user_agent = "Abstract Spider"
8
9
 
9
10
  super(attrs)
10
11
  end
@@ -42,8 +43,8 @@ module HashSpidey
42
43
  begin
43
44
  page = agent.get(url)
44
45
  Spidey.logger.info "Handling #{url.inspect}"
45
- process_crawl(url, page)
46
46
  send handler, page, default_data
47
+ process_crawl(url, page)
47
48
  rescue => ex
48
49
  add_error url: url, handler: handler, error: ex
49
50
  end
@@ -61,27 +62,30 @@ module HashSpidey
61
62
  end
62
63
 
63
64
  # expects @url_collection to have :url, but if not, creates new HashUrlRecord
64
- def record(data_hashie)
65
- url = data_hashie.url
65
+ # data_hashie should have :content and/or :parsed_data
66
+ def record(url, data_hashie)
66
67
  h_url = @url_collection[url] || HashUrlRecord.new(url)
67
68
 
68
69
  # set the content and record_timestamp of the HashUrlRecord
69
- h_url.record_content(data_hashie.content)
70
+ h_url.mark_record(data_hashie)
70
71
 
71
72
  # reassign, update collection
72
73
  @url_collection[url] = h_url
73
74
  end
74
75
 
76
+ # convenience method, expecting :page to be a Nokogiri::Page
77
+ def record_page(page)
78
+ url = page.uri.to_s
79
+ record(url, content: page.content)
80
+ end
75
81
 
76
- # wrapper around #record
77
- def record_page(page, default_data={})
78
- msh = Hashie::Mash.new(default_data)
79
- msh.url = page.uri.to_s
80
- msh.content = page.content
81
-
82
- record(msh)
82
+ def record_data(page, data)
83
+ url = page.uri.to_s
84
+ record(url, parsed_data: data)
83
85
  end
84
86
 
87
+
88
+
85
89
  def each_url(&block)
86
90
  while h_url = get_next_url_hash
87
91
  yield h_url.url, h_url.handler, h_url.handle_data
@@ -1,3 +1,3 @@
1
1
  module HashSpidey
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -59,20 +59,23 @@ describe HashSpidey::Strategies::HashStore do
59
59
  end
60
60
 
61
61
  it 'should respond to header#content-type' do
62
- expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
62
+ expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
63
63
  end
64
64
  end
65
65
  end
66
66
  end
67
67
 
68
68
 
69
- context 'generic #record' do
69
+ context 'generic #record_page' do
70
70
  describe '#records' do
71
71
  before(:each) do
72
+ FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
73
+ "content-type"=>"text/html; charset=UTF-8"
74
+ )
72
75
 
73
- @data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
74
76
  @spider = TestSpider.new request_interval: 0
75
- @spider.record @data
77
+ @page = Mechanize.new.get("http://www.example.com/")
78
+ @spider.record_page @page
76
79
  end
77
80
 
78
81
  it "should add to records" do
@@ -81,7 +84,9 @@ describe HashSpidey::Strategies::HashStore do
81
84
  end
82
85
 
83
86
  it 'should update existing result' do
84
- @spider.record Hashie::Mash.new url: 'http://www.example.com/', content: 'Bye World'
87
+ @page.stub(:content){ 'Bye World' }
88
+
89
+ @spider.record_page @page
85
90
  expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
86
91
  expect(@spider.records.count).to eq 1
87
92
  end
@@ -37,9 +37,9 @@ describe HashSpidey::HashUrlRecord do
37
37
  @hurl = HashUrlRecord.new "http://www.example.com"
38
38
  end
39
39
 
40
- describe '#record_content' do
40
+ describe '#mark_record' do
41
41
  before(:each) do
42
- @hurl.record_content 'hello'
42
+ @hurl.mark_record content: 'hello'
43
43
  end
44
44
 
45
45
  it 'should set @recorded_timestamp' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hash_spidey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -91,7 +91,7 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
- description: An implementation of joeyAghion's Spidey class at Artsy
94
+ description: An implementation of Artsy's joeyAghion's Spidey::AbstractSpider
95
95
  email:
96
96
  - dansonguyen@gmail.com
97
97
  executables: []
@@ -103,6 +103,7 @@ files:
103
103
  - LICENSE.txt
104
104
  - README.md
105
105
  - Rakefile
106
+ - examples/wikipedia_article_spider.rb
106
107
  - hash_spidey.gemspec
107
108
  - lib/hash_spidey.rb
108
109
  - lib/hash_spidey/crawl_record.rb