reliefweb_scraper 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,18 +1,16 @@
1
1
  Reliefweb Scraper
2
2
  =================
3
3
 
4
- * Scrapes featured disasters from http://reliefweb.int/home
5
- * Retrieves extra information by searching http://glidenumber.net
4
+ * Feeds featured disasters from http://reliefweb.int/
6
5
 
7
- Objects such as Reliefweb::Disaster and Glidenumber::Record are subclasses of Hashie::Mash,
8
- so you can work with them in the following ways:
6
+ Reliefweb::Disaster is a subclass of Hashie::Mash, so you can do the following:
9
7
 
10
8
  disaster = Reliefweb.featured_disasters.first
11
9
  puts disaster["title"]
12
10
  puts disaster.title
13
11
 
14
12
 
15
- * Run "rake featured_disasters:fetch" to see the disaster data that is scraped
13
+ * Run "rake featured_disasters:fetch" to see the disaster data
16
14
  * Run "rake spec" to run the rspec tests
17
15
  ** Note: run "rm spec/fixtures/vcr_cassettes/*.yml" if you want to re-record the vcr specs
18
16
 
@@ -21,11 +19,7 @@ Examples
21
19
 
22
20
  Return an array of featured disasters:
23
21
 
24
- Reliefweb.featured_disasters(:verbose => true)
25
-
26
- Fetch disaster information from http://glidenumber.net :
27
-
28
- Glidenumber.find("OT-2011-000110-UGA")
22
+ Reliefweb.featured_disasters
29
23
 
30
24
 
31
25
  History
@@ -33,3 +27,4 @@ History
33
27
 
34
28
  * version 0.2.0 - initial release
35
29
  * version 0.3.1 - updated to reflect new changes to the ReliefWeb website
30
+ * version 0.4.0 - using new RSS feed from ReliefWeb, no need for GlideNumber anymore
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ namespace :featured_disasters do
15
15
  task :fetch do
16
16
  require 'reliefweb'
17
17
  require 'awesome_print'
18
- ap Reliefweb.featured_disasters(:verbose => true)
18
+ ap Reliefweb.featured_disasters
19
19
  end
20
20
  end
21
21
 
data/lib/reliefweb.rb CHANGED
@@ -1,35 +1,27 @@
1
- require 'date'
2
1
  require 'hashie'
3
- require 'mechanize'
4
- require File.join(File.dirname(__FILE__), 'glidenumber')
2
+ require 'nokogiri'
3
+ require 'open-uri'
5
4
 
6
5
  module Reliefweb
7
- class ParseError < StandardError; end
8
6
 
9
- def self.agent
10
- @agent ||= Mechanize.new
11
- end
7
+ class Disaster < Hashie::Mash; end
12
8
 
13
9
  def self.featured_disasters(options = {})
14
- agent.get("http://reliefweb.int/home")
15
- links = agent.page.parser.css("div#middle-right.page-middle div.region div.block-content div.view div.view-content table.views-view-grid tbody tr td div.grid-item div.views-field span.field-content a")
16
- raise ParseError.new("Could not find any featured disasters on Reliefweb") if links.empty?
17
- links.map do |link|
18
- title = link.text.sub(/-[^-]*$/, '').strip
19
- href = link.attributes["href"].to_s
20
- puts "== Fetching glide number from #{href} (#{title})..." if options[:verbose]
21
- reliefweb_glide = fetch_glidenumber(href)
22
- puts "== Fetching glide details for #{reliefweb_glide}..." if options[:verbose]
23
- record = Glidenumber.find(reliefweb_glide)
24
- Disaster.new({:title => title, :url => URI.join("http://reliefweb.int", href).to_s}.merge(record.to_hash))
10
+ url = 'http://reliefweb.int/disasters/rss.xml?sl=environment-disaster_listing'
11
+ xml = Nokogiri::XML(open(url))
12
+ xml.xpath('//channel/item').map do |item|
13
+ title = item.xpath('title').text.sub(/-[^-]*$/, '').strip
14
+ url = item.xpath('link').text
15
+ country_code = item.xpath('reliefweb:iso3').map(&:text).join(', ')
16
+ date = Time.parse(item.xpath('pubDate').text)
17
+ glidenumber = item.xpath('reliefweb:glide').text
18
+ disaster_type = item.xpath('reliefweb:disaster_type').map(&:text).join(', ')
19
+ current = true
20
+ Disaster.new({:title => title, :url => url, :country_code => country_code,
21
+ :date => date, :glidenumber => glidenumber, :disaster_type => disaster_type,
22
+ :current => current})
25
23
  end
26
24
  end
27
25
 
28
- def self.fetch_glidenumber(link)
29
- agent.get(link)
30
- agent.page.parser.css("div.views-field-entity-id-3 div.field-content").text.strip
31
- end
32
-
33
- class Disaster < Hashie::Mash; end
34
26
  end
35
27
 
@@ -3,21 +3,21 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "reliefweb_scraper"
6
- s.version = "0.3.1"
6
+ s.version = "0.4.0"
7
7
  s.platform = Gem::Platform::RUBY
8
8
  s.authors = ["Nathan Broadbent", "Stanley Lau", "Stephen Kenworthy"]
9
9
  s.email = ["itdept@crossroads.org.hk"]
10
10
  s.homepage = "http://www.crossroads.org.hk"
11
- s.summary = %q{Scrape Reliefweb featured disasters}
12
- s.description = %q{Scrapes reliefweb's featured disasters, and pulls additional information from glidenumber.net}
11
+ s.summary = %q{Feeds disasters from Reliefweb}
12
+ s.description = %q{Gathers disasters from Reliefweb via RSS and presents them as a Disaster hashie}
13
13
 
14
14
  s.files = `git ls-files`.split("\n")
15
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
16
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
17
  s.require_paths = ["lib"]
18
18
 
19
- s.add_dependency('mechanize', ">= 2.0.1")
20
- s.add_dependency('hashie', ">= 1.1.0")
19
+ s.add_dependency('nokogiri', ">= 1.5.0")
20
+ s.add_dependency('hashie', ">= 1.1.0")
21
21
 
22
22
  s.add_development_dependency("rspec", "~> 2.5.0")
23
23
  s.add_development_dependency("vcr", "~> 1.11.1")