reliefweb_scraper 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,18 +1,16 @@
1
1
  Reliefweb Scraper
2
2
  =================
3
3
 
4
- * Scrapes featured disasters from http://reliefweb.int/home
5
- * Retrieves extra information by searching http://glidenumber.net
4
+ * Feeds featured disasters from http://reliefweb.int/
6
5
 
7
- Objects such as Reliefweb::Disaster and Glidenumber::Record are subclasses of Hashie::Mash,
8
- so you can work with them in the following ways:
6
+ Reliefweb::Disaster is a subclass of Hashie::Mash, so you can do the following:
9
7
 
10
8
  disaster = Reliefweb.featured_disasters.first
11
9
  puts disaster["title"]
12
10
  puts disaster.title
13
11
 
14
12
 
15
- * Run "rake featured_disasters:fetch" to see the disaster data that is scraped
13
+ * Run "rake featured_disasters:fetch" to see the disaster data
16
14
  * Run "rake spec" to run the rspec tests
17
15
  ** Note: run "rm spec/fixtures/vcr_cassettes/*.yml" if you want to re-record the vcr specs
18
16
 
@@ -21,11 +19,7 @@ Examples
21
19
 
22
20
  Return an array of featured disasters:
23
21
 
24
- Reliefweb.featured_disasters(:verbose => true)
25
-
26
- Fetch disaster information from http://glidenumber.net :
27
-
28
- Glidenumber.find("OT-2011-000110-UGA")
22
+ Reliefweb.featured_disasters
29
23
 
30
24
 
31
25
  History
@@ -33,3 +27,4 @@ History
33
27
 
34
28
  * version 0.2.0 - initial release
35
29
  * version 0.3.1 - updated to reflect new changes to the ReliefWeb website
30
+ * version 0.4.0 - using new RSS feed from ReliefWeb, no need for GlideNumber anymore
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ namespace :featured_disasters do
15
15
  task :fetch do
16
16
  require 'reliefweb'
17
17
  require 'awesome_print'
18
- ap Reliefweb.featured_disasters(:verbose => true)
18
+ ap Reliefweb.featured_disasters
19
19
  end
20
20
  end
21
21
 
data/lib/reliefweb.rb CHANGED
@@ -1,35 +1,27 @@
1
- require 'date'
2
1
  require 'hashie'
3
- require 'mechanize'
4
- require File.join(File.dirname(__FILE__), 'glidenumber')
2
+ require 'nokogiri'
3
+ require 'open-uri'
5
4
 
6
5
  module Reliefweb
7
- class ParseError < StandardError; end
8
6
 
9
- def self.agent
10
- @agent ||= Mechanize.new
11
- end
7
+ class Disaster < Hashie::Mash; end
12
8
 
13
9
  def self.featured_disasters(options = {})
14
- agent.get("http://reliefweb.int/home")
15
- links = agent.page.parser.css("div#middle-right.page-middle div.region div.block-content div.view div.view-content table.views-view-grid tbody tr td div.grid-item div.views-field span.field-content a")
16
- raise ParseError.new("Could not find any featured disasters on Reliefweb") if links.empty?
17
- links.map do |link|
18
- title = link.text.sub(/-[^-]*$/, '').strip
19
- href = link.attributes["href"].to_s
20
- puts "== Fetching glide number from #{href} (#{title})..." if options[:verbose]
21
- reliefweb_glide = fetch_glidenumber(href)
22
- puts "== Fetching glide details for #{reliefweb_glide}..." if options[:verbose]
23
- record = Glidenumber.find(reliefweb_glide)
24
- Disaster.new({:title => title, :url => URI.join("http://reliefweb.int", href).to_s}.merge(record.to_hash))
10
+ url = 'http://reliefweb.int/disasters/rss.xml?sl=environment-disaster_listing'
11
+ xml = Nokogiri::XML(open(url))
12
+ xml.xpath('//channel/item').map do |item|
13
+ title = item.xpath('title').text.sub(/-[^-]*$/, '').strip
14
+ url = item.xpath('link').text
15
+ country_code = item.xpath('reliefweb:iso3').map(&:text).join(', ')
16
+ date = Time.parse(item.xpath('pubDate').text)
17
+ glidenumber = item.xpath('reliefweb:glide').text
18
+ disaster_type = item.xpath('reliefweb:disaster_type').map(&:text).join(', ')
19
+ current = true
20
+ Disaster.new({:title => title, :url => url, :country_code => country_code,
21
+ :date => date, :glidenumber => glidenumber, :disaster_type => disaster_type,
22
+ :current => current})
25
23
  end
26
24
  end
27
25
 
28
- def self.fetch_glidenumber(link)
29
- agent.get(link)
30
- agent.page.parser.css("div.views-field-entity-id-3 div.field-content").text.strip
31
- end
32
-
33
- class Disaster < Hashie::Mash; end
34
26
  end
35
27
 
@@ -3,21 +3,21 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "reliefweb_scraper"
6
- s.version = "0.3.1"
6
+ s.version = "0.4.0"
7
7
  s.platform = Gem::Platform::RUBY
8
8
  s.authors = ["Nathan Broadbent", "Stanley Lau", "Stephen Kenworthy"]
9
9
  s.email = ["itdept@crossroads.org.hk"]
10
10
  s.homepage = "http://www.crossroads.org.hk"
11
- s.summary = %q{Scrape Reliefweb featured disasters}
12
- s.description = %q{Scrapes reliefweb's featured disasters, and pulls additional information from glidenumber.net}
11
+ s.summary = %q{Feeds disasters from Reliefweb}
12
+ s.description = %q{Gathers disasters from Reliefweb via RSS and presents them as a Disaster hashie}
13
13
 
14
14
  s.files = `git ls-files`.split("\n")
15
15
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
16
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
17
  s.require_paths = ["lib"]
18
18
 
19
- s.add_dependency('mechanize', ">= 2.0.1")
20
- s.add_dependency('hashie', ">= 1.1.0")
19
+ s.add_dependency('nokogiri', ">= 1.5.0")
20
+ s.add_dependency('hashie', ">= 1.1.0")
21
21
 
22
22
  s.add_development_dependency("rspec", "~> 2.5.0")
23
23
  s.add_development_dependency("vcr", "~> 1.11.1")