RubyGems - web-page-parser - Versions diffs - 0.25 → 1.0.0 - Mend

web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
checksums.yaml.gz.sig +1 -0
data.tar.gz.sig +0 -0
data/README.rdoc +5 -0
data/lib/web-page-parser.rb +31 -0
data/lib/web-page-parser/base_parser.rb +92 -42
data/lib/web-page-parser/http.rb +63 -0
data/lib/web-page-parser/parser_factory.rb +0 -1
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
data/spec/base_parser_spec.rb +24 -8
data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
data/spec/fixtures/bbc_news/21528631.html +2021 -0
data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
data/spec/parser_factory_spec.rb +3 -3
data/spec/parsers/bbc_news_page_spec.rb +223 -3
data/spec/parsers/guardian_page_spec.rb +157 -4
data/spec/parsers/independent_page_parser_spec.rb +152 -0
data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
data/spec/spec_helper.rb +5 -0
metadata +167 -59
metadata.gz.sig +2 -0

data/spec/parser_factory_spec.rb CHANGED

@@ -1,11 +1,11 @@
-$:.unshift File.join(File.dirname(__FILE__), '../lib')
-require 'web-page-parser'
+require 'spec_helper'
 include WebPageParser
 describe ParserFactory do
   it "should load parsers in the parsers directory" do
-    ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
+    pfl = ParserFactory.factories.collect { |f| f.to_s }
+    pfl.should include "TestPageParserFactory"
   end
   it "should provide the right PageParser for the given url" do

data/spec/parsers/bbc_news_page_spec.rb CHANGED

@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 $:.unshift File.join(File.dirname(__FILE__), '../../lib')
-require 'spec/base_parser_spec'
+$:.unshift File.join(File.dirname(__FILE__), '../../spec')
+require 'base_parser_spec'
+require 'spec_helper'
 require 'web-page-parser'
 include WebPageParser
@@ -44,16 +46,208 @@ describe BbcNewsPageParserFactory do
       BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
     end
   end
   it "should ignore 'in pictures' articles" do
     BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
   end
 end
+describe BbcNewsPageParserV5 do
+  describe "downloaded article with non-utf8" do
+    page = BbcNewsPageParserV5.new(:url => "http://news.bbc.co.uk/1/hi/uk_politics/7984711.stm")
+    page.hash.should_not == nil
+    page.hash.should_not == ""
+  end
+  describe "Oscar Pistorius article" do
+    it_should_behave_like AllPageParsers
+    before do
+      @valid_options = {
+        :url => 'http://www.bbc.co.uk/news/world-africa-21528631',
+        :page => File.read("spec/fixtures/bbc_news/21528631.html"),
+        :valid_hash => ''
+      }
+      @pa = BbcNewsPageParserV5.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Oscar Pistorius detective on attempted murder charges"
+    end
+    it "should parse the content" do
+      @pa.content.first.should == "The South African detective leading the Oscar Pistorius inquiry is facing seven charges of attempted murder, police have confirmed."
+      @pa.content.last.should == "In London he made history by becoming the first double-amputee to run in the Olympics, making the semi-final of the 400m."
+      @pa.content.should include "Reinstated charges"
+      @pa.content.should include "Mr Roux said this was a strong, loving relationship and that there was no motive to kill."
+      @pa.content.should include "The three were arrested in 2011, Eyewitness News says, citing police."
+      @pa.content.size.should == 38
+    end
+    it "should exclude the twitter feed" do
+      @pa.content.to_s.should_not =~ /Live tweets/
+      @pa.content.to_s.should_not =~ /An old mystery resurfaces/
+    end
+    it "should parse the publication date" do
+      # 2013/02/21 14:10:58
+      @pa.date.should == DateTime.parse("Feb 21 14:10:58 +0000 2013")
+    end
+  end
+  describe "UK economy article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.bbc.co.uk/news/business-11125504',
+        :page => File.read("spec/fixtures/bbc_news/11125504.html"),
+        :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
+      }
+      @pa = BbcNewsPageParserV5.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "UK economy 'to pick up in near term'"
+    end
+    it "should parse the content" do
+      @pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
+      @pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
+      @pa.content.size.should == 18
+    end
+  end
+  it "should ignore embedded-hyper content" do
+    @pa = BbcNewsPageParserV5.new(:page => File.read('spec/fixtures/bbc_news/12921632.html'))
+    @pa.content.to_s.should_not =~ /Fake and real quotes/
+  end
+  it "should parse the content of an article with market data" do
+    @pa = BbcNewsPageParserV5.new(:page => File.read('spec/fixtures/bbc_news/13293006.html'))
+    @pa.content.to_s.should_not =~ /Market Data/
+    @pa.content.to_s.should_not =~ /Last updated at/
+    @pa.content.size.should == 13
+  end
+  it "should ignore the twitter widget" do
+    pa = BbcNewsPageParserV5.new(:url => "http://www.bbc.co.uk/news/world-us-canada-20230333", :page => File.read("spec/fixtures/bbc_news/20230333.stm.html"))
+    pa.title.should == "US election: Results declared from some states"
+    pa.content.first.should == "President Barack Obama and challenger Mitt Romney remain locked in a tight race as US election results stream in."
+    pa.content.to_s.should_not =~ /US Election Tweets/
+    pa.content.last.should == "The BBC is providing full online live results of the US presidential election. More details here ."
+    pa.content.should include "Legal battles feared"
+  end
+  it "should ignore the 'latest' twitter widget" do
+    pa = BbcNewsPageParserV5.new(:url => "http://www.bbc.co.uk/news/uk-19957138", :page => File.read("spec/fixtures/bbc_news/19957138.stm.html"))
+    pa.title.should == "Gary McKinnon extradition to US blocked by Theresa May"
+    pa.content.to_s.should_not =~ /High Noon for Abu Qatada?/
+    pa.content.to_s.should_not =~ /Content from Twitter./
+    pa.content.last.should == "Mr McKinnon was arrested in 2002 and again in 2005 before an order for his extradition was made in July 2006 under the 2003 Extradition Act."
+  end
+  describe "Derrick Bird article" do
+    before do
+      @valid_options = {
+        :url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
+        :page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
+        :valid_hash => '43634596a9f1cfb59bb9548282043119' # Differs from V3 as title is obtained more accurately
+      }
+      @pa = BbcNewsPageParserV5.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Gunman's family unaware of motive for killings"
+    end
+    it "should parse the content" do
+      @pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
+      @pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
+      @pa.content.size.should == 24
+    end
+    it "should parse the publication date" do
+      # 2010/06/06 13:48:45
+      @pa.date.should == DateTime.parse("Jun 06 13:48:45 +0000 2010")
+    end
+    it "should calculate a valid hash of the content" do
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+  describe "Obama invite article" do
+    before do
+      @valid_options = {
+        :url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
+        :page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
+        :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
+      }
+      @pa = BbcNewsPageParserV5.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Obama invites Middle East heads"
+    end
+    it "should parse the date in UTC" do
+      # 2009/04/21 19:50:44
+      @pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
+      @pa.date.zone.should == '+00:00'
+    end
+    it "should parse the content" do
+      @pa.content.first.should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
+      @pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
+      @pa.content.size.should == 15
+    end
+    it "should decode html entities" do
+      @pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
+    end
+    it "should calculate a valid hash of the content" do
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+  describe "Woodward mortgage article" do
+    before do
+      @valid_options = {
+        :url => 'http://news.bbc.co.uk/1/hi/northern_ireland/8040164.stm',
+        :page => File.read("spec/fixtures/bbc_news/8040164.stm.html"),
+        :valid_hash => ''
+      }
+      @pa = BbcNewsPageParserV5.new(@valid_options)
+    end
+    it "should convert iso-8859-1 in the title to utf8" do
+      @pa.title.should == "£100K mortgage claim by Woodward"
+    end
+    it "should convert iso-8859-1 in the content to utf8" do
+      @pa.content.first.should =~ /£100,000/
+    end
+  end
+  it "should parse the content of an article with two captions" do
+    @pa = BbcNewsPageParserV5.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
+                                    :page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
+                                    :valid_hash => 'unknown'
+                                  })
+    @pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
+    @pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
+    @pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
+  end
+end
 describe BbcNewsPageParserV4 do
   it_should_behave_like AllPageParsers
   before do
-    @valid_options = {
+    @valid_options = {
       :url => 'http://www.bbc.co.uk/news/business-11125504',
       :page => File.read("spec/fixtures/bbc_news/11125504.html"),
       :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
@@ -83,6 +277,32 @@ describe BbcNewsPageParserV4 do
     @pa.content.to_s.should_not =~ /Fake and real quotes/
   end
+  it "should retrieve the article from the bbc website" do
+    @pa = BbcNewsPageParserV4.new(:url => @valid_options[:url])
+    @pa.title.should == "UK economy 'to pick up in near term'"
+  end
+  it "should ignore the twitter widget" do
+    pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/world-us-canada-20230333", :page => File.read("spec/fixtures/bbc_news/20230333.stm.html"))
+    pa.title.should == "US election: Results declared from some states"
+    pa.content.first.should == "President Barack Obama and challenger Mitt Romney remain locked in a tight race as US election results stream in."
+    pa.content.to_s.should_not =~ /US Election Tweets/
+    pa.content.last.should == "Are you a voter in one of the swing states? Send us your comments on the election campaign using the form below."
+  end
+  it "should ignore the 'latest' twitter widget" do
+    pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/uk-19957138", :page => File.read("spec/fixtures/bbc_news/19957138.stm.html"))
+    pa.title.should == "Gary McKinnon extradition to US blocked by Theresa May"
+    pa.content.to_s.should_not =~ /High Noon for Abu Qatada?/
+    pa.content.to_s.should_not =~ /Content from Twitter./
+    pa.content.last.should == "Mr McKinnon was arrested in 2002 and again in 2005 before an order for his extradition was made in July 2006 under the 2003 Extradition Act."
+  end
+  it "should retrieve an old iso-8859-1 article without getting upset about encoding" do
+    @pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/magazine-20761954")
+    @pa.title.should == "Quiz of the Year: 52 weeks 52 questions, part four"
+  end
 end

data/spec/parsers/guardian_page_spec.rb CHANGED

@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-$:.unshift File.join(File.dirname(__FILE__), '../../lib')
-require 'spec/base_parser_spec'
-require 'web-page-parser'
+require 'spec_helper'
 include WebPageParser
 describe GuardianPageParserFactory do
@@ -11,6 +9,9 @@ describe GuardianPageParserFactory do
                    "http://www.guardian.co.uk/commentisfree/2012/jan/27/ian-jack-battle-for-scotland",
                    "http://www.guardian.co.uk/environment/bike-blog/2012/jan/27/hgv-cyclists-safety-bike-blog",
                    "http://www.guardian.co.uk/tv-and-radio/2012/jan/26/well-take-manhattan-david-bailey",
+                   "http://www.theguardian.com/world/2013/aug/24/syria-cameron-obama-intervention",
+                   "http://www.theguardian.com/commentisfree/2013/aug/25/coalition-leaders-change-tune-rawnsley",
+                   "http://www.theguardian.com/uk-news/2013/aug/25/police-officer-cleared-taser-brighton"
                   ]
     @invalid_urls = [
                      "http://www.guardian.co.uk/business",
@@ -23,7 +24,9 @@ describe GuardianPageParserFactory do
                      "http://www.guardian.co.uk/uk/video/2012/may/13/occupy-protesters-clash-police-video",
                      "http://www.guardian.co.uk/uk/gallery/2012/may/10/public-sector-protests-in-pictures",
                      "http://www.guardian.co.uk/media/video/2012/may/24/chris-huhne-partner-privacy-case-video",
-                     "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable"
+                     "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable",
+                     "http://www.theguardian.com/global-development",
+                     "http://www.theguardian.com/uk/business"
                     ]
   end
@@ -95,4 +98,154 @@ describe GuardianPageParserV1 do
       @pa.hash.should == @valid_options[:valid_hash]
     end
   end
+  describe "when parsing the barack obama-nicki-minaj article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/music/2012/oct/16/barack-obama-nicki-minaj-mariah-carey',
+        :page => File.read("spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html"),
+        :valid_hash => '22fe55dc3664662ac6c1c79eac584754'
+      }
+      @pa = GuardianPageParserV1.new(@valid_options)
+    end
+    it "should not include +explainerText+" do
+      @pa.hash.should == @valid_options[:valid_hash]
+      @pa.content.to_s.should_not =~ /explainerText/
+    end
+  end
+  describe "when parsing the anger-grows article with the explainerText javascript" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
+        :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html"),
+        :valid_hash => '04108a9a7e3196da185e4d10432740a1'
+      }
+      @pa = GuardianPageParserV1.new(@valid_options)
+    end
+    it "should have the same hash as before" do
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+    it "should not include +explainerText+" do
+      @pa.content.to_s.should_not =~ /explainerText/
+    end
+  end
+end
+describe GuardianPageParserV2 do
+  describe "when parsing the anger-grows article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
+        :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html"),
+        :valid_hash => '04108a9a7e3196da185e4d10432740a1'
+      }
+      @pa = GuardianPageParserV2.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Anger grows over RBS chief's £900,000 bonus"
+    end
+    it "should parse the date in UTC" do
+      @pa.date.should == DateTime.parse("Fri Jan 27 12:58:53 +0000 2012")
+      @pa.date.zone.should == '+00:00'
+    end
+    it "should parse the content" do
+      @pa.content[0].should == "Ed Miliband and Boris Johnson have joined the chorus of criticism over the decision by the Royal Bank of Scotland to award its chief executive a bonus of nearly £1m."
+      @pa.content[7].should == 'Speaking from the World Economic Forum in Davos, Switzerland, Johnson described the bonus as "absolutely bewildering" and said it should have been blocked by ministers.'
+      @pa.content[38].should == '"Even to be considering this at a time when we are struggling to get our economies growing is quite simply madness," he told leaders in a speech to the World Economic Forum.'
+      @pa.content.last.should == "."
+      @pa.content.size.should == 40
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+  describe "when parsing the syria-libya-middle-east article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/world/middle-east-live/2011/jun/22/syria-libya-middle-east-unrest-live?INTCMP=ILCNETTXT3487',
+        :page => File.read("spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html"),
+        :valid_hash => 'a2ed6d79e1fd834df80e2d603b36be22' # changed from V1 due to html stripping
+      }
+      @pa = GuardianPageParserV2.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Bahrain, Syria and Middle East unrest - Wednesday 22 June 2011"
+    end
+    it "should parse the content" do
+      @pa.content[0].should == "9.31am:Welcome to Middle East Live. There's so much happening across the region that it's difficult to know which stories to watch today. Here's a run down of the latest developments by country:"
+      @pa.content[1].should == "Bahrain"
+      @pa.content[6].should == "When I see children being killed, I must have misgivings. That's why I warned about the risk of civilian casualties... You can't have a decisive ending. Now is the time to do whatever we can to reach a political solution."
+      @pa.content.last.should == "(That's it from us today. Thanks for your comments)."
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+  describe "when parsing the barack obama-nicki-minaj article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/music/2012/oct/16/barack-obama-nicki-minaj-mariah-carey',
+        :page => File.read("spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html"),
+        :valid_hash => '22fe55dc3664662ac6c1c79eac584754'
+      }
+      @pa = GuardianPageParserV2.new(@valid_options)
+    end
+    it "should not include +explainerText+" do
+      @pa.hash.should == @valid_options[:valid_hash]
+      @pa.content.to_s.should_not =~ /explainerText/
+    end
+  end
+  describe "when parsing the anger-grows article with the explainerText javascript" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
+        :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html"),
+        :valid_hash => '04108a9a7e3196da185e4d10432740a1'
+      }
+      @pa = GuardianPageParserV2.new(@valid_options)
+    end
+    it "should have the same hash as before" do
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+    it "should not include +explainerText+" do
+      @pa.content.to_s.should_not =~ /explainerText/
+    end
+  end
+  describe "when parsing the nhs-patient-data article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.theguardian.com/society/2014/jan/19/nhs-patient-data-available-companies-buy',
+        :page => File.read('spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html'),
+        :valid_hash => '0ae4a335bfd96ee3345350814f1e9f97'
+      }
+      @pa = GuardianPageParserV2.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == 'NHS patient data to be made available for sale to drug and insurance companies'
+    end
+    it "should parse the content" do
+      @pa.content[0].should == 'Drug and insurance companies will from later this year be able to buy information on patients including mental health conditions and diseases such as cancer, as well as smoking and drinking habits, once a single English database of medical data has been created.'
+      @pa.content.last.should == 'A spokesperson said: "A phased rollout of care.data is being readied over a three month period with first extractions  from March allowing time for the HSCIC to assess the quality of the data and the linkage before making the data available. We think it would be wrong to exclude private companies simply on ideological grounds; instead, the test should be how the company wants to use the data to improve NHS care."'
+      @pa.content.size.should == 21
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
 end