RubyGems - web-page-parser - Versions diffs - 0.23 → 0.25 - Mend

web-page-parser 0.23 → 0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/LICENSE +1 -1
data/README.rdoc +17 -7
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +3 -2
data/lib/web-page-parser/parsers/guardian_page_parser.rb +45 -0
data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html +3556 -0
data/spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html +7624 -0
data/spec/parsers/bbc_news_page_spec.rb +1 -1
data/spec/parsers/guardian_page_spec.rb +98 -0
metadata +56 -65

data/spec/parsers/bbc_news_page_spec.rb CHANGED

@@ -14,7 +14,7 @@ describe BbcNewsPageParserFactory do
                    "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
                    "http://www.bbc.co.uk/news/business-11125504",
                    "http://www.bbc.co.uk/news/10604897",
-                   "http://www.bbc.co.uk/news/world-middle-east-13373006"
+                   "http://www.bbc.co.uk/news/world-middle-east-18229870#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa"
                   ]
     @invalid_urls = [
                      "http://news.bbc.co.uk/2/hi/health/default.stm",

data/spec/parsers/guardian_page_spec.rb ADDED

@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+$:.unshift File.join(File.dirname(__FILE__), '../../lib')
+require 'spec/base_parser_spec'
+require 'web-page-parser'
+include WebPageParser
+describe GuardianPageParserFactory do
+  before do
+    @valid_urls = [
+                   "http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus",
+                   "http://www.guardian.co.uk/commentisfree/2012/jan/27/ian-jack-battle-for-scotland",
+                   "http://www.guardian.co.uk/environment/bike-blog/2012/jan/27/hgv-cyclists-safety-bike-blog",
+                   "http://www.guardian.co.uk/tv-and-radio/2012/jan/26/well-take-manhattan-david-bailey",
+                  ]
+    @invalid_urls = [
+                     "http://www.guardian.co.uk/business",
+                     "http://www.guardian.co.uk/mobile/apps",
+                     "http://www.guardian.co.uk/business/nils-pratley-on-finance",
+                     "http://www.guardian.co.uk/commentisfree/commentisfree+uk/uk",
+                     "http://www.guardian.co.uk/help/feeds",
+                     "http://www.guardian.co.uk/uk/cartoon/2012/jan/28/nicolas-sarkozy-caricature",
+                     "http://www.guardian.co.uk/commentisfree/poll/2012/jan/30/smacking-children-david-lammy",
+                     "http://www.guardian.co.uk/uk/video/2012/may/13/occupy-protesters-clash-police-video",
+                     "http://www.guardian.co.uk/uk/gallery/2012/may/10/public-sector-protests-in-pictures",
+                     "http://www.guardian.co.uk/media/video/2012/may/24/chris-huhne-partner-privacy-case-video",
+                     "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable"
+                    ]
+  end
+  it "should detect guardian articles from the url" do
+    @valid_urls.each do |url|
+      GuardianPageParserFactory.can_parse?(:url => url).should be_true
+    end
+  end
+  it "should ignore pages with the wrong url format" do
+    @invalid_urls.each do |url|
+      GuardianPageParserFactory.can_parse?(:url => url).should be_nil
+    end
+  end
+end
+describe GuardianPageParserV1 do
+  describe "when parsing the anger-grows article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
+        :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html"),
+        :valid_hash => '04108a9a7e3196da185e4d10432740a1'
+      }
+      @pa = GuardianPageParserV1.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Anger grows over RBS chief's £900,000 bonus"
+    end
+    it "should parse the date in UTC" do
+      @pa.date.should == DateTime.parse("Fri Jan 27 12:58:53 +0000 2012")
+      @pa.date.zone.should == '+00:00'
+    end
+    it "should parse the content" do
+      @pa.content[0].should == "Ed Miliband and Boris Johnson have joined the chorus of criticism over the decision by the Royal Bank of Scotland to award its chief executive a bonus of nearly £1m."
+      @pa.content[7].should == 'Speaking from the World Economic Forum in Davos, Switzerland, Johnson described the bonus as "absolutely bewildering" and said it should have been blocked by ministers.'
+      @pa.content[38].should == '"Even to be considering this at a time when we are struggling to get our economies growing is quite simply madness," he told leaders in a speech to the World Economic Forum.'
+      @pa.content.last.should == "."
+      @pa.content.size.should == 40
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+  describe "when parsing the syria-libya-middle-east article" do
+    before do
+      @valid_options = {
+        :url => 'http://www.guardian.co.uk/world/middle-east-live/2011/jun/22/syria-libya-middle-east-unrest-live?INTCMP=ILCNETTXT3487',
+        :page => File.read("spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html"),
+        :valid_hash => '19427d70638b8d787a004f31ede29757'
+      }
+      @pa = GuardianPageParserV1.new(@valid_options)
+    end
+    it "should parse the title" do
+      @pa.title.should == "Bahrain, Syria and Middle East unrest - Wednesday 22 June 2011"
+    end
+    it "should parse the content" do
+      @pa.content[0].should == "9.31am:Welcome to Middle East Live. There's so much happening across the region that it's difficult to know which stories to watch today. Here's a run down of the latest developments by country:"
+      @pa.content[1].should == "Bahrain"
+      @pa.content[6].should == "When I see children being killed, I must have misgivings. That's why I warned about the risk of civilian casualties... You can't have a decisive ending. Now is the time to do whatever we can to reach a political solution."
+      @pa.content.last.should == "(That's it from us today. Thanks for your comments)."
+      @pa.hash.should == @valid_options[:valid_hash]
+    end
+  end
+end

metadata CHANGED

@@ -1,74 +1,71 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: web-page-parser
-version: !ruby/object:Gem::Version
-  hash: 37
-  prerelease: false
-  segments:
-  - 0
-  - 23
-  version: "0.23"
+version: !ruby/object:Gem::Version
+  version: '0.25'
+  prerelease:
 platform: ruby
-authors:
+authors:
 - John Leach
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-15 00:00:00 +01:00
-default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2012-06-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: oniguruma
-  prerelease: false
-  requirement: &id001 !ruby/object:Gem::Requirement
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 19
-        segments:
-        - 1
-        - 1
-        - 0
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
         version: 1.1.0
   type: :runtime
-  version_requirements: *id001
-- !ruby/object:Gem::Dependency
-  name: htmlentities
   prerelease: false
-  requirement: &id002 !ruby/object:Gem::Requirement
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.1.0
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 63
-        segments:
-        - 4
-        - 0
-        - 0
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
         version: 4.0.0
   type: :runtime
-  version_requirements: *id002
-description: A Ruby library to parse the content out of web pages, such as BBC News pages.  Used by the News Sniffer project.
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 4.0.0
+description: A Ruby library to parse the content out of web pages, such as BBC News
+  pages and Guardian articles. Used by the News Sniffer project.
 email: john@johnleach.co.uk
 executables: []
 extensions: []
-extra_rdoc_files:
+extra_rdoc_files:
 - README.rdoc
 - LICENSE
-files:
+files:
 - lib/web-page-parser/parser_factory.rb
 - lib/web-page-parser/base_parser.rb
 - lib/web-page-parser/parsers/test_page_parser.rb
+- lib/web-page-parser/parsers/guardian_page_parser.rb
 - lib/web-page-parser/parsers/bbc_news_page_parser.rb
 - lib/web-page-parser.rb
 - README.rdoc
 - LICENSE
 - spec/parser_factory_spec.rb
 - spec/base_parser_spec.rb
+- spec/parsers/guardian_page_spec.rb
 - spec/parsers/bbc_news_page_spec.rb
+- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
+- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
 - spec/fixtures/bbc_news/10341015.stm.html
 - spec/fixtures/bbc_news/8029015.stm.html
 - spec/fixtures/bbc_news/7745137.stm.html
@@ -80,44 +77,37 @@ files:
 - spec/fixtures/bbc_news/11125504.html
 - spec/fixtures/bbc_news/6072486.stm.html
 - spec/spec.opts
-has_rdoc: true
 homepage: http://github.com/johnl/web-page-parser/tree/master
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project: web-page-parser
-rubygems_version: 1.3.7
+rubygems_version: 1.8.23
 signing_key:
 specification_version: 3
 summary: A parser for web pages
-test_files:
+test_files:
 - spec/parser_factory_spec.rb
 - spec/base_parser_spec.rb
+- spec/parsers/guardian_page_spec.rb
 - spec/parsers/bbc_news_page_spec.rb
+- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
+- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
 - spec/fixtures/bbc_news/10341015.stm.html
 - spec/fixtures/bbc_news/8029015.stm.html
 - spec/fixtures/bbc_news/7745137.stm.html
@@ -129,3 +119,4 @@ test_files:
 - spec/fixtures/bbc_news/11125504.html
 - spec/fixtures/bbc_news/6072486.stm.html
 - spec/spec.opts
+has_rdoc: true