RubyGems - web-page-parser - Versions diffs - 0.10 → 0.21 - Mend

web-page-parser 0.10 → 0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/web-page-parser/base_parser.rb +1 -1
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +45 -5
data/spec/fixtures/bbc_news/10249066.stm.html +1361 -0
data/spec/fixtures/bbc_news/10341015.stm.html +1278 -0
data/spec/fixtures/bbc_news/11125504.html +1481 -0
data/spec/parsers/bbc_news_page_spec.rb +65 -3
metadata +58 -23

data/spec/parsers/bbc_news_page_spec.rb CHANGED Viewed

@@ -10,17 +10,24 @@ describe BbcNewsPageParserFactory do
                    "http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
                    "http://news.bbc.co.uk/1/hi/uk/7995652.stm",
                    "http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
-                   "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
+                   "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
+                   "http://www.bbc.co.uk/news/business-11125504",
+                   "http://www.bbc.co.uk/news/10604897"
                   ]
     @invalid_urls = [
                      "http://news.bbc.co.uk/2/hi/health/default.stm",
                      "http://news.bbc.co.uk/2/low/europe/default.stm",
                      "http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
                      "http://news.bbc.co.uk/sport",
+                     "http://news.bbc.co.uk/sport1/hi/tennis/8951357.stm",
                      "http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
                      "http://www.bbc.co.uk/blogs/nickrobinson/",
                      "http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
-                     "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
+                     "http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm",
+                     "http://www.bbc.co.uk/blogs/theeditors/",
+                     "http://www.bbc.co.uk/news/have_your_say/",
+                     "http://news.bbc.co.uk/1/hi/magazine/default.stm",
+                     "http://news.bbc.co.uk/1/hi/in_pictures/default.stm"
                     ]
   end
@@ -41,6 +48,61 @@ describe BbcNewsPageParserFactory do
   end
 end
+describe BbcNewsPageParserV4 do
+  it_should_behave_like AllPageParsers
+  before do
+    @valid_options = {
+      :url => 'http://www.bbc.co.uk/news/business-11125504',
+      :page => File.read("spec/fixtures/bbc_news/11125504.html"),
+      :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
+    }
+    @pa = BbcNewsPageParserV4.new(@valid_options)
+  end
+  it "should parse the title" do
+    @pa.title.should == "UK economy 'to pick up in near term'"
+  end
+  it "should parse the content" do
+    @pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
+    @pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
+    @pa.content.size.should == 18
+  end
+end
+describe BbcNewsPageParserV3 do
+  it_should_behave_like AllPageParsers
+  before do
+    @valid_options = {
+      :url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
+      :page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
+      :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
+    }
+    @pa = BbcNewsPageParserV3.new(@valid_options)
+  end
+  it "should parse the content" do
+    @pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
+    @pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
+    @pa.content.size.should == 24
+  end
+  it "should parse the content of an article with two captions" do
+    @pa = BbcNewsPageParserV3.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
+                                    :page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
+                                    :valid_hash => 'unknown'
+                                  })
+    @pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
+    @pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
+    @pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
+  end
+end
 describe BbcNewsPageParserV2 do
   it_should_behave_like AllPageParsers
   before do
@@ -140,5 +202,5 @@ describe BbcNewsPageParserV1 do
     @pa.title.should match Regexp.new("John's")
     @pa.title.should match /sucks & blows/
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: web-page-parser
 version: !ruby/object:Gem::Version
-  version: "0.10"
+  hash: 33
+  prerelease: false
+  segments:
+  - 0
+  - 21
+  version: "0.21"
 platform: ruby
 authors:
 - John Leach
@@ -9,29 +14,41 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-06-20 00:00:00 +01:00
+date: 2010-08-30 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: oniguruma
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 19
+        segments:
+        - 1
+        - 1
+        - 0
         version: 1.1.0
-    version:
+  type: :runtime
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
   name: htmlentities
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 63
+        segments:
+        - 4
+        - 0
+        - 0
         version: 4.0.0
-    version:
+  type: :runtime
+  version_requirements: *id002
 description: A Ruby library to parse the content out of web pages, such as BBC News pages.  Used by the News Sniffer project.
 email: john@johnleach.co.uk
 executables: []
@@ -42,51 +59,69 @@ extra_rdoc_files:
 - README.rdoc
 - LICENSE
 files:
-- lib/web-page-parser
+- lib/web-page-parser/parser_factory.rb
 - lib/web-page-parser/base_parser.rb
-- lib/web-page-parser/parsers
 - lib/web-page-parser/parsers/test_page_parser.rb
 - lib/web-page-parser/parsers/bbc_news_page_parser.rb
-- lib/web-page-parser/parser_factory.rb
 - lib/web-page-parser.rb
 - README.rdoc
 - LICENSE
+- spec/parser_factory_spec.rb
+- spec/base_parser_spec.rb
+- spec/parsers/bbc_news_page_spec.rb
+- spec/fixtures/bbc_news/10341015.stm.html
+- spec/fixtures/bbc_news/8029015.stm.html
+- spec/fixtures/bbc_news/7745137.stm.html
+- spec/fixtures/bbc_news/8063681.stm.html
+- spec/fixtures/bbc_news/10249066.stm.html
+- spec/fixtures/bbc_news/8011268.stm.html
+- spec/fixtures/bbc_news/11125504.html
+- spec/fixtures/bbc_news/6072486.stm.html
+- spec/spec.opts
 has_rdoc: true
 homepage: http://github.com/johnl/web-page-parser/tree/master
+licenses: []
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project: web-page-parser
-rubygems_version: 1.3.1
+rubygems_version: 1.3.7
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: A parser for web pages
 test_files:
 - spec/parser_factory_spec.rb
 - spec/base_parser_spec.rb
-- spec/fixtures
-- spec/fixtures/bbc_news
+- spec/parsers/bbc_news_page_spec.rb
+- spec/fixtures/bbc_news/10341015.stm.html
+- spec/fixtures/bbc_news/8029015.stm.html
+- spec/fixtures/bbc_news/7745137.stm.html
 - spec/fixtures/bbc_news/8063681.stm.html
+- spec/fixtures/bbc_news/10249066.stm.html
 - spec/fixtures/bbc_news/8011268.stm.html
+- spec/fixtures/bbc_news/11125504.html
 - spec/fixtures/bbc_news/6072486.stm.html
-- spec/fixtures/bbc_news/8029015.stm.html
-- spec/fixtures/bbc_news/7745137.stm.html
-- spec/parsers
-- spec/parsers/bbc_news_page_spec.rb
 - spec/spec.opts