RubyGems - web-page-parser - Versions diffs - 0.10 → 0.21 - Mend

web-page-parser 0.10 → 0.21

Files changed (7) hide show

data/lib/web-page-parser/base_parser.rb +1 -1
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +45 -5
data/spec/fixtures/bbc_news/10249066.stm.html +1361 -0
data/spec/fixtures/bbc_news/10341015.stm.html +1278 -0
data/spec/fixtures/bbc_news/11125504.html +1481 -0
data/spec/parsers/bbc_news_page_spec.rb +65 -3
metadata +58 -23

data/lib/web-page-parser/base_parser.rb CHANGED Viewed

@@ -120,7 +120,7 @@ module WebPageParser
     # Convert the encoding of the given text if necessary
     def iconv(s)
-      if class_const(:ICONV)
+      if class_const(:ICONV) and ICONV
         class_const(:ICONV).iconv(s)
       else
         s

data/lib/web-page-parser/parsers/bbc_news_page_parser.rb CHANGED Viewed

@@ -2,8 +2,8 @@
 module WebPageParser
     class BbcNewsPageParserFactory < WebPageParser::ParserFactory
-      URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
-      INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
+      URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z]+-)?[0-9]+(\.stm)?$")
+      INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
       def self.can_parse?(options)
         if INVALID_URL_RE.match(options[:url])
@@ -14,7 +14,7 @@ module WebPageParser
       end
       def self.create(options = {})
-        BbcNewsPageParserV2.new(options)
+        BbcNewsPageParserV4.new(options)
       end
     end
@@ -65,9 +65,9 @@ module WebPageParser
       STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
       STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
       STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
-      STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
+      STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
       WHITESPACE_RE = ORegexp.new('[\t ]+')
-      PARA_RE = Regexp.new('</?p[^>]*>')
+      PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
       private
@@ -90,4 +90,44 @@ module WebPageParser
       end
     end
+    class BbcNewsPageParserV3 < BbcNewsPageParserV2
+      CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
+      STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
+      STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
+      ICONV = nil # BBC news is now in utf8
+      def content_processor
+        @content = STRIP_FEATURES_RE.gsub(@content, '')
+        @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
+        super
+      end
+    end
+    class BbcNewsPageParserV4 < BbcNewsPageParserV3
+      CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
+      STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
+      STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
+      STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
+      STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
+      TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
+      STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
+      STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
+      STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
+      STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
+      def content_processor
+        @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
+        @content = STRIP_STORY_DATE.gsub(@content, '')
+        @content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
+        @content = STRIP_STORY_TIME.gsub(@content, '')
+        @content = TITLE_RE.gsub(@content, '')
+        @content = STRIP_CAPTIONS_RE2.gsub(@content, '')
+        @content = STRIP_HIDDEN_A.gsub(@content, '')
+        @content = STRIP_STORY_FEATURE.gsub(@content, '')
+        @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
+        super
+      end
+    end
 end