RubyGems - web-page-parser - Versions diffs - 0.25 → 1.0.0 - Mend

web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
checksums.yaml.gz.sig +1 -0
data.tar.gz.sig +0 -0
data/README.rdoc +5 -0
data/lib/web-page-parser.rb +31 -0
data/lib/web-page-parser/base_parser.rb +92 -42
data/lib/web-page-parser/http.rb +63 -0
data/lib/web-page-parser/parser_factory.rb +0 -1
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
data/spec/base_parser_spec.rb +24 -8
data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
data/spec/fixtures/bbc_news/21528631.html +2021 -0
data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
data/spec/parser_factory_spec.rb +3 -3
data/spec/parsers/bbc_news_page_spec.rb +223 -3
data/spec/parsers/guardian_page_spec.rb +157 -4
data/spec/parsers/independent_page_parser_spec.rb +152 -0
data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
data/spec/spec_helper.rb +5 -0
metadata +167 -59
metadata.gz.sig +2 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e1076ec5c2d36f32055c1d8996bd632ae9ec8c41
+  data.tar.gz: bd448302e1e04cf6d022f747959b0740367e75af
+SHA512:
+  metadata.gz: 31f642be9f27c32b59fd2cdf0e1fd19f17fb6f0f55f10d506cae97923bc72ca64b508aa296f2bce5345a6bbbec98d7b8ffd3f7fe92ae80fff914dad29e906c16
+  data.tar.gz: 330bfa9cf1e96e7c0cd98c51ca1f0d63de85fcfec11d074fdcac5ac538d86dd3e6f7237f39d28c24d32121fe04fd6331caf4c96d674b7f1923c5262b53b89574

checksums.yaml.gz.sig ADDED

	@@ -0,0 +1 @@
1	+ Q��ϟi{��/wW�i��U��k�[#j:�,5�%vm[�"{Eug�^��S8�z2��Tb��d��a ʲ��Ƈ��@�\��j�Hg4�#�l��M�F��RQ^܅��׶ŭ��Hu��%�QY��2^:T�U��S^3%&W��?�KA��1�0��o�z=8x/6/��v�\|�"�vԬ�9.t`�rS�c�/�k ��z+�"o��_�L��ʷ��B{d/�y�Ʋ�\|��

data.tar.gz.sig ADDED

Binary file

data/README.rdoc CHANGED

@@ -31,6 +31,11 @@ they change.
   puts page.date # 2009-05-09T18:58:59+00:00
   puts page.content.first # The wife of author Ken Follett and ...
+== Ruby 1.8 support
+Installing the Oniguruma gem on Ruby 1.8 will make Web Page Parser run
+faster, it's highly recommended but not required.
 == More Info
 Web Page Parser was written by {John Leach}[http://johnleach.co.uk]

data/lib/web-page-parser.rb CHANGED

@@ -1,4 +1,35 @@
 # $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
+# Try using oniguruma on Ruby 1.8, if it's available
+if RUBY_VERSION =~ /^1.8/
+  begin
+    require 'oniguruma'
+  rescue LoadError
+  end
+end
+# New Sniffer was originally developed against oniguruma, so when it's
+# not available we just provide a compatible interface. This is a bit
+# silly, especially for Ruby 1.9 (where it's built in!), but it saves
+# changing lots of code.
+unless defined?(Oniguruma)
+  module Oniguruma
+    class ORegexp < Regexp
+      def self.new(r, options = "")
+        ropts = 0
+        ropts = ropts | Regexp::MULTILINE if options =~ /m/
+        ropts = ropts | Regexp::IGNORECASE if options =~ /i/
+        super(r, ropts)
+      end
+      def gsub(a, b)
+        a.gsub(self, b)
+      end
+    end
+  end
+end
+require 'web-page-parser/http.rb'
 require 'web-page-parser/base_parser.rb'
 require 'web-page-parser/parser_factory.rb'

data/lib/web-page-parser/base_parser.rb CHANGED

@@ -2,46 +2,115 @@
 module WebPageParser
   require 'digest'
   require 'date'
-  require 'oniguruma'
   require 'htmlentities'
-  require 'iconv'
-  # BaseParse is designed to be sub-classed to write new parsers.  It
-  # provides some basic help but most of the work needs to be done by
-  # the sub-class.
+  class RetrieveError < StandardError ; end
+  class BaseParser
+    class << self
+      attr_accessor :retrieve_session
+    end
+    attr_reader :url, :guid
+    # takes a hash of options. The :url option passes the page url, and
+    # the :page option passes the raw html page content for parsing
+    def initialize(options = { })
+      @url = options[:url]
+      @page = options[:page]
+    end
+    # return the page contents, retrieving it from the server if necessary
+    def page
+      @page ||= retrieve_page
+    end
+    # request the page from the server and return the raw contents
+    def retrieve_page(rurl = nil)
+      durl = rurl || url
+      return nil unless durl
+      durl = filter_url(durl) if self.respond_to?(:filter_url)
+      self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
+      self.class.retrieve_session.get(durl)
+    end
+    def title
+      @title
+    end
+    def content
+      @content || []
+    end
+    def date
+    end
+    # Return a hash representing the textual content of this web page
+    def hash
+      digest = Digest::MD5.new
+      digest << title.to_s
+      digest << content.join('').to_s
+      digest.to_s
+    end
+  end
+  # BaseRegexpParser is designed to be sub-classed to write new
+  # parsers that use regular. It provides some basic help but most of
+  # the work needs to be done by the sub-class.
   #
   # Simple pages could be implemented by just defining new regular
   # expression constants, but more advanced parsing can be achieved
   # with the *_processor methods.
   #
-  class BaseParser
+  class BaseRegexpParser < BaseParser
     include Oniguruma
-    attr_reader :url, :guid, :page
-    ICONV = Iconv.new("utf8", "iso-8859-1")
     # The regular expression to extract the title
     TITLE_RE = //
     # The regular expression to extract the date
     DATE_RE = //
     # The regular expression to extract the content
     CONTENT_RE = //
     # The regular expression to find all characters that should be
     # removed from any content.
     KILL_CHARS_RE = ORegexp.new('[\n\r]+')
     # The object used to turn HTML entities into real charaters
     HTML_ENTITIES_DECODER = HTMLEntities.new
-    # takes a has of options. The :url option passes the page url, and
-    # the :page option passes the raw html page content for parsing
     def initialize(options = { })
-      @url = options[:url]
-      @page = options[:page]
+      super(options)
+      @page = encode(@page)
+    end
+    # Handle any string encoding
+    def encode(s)
+      return s if s.nil?
+      return s if s.valid_encoding?
+      if s.force_encoding("iso-8859-1").valid_encoding?
+        return s.encode('utf-8', 'iso-8859-1')
+      end
+      s
+    end
+    # return the page contents, retrieving it from the server if necessary
+    def page
+      @page ||= retrieve_page
+    end
+    # request the page from the server and return the raw contents
+    def retrieve_page(rurl = nil)
+      durl = rurl || url
+      return nil unless durl
+      durl = filter_url(durl) if self.respond_to?(:filter_url)
+      self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
+      encode(self.class.retrieve_session.get(durl))
     end
     # The title method returns the title of the web page.
@@ -54,7 +123,6 @@ module WebPageParser
       if matches = class_const(:TITLE_RE).match(page)
         @title = matches[1].to_s.strip
         title_processor
-        @title = iconv(@title)
         @title = decode_entities(@title)
       end
     end
@@ -89,59 +157,41 @@ module WebPageParser
       matches = class_const(:CONTENT_RE).match(page)
       if matches
         @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
-        @content = iconv(@content)
         content_processor
         @content.collect! { |p| decode_entities(p.strip) }
-        @content.delete_if { |p| p == '' or p.nil? }
+        @content.delete_if { |p| p == '' or p.nil? }
       end
       @content = [] if @content.nil?
       @content
     end
-    # Return a hash representing the textual content of this web page
-    def hash
-      digest = Digest::MD5.new
-      digest << title.to_s
-      digest << content.to_s
-      digest.to_s
-    end
     # Convert html entities to unicode
     def decode_entities(s)
       HTML_ENTITIES_DECODER.decode(s)
     end
     private
     # get the constant from this objects class
     def class_const(sym)
       self.class.const_get(sym)
     end
-    # Convert the encoding of the given text if necessary
-    def iconv(s)
-      if class_const(:ICONV) and ICONV
-        class_const(:ICONV).iconv(s)
-      else
-        s
-      end
-    end
     # Custom content parsing. It should split the @content up into an
     # array of paragraphs. Conversion to utf8 is done after this method.
     def content_processor
       @content = @content.split(/<p>/)
     end
     # Custom date parsing.  It should parse @date into a DateTime object
     def date_processor
     end
     # Custom title parsing.  It should clean up @title as
     # necessary. Conversion to utf8 is done after this method.
     def title_processor
     end
   end

data/lib/web-page-parser/http.rb ADDED

@@ -0,0 +1,63 @@
+module WebPageParser
+  module HTTP
+    require 'curb'
+    require 'zlib'
+    class Response < String
+      attr_accessor :curl
+      def initialize(s, curl)
+        self.curl = curl
+        super(s)
+      end
+    end
+    class Session
+      class CurlError < StandardError ; end
+      def curl
+        @curl ||= Curl::Easy.new do |c|
+          c.timeout = 8
+          c.connect_timeout = 8
+          c.dns_cache_timeout = 600
+          c.enable_cookies = true
+          c.follow_location = true
+          c.autoreferer = true
+          c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
+          c.headers["Accept-encoding"] = 'gzip, deflate'
+        end
+      end
+      def get(url)
+        curl.url = url
+        if curl.perform == false
+          raise CurlError, "curl.perform returned false"
+        end
+        uncompressed = gunzip(curl.body_str)
+        uncompressed = inflate(curl.body_str) if uncompressed.nil?
+        final_body = uncompressed || curl.body_str
+        if final_body.respond_to?(:force_encoding)
+          # Not sure if this is right. works for BBC/Guardian/New York Times anyway
+          final_body.force_encoding("utf-8")
+        end
+        Response.new(final_body, curl)
+      end
+      def inflate(s)
+        Zlib::Inflate.inflate(s)
+      rescue Zlib::DataError
+        nil
+      end
+      def gunzip(s)
+        s = StringIO.new(s)
+        Zlib::GzipReader.new(s).read
+      rescue Zlib::DataError
+      rescue Zlib::GzipFile::Error
+        nil
+      end
+    end
+  end
+end

data/lib/web-page-parser/parser_factory.rb CHANGED

@@ -1,5 +1,4 @@
 module WebPageParser
-  require 'oniguruma'
   class ParserFactory
     include Oniguruma

data/lib/web-page-parser/parsers/bbc_news_page_parser.rb CHANGED

@@ -15,7 +15,7 @@ module WebPageParser
       end
       def self.create(options = {})
-        BbcNewsPageParserV4.new(options)
+        BbcNewsPageParserV5.new(options)
       end
     end
@@ -23,7 +23,7 @@ module WebPageParser
     # old News Sniffer BbcNewsPage class did.  This should only ever
     # be used for backwards compatability with News Sniffer and is
     # never supplied for use by a factory.
-    class BbcNewsPageParserV1 < WebPageParser::BaseParser
+    class BbcNewsPageParserV1 < WebPageParser::BaseRegexpParser
       TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
       DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -34,11 +34,11 @@ module WebPageParser
       def hash
         # Old News Sniffer only hashed the content, not the title
-        Digest::MD5.hexdigest(content.to_s)
+        Digest::MD5.hexdigest(content.join('').to_s)
       end
       private
       def date_processor
         begin
           # OPD is in GMT/UTC, which DateTime seems to use by default
@@ -58,7 +58,7 @@ module WebPageParser
     end
     # BbcNewsPageParserV2 parses BBC News web pages
-    class BbcNewsPageParserV2 < WebPageParser::BaseParser
+    class BbcNewsPageParserV2 < WebPageParser::BaseRegexpParser
       TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
       DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
@@ -96,8 +96,8 @@ module WebPageParser
       CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
       STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
       STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
-      ICONV = nil # BBC news is now in utf8
+      # BBC news is now in utf8
       def content_processor
         @content = STRIP_FEATURES_RE.gsub(@content, '')
         @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
@@ -118,7 +118,8 @@ module WebPageParser
       STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
       STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
       STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
+      STRIP_TWITTER_WIDGET_RE = ORegexp.new('<div[^>]+twitter\-module.*?</ul>','m')
+      STRIP_TWITTER_WIDGET2_RE = ORegexp.new('<ul[^>]+tweets.+?</ul>.+?<ul[^>]+links.+?</ul>', 'm')
       def content_processor
         @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
         @content = STRIP_STORY_DATE.gsub(@content, '')
@@ -131,8 +132,70 @@ module WebPageParser
         @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
         @content = STRIP_MARKETDATA_RE.gsub(@content, '')
         @content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
+        @content = STRIP_TWITTER_WIDGET_RE.gsub(@content, '')
+        @content = STRIP_TWITTER_WIDGET2_RE.gsub(@content, '')
         super
       end
     end
+    class BbcNewsPageParserV5 < WebPageParser::BaseParser
+      require 'nokogiri'
+      def html_doc
+        @html_document ||= Nokogiri::HTML(page)
+      end
+      def title
+        return @title if @title
+        @title = html_doc.css('h1.story-header').text.strip
+        # for older bbc articles
+        if @title.empty?
+          @title = html_doc.css('div#meta-information h1').text.strip
+        end
+        # for very old bbc articles
+        if @title.empty?
+          if headline_meta = html_doc.at_css('meta[name=Headline]')
+            @title = headline_meta['content'].to_s.strip
+          end
+        end
+        @title
+      end
+      def content
+        return @content if @content
+        @content = []
+        story_body = html_doc.css('div.story-body')
+        # for older bbc articles
+        if story_body.children.empty?
+          story_body = html_doc.css('div#story-body')
+        end
+        # for very old bbc articles
+        if story_body.children.empty?
+          story_body = html_doc.css('td.storybody')
+        end
+        story_body.children.each do |n|
+          @content << n.text.strip if n.name == 'p'
+          @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
+        end
+        @content
+      end
+      def date
+        return @date if @date
+        if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
+          @date = DateTime.parse(date_meta['content']) rescue nil
+        end
+        @date
+      end
+    end
 end