RubyGems - web-page-parser - Versions diffs - 0.10 - Mend

web-page-parser 0.10

Files changed (17) hide show

data/LICENSE +22 -0
data/README.rdoc +31 -0
data/lib/web-page-parser/base_parser.rb +149 -0
data/lib/web-page-parser/parser_factory.rb +54 -0
data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +93 -0
data/lib/web-page-parser/parsers/test_page_parser.rb +15 -0
data/lib/web-page-parser.rb +4 -0
data/spec/base_parser_spec.rb +67 -0
data/spec/fixtures/bbc_news/6072486.stm.html +1318 -0
data/spec/fixtures/bbc_news/7745137.stm.html +2177 -0
data/spec/fixtures/bbc_news/8011268.stm.html +2899 -0
data/spec/fixtures/bbc_news/8029015.stm.html +2417 -0
data/spec/fixtures/bbc_news/8063681.stm.html +2382 -0
data/spec/parser_factory_spec.rb +18 -0
data/spec/parsers/bbc_news_page_spec.rb +144 -0
data/spec/spec.opts +4 -0
metadata +92 -0

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+The MIT License
+Copyright (c) 2009 John Leach <john@johnleach.co.uk>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,31 @@
+= Web Page Parser
+Web Page Parser is a Ruby library to parse the content out of web
+pages, such as BBC News pages. It strips all non-textual stuff out,
+leaving the title, publication date and an array of paragraphs. It
+currently only supports BBC News pages but new parsers are planned and
+can be added easily.
+It is used by the {News Sniffer}[http://www.newssniffer.co.uk] project, which parses and archives news
+articles to keep track of how they change.
+== Example usage
+  require 'web-page-parser'
+  require 'open-uri'
+  url = "http://news.bbc.co.uk/1/hi/uk/8041972.stm"
+  page_data = open(url).read
+  page = WebPageParser::ParserFactory.parser_for(:url => url, :page => page_data)
+  puts page.title # MPs hit back over expenses claims
+  puts page.date # 2009-05-09T18:58:59+00:00
+  puts page.content.first # The wife of author Ken Follett and ...
+== More Info
+Web Page Parser was written by {John Leach}[http://johnleach.co.uk].
+The code is available on {github}[http://github.com/johnl/web-page-parser/tree/master].

data/lib/web-page-parser/base_parser.rb ADDED Viewed

@@ -0,0 +1,149 @@
+module WebPageParser
+  require 'digest'
+  require 'date'
+  require 'oniguruma'
+  require 'htmlentities'
+  require 'iconv'
+  # BaseParse is designed to be sub-classed to write new parsers.  It
+  # provides some basic help but most of the work needs to be done by
+  # the sub-class.
+  #
+  # Simple pages could be implemented by just defining new regular
+  # expression constants, but more advanced parsing can be achieved
+  # with the *_processor methods.
+  #
+  class BaseParser
+    include Oniguruma
+    attr_reader :url, :guid, :page
+    ICONV = Iconv.new("utf8", "iso-8859-1")
+    # The regular expression to extract the title
+    TITLE_RE = //
+    # The regular expression to extract the date
+    DATE_RE = //
+    # The regular expression to extract the content
+    CONTENT_RE = //
+    # The regular expression to find all characters that should be
+    # removed from any content.
+    KILL_CHARS_RE = ORegexp.new('[\n\r]+')
+    # The object used to turn HTML entities into real charaters
+    HTML_ENTITIES_DECODER = HTMLEntities.new
+    # takes a has of options. The :url option passes the page url, and
+    # the :page option passes the raw html page content for parsing
+    def initialize(options = { })
+      @url = options[:url]
+      @page = options[:page]
+    end
+    # The title method returns the title of the web page.
+    #
+    # It does the basic extraction using the TITLE_RE regular
+    # expression and handles text encoding.  More advanced parsing can
+    # be done by overriding the title_processor method.
+    def title
+      return @title if @title
+      if matches = class_const(:TITLE_RE).match(page)
+        @title = matches[1].to_s.strip
+        title_processor
+        @title = iconv(@title)
+        @title = decode_entities(@title)
+      end
+    end
+    # The date method returns a the timestamp of the web page, as a
+    # DateTime object.
+    #
+    # It does the basic extraction using the DATE_RE regular
+    # expression but the work of converting the text into a DateTime
+    # object needs to be done by the date_processor method.
+    def date
+      return @date if @date
+      if matches = class_const(:DATE_RE).match(page)
+        @date = matches[1].to_s.strip
+        date_processor
+        @date
+      end
+    end
+    # The content method returns the important body text of the web page.
+    #
+    # It does basic extraction and pre-processing of the page content
+    # and then calls the content_processor method for any other more
+    # custom processing work that needs doing.  Lastly, it does some
+    # basic post processing and returns the content as a string.
+    #
+    # When writing a new parser, the CONTENT_RE constant should be
+    # defined in the subclass.  The KILL_CHARS_RE constant can be
+    # overridden if necessary.
+    def content
+      return @content if @content
+      matches = class_const(:CONTENT_RE).match(page)
+      if matches
+        @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
+        @content = iconv(@content)
+        content_processor
+        @content.collect! { |p| decode_entities(p.strip) }
+        @content.delete_if { |p| p == '' or p.nil? }
+      end
+      @content = [] if @content.nil?
+      @content
+    end
+    # Return a hash representing the textual content of this web page
+    def hash
+      digest = Digest::MD5.new
+      digest << title.to_s
+      digest << content.to_s
+      digest.to_s
+    end
+    # Convert html entities to unicode
+    def decode_entities(s)
+      HTML_ENTITIES_DECODER.decode(s)
+    end
+    private
+    # get the constant from this objects class
+    def class_const(sym)
+      self.class.const_get(sym)
+    end
+    # Convert the encoding of the given text if necessary
+    def iconv(s)
+      if class_const(:ICONV)
+        class_const(:ICONV).iconv(s)
+      else
+        s
+      end
+    end
+    # Custom content parsing. It should split the @content up into an
+    # array of paragraphs. Conversion to utf8 is done after this method.
+    def content_processor
+      @content = @content.split(/<p>/)
+    end
+    # Custom date parsing.  It should parse @date into a DateTime object
+    def date_processor
+    end
+    # Custom title parsing.  It should clean up @title as
+    # necessary. Conversion to utf8 is done after this method.
+    def title_processor
+    end
+  end
+end

data/lib/web-page-parser/parser_factory.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module WebPageParser
+  require 'oniguruma'
+  class ParserFactory
+    include Oniguruma
+    # return true if the Parser can handle the given page. options
+    # hash must have a :url key
+    def can_parse?(options = {})
+      false
+    end
+    # Allocate a new parser. options hash is passed to new method of
+    # parser class.
+    def create(options = {})
+      nil
+    end
+    @@factories = []
+    def self.add_factory(f)
+      @@factories << f unless @@factories.include? f
+    end
+    def self.factories
+      @@factories
+    end
+    # Return a PageParser than can parse the given page. options hash
+    # must have a :url key
+    def self.parser_for(options = {})
+      @@factories.each do |factory|
+        return factory.create(options) if factory.can_parse?(options)
+      end
+      nil
+    end
+    # Load all the plugins in the given directory
+    def self.load(dirname)
+      Dir.open(dirname).each do |fn|
+        next unless fn =~ /page_parser\.rb$/
+        require File.join(dirname, fn)
+      end
+    end
+    # Keep track of any newly defined factories
+    def self.inherited(factory)
+      self.add_factory(factory)
+    end
+  end
+  ParserFactory.load(File.join(File.dirname(__FILE__), 'parsers'))
+end

data/lib/web-page-parser/parsers/bbc_news_page_parser.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+module WebPageParser
+    class BbcNewsPageParserFactory < WebPageParser::ParserFactory
+      URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
+      INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
+      def self.can_parse?(options)
+        if INVALID_URL_RE.match(options[:url])
+          nil
+        else
+          URL_RE.match(options[:url])
+        end
+      end
+      def self.create(options = {})
+        BbcNewsPageParserV2.new(options)
+      end
+    end
+    # BbcNewsPageParserV1 parses BBC News web pages exactly like the
+    # old News Sniffer BbcNewsPage class did.  This should only ever
+    # be used for backwards compatability with News Sniffer and is
+    # never supplied for use by a factory.
+    class BbcNewsPageParserV1 < WebPageParser::BaseParser
+      TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
+      DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
+      CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
+      STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
+      WHITESPACE_RE = ORegexp.new('\t|')
+      PARA_RE = Regexp.new(/<p>/i)
+      def hash
+        # Old News Sniffer only hashed the content, not the title
+        Digest::MD5.hexdigest(content.to_s)
+      end
+      private
+      def date_processor
+        begin
+          # OPD is in GMT/UTC, which DateTime seems to use by default
+          @date = DateTime.parse(@date)
+        rescue ArgumentError
+          @date = Time.now.utc
+        end
+      end
+      def content_processor
+        @content = STRIP_TAGS_RE.gsub(@content, '')
+        @content = WHITESPACE_RE.gsub(@content, '')
+        @content = decode_entities(@content)
+        @content = @content.split(PARA_RE)
+      end
+    end
+    # BbcNewsPageParserV2 parses BBC News web pages
+    class BbcNewsPageParserV2 < WebPageParser::BaseParser
+      TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
+      DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
+      CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
+      STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
+      STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
+      STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
+      STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
+      WHITESPACE_RE = ORegexp.new('[\t ]+')
+      PARA_RE = Regexp.new('</?p[^>]*>')
+      private
+      def content_processor
+        @content = STRIP_CAPTIONS_RE.gsub(@content, '')
+        @content = STRIP_COMMENTS_RE.gsub(@content, '')
+        @content = STRIP_BLOCKS_RE.gsub(@content, '')
+        @content = STRIP_TAGS_RE.gsub(@content, '')
+        @content = WHITESPACE_RE.gsub(@content, ' ')
+        @content = @content.split(PARA_RE)
+      end
+      def date_processor
+        begin
+          # OPD is in GMT/UTC, which DateTime seems to use by default
+          @date = DateTime.parse(@date)
+        rescue ArgumentError
+          @date = Time.now.utc
+        end
+      end
+    end
+end

data/lib/web-page-parser/parsers/test_page_parser.rb ADDED Viewed

@@ -0,0 +1,15 @@
+class TestPageParserFactory < WebPageParser::ParserFactory
+  @url_regexp = Regexp.new("www.example.com")
+  def self.can_parse?(options = {})
+    @url_regexp.match(options[:url])
+  end
+  def self.create(options = {})
+    TestPageParser.new(options)
+  end
+end
+class TestPageParser < WebPageParser::BaseParser
+end

data/lib/web-page-parser.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
+require 'web-page-parser/base_parser.rb'
+require 'web-page-parser/parser_factory.rb'

data/spec/base_parser_spec.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+$:.unshift File.join(File.dirname(__FILE__), '../lib')
+require 'web-page-parser'
+share_as :AllPageParsers do
+  it "is initialized with a hash containing :url and :page keys" do
+    wpp = WebPageParser::BaseParser.new(@valid_options)
+    wpp.url.should == @valid_options[:url]
+    wpp.page.should == @valid_options[:page]
+  end
+  it "should return an empty array when there is no content available" do
+    content = WebPageParser::BaseParser.new.content
+    content.should be_a_kind_of Array
+    content.empty?.should be_true
+  end
+  context "when hashing the content" do
+    before :each do
+      @wpp = WebPageParser::BaseParser.new(@valid_options)
+      @hash = @wpp.hash
+    end
+    it "calculates a hash using the title" do
+      @wpp.instance_eval("@title='different'")
+      @wpp.hash.should_not == @hash
+    end
+    it "does not calculates a hash using the date" do
+      @wpp.instance_eval("@date=Time.now")
+      @wpp.hash.should == @hash
+    end
+    it "calculates a hash using the content" do
+      @wpp.instance_eval("@content='different'")
+      @wpp.hash.should_not == @hash
+    end
+  end
+end
+describe WebPageParser::BaseParser do
+  it_should_behave_like AllPageParsers
+  before :each do
+    @valid_options = {
+      :url => 'http://news.bbc.co.uk',
+      :page => '<html></html>',
+      :valid_hash => 'cfcd208495d565ef66e7dff9f98764da'
+    }
+  end
+  it "should decode basic html entities" do
+    bp = WebPageParser::BaseParser.new
+    entities = {
+      '&quot;' => '"',
+      '&apos;' => "'",
+      '&amp;' => "&",
+      '&pound;' => '£',
+      '&aacute;' => 'á'
+    }
+    entities.each do |e,v|
+      bp.decode_entities(e).should == v
+    end
+  end
+end