web-page-parser 0.10

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2009 John Leach <john@johnleach.co.uk>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.rdoc ADDED
@@ -0,0 +1,31 @@
1
+ = Web Page Parser
2
+
3
+ Web Page Parser is a Ruby library to parse the content out of web
4
+ pages, such as BBC News pages. It strips all non-textual stuff out,
5
+ leaving the title, publication date and an array of paragraphs. It
6
+ currently only supports BBC News pages but new parsers are planned and
7
+ can be added easily.
8
+
9
+ It is used by the {News Sniffer}[http://www.newssniffer.co.uk] project, which parses and archives news
10
+ articles to keep track of how they change.
11
+
12
+ == Example usage
13
+
14
+ require 'web-page-parser'
15
+ require 'open-uri'
16
+
17
+ url = "http://news.bbc.co.uk/1/hi/uk/8041972.stm"
18
+ page_data = open(url).read
19
+
20
+ page = WebPageParser::ParserFactory.parser_for(:url => url, :page => page_data)
21
+
22
+ puts page.title # MPs hit back over expenses claims
23
+ puts page.date # 2009-05-09T18:58:59+00:00
24
+ puts page.content.first # The wife of author Ken Follett and ...
25
+
26
+ == More Info
27
+
28
+ Web Page Parser was written by {John Leach}[http://johnleach.co.uk].
29
+
30
+ The code is available on {github}[http://github.com/johnl/web-page-parser/tree/master].
31
+
@@ -0,0 +1,149 @@
1
+
2
+ module WebPageParser
3
+ require 'digest'
4
+ require 'date'
5
+ require 'oniguruma'
6
+ require 'htmlentities'
7
+ require 'iconv'
8
+
9
+ # BaseParse is designed to be sub-classed to write new parsers. It
10
+ # provides some basic help but most of the work needs to be done by
11
+ # the sub-class.
12
+ #
13
+ # Simple pages could be implemented by just defining new regular
14
+ # expression constants, but more advanced parsing can be achieved
15
+ # with the *_processor methods.
16
+ #
17
+ class BaseParser
18
+ include Oniguruma
19
+
20
+ attr_reader :url, :guid, :page
21
+
22
+ ICONV = Iconv.new("utf8", "iso-8859-1")
23
+
24
+ # The regular expression to extract the title
25
+ TITLE_RE = //
26
+
27
+ # The regular expression to extract the date
28
+ DATE_RE = //
29
+
30
+ # The regular expression to extract the content
31
+ CONTENT_RE = //
32
+
33
+ # The regular expression to find all characters that should be
34
+ # removed from any content.
35
+ KILL_CHARS_RE = ORegexp.new('[\n\r]+')
36
+
37
+ # The object used to turn HTML entities into real charaters
38
+ HTML_ENTITIES_DECODER = HTMLEntities.new
39
+
40
+ # takes a has of options. The :url option passes the page url, and
41
+ # the :page option passes the raw html page content for parsing
42
+ def initialize(options = { })
43
+ @url = options[:url]
44
+ @page = options[:page]
45
+ end
46
+
47
+ # The title method returns the title of the web page.
48
+ #
49
+ # It does the basic extraction using the TITLE_RE regular
50
+ # expression and handles text encoding. More advanced parsing can
51
+ # be done by overriding the title_processor method.
52
+ def title
53
+ return @title if @title
54
+ if matches = class_const(:TITLE_RE).match(page)
55
+ @title = matches[1].to_s.strip
56
+ title_processor
57
+ @title = iconv(@title)
58
+ @title = decode_entities(@title)
59
+ end
60
+ end
61
+
62
+ # The date method returns a the timestamp of the web page, as a
63
+ # DateTime object.
64
+ #
65
+ # It does the basic extraction using the DATE_RE regular
66
+ # expression but the work of converting the text into a DateTime
67
+ # object needs to be done by the date_processor method.
68
+ def date
69
+ return @date if @date
70
+ if matches = class_const(:DATE_RE).match(page)
71
+ @date = matches[1].to_s.strip
72
+ date_processor
73
+ @date
74
+ end
75
+ end
76
+
77
+ # The content method returns the important body text of the web page.
78
+ #
79
+ # It does basic extraction and pre-processing of the page content
80
+ # and then calls the content_processor method for any other more
81
+ # custom processing work that needs doing. Lastly, it does some
82
+ # basic post processing and returns the content as a string.
83
+ #
84
+ # When writing a new parser, the CONTENT_RE constant should be
85
+ # defined in the subclass. The KILL_CHARS_RE constant can be
86
+ # overridden if necessary.
87
+ def content
88
+ return @content if @content
89
+ matches = class_const(:CONTENT_RE).match(page)
90
+ if matches
91
+ @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
92
+ @content = iconv(@content)
93
+ content_processor
94
+ @content.collect! { |p| decode_entities(p.strip) }
95
+ @content.delete_if { |p| p == '' or p.nil? }
96
+ end
97
+ @content = [] if @content.nil?
98
+ @content
99
+ end
100
+
101
+ # Return a hash representing the textual content of this web page
102
+ def hash
103
+ digest = Digest::MD5.new
104
+ digest << title.to_s
105
+ digest << content.to_s
106
+ digest.to_s
107
+ end
108
+
109
+ # Convert html entities to unicode
110
+ def decode_entities(s)
111
+ HTML_ENTITIES_DECODER.decode(s)
112
+ end
113
+
114
+ private
115
+
116
+ # get the constant from this objects class
117
+ def class_const(sym)
118
+ self.class.const_get(sym)
119
+ end
120
+
121
+ # Convert the encoding of the given text if necessary
122
+ def iconv(s)
123
+ if class_const(:ICONV)
124
+ class_const(:ICONV).iconv(s)
125
+ else
126
+ s
127
+ end
128
+ end
129
+
130
+ # Custom content parsing. It should split the @content up into an
131
+ # array of paragraphs. Conversion to utf8 is done after this method.
132
+ def content_processor
133
+ @content = @content.split(/<p>/)
134
+ end
135
+
136
+ # Custom date parsing. It should parse @date into a DateTime object
137
+ def date_processor
138
+ end
139
+
140
+ # Custom title parsing. It should clean up @title as
141
+ # necessary. Conversion to utf8 is done after this method.
142
+ def title_processor
143
+ end
144
+
145
+ end
146
+
147
+
148
+ end
149
+
@@ -0,0 +1,54 @@
1
+ module WebPageParser
2
+ require 'oniguruma'
3
+ class ParserFactory
4
+ include Oniguruma
5
+
6
+ # return true if the Parser can handle the given page. options
7
+ # hash must have a :url key
8
+ def can_parse?(options = {})
9
+ false
10
+ end
11
+
12
+ # Allocate a new parser. options hash is passed to new method of
13
+ # parser class.
14
+ def create(options = {})
15
+ nil
16
+ end
17
+
18
+ @@factories = []
19
+
20
+ def self.add_factory(f)
21
+ @@factories << f unless @@factories.include? f
22
+ end
23
+
24
+ def self.factories
25
+ @@factories
26
+ end
27
+
28
+ # Return a PageParser than can parse the given page. options hash
29
+ # must have a :url key
30
+ def self.parser_for(options = {})
31
+ @@factories.each do |factory|
32
+ return factory.create(options) if factory.can_parse?(options)
33
+ end
34
+ nil
35
+ end
36
+
37
+ # Load all the plugins in the given directory
38
+ def self.load(dirname)
39
+ Dir.open(dirname).each do |fn|
40
+ next unless fn =~ /page_parser\.rb$/
41
+ require File.join(dirname, fn)
42
+ end
43
+ end
44
+
45
+ # Keep track of any newly defined factories
46
+ def self.inherited(factory)
47
+ self.add_factory(factory)
48
+ end
49
+
50
+ end
51
+
52
+ ParserFactory.load(File.join(File.dirname(__FILE__), 'parsers'))
53
+
54
+ end
@@ -0,0 +1,93 @@
1
+ # -*- coding: utf-8 -*-
2
+ module WebPageParser
3
+
4
+ class BbcNewsPageParserFactory < WebPageParser::ParserFactory
5
+ URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
6
+ INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
7
+
8
+ def self.can_parse?(options)
9
+ if INVALID_URL_RE.match(options[:url])
10
+ nil
11
+ else
12
+ URL_RE.match(options[:url])
13
+ end
14
+ end
15
+
16
+ def self.create(options = {})
17
+ BbcNewsPageParserV2.new(options)
18
+ end
19
+ end
20
+
21
+ # BbcNewsPageParserV1 parses BBC News web pages exactly like the
22
+ # old News Sniffer BbcNewsPage class did. This should only ever
23
+ # be used for backwards compatability with News Sniffer and is
24
+ # never supplied for use by a factory.
25
+ class BbcNewsPageParserV1 < WebPageParser::BaseParser
26
+
27
+ TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
28
+ DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
29
+ CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
30
+ STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
31
+ WHITESPACE_RE = ORegexp.new('\t|')
32
+ PARA_RE = Regexp.new(/<p>/i)
33
+
34
+ def hash
35
+ # Old News Sniffer only hashed the content, not the title
36
+ Digest::MD5.hexdigest(content.to_s)
37
+ end
38
+
39
+ private
40
+
41
+ def date_processor
42
+ begin
43
+ # OPD is in GMT/UTC, which DateTime seems to use by default
44
+ @date = DateTime.parse(@date)
45
+ rescue ArgumentError
46
+ @date = Time.now.utc
47
+ end
48
+ end
49
+
50
+ def content_processor
51
+ @content = STRIP_TAGS_RE.gsub(@content, '')
52
+ @content = WHITESPACE_RE.gsub(@content, '')
53
+ @content = decode_entities(@content)
54
+ @content = @content.split(PARA_RE)
55
+ end
56
+
57
+ end
58
+
59
+ # BbcNewsPageParserV2 parses BBC News web pages
60
+ class BbcNewsPageParserV2 < WebPageParser::BaseParser
61
+
62
+ TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
63
+ DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
64
+ CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
65
+ STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
66
+ STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
67
+ STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
68
+ STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
69
+ WHITESPACE_RE = ORegexp.new('[\t ]+')
70
+ PARA_RE = Regexp.new('</?p[^>]*>')
71
+
72
+ private
73
+
74
+ def content_processor
75
+ @content = STRIP_CAPTIONS_RE.gsub(@content, '')
76
+ @content = STRIP_COMMENTS_RE.gsub(@content, '')
77
+ @content = STRIP_BLOCKS_RE.gsub(@content, '')
78
+ @content = STRIP_TAGS_RE.gsub(@content, '')
79
+ @content = WHITESPACE_RE.gsub(@content, ' ')
80
+ @content = @content.split(PARA_RE)
81
+ end
82
+
83
+ def date_processor
84
+ begin
85
+ # OPD is in GMT/UTC, which DateTime seems to use by default
86
+ @date = DateTime.parse(@date)
87
+ rescue ArgumentError
88
+ @date = Time.now.utc
89
+ end
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,15 @@
1
+ class TestPageParserFactory < WebPageParser::ParserFactory
2
+ @url_regexp = Regexp.new("www.example.com")
3
+
4
+ def self.can_parse?(options = {})
5
+ @url_regexp.match(options[:url])
6
+ end
7
+
8
+ def self.create(options = {})
9
+ TestPageParser.new(options)
10
+ end
11
+ end
12
+
13
+ class TestPageParser < WebPageParser::BaseParser
14
+
15
+ end
@@ -0,0 +1,4 @@
1
+ # $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
2
+
3
+ require 'web-page-parser/base_parser.rb'
4
+ require 'web-page-parser/parser_factory.rb'
@@ -0,0 +1,67 @@
1
+ # -*- coding: utf-8 -*-
2
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
3
+ require 'web-page-parser'
4
+
5
+ share_as :AllPageParsers do
6
+ it "is initialized with a hash containing :url and :page keys" do
7
+ wpp = WebPageParser::BaseParser.new(@valid_options)
8
+ wpp.url.should == @valid_options[:url]
9
+ wpp.page.should == @valid_options[:page]
10
+ end
11
+
12
+ it "should return an empty array when there is no content available" do
13
+ content = WebPageParser::BaseParser.new.content
14
+ content.should be_a_kind_of Array
15
+ content.empty?.should be_true
16
+ end
17
+
18
+ context "when hashing the content" do
19
+ before :each do
20
+ @wpp = WebPageParser::BaseParser.new(@valid_options)
21
+ @hash = @wpp.hash
22
+ end
23
+
24
+ it "calculates a hash using the title" do
25
+ @wpp.instance_eval("@title='different'")
26
+ @wpp.hash.should_not == @hash
27
+ end
28
+
29
+ it "does not calculates a hash using the date" do
30
+ @wpp.instance_eval("@date=Time.now")
31
+ @wpp.hash.should == @hash
32
+ end
33
+
34
+ it "calculates a hash using the content" do
35
+ @wpp.instance_eval("@content='different'")
36
+ @wpp.hash.should_not == @hash
37
+ end
38
+ end
39
+ end
40
+
41
+ describe WebPageParser::BaseParser do
42
+ it_should_behave_like AllPageParsers
43
+
44
+ before :each do
45
+ @valid_options = {
46
+ :url => 'http://news.bbc.co.uk',
47
+ :page => '<html></html>',
48
+ :valid_hash => 'cfcd208495d565ef66e7dff9f98764da'
49
+ }
50
+ end
51
+
52
+ it "should decode basic html entities" do
53
+ bp = WebPageParser::BaseParser.new
54
+ entities = {
55
+ '&quot;' => '"',
56
+ '&apos;' => "'",
57
+ '&amp;' => "&",
58
+ '&pound;' => '£',
59
+ '&aacute;' => 'á'
60
+ }
61
+ entities.each do |e,v|
62
+ bp.decode_entities(e).should == v
63
+ end
64
+ end
65
+
66
+
67
+ end