web-page-parser 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2009 John Leach <john@johnleach.co.uk>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
data/README.rdoc ADDED
@@ -0,0 +1,31 @@
1
+ = Web Page Parser
2
+
3
+ Web Page Parser is a Ruby library to parse the content out of web
4
+ pages, such as BBC News pages. It strips all non-textual stuff out,
5
+ leaving the title, publication date and an array of paragraphs. It
6
+ currently only supports BBC News pages but new parsers are planned and
7
+ can be added easily.
8
+
9
+ It is used by the {News Sniffer}[http://www.newssniffer.co.uk] project, which parses and archives news
10
+ articles to keep track of how they change.
11
+
12
+ == Example usage
13
+
14
+ require 'web-page-parser'
15
+ require 'open-uri'
16
+
17
+ url = "http://news.bbc.co.uk/1/hi/uk/8041972.stm"
18
+ page_data = open(url).read
19
+
20
+ page = WebPageParser::ParserFactory.parser_for(:url => url, :page => page_data)
21
+
22
+ puts page.title # MPs hit back over expenses claims
23
+ puts page.date # 2009-05-09T18:58:59+00:00
24
+ puts page.content.first # The wife of author Ken Follett and ...
25
+
26
+ == More Info
27
+
28
+ Web Page Parser was written by {John Leach}[http://johnleach.co.uk].
29
+
30
+ The code is available on {github}[http://github.com/johnl/web-page-parser/tree/master].
31
+
@@ -0,0 +1,149 @@
1
+
2
+ module WebPageParser
3
+ require 'digest'
4
+ require 'date'
5
+ require 'oniguruma'
6
+ require 'htmlentities'
7
+ require 'iconv'
8
+
9
+ # BaseParse is designed to be sub-classed to write new parsers. It
10
+ # provides some basic help but most of the work needs to be done by
11
+ # the sub-class.
12
+ #
13
+ # Simple pages could be implemented by just defining new regular
14
+ # expression constants, but more advanced parsing can be achieved
15
+ # with the *_processor methods.
16
+ #
17
+ class BaseParser
18
+ include Oniguruma
19
+
20
+ attr_reader :url, :guid, :page
21
+
22
+ ICONV = Iconv.new("utf8", "iso-8859-1")
23
+
24
+ # The regular expression to extract the title
25
+ TITLE_RE = //
26
+
27
+ # The regular expression to extract the date
28
+ DATE_RE = //
29
+
30
+ # The regular expression to extract the content
31
+ CONTENT_RE = //
32
+
33
+ # The regular expression to find all characters that should be
34
+ # removed from any content.
35
+ KILL_CHARS_RE = ORegexp.new('[\n\r]+')
36
+
37
+ # The object used to turn HTML entities into real charaters
38
+ HTML_ENTITIES_DECODER = HTMLEntities.new
39
+
40
+ # takes a has of options. The :url option passes the page url, and
41
+ # the :page option passes the raw html page content for parsing
42
+ def initialize(options = { })
43
+ @url = options[:url]
44
+ @page = options[:page]
45
+ end
46
+
47
+ # The title method returns the title of the web page.
48
+ #
49
+ # It does the basic extraction using the TITLE_RE regular
50
+ # expression and handles text encoding. More advanced parsing can
51
+ # be done by overriding the title_processor method.
52
+ def title
53
+ return @title if @title
54
+ if matches = class_const(:TITLE_RE).match(page)
55
+ @title = matches[1].to_s.strip
56
+ title_processor
57
+ @title = iconv(@title)
58
+ @title = decode_entities(@title)
59
+ end
60
+ end
61
+
62
+ # The date method returns a the timestamp of the web page, as a
63
+ # DateTime object.
64
+ #
65
+ # It does the basic extraction using the DATE_RE regular
66
+ # expression but the work of converting the text into a DateTime
67
+ # object needs to be done by the date_processor method.
68
+ def date
69
+ return @date if @date
70
+ if matches = class_const(:DATE_RE).match(page)
71
+ @date = matches[1].to_s.strip
72
+ date_processor
73
+ @date
74
+ end
75
+ end
76
+
77
+ # The content method returns the important body text of the web page.
78
+ #
79
+ # It does basic extraction and pre-processing of the page content
80
+ # and then calls the content_processor method for any other more
81
+ # custom processing work that needs doing. Lastly, it does some
82
+ # basic post processing and returns the content as a string.
83
+ #
84
+ # When writing a new parser, the CONTENT_RE constant should be
85
+ # defined in the subclass. The KILL_CHARS_RE constant can be
86
+ # overridden if necessary.
87
+ def content
88
+ return @content if @content
89
+ matches = class_const(:CONTENT_RE).match(page)
90
+ if matches
91
+ @content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
92
+ @content = iconv(@content)
93
+ content_processor
94
+ @content.collect! { |p| decode_entities(p.strip) }
95
+ @content.delete_if { |p| p == '' or p.nil? }
96
+ end
97
+ @content = [] if @content.nil?
98
+ @content
99
+ end
100
+
101
+ # Return a hash representing the textual content of this web page
102
+ def hash
103
+ digest = Digest::MD5.new
104
+ digest << title.to_s
105
+ digest << content.to_s
106
+ digest.to_s
107
+ end
108
+
109
+ # Convert html entities to unicode
110
+ def decode_entities(s)
111
+ HTML_ENTITIES_DECODER.decode(s)
112
+ end
113
+
114
+ private
115
+
116
+ # get the constant from this objects class
117
+ def class_const(sym)
118
+ self.class.const_get(sym)
119
+ end
120
+
121
+ # Convert the encoding of the given text if necessary
122
+ def iconv(s)
123
+ if class_const(:ICONV)
124
+ class_const(:ICONV).iconv(s)
125
+ else
126
+ s
127
+ end
128
+ end
129
+
130
+ # Custom content parsing. It should split the @content up into an
131
+ # array of paragraphs. Conversion to utf8 is done after this method.
132
+ def content_processor
133
+ @content = @content.split(/<p>/)
134
+ end
135
+
136
+ # Custom date parsing. It should parse @date into a DateTime object
137
+ def date_processor
138
+ end
139
+
140
+ # Custom title parsing. It should clean up @title as
141
+ # necessary. Conversion to utf8 is done after this method.
142
+ def title_processor
143
+ end
144
+
145
+ end
146
+
147
+
148
+ end
149
+
@@ -0,0 +1,54 @@
1
+ module WebPageParser
2
+ require 'oniguruma'
3
+ class ParserFactory
4
+ include Oniguruma
5
+
6
+ # return true if the Parser can handle the given page. options
7
+ # hash must have a :url key
8
+ def can_parse?(options = {})
9
+ false
10
+ end
11
+
12
+ # Allocate a new parser. options hash is passed to new method of
13
+ # parser class.
14
+ def create(options = {})
15
+ nil
16
+ end
17
+
18
+ @@factories = []
19
+
20
+ def self.add_factory(f)
21
+ @@factories << f unless @@factories.include? f
22
+ end
23
+
24
+ def self.factories
25
+ @@factories
26
+ end
27
+
28
+ # Return a PageParser than can parse the given page. options hash
29
+ # must have a :url key
30
+ def self.parser_for(options = {})
31
+ @@factories.each do |factory|
32
+ return factory.create(options) if factory.can_parse?(options)
33
+ end
34
+ nil
35
+ end
36
+
37
+ # Load all the plugins in the given directory
38
+ def self.load(dirname)
39
+ Dir.open(dirname).each do |fn|
40
+ next unless fn =~ /page_parser\.rb$/
41
+ require File.join(dirname, fn)
42
+ end
43
+ end
44
+
45
+ # Keep track of any newly defined factories
46
+ def self.inherited(factory)
47
+ self.add_factory(factory)
48
+ end
49
+
50
+ end
51
+
52
+ ParserFactory.load(File.join(File.dirname(__FILE__), 'parsers'))
53
+
54
+ end
@@ -0,0 +1,93 @@
1
+ # -*- coding: utf-8 -*-
2
+ module WebPageParser
3
+
4
+ class BbcNewsPageParserFactory < WebPageParser::ParserFactory
5
+ URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
6
+ INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
7
+
8
+ def self.can_parse?(options)
9
+ if INVALID_URL_RE.match(options[:url])
10
+ nil
11
+ else
12
+ URL_RE.match(options[:url])
13
+ end
14
+ end
15
+
16
+ def self.create(options = {})
17
+ BbcNewsPageParserV2.new(options)
18
+ end
19
+ end
20
+
21
+ # BbcNewsPageParserV1 parses BBC News web pages exactly like the
22
+ # old News Sniffer BbcNewsPage class did. This should only ever
23
+ # be used for backwards compatability with News Sniffer and is
24
+ # never supplied for use by a factory.
25
+ class BbcNewsPageParserV1 < WebPageParser::BaseParser
26
+
27
+ TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
28
+ DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
29
+ CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
30
+ STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
31
+ WHITESPACE_RE = ORegexp.new('\t|')
32
+ PARA_RE = Regexp.new(/<p>/i)
33
+
34
+ def hash
35
+ # Old News Sniffer only hashed the content, not the title
36
+ Digest::MD5.hexdigest(content.to_s)
37
+ end
38
+
39
+ private
40
+
41
+ def date_processor
42
+ begin
43
+ # OPD is in GMT/UTC, which DateTime seems to use by default
44
+ @date = DateTime.parse(@date)
45
+ rescue ArgumentError
46
+ @date = Time.now.utc
47
+ end
48
+ end
49
+
50
+ def content_processor
51
+ @content = STRIP_TAGS_RE.gsub(@content, '')
52
+ @content = WHITESPACE_RE.gsub(@content, '')
53
+ @content = decode_entities(@content)
54
+ @content = @content.split(PARA_RE)
55
+ end
56
+
57
+ end
58
+
59
+ # BbcNewsPageParserV2 parses BBC News web pages
60
+ class BbcNewsPageParserV2 < WebPageParser::BaseParser
61
+
62
+ TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
63
+ DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
64
+ CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
65
+ STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
66
+ STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
67
+ STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
68
+ STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
69
+ WHITESPACE_RE = ORegexp.new('[\t ]+')
70
+ PARA_RE = Regexp.new('</?p[^>]*>')
71
+
72
+ private
73
+
74
+ def content_processor
75
+ @content = STRIP_CAPTIONS_RE.gsub(@content, '')
76
+ @content = STRIP_COMMENTS_RE.gsub(@content, '')
77
+ @content = STRIP_BLOCKS_RE.gsub(@content, '')
78
+ @content = STRIP_TAGS_RE.gsub(@content, '')
79
+ @content = WHITESPACE_RE.gsub(@content, ' ')
80
+ @content = @content.split(PARA_RE)
81
+ end
82
+
83
+ def date_processor
84
+ begin
85
+ # OPD is in GMT/UTC, which DateTime seems to use by default
86
+ @date = DateTime.parse(@date)
87
+ rescue ArgumentError
88
+ @date = Time.now.utc
89
+ end
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,15 @@
1
+ class TestPageParserFactory < WebPageParser::ParserFactory
2
+ @url_regexp = Regexp.new("www.example.com")
3
+
4
+ def self.can_parse?(options = {})
5
+ @url_regexp.match(options[:url])
6
+ end
7
+
8
+ def self.create(options = {})
9
+ TestPageParser.new(options)
10
+ end
11
+ end
12
+
13
+ class TestPageParser < WebPageParser::BaseParser
14
+
15
+ end
@@ -0,0 +1,4 @@
1
+ # $:.unshift File.join(File.dirname(__FILE__), 'web-page-parser')
2
+
3
+ require 'web-page-parser/base_parser.rb'
4
+ require 'web-page-parser/parser_factory.rb'
@@ -0,0 +1,67 @@
1
+ # -*- coding: utf-8 -*-
2
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
3
+ require 'web-page-parser'
4
+
5
+ share_as :AllPageParsers do
6
+ it "is initialized with a hash containing :url and :page keys" do
7
+ wpp = WebPageParser::BaseParser.new(@valid_options)
8
+ wpp.url.should == @valid_options[:url]
9
+ wpp.page.should == @valid_options[:page]
10
+ end
11
+
12
+ it "should return an empty array when there is no content available" do
13
+ content = WebPageParser::BaseParser.new.content
14
+ content.should be_a_kind_of Array
15
+ content.empty?.should be_true
16
+ end
17
+
18
+ context "when hashing the content" do
19
+ before :each do
20
+ @wpp = WebPageParser::BaseParser.new(@valid_options)
21
+ @hash = @wpp.hash
22
+ end
23
+
24
+ it "calculates a hash using the title" do
25
+ @wpp.instance_eval("@title='different'")
26
+ @wpp.hash.should_not == @hash
27
+ end
28
+
29
+ it "does not calculates a hash using the date" do
30
+ @wpp.instance_eval("@date=Time.now")
31
+ @wpp.hash.should == @hash
32
+ end
33
+
34
+ it "calculates a hash using the content" do
35
+ @wpp.instance_eval("@content='different'")
36
+ @wpp.hash.should_not == @hash
37
+ end
38
+ end
39
+ end
40
+
41
+ describe WebPageParser::BaseParser do
42
+ it_should_behave_like AllPageParsers
43
+
44
+ before :each do
45
+ @valid_options = {
46
+ :url => 'http://news.bbc.co.uk',
47
+ :page => '<html></html>',
48
+ :valid_hash => 'cfcd208495d565ef66e7dff9f98764da'
49
+ }
50
+ end
51
+
52
+ it "should decode basic html entities" do
53
+ bp = WebPageParser::BaseParser.new
54
+ entities = {
55
+ '&quot;' => '"',
56
+ '&apos;' => "'",
57
+ '&amp;' => "&",
58
+ '&pound;' => '£',
59
+ '&aacute;' => 'á'
60
+ }
61
+ entities.each do |e,v|
62
+ bp.decode_entities(e).should == v
63
+ end
64
+ end
65
+
66
+
67
+ end