web-page-parser 0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README.rdoc +31 -0
- data/lib/web-page-parser/base_parser.rb +149 -0
- data/lib/web-page-parser/parser_factory.rb +54 -0
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +93 -0
- data/lib/web-page-parser/parsers/test_page_parser.rb +15 -0
- data/lib/web-page-parser.rb +4 -0
- data/spec/base_parser_spec.rb +67 -0
- data/spec/fixtures/bbc_news/6072486.stm.html +1318 -0
- data/spec/fixtures/bbc_news/7745137.stm.html +2177 -0
- data/spec/fixtures/bbc_news/8011268.stm.html +2899 -0
- data/spec/fixtures/bbc_news/8029015.stm.html +2417 -0
- data/spec/fixtures/bbc_news/8063681.stm.html +2382 -0
- data/spec/parser_factory_spec.rb +18 -0
- data/spec/parsers/bbc_news_page_spec.rb +144 -0
- data/spec/spec.opts +4 -0
- metadata +92 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2009 John Leach <john@johnleach.co.uk>
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
22
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
= Web Page Parser
|
2
|
+
|
3
|
+
Web Page Parser is a Ruby library to parse the content out of web
|
4
|
+
pages, such as BBC News pages. It strips all non-textual stuff out,
|
5
|
+
leaving the title, publication date and an array of paragraphs. It
|
6
|
+
currently only supports BBC News pages but new parsers are planned and
|
7
|
+
can be added easily.
|
8
|
+
|
9
|
+
It is used by the {News Sniffer}[http://www.newssniffer.co.uk] project, which parses and archives news
|
10
|
+
articles to keep track of how they change.
|
11
|
+
|
12
|
+
== Example usage
|
13
|
+
|
14
|
+
require 'web-page-parser'
|
15
|
+
require 'open-uri'
|
16
|
+
|
17
|
+
url = "http://news.bbc.co.uk/1/hi/uk/8041972.stm"
|
18
|
+
page_data = open(url).read
|
19
|
+
|
20
|
+
page = WebPageParser::ParserFactory.parser_for(:url => url, :page => page_data)
|
21
|
+
|
22
|
+
puts page.title # MPs hit back over expenses claims
|
23
|
+
puts page.date # 2009-05-09T18:58:59+00:00
|
24
|
+
puts page.content.first # The wife of author Ken Follett and ...
|
25
|
+
|
26
|
+
== More Info
|
27
|
+
|
28
|
+
Web Page Parser was written by {John Leach}[http://johnleach.co.uk].
|
29
|
+
|
30
|
+
The code is available on {github}[http://github.com/johnl/web-page-parser/tree/master].
|
31
|
+
|
@@ -0,0 +1,149 @@
|
|
1
|
+
|
2
|
+
module WebPageParser
|
3
|
+
require 'digest'
|
4
|
+
require 'date'
|
5
|
+
require 'oniguruma'
|
6
|
+
require 'htmlentities'
|
7
|
+
require 'iconv'
|
8
|
+
|
9
|
+
# BaseParse is designed to be sub-classed to write new parsers. It
|
10
|
+
# provides some basic help but most of the work needs to be done by
|
11
|
+
# the sub-class.
|
12
|
+
#
|
13
|
+
# Simple pages could be implemented by just defining new regular
|
14
|
+
# expression constants, but more advanced parsing can be achieved
|
15
|
+
# with the *_processor methods.
|
16
|
+
#
|
17
|
+
class BaseParser
|
18
|
+
include Oniguruma
|
19
|
+
|
20
|
+
attr_reader :url, :guid, :page
|
21
|
+
|
22
|
+
ICONV = Iconv.new("utf8", "iso-8859-1")
|
23
|
+
|
24
|
+
# The regular expression to extract the title
|
25
|
+
TITLE_RE = //
|
26
|
+
|
27
|
+
# The regular expression to extract the date
|
28
|
+
DATE_RE = //
|
29
|
+
|
30
|
+
# The regular expression to extract the content
|
31
|
+
CONTENT_RE = //
|
32
|
+
|
33
|
+
# The regular expression to find all characters that should be
|
34
|
+
# removed from any content.
|
35
|
+
KILL_CHARS_RE = ORegexp.new('[\n\r]+')
|
36
|
+
|
37
|
+
# The object used to turn HTML entities into real charaters
|
38
|
+
HTML_ENTITIES_DECODER = HTMLEntities.new
|
39
|
+
|
40
|
+
# takes a has of options. The :url option passes the page url, and
|
41
|
+
# the :page option passes the raw html page content for parsing
|
42
|
+
def initialize(options = { })
|
43
|
+
@url = options[:url]
|
44
|
+
@page = options[:page]
|
45
|
+
end
|
46
|
+
|
47
|
+
# The title method returns the title of the web page.
|
48
|
+
#
|
49
|
+
# It does the basic extraction using the TITLE_RE regular
|
50
|
+
# expression and handles text encoding. More advanced parsing can
|
51
|
+
# be done by overriding the title_processor method.
|
52
|
+
def title
|
53
|
+
return @title if @title
|
54
|
+
if matches = class_const(:TITLE_RE).match(page)
|
55
|
+
@title = matches[1].to_s.strip
|
56
|
+
title_processor
|
57
|
+
@title = iconv(@title)
|
58
|
+
@title = decode_entities(@title)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# The date method returns a the timestamp of the web page, as a
|
63
|
+
# DateTime object.
|
64
|
+
#
|
65
|
+
# It does the basic extraction using the DATE_RE regular
|
66
|
+
# expression but the work of converting the text into a DateTime
|
67
|
+
# object needs to be done by the date_processor method.
|
68
|
+
def date
|
69
|
+
return @date if @date
|
70
|
+
if matches = class_const(:DATE_RE).match(page)
|
71
|
+
@date = matches[1].to_s.strip
|
72
|
+
date_processor
|
73
|
+
@date
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# The content method returns the important body text of the web page.
|
78
|
+
#
|
79
|
+
# It does basic extraction and pre-processing of the page content
|
80
|
+
# and then calls the content_processor method for any other more
|
81
|
+
# custom processing work that needs doing. Lastly, it does some
|
82
|
+
# basic post processing and returns the content as a string.
|
83
|
+
#
|
84
|
+
# When writing a new parser, the CONTENT_RE constant should be
|
85
|
+
# defined in the subclass. The KILL_CHARS_RE constant can be
|
86
|
+
# overridden if necessary.
|
87
|
+
def content
|
88
|
+
return @content if @content
|
89
|
+
matches = class_const(:CONTENT_RE).match(page)
|
90
|
+
if matches
|
91
|
+
@content = class_const(:KILL_CHARS_RE).gsub(matches[1].to_s, '')
|
92
|
+
@content = iconv(@content)
|
93
|
+
content_processor
|
94
|
+
@content.collect! { |p| decode_entities(p.strip) }
|
95
|
+
@content.delete_if { |p| p == '' or p.nil? }
|
96
|
+
end
|
97
|
+
@content = [] if @content.nil?
|
98
|
+
@content
|
99
|
+
end
|
100
|
+
|
101
|
+
# Return a hash representing the textual content of this web page
|
102
|
+
def hash
|
103
|
+
digest = Digest::MD5.new
|
104
|
+
digest << title.to_s
|
105
|
+
digest << content.to_s
|
106
|
+
digest.to_s
|
107
|
+
end
|
108
|
+
|
109
|
+
# Convert html entities to unicode
|
110
|
+
def decode_entities(s)
|
111
|
+
HTML_ENTITIES_DECODER.decode(s)
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
# get the constant from this objects class
|
117
|
+
def class_const(sym)
|
118
|
+
self.class.const_get(sym)
|
119
|
+
end
|
120
|
+
|
121
|
+
# Convert the encoding of the given text if necessary
|
122
|
+
def iconv(s)
|
123
|
+
if class_const(:ICONV)
|
124
|
+
class_const(:ICONV).iconv(s)
|
125
|
+
else
|
126
|
+
s
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Custom content parsing. It should split the @content up into an
|
131
|
+
# array of paragraphs. Conversion to utf8 is done after this method.
|
132
|
+
def content_processor
|
133
|
+
@content = @content.split(/<p>/)
|
134
|
+
end
|
135
|
+
|
136
|
+
# Custom date parsing. It should parse @date into a DateTime object
|
137
|
+
def date_processor
|
138
|
+
end
|
139
|
+
|
140
|
+
# Custom title parsing. It should clean up @title as
|
141
|
+
# necessary. Conversion to utf8 is done after this method.
|
142
|
+
def title_processor
|
143
|
+
end
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
end
|
149
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module WebPageParser
|
2
|
+
require 'oniguruma'
|
3
|
+
class ParserFactory
|
4
|
+
include Oniguruma
|
5
|
+
|
6
|
+
# return true if the Parser can handle the given page. options
|
7
|
+
# hash must have a :url key
|
8
|
+
def can_parse?(options = {})
|
9
|
+
false
|
10
|
+
end
|
11
|
+
|
12
|
+
# Allocate a new parser. options hash is passed to new method of
|
13
|
+
# parser class.
|
14
|
+
def create(options = {})
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
|
18
|
+
@@factories = []
|
19
|
+
|
20
|
+
def self.add_factory(f)
|
21
|
+
@@factories << f unless @@factories.include? f
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.factories
|
25
|
+
@@factories
|
26
|
+
end
|
27
|
+
|
28
|
+
# Return a PageParser than can parse the given page. options hash
|
29
|
+
# must have a :url key
|
30
|
+
def self.parser_for(options = {})
|
31
|
+
@@factories.each do |factory|
|
32
|
+
return factory.create(options) if factory.can_parse?(options)
|
33
|
+
end
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
# Load all the plugins in the given directory
|
38
|
+
def self.load(dirname)
|
39
|
+
Dir.open(dirname).each do |fn|
|
40
|
+
next unless fn =~ /page_parser\.rb$/
|
41
|
+
require File.join(dirname, fn)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Keep track of any newly defined factories
|
46
|
+
def self.inherited(factory)
|
47
|
+
self.add_factory(factory)
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
ParserFactory.load(File.join(File.dirname(__FILE__), 'parsers'))
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module WebPageParser
|
3
|
+
|
4
|
+
class BbcNewsPageParserFactory < WebPageParser::ParserFactory
|
5
|
+
URL_RE = ORegexp.new("news\.bbc\.co\.uk/.*/[0-9]+\.stm")
|
6
|
+
INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups")
|
7
|
+
|
8
|
+
def self.can_parse?(options)
|
9
|
+
if INVALID_URL_RE.match(options[:url])
|
10
|
+
nil
|
11
|
+
else
|
12
|
+
URL_RE.match(options[:url])
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.create(options = {})
|
17
|
+
BbcNewsPageParserV2.new(options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# BbcNewsPageParserV1 parses BBC News web pages exactly like the
|
22
|
+
# old News Sniffer BbcNewsPage class did. This should only ever
|
23
|
+
# be used for backwards compatability with News Sniffer and is
|
24
|
+
# never supplied for use by a factory.
|
25
|
+
class BbcNewsPageParserV1 < WebPageParser::BaseParser
|
26
|
+
|
27
|
+
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
28
|
+
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
29
|
+
CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
|
30
|
+
STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
|
31
|
+
WHITESPACE_RE = ORegexp.new('\t|')
|
32
|
+
PARA_RE = Regexp.new(/<p>/i)
|
33
|
+
|
34
|
+
def hash
|
35
|
+
# Old News Sniffer only hashed the content, not the title
|
36
|
+
Digest::MD5.hexdigest(content.to_s)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def date_processor
|
42
|
+
begin
|
43
|
+
# OPD is in GMT/UTC, which DateTime seems to use by default
|
44
|
+
@date = DateTime.parse(@date)
|
45
|
+
rescue ArgumentError
|
46
|
+
@date = Time.now.utc
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def content_processor
|
51
|
+
@content = STRIP_TAGS_RE.gsub(@content, '')
|
52
|
+
@content = WHITESPACE_RE.gsub(@content, '')
|
53
|
+
@content = decode_entities(@content)
|
54
|
+
@content = @content.split(PARA_RE)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
# BbcNewsPageParserV2 parses BBC News web pages
|
60
|
+
class BbcNewsPageParserV2 < WebPageParser::BaseParser
|
61
|
+
|
62
|
+
TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
|
63
|
+
DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
|
64
|
+
CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
|
65
|
+
STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
|
66
|
+
STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
|
67
|
+
STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
|
68
|
+
STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+<!-- END - caption -->')
|
69
|
+
WHITESPACE_RE = ORegexp.new('[\t ]+')
|
70
|
+
PARA_RE = Regexp.new('</?p[^>]*>')
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def content_processor
|
75
|
+
@content = STRIP_CAPTIONS_RE.gsub(@content, '')
|
76
|
+
@content = STRIP_COMMENTS_RE.gsub(@content, '')
|
77
|
+
@content = STRIP_BLOCKS_RE.gsub(@content, '')
|
78
|
+
@content = STRIP_TAGS_RE.gsub(@content, '')
|
79
|
+
@content = WHITESPACE_RE.gsub(@content, ' ')
|
80
|
+
@content = @content.split(PARA_RE)
|
81
|
+
end
|
82
|
+
|
83
|
+
def date_processor
|
84
|
+
begin
|
85
|
+
# OPD is in GMT/UTC, which DateTime seems to use by default
|
86
|
+
@date = DateTime.parse(@date)
|
87
|
+
rescue ArgumentError
|
88
|
+
@date = Time.now.utc
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class TestPageParserFactory < WebPageParser::ParserFactory
|
2
|
+
@url_regexp = Regexp.new("www.example.com")
|
3
|
+
|
4
|
+
def self.can_parse?(options = {})
|
5
|
+
@url_regexp.match(options[:url])
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.create(options = {})
|
9
|
+
TestPageParser.new(options)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class TestPageParser < WebPageParser::BaseParser
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
$:.unshift File.join(File.dirname(__FILE__), '../lib')
|
3
|
+
require 'web-page-parser'
|
4
|
+
|
5
|
+
share_as :AllPageParsers do
|
6
|
+
it "is initialized with a hash containing :url and :page keys" do
|
7
|
+
wpp = WebPageParser::BaseParser.new(@valid_options)
|
8
|
+
wpp.url.should == @valid_options[:url]
|
9
|
+
wpp.page.should == @valid_options[:page]
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return an empty array when there is no content available" do
|
13
|
+
content = WebPageParser::BaseParser.new.content
|
14
|
+
content.should be_a_kind_of Array
|
15
|
+
content.empty?.should be_true
|
16
|
+
end
|
17
|
+
|
18
|
+
context "when hashing the content" do
|
19
|
+
before :each do
|
20
|
+
@wpp = WebPageParser::BaseParser.new(@valid_options)
|
21
|
+
@hash = @wpp.hash
|
22
|
+
end
|
23
|
+
|
24
|
+
it "calculates a hash using the title" do
|
25
|
+
@wpp.instance_eval("@title='different'")
|
26
|
+
@wpp.hash.should_not == @hash
|
27
|
+
end
|
28
|
+
|
29
|
+
it "does not calculates a hash using the date" do
|
30
|
+
@wpp.instance_eval("@date=Time.now")
|
31
|
+
@wpp.hash.should == @hash
|
32
|
+
end
|
33
|
+
|
34
|
+
it "calculates a hash using the content" do
|
35
|
+
@wpp.instance_eval("@content='different'")
|
36
|
+
@wpp.hash.should_not == @hash
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe WebPageParser::BaseParser do
|
42
|
+
it_should_behave_like AllPageParsers
|
43
|
+
|
44
|
+
before :each do
|
45
|
+
@valid_options = {
|
46
|
+
:url => 'http://news.bbc.co.uk',
|
47
|
+
:page => '<html></html>',
|
48
|
+
:valid_hash => 'cfcd208495d565ef66e7dff9f98764da'
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should decode basic html entities" do
|
53
|
+
bp = WebPageParser::BaseParser.new
|
54
|
+
entities = {
|
55
|
+
'"' => '"',
|
56
|
+
''' => "'",
|
57
|
+
'&' => "&",
|
58
|
+
'£' => '£',
|
59
|
+
'á' => 'á'
|
60
|
+
}
|
61
|
+
entities.each do |e,v|
|
62
|
+
bp.decode_entities(e).should == v
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
end
|