web-page-parser 0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README.rdoc +31 -0
- data/lib/web-page-parser/base_parser.rb +149 -0
- data/lib/web-page-parser/parser_factory.rb +54 -0
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +93 -0
- data/lib/web-page-parser/parsers/test_page_parser.rb +15 -0
- data/lib/web-page-parser.rb +4 -0
- data/spec/base_parser_spec.rb +67 -0
- data/spec/fixtures/bbc_news/6072486.stm.html +1318 -0
- data/spec/fixtures/bbc_news/7745137.stm.html +2177 -0
- data/spec/fixtures/bbc_news/8011268.stm.html +2899 -0
- data/spec/fixtures/bbc_news/8029015.stm.html +2417 -0
- data/spec/fixtures/bbc_news/8063681.stm.html +2382 -0
- data/spec/parser_factory_spec.rb +18 -0
- data/spec/parsers/bbc_news_page_spec.rb +144 -0
- data/spec/spec.opts +4 -0
- metadata +92 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
require 'web-page-parser'
|
3
|
+
include WebPageParser
|
4
|
+
|
5
|
+
describe ParserFactory do
|
6
|
+
|
7
|
+
it "should load parsers in the parsers directory" do
|
8
|
+
ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should provide the right PageParser for the given url" do
|
12
|
+
ParserFactory.parser_for(:url => "http://www.example.com").should be_a_kind_of TestPageParser
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return nil if no PageParser can be found for the given url" do
|
16
|
+
ParserFactory.parser_for(:url => "http://www.nowhere.nodomain").should be_nil
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), '../../lib')
|
2
|
+
require 'spec/base_parser_spec'
|
3
|
+
require 'web-page-parser'
|
4
|
+
include WebPageParser
|
5
|
+
|
6
|
+
describe BbcNewsPageParserFactory do
|
7
|
+
before do
|
8
|
+
@valid_urls = [
|
9
|
+
"http://news.bbc.co.uk/1/hi/entertainment/6984082.stm",
|
10
|
+
"http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
|
11
|
+
"http://news.bbc.co.uk/1/hi/uk/7995652.stm",
|
12
|
+
"http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
|
13
|
+
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
|
14
|
+
]
|
15
|
+
@invalid_urls = [
|
16
|
+
"http://news.bbc.co.uk/2/hi/health/default.stm",
|
17
|
+
"http://news.bbc.co.uk/2/low/europe/default.stm",
|
18
|
+
"http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
|
19
|
+
"http://news.bbc.co.uk/sport",
|
20
|
+
"http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
|
21
|
+
"http://www.bbc.co.uk/blogs/nickrobinson/",
|
22
|
+
"http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
|
23
|
+
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
|
24
|
+
]
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should detect bbc news articles from the url" do
|
28
|
+
@valid_urls.each do |url|
|
29
|
+
BbcNewsPageParserFactory.can_parse?(:url => url).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should ignore pages with the wrong url format" do
|
34
|
+
@invalid_urls.each do |url|
|
35
|
+
BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should ignore 'in pictures' articles" do
|
40
|
+
BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe BbcNewsPageParserV2 do
|
45
|
+
it_should_behave_like AllPageParsers
|
46
|
+
before do
|
47
|
+
@valid_options = {
|
48
|
+
:url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
|
49
|
+
:page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
|
50
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
51
|
+
}
|
52
|
+
@pa = BbcNewsPageParserV2.new(@valid_options)
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should parse the title" do
|
56
|
+
@pa.title.should == "Obama invites Middle East heads"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should convert iso-8859-1 in the title to utf8" do
|
60
|
+
page = BbcNewsPageParserV2.new(:page => '<meta name="Headline" content="'+"\243"+'100K mortgage claim by Woodward"')
|
61
|
+
page.title.should == "£100K mortgage claim by Woodward"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should convert iso-8859-1 in the content to utf8" do
|
65
|
+
page = BbcNewsPageParserV2.new(:page => "S BO -->\243100K mortgage claim by Woodward<!-- E BO")
|
66
|
+
page.content.first.should == "£100K mortgage claim by Woodward"
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
it "should parse the date in UTC" do
|
71
|
+
# 2009/04/21 19:50:44
|
72
|
+
@pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
|
73
|
+
@pa.date.zone.should == '+00:00'
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should parse the content" do
|
77
|
+
@pa.content[0].should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
|
78
|
+
@pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
|
79
|
+
@pa.content.size.should == 15
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should decode html entities" do
|
83
|
+
@pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should calculate a valid hash of the content" do
|
87
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should parse 'from our own correspondent' pages" do
|
91
|
+
page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/programmes/from_our_own_correspondent/8029015.stm",
|
92
|
+
:page => File.read("spec/fixtures/bbc_news/8029015.stm.html"))
|
93
|
+
page.title.should == "Cairo's terrifying traffic chaos"
|
94
|
+
page.content.first.should == "Christian Fraser discovers that a brush with death on Cairo's congested roads leaves no appetite for life in the fast lane."
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should parse 'magazine' pages" do
|
98
|
+
page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/magazine/8063681.stm",
|
99
|
+
:page => File.read("spec/fixtures/bbc_news/8063681.stm.html"))
|
100
|
+
page.title.should == "My night with Parisien prostitutes"
|
101
|
+
page.content.first.should == "Wandering around the red light district of Paris as a teenager taught me all I need to know - about teenagers, not women, says Laurie Taylor in his weekly column."
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
describe BbcNewsPageParserV1 do
|
107
|
+
before do
|
108
|
+
@valid_options = {
|
109
|
+
:url => 'http://news.bbc.co.uk/1/hi/england/bradford/6072486.stm',
|
110
|
+
:page => File.read("spec/fixtures/bbc_news/6072486.stm.html"),
|
111
|
+
:valid_hash => 'aaf7ed1219eb69c3126ea5d0774fbe7d'
|
112
|
+
}
|
113
|
+
@pa = BbcNewsPageParserV1.new(@valid_options)
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should parse the title" do
|
117
|
+
@pa.title.should == "Son-in-law remanded over killing"
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should parse the date in UTC" do
|
121
|
+
@pa.date.should == DateTime.parse("Sat Oct 21 14:41:10 +0000 2006")
|
122
|
+
@pa.date.zone.should == '+00:00'
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should parse the content exactly like the old News Sniffer library" do
|
126
|
+
@pa.content.first.should == "<B>The son-in-law of a 73-year-old Castleford widow has been charged with her murder.</B>"
|
127
|
+
@pa.content.last.should == 'He denied the charges against him through his solicitor and is due to appear at Leeds Crown Court on Friday.'
|
128
|
+
@pa.content.size.should == 5
|
129
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
130
|
+
end
|
131
|
+
|
132
|
+
it "should convert apostrophe and pound sign html entities in content" do
|
133
|
+
@pa = BbcNewsPageParserV1.new :page => 'S SF -->John's code sucks & blows<!-- E BO'
|
134
|
+
@pa.content.to_s.should match Regexp.new("John's")
|
135
|
+
@pa.content.to_s.should match /sucks & blows/
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should convert apostrophe and pound sign html entities in page titles" do
|
139
|
+
@pa = BbcNewsPageParserV1.new :page => '<meta name="Headline" content="John's code sucks & blows!"/>'
|
140
|
+
@pa.title.should match Regexp.new("John's")
|
141
|
+
@pa.title.should match /sucks & blows/
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
data/spec/spec.opts
ADDED
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web-page-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.10"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Leach
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-20 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: oniguruma
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.1.0
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 4.0.0
|
34
|
+
version:
|
35
|
+
description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
|
36
|
+
email: john@johnleach.co.uk
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README.rdoc
|
43
|
+
- LICENSE
|
44
|
+
files:
|
45
|
+
- lib/web-page-parser
|
46
|
+
- lib/web-page-parser/base_parser.rb
|
47
|
+
- lib/web-page-parser/parsers
|
48
|
+
- lib/web-page-parser/parsers/test_page_parser.rb
|
49
|
+
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
50
|
+
- lib/web-page-parser/parser_factory.rb
|
51
|
+
- lib/web-page-parser.rb
|
52
|
+
- README.rdoc
|
53
|
+
- LICENSE
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: http://github.com/johnl/web-page-parser/tree/master
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
version:
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "0"
|
72
|
+
version:
|
73
|
+
requirements: []
|
74
|
+
|
75
|
+
rubyforge_project: web-page-parser
|
76
|
+
rubygems_version: 1.3.1
|
77
|
+
signing_key:
|
78
|
+
specification_version: 2
|
79
|
+
summary: A parser for web pages
|
80
|
+
test_files:
|
81
|
+
- spec/parser_factory_spec.rb
|
82
|
+
- spec/base_parser_spec.rb
|
83
|
+
- spec/fixtures
|
84
|
+
- spec/fixtures/bbc_news
|
85
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
86
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
87
|
+
- spec/fixtures/bbc_news/6072486.stm.html
|
88
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
89
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
90
|
+
- spec/parsers
|
91
|
+
- spec/parsers/bbc_news_page_spec.rb
|
92
|
+
- spec/spec.opts
|