web-page-parser 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/README.rdoc +31 -0
- data/lib/web-page-parser/base_parser.rb +149 -0
- data/lib/web-page-parser/parser_factory.rb +54 -0
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +93 -0
- data/lib/web-page-parser/parsers/test_page_parser.rb +15 -0
- data/lib/web-page-parser.rb +4 -0
- data/spec/base_parser_spec.rb +67 -0
- data/spec/fixtures/bbc_news/6072486.stm.html +1318 -0
- data/spec/fixtures/bbc_news/7745137.stm.html +2177 -0
- data/spec/fixtures/bbc_news/8011268.stm.html +2899 -0
- data/spec/fixtures/bbc_news/8029015.stm.html +2417 -0
- data/spec/fixtures/bbc_news/8063681.stm.html +2382 -0
- data/spec/parser_factory_spec.rb +18 -0
- data/spec/parsers/bbc_news_page_spec.rb +144 -0
- data/spec/spec.opts +4 -0
- metadata +92 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
require 'web-page-parser'
|
3
|
+
include WebPageParser
|
4
|
+
|
5
|
+
describe ParserFactory do
|
6
|
+
|
7
|
+
it "should load parsers in the parsers directory" do
|
8
|
+
ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should provide the right PageParser for the given url" do
|
12
|
+
ParserFactory.parser_for(:url => "http://www.example.com").should be_a_kind_of TestPageParser
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return nil if no PageParser can be found for the given url" do
|
16
|
+
ParserFactory.parser_for(:url => "http://www.nowhere.nodomain").should be_nil
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), '../../lib')
|
2
|
+
require 'spec/base_parser_spec'
|
3
|
+
require 'web-page-parser'
|
4
|
+
include WebPageParser
|
5
|
+
|
6
|
+
describe BbcNewsPageParserFactory do
|
7
|
+
before do
|
8
|
+
@valid_urls = [
|
9
|
+
"http://news.bbc.co.uk/1/hi/entertainment/6984082.stm",
|
10
|
+
"http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
|
11
|
+
"http://news.bbc.co.uk/1/hi/uk/7995652.stm",
|
12
|
+
"http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
|
13
|
+
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
|
14
|
+
]
|
15
|
+
@invalid_urls = [
|
16
|
+
"http://news.bbc.co.uk/2/hi/health/default.stm",
|
17
|
+
"http://news.bbc.co.uk/2/low/europe/default.stm",
|
18
|
+
"http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
|
19
|
+
"http://news.bbc.co.uk/sport",
|
20
|
+
"http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
|
21
|
+
"http://www.bbc.co.uk/blogs/nickrobinson/",
|
22
|
+
"http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
|
23
|
+
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
|
24
|
+
]
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should detect bbc news articles from the url" do
|
28
|
+
@valid_urls.each do |url|
|
29
|
+
BbcNewsPageParserFactory.can_parse?(:url => url).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should ignore pages with the wrong url format" do
|
34
|
+
@invalid_urls.each do |url|
|
35
|
+
BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should ignore 'in pictures' articles" do
|
40
|
+
BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe BbcNewsPageParserV2 do
|
45
|
+
it_should_behave_like AllPageParsers
|
46
|
+
before do
|
47
|
+
@valid_options = {
|
48
|
+
:url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
|
49
|
+
:page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
|
50
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
51
|
+
}
|
52
|
+
@pa = BbcNewsPageParserV2.new(@valid_options)
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should parse the title" do
|
56
|
+
@pa.title.should == "Obama invites Middle East heads"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should convert iso-8859-1 in the title to utf8" do
|
60
|
+
page = BbcNewsPageParserV2.new(:page => '<meta name="Headline" content="'+"\243"+'100K mortgage claim by Woodward"')
|
61
|
+
page.title.should == "£100K mortgage claim by Woodward"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should convert iso-8859-1 in the content to utf8" do
|
65
|
+
page = BbcNewsPageParserV2.new(:page => "S BO -->\243100K mortgage claim by Woodward<!-- E BO")
|
66
|
+
page.content.first.should == "£100K mortgage claim by Woodward"
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
it "should parse the date in UTC" do
|
71
|
+
# 2009/04/21 19:50:44
|
72
|
+
@pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
|
73
|
+
@pa.date.zone.should == '+00:00'
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should parse the content" do
|
77
|
+
@pa.content[0].should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
|
78
|
+
@pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
|
79
|
+
@pa.content.size.should == 15
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should decode html entities" do
|
83
|
+
@pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should calculate a valid hash of the content" do
|
87
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should parse 'from our own correspondent' pages" do
|
91
|
+
page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/programmes/from_our_own_correspondent/8029015.stm",
|
92
|
+
:page => File.read("spec/fixtures/bbc_news/8029015.stm.html"))
|
93
|
+
page.title.should == "Cairo's terrifying traffic chaos"
|
94
|
+
page.content.first.should == "Christian Fraser discovers that a brush with death on Cairo's congested roads leaves no appetite for life in the fast lane."
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should parse 'magazine' pages" do
|
98
|
+
page = BbcNewsPageParserV2.new(:url => "http://news.bbc.co.uk/1/hi/magazine/8063681.stm",
|
99
|
+
:page => File.read("spec/fixtures/bbc_news/8063681.stm.html"))
|
100
|
+
page.title.should == "My night with Parisien prostitutes"
|
101
|
+
page.content.first.should == "Wandering around the red light district of Paris as a teenager taught me all I need to know - about teenagers, not women, says Laurie Taylor in his weekly column."
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
describe BbcNewsPageParserV1 do
|
107
|
+
before do
|
108
|
+
@valid_options = {
|
109
|
+
:url => 'http://news.bbc.co.uk/1/hi/england/bradford/6072486.stm',
|
110
|
+
:page => File.read("spec/fixtures/bbc_news/6072486.stm.html"),
|
111
|
+
:valid_hash => 'aaf7ed1219eb69c3126ea5d0774fbe7d'
|
112
|
+
}
|
113
|
+
@pa = BbcNewsPageParserV1.new(@valid_options)
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should parse the title" do
|
117
|
+
@pa.title.should == "Son-in-law remanded over killing"
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should parse the date in UTC" do
|
121
|
+
@pa.date.should == DateTime.parse("Sat Oct 21 14:41:10 +0000 2006")
|
122
|
+
@pa.date.zone.should == '+00:00'
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should parse the content exactly like the old News Sniffer library" do
|
126
|
+
@pa.content.first.should == "<B>The son-in-law of a 73-year-old Castleford widow has been charged with her murder.</B>"
|
127
|
+
@pa.content.last.should == 'He denied the charges against him through his solicitor and is due to appear at Leeds Crown Court on Friday.'
|
128
|
+
@pa.content.size.should == 5
|
129
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
130
|
+
end
|
131
|
+
|
132
|
+
it "should convert apostrophe and pound sign html entities in content" do
|
133
|
+
@pa = BbcNewsPageParserV1.new :page => 'S SF -->John's code sucks & blows<!-- E BO'
|
134
|
+
@pa.content.to_s.should match Regexp.new("John's")
|
135
|
+
@pa.content.to_s.should match /sucks & blows/
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should convert apostrophe and pound sign html entities in page titles" do
|
139
|
+
@pa = BbcNewsPageParserV1.new :page => '<meta name="Headline" content="John's code sucks & blows!"/>'
|
140
|
+
@pa.title.should match Regexp.new("John's")
|
141
|
+
@pa.title.should match /sucks & blows/
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
data/spec/spec.opts
ADDED
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web-page-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.10"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Leach
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-20 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: oniguruma
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.1.0
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 4.0.0
|
34
|
+
version:
|
35
|
+
description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
|
36
|
+
email: john@johnleach.co.uk
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README.rdoc
|
43
|
+
- LICENSE
|
44
|
+
files:
|
45
|
+
- lib/web-page-parser
|
46
|
+
- lib/web-page-parser/base_parser.rb
|
47
|
+
- lib/web-page-parser/parsers
|
48
|
+
- lib/web-page-parser/parsers/test_page_parser.rb
|
49
|
+
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
50
|
+
- lib/web-page-parser/parser_factory.rb
|
51
|
+
- lib/web-page-parser.rb
|
52
|
+
- README.rdoc
|
53
|
+
- LICENSE
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: http://github.com/johnl/web-page-parser/tree/master
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
version:
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "0"
|
72
|
+
version:
|
73
|
+
requirements: []
|
74
|
+
|
75
|
+
rubyforge_project: web-page-parser
|
76
|
+
rubygems_version: 1.3.1
|
77
|
+
signing_key:
|
78
|
+
specification_version: 2
|
79
|
+
summary: A parser for web pages
|
80
|
+
test_files:
|
81
|
+
- spec/parser_factory_spec.rb
|
82
|
+
- spec/base_parser_spec.rb
|
83
|
+
- spec/fixtures
|
84
|
+
- spec/fixtures/bbc_news
|
85
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
86
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
87
|
+
- spec/fixtures/bbc_news/6072486.stm.html
|
88
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
89
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
90
|
+
- spec/parsers
|
91
|
+
- spec/parsers/bbc_news_page_spec.rb
|
92
|
+
- spec/spec.opts
|