web-page-parser 0.10 → 0.21
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/web-page-parser/base_parser.rb +1 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +45 -5
- data/spec/fixtures/bbc_news/10249066.stm.html +1361 -0
- data/spec/fixtures/bbc_news/10341015.stm.html +1278 -0
- data/spec/fixtures/bbc_news/11125504.html +1481 -0
- data/spec/parsers/bbc_news_page_spec.rb +65 -3
- metadata +58 -23
@@ -10,17 +10,24 @@ describe BbcNewsPageParserFactory do
|
|
10
10
|
"http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
|
11
11
|
"http://news.bbc.co.uk/1/hi/uk/7995652.stm",
|
12
12
|
"http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
|
13
|
-
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
|
13
|
+
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
|
14
|
+
"http://www.bbc.co.uk/news/business-11125504",
|
15
|
+
"http://www.bbc.co.uk/news/10604897"
|
14
16
|
]
|
15
17
|
@invalid_urls = [
|
16
18
|
"http://news.bbc.co.uk/2/hi/health/default.stm",
|
17
19
|
"http://news.bbc.co.uk/2/low/europe/default.stm",
|
18
20
|
"http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
|
19
21
|
"http://news.bbc.co.uk/sport",
|
22
|
+
"http://news.bbc.co.uk/sport1/hi/tennis/8951357.stm",
|
20
23
|
"http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
|
21
24
|
"http://www.bbc.co.uk/blogs/nickrobinson/",
|
22
25
|
"http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
|
23
|
-
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
|
26
|
+
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm",
|
27
|
+
"http://www.bbc.co.uk/blogs/theeditors/",
|
28
|
+
"http://www.bbc.co.uk/news/have_your_say/",
|
29
|
+
"http://news.bbc.co.uk/1/hi/magazine/default.stm",
|
30
|
+
"http://news.bbc.co.uk/1/hi/in_pictures/default.stm"
|
24
31
|
]
|
25
32
|
end
|
26
33
|
|
@@ -41,6 +48,61 @@ describe BbcNewsPageParserFactory do
|
|
41
48
|
end
|
42
49
|
end
|
43
50
|
|
51
|
+
describe BbcNewsPageParserV4 do
|
52
|
+
it_should_behave_like AllPageParsers
|
53
|
+
before do
|
54
|
+
@valid_options = {
|
55
|
+
:url => 'http://www.bbc.co.uk/news/business-11125504',
|
56
|
+
:page => File.read("spec/fixtures/bbc_news/11125504.html"),
|
57
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
58
|
+
}
|
59
|
+
@pa = BbcNewsPageParserV4.new(@valid_options)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should parse the title" do
|
63
|
+
@pa.title.should == "UK economy 'to pick up in near term'"
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should parse the content" do
|
67
|
+
@pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
|
68
|
+
@pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
|
69
|
+
@pa.content.size.should == 18
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
describe BbcNewsPageParserV3 do
|
76
|
+
it_should_behave_like AllPageParsers
|
77
|
+
before do
|
78
|
+
@valid_options = {
|
79
|
+
:url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
|
80
|
+
:page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
|
81
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
82
|
+
}
|
83
|
+
@pa = BbcNewsPageParserV3.new(@valid_options)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should parse the content" do
|
87
|
+
@pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
|
88
|
+
@pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
|
89
|
+
@pa.content.size.should == 24
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should parse the content of an article with two captions" do
|
93
|
+
@pa = BbcNewsPageParserV3.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
|
94
|
+
:page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
|
95
|
+
:valid_hash => 'unknown'
|
96
|
+
})
|
97
|
+
@pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
|
98
|
+
@pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
|
99
|
+
@pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
end
|
105
|
+
|
44
106
|
describe BbcNewsPageParserV2 do
|
45
107
|
it_should_behave_like AllPageParsers
|
46
108
|
before do
|
@@ -140,5 +202,5 @@ describe BbcNewsPageParserV1 do
|
|
140
202
|
@pa.title.should match Regexp.new("John's")
|
141
203
|
@pa.title.should match /sucks & blows/
|
142
204
|
end
|
143
|
-
|
205
|
+
|
144
206
|
end
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 33
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 21
|
9
|
+
version: "0.21"
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- John Leach
|
@@ -9,29 +14,41 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date:
|
17
|
+
date: 2010-08-30 00:00:00 +01:00
|
13
18
|
default_executable:
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: oniguruma
|
17
|
-
|
18
|
-
|
19
|
-
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
20
25
|
requirements:
|
21
26
|
- - ">="
|
22
27
|
- !ruby/object:Gem::Version
|
28
|
+
hash: 19
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 1
|
32
|
+
- 0
|
23
33
|
version: 1.1.0
|
24
|
-
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: htmlentities
|
27
|
-
|
28
|
-
|
29
|
-
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
30
41
|
requirements:
|
31
42
|
- - ">="
|
32
43
|
- !ruby/object:Gem::Version
|
44
|
+
hash: 63
|
45
|
+
segments:
|
46
|
+
- 4
|
47
|
+
- 0
|
48
|
+
- 0
|
33
49
|
version: 4.0.0
|
34
|
-
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
35
52
|
description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
|
36
53
|
email: john@johnleach.co.uk
|
37
54
|
executables: []
|
@@ -42,51 +59,69 @@ extra_rdoc_files:
|
|
42
59
|
- README.rdoc
|
43
60
|
- LICENSE
|
44
61
|
files:
|
45
|
-
- lib/web-page-parser
|
62
|
+
- lib/web-page-parser/parser_factory.rb
|
46
63
|
- lib/web-page-parser/base_parser.rb
|
47
|
-
- lib/web-page-parser/parsers
|
48
64
|
- lib/web-page-parser/parsers/test_page_parser.rb
|
49
65
|
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
50
|
-
- lib/web-page-parser/parser_factory.rb
|
51
66
|
- lib/web-page-parser.rb
|
52
67
|
- README.rdoc
|
53
68
|
- LICENSE
|
69
|
+
- spec/parser_factory_spec.rb
|
70
|
+
- spec/base_parser_spec.rb
|
71
|
+
- spec/parsers/bbc_news_page_spec.rb
|
72
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
73
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
74
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
75
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
76
|
+
- spec/fixtures/bbc_news/10249066.stm.html
|
77
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
78
|
+
- spec/fixtures/bbc_news/11125504.html
|
79
|
+
- spec/fixtures/bbc_news/6072486.stm.html
|
80
|
+
- spec/spec.opts
|
54
81
|
has_rdoc: true
|
55
82
|
homepage: http://github.com/johnl/web-page-parser/tree/master
|
83
|
+
licenses: []
|
84
|
+
|
56
85
|
post_install_message:
|
57
86
|
rdoc_options: []
|
58
87
|
|
59
88
|
require_paths:
|
60
89
|
- lib
|
61
90
|
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
62
92
|
requirements:
|
63
93
|
- - ">="
|
64
94
|
- !ruby/object:Gem::Version
|
95
|
+
hash: 3
|
96
|
+
segments:
|
97
|
+
- 0
|
65
98
|
version: "0"
|
66
|
-
version:
|
67
99
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
68
101
|
requirements:
|
69
102
|
- - ">="
|
70
103
|
- !ruby/object:Gem::Version
|
104
|
+
hash: 3
|
105
|
+
segments:
|
106
|
+
- 0
|
71
107
|
version: "0"
|
72
|
-
version:
|
73
108
|
requirements: []
|
74
109
|
|
75
110
|
rubyforge_project: web-page-parser
|
76
|
-
rubygems_version: 1.3.
|
111
|
+
rubygems_version: 1.3.7
|
77
112
|
signing_key:
|
78
|
-
specification_version:
|
113
|
+
specification_version: 3
|
79
114
|
summary: A parser for web pages
|
80
115
|
test_files:
|
81
116
|
- spec/parser_factory_spec.rb
|
82
117
|
- spec/base_parser_spec.rb
|
83
|
-
- spec/
|
84
|
-
- spec/fixtures/bbc_news
|
118
|
+
- spec/parsers/bbc_news_page_spec.rb
|
119
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
120
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
121
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
85
122
|
- spec/fixtures/bbc_news/8063681.stm.html
|
123
|
+
- spec/fixtures/bbc_news/10249066.stm.html
|
86
124
|
- spec/fixtures/bbc_news/8011268.stm.html
|
125
|
+
- spec/fixtures/bbc_news/11125504.html
|
87
126
|
- spec/fixtures/bbc_news/6072486.stm.html
|
88
|
-
- spec/fixtures/bbc_news/8029015.stm.html
|
89
|
-
- spec/fixtures/bbc_news/7745137.stm.html
|
90
|
-
- spec/parsers
|
91
|
-
- spec/parsers/bbc_news_page_spec.rb
|
92
127
|
- spec/spec.opts
|