web-page-parser 0.10 → 0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/web-page-parser/base_parser.rb +1 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +45 -5
- data/spec/fixtures/bbc_news/10249066.stm.html +1361 -0
- data/spec/fixtures/bbc_news/10341015.stm.html +1278 -0
- data/spec/fixtures/bbc_news/11125504.html +1481 -0
- data/spec/parsers/bbc_news_page_spec.rb +65 -3
- metadata +58 -23
@@ -10,17 +10,24 @@ describe BbcNewsPageParserFactory do
|
|
10
10
|
"http://news.bbc.co.uk/1/hi/northern_ireland/7996478.stm",
|
11
11
|
"http://news.bbc.co.uk/1/hi/uk/7995652.stm",
|
12
12
|
"http://news.bbc.co.uk/1/hi/england/derbyshire/7996494.stm",
|
13
|
-
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm"
|
13
|
+
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
|
14
|
+
"http://www.bbc.co.uk/news/business-11125504",
|
15
|
+
"http://www.bbc.co.uk/news/10604897"
|
14
16
|
]
|
15
17
|
@invalid_urls = [
|
16
18
|
"http://news.bbc.co.uk/2/hi/health/default.stm",
|
17
19
|
"http://news.bbc.co.uk/2/low/europe/default.stm",
|
18
20
|
"http://news.bbc.co.uk/2/hi/in_pictures/default.stm",
|
19
21
|
"http://news.bbc.co.uk/sport",
|
22
|
+
"http://news.bbc.co.uk/sport1/hi/tennis/8951357.stm",
|
20
23
|
"http://newsforums.bbc.co.uk/nol/thread.jspa?forumID=6422&edition=1&ttl=20090509133749",
|
21
24
|
"http://www.bbc.co.uk/blogs/nickrobinson/",
|
22
25
|
"http://news.bbc.co.uk/hi/english/static/in_depth/health/2000/heart_disease/default.stm",
|
23
|
-
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm"
|
26
|
+
"http://news.bbc.co.uk/1/shared/spl/hi/pop_ups/08/middle_east_views_on_netanyahu0s_us_visit/html/1.stm",
|
27
|
+
"http://www.bbc.co.uk/blogs/theeditors/",
|
28
|
+
"http://www.bbc.co.uk/news/have_your_say/",
|
29
|
+
"http://news.bbc.co.uk/1/hi/magazine/default.stm",
|
30
|
+
"http://news.bbc.co.uk/1/hi/in_pictures/default.stm"
|
24
31
|
]
|
25
32
|
end
|
26
33
|
|
@@ -41,6 +48,61 @@ describe BbcNewsPageParserFactory do
|
|
41
48
|
end
|
42
49
|
end
|
43
50
|
|
51
|
+
describe BbcNewsPageParserV4 do
|
52
|
+
it_should_behave_like AllPageParsers
|
53
|
+
before do
|
54
|
+
@valid_options = {
|
55
|
+
:url => 'http://www.bbc.co.uk/news/business-11125504',
|
56
|
+
:page => File.read("spec/fixtures/bbc_news/11125504.html"),
|
57
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
58
|
+
}
|
59
|
+
@pa = BbcNewsPageParserV4.new(@valid_options)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should parse the title" do
|
63
|
+
@pa.title.should == "UK economy 'to pick up in near term'"
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should parse the content" do
|
67
|
+
@pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
|
68
|
+
@pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
|
69
|
+
@pa.content.size.should == 18
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
describe BbcNewsPageParserV3 do
|
76
|
+
it_should_behave_like AllPageParsers
|
77
|
+
before do
|
78
|
+
@valid_options = {
|
79
|
+
:url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
|
80
|
+
:page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
|
81
|
+
:valid_hash => 'd9e201abec3f4b9e38865b5135281978'
|
82
|
+
}
|
83
|
+
@pa = BbcNewsPageParserV3.new(@valid_options)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should parse the content" do
|
87
|
+
@pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
|
88
|
+
@pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
|
89
|
+
@pa.content.size.should == 24
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should parse the content of an article with two captions" do
|
93
|
+
@pa = BbcNewsPageParserV3.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
|
94
|
+
:page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
|
95
|
+
:valid_hash => 'unknown'
|
96
|
+
})
|
97
|
+
@pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
|
98
|
+
@pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
|
99
|
+
@pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
end
|
105
|
+
|
44
106
|
describe BbcNewsPageParserV2 do
|
45
107
|
it_should_behave_like AllPageParsers
|
46
108
|
before do
|
@@ -140,5 +202,5 @@ describe BbcNewsPageParserV1 do
|
|
140
202
|
@pa.title.should match Regexp.new("John's")
|
141
203
|
@pa.title.should match /sucks & blows/
|
142
204
|
end
|
143
|
-
|
205
|
+
|
144
206
|
end
|
metadata
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 33
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 21
|
9
|
+
version: "0.21"
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- John Leach
|
@@ -9,29 +14,41 @@ autorequire:
|
|
9
14
|
bindir: bin
|
10
15
|
cert_chain: []
|
11
16
|
|
12
|
-
date:
|
17
|
+
date: 2010-08-30 00:00:00 +01:00
|
13
18
|
default_executable:
|
14
19
|
dependencies:
|
15
20
|
- !ruby/object:Gem::Dependency
|
16
21
|
name: oniguruma
|
17
|
-
|
18
|
-
|
19
|
-
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
20
25
|
requirements:
|
21
26
|
- - ">="
|
22
27
|
- !ruby/object:Gem::Version
|
28
|
+
hash: 19
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 1
|
32
|
+
- 0
|
23
33
|
version: 1.1.0
|
24
|
-
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: htmlentities
|
27
|
-
|
28
|
-
|
29
|
-
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
30
41
|
requirements:
|
31
42
|
- - ">="
|
32
43
|
- !ruby/object:Gem::Version
|
44
|
+
hash: 63
|
45
|
+
segments:
|
46
|
+
- 4
|
47
|
+
- 0
|
48
|
+
- 0
|
33
49
|
version: 4.0.0
|
34
|
-
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
35
52
|
description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
|
36
53
|
email: john@johnleach.co.uk
|
37
54
|
executables: []
|
@@ -42,51 +59,69 @@ extra_rdoc_files:
|
|
42
59
|
- README.rdoc
|
43
60
|
- LICENSE
|
44
61
|
files:
|
45
|
-
- lib/web-page-parser
|
62
|
+
- lib/web-page-parser/parser_factory.rb
|
46
63
|
- lib/web-page-parser/base_parser.rb
|
47
|
-
- lib/web-page-parser/parsers
|
48
64
|
- lib/web-page-parser/parsers/test_page_parser.rb
|
49
65
|
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
50
|
-
- lib/web-page-parser/parser_factory.rb
|
51
66
|
- lib/web-page-parser.rb
|
52
67
|
- README.rdoc
|
53
68
|
- LICENSE
|
69
|
+
- spec/parser_factory_spec.rb
|
70
|
+
- spec/base_parser_spec.rb
|
71
|
+
- spec/parsers/bbc_news_page_spec.rb
|
72
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
73
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
74
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
75
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
76
|
+
- spec/fixtures/bbc_news/10249066.stm.html
|
77
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
78
|
+
- spec/fixtures/bbc_news/11125504.html
|
79
|
+
- spec/fixtures/bbc_news/6072486.stm.html
|
80
|
+
- spec/spec.opts
|
54
81
|
has_rdoc: true
|
55
82
|
homepage: http://github.com/johnl/web-page-parser/tree/master
|
83
|
+
licenses: []
|
84
|
+
|
56
85
|
post_install_message:
|
57
86
|
rdoc_options: []
|
58
87
|
|
59
88
|
require_paths:
|
60
89
|
- lib
|
61
90
|
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
62
92
|
requirements:
|
63
93
|
- - ">="
|
64
94
|
- !ruby/object:Gem::Version
|
95
|
+
hash: 3
|
96
|
+
segments:
|
97
|
+
- 0
|
65
98
|
version: "0"
|
66
|
-
version:
|
67
99
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
68
101
|
requirements:
|
69
102
|
- - ">="
|
70
103
|
- !ruby/object:Gem::Version
|
104
|
+
hash: 3
|
105
|
+
segments:
|
106
|
+
- 0
|
71
107
|
version: "0"
|
72
|
-
version:
|
73
108
|
requirements: []
|
74
109
|
|
75
110
|
rubyforge_project: web-page-parser
|
76
|
-
rubygems_version: 1.3.
|
111
|
+
rubygems_version: 1.3.7
|
77
112
|
signing_key:
|
78
|
-
specification_version:
|
113
|
+
specification_version: 3
|
79
114
|
summary: A parser for web pages
|
80
115
|
test_files:
|
81
116
|
- spec/parser_factory_spec.rb
|
82
117
|
- spec/base_parser_spec.rb
|
83
|
-
- spec/
|
84
|
-
- spec/fixtures/bbc_news
|
118
|
+
- spec/parsers/bbc_news_page_spec.rb
|
119
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
120
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
121
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
85
122
|
- spec/fixtures/bbc_news/8063681.stm.html
|
123
|
+
- spec/fixtures/bbc_news/10249066.stm.html
|
86
124
|
- spec/fixtures/bbc_news/8011268.stm.html
|
125
|
+
- spec/fixtures/bbc_news/11125504.html
|
87
126
|
- spec/fixtures/bbc_news/6072486.stm.html
|
88
|
-
- spec/fixtures/bbc_news/8029015.stm.html
|
89
|
-
- spec/fixtures/bbc_news/7745137.stm.html
|
90
|
-
- spec/parsers
|
91
|
-
- spec/parsers/bbc_news_page_spec.rb
|
92
127
|
- spec/spec.opts
|