web-page-parser 0.23 → 0.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.rdoc +17 -7
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +3 -2
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +45 -0
- data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html +3556 -0
- data/spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html +7624 -0
- data/spec/parsers/bbc_news_page_spec.rb +1 -1
- data/spec/parsers/guardian_page_spec.rb +98 -0
- metadata +56 -65
@@ -14,7 +14,7 @@ describe BbcNewsPageParserFactory do
|
|
14
14
|
"http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
|
15
15
|
"http://www.bbc.co.uk/news/business-11125504",
|
16
16
|
"http://www.bbc.co.uk/news/10604897",
|
17
|
-
"http://www.bbc.co.uk/news/world-middle-east-
|
17
|
+
"http://www.bbc.co.uk/news/world-middle-east-18229870#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa"
|
18
18
|
]
|
19
19
|
@invalid_urls = [
|
20
20
|
"http://news.bbc.co.uk/2/hi/health/default.stm",
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
$:.unshift File.join(File.dirname(__FILE__), '../../lib')
|
3
|
+
require 'spec/base_parser_spec'
|
4
|
+
require 'web-page-parser'
|
5
|
+
include WebPageParser
|
6
|
+
|
7
|
+
describe GuardianPageParserFactory do
|
8
|
+
before do
|
9
|
+
@valid_urls = [
|
10
|
+
"http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus",
|
11
|
+
"http://www.guardian.co.uk/commentisfree/2012/jan/27/ian-jack-battle-for-scotland",
|
12
|
+
"http://www.guardian.co.uk/environment/bike-blog/2012/jan/27/hgv-cyclists-safety-bike-blog",
|
13
|
+
"http://www.guardian.co.uk/tv-and-radio/2012/jan/26/well-take-manhattan-david-bailey",
|
14
|
+
]
|
15
|
+
@invalid_urls = [
|
16
|
+
"http://www.guardian.co.uk/business",
|
17
|
+
"http://www.guardian.co.uk/mobile/apps",
|
18
|
+
"http://www.guardian.co.uk/business/nils-pratley-on-finance",
|
19
|
+
"http://www.guardian.co.uk/commentisfree/commentisfree+uk/uk",
|
20
|
+
"http://www.guardian.co.uk/help/feeds",
|
21
|
+
"http://www.guardian.co.uk/uk/cartoon/2012/jan/28/nicolas-sarkozy-caricature",
|
22
|
+
"http://www.guardian.co.uk/commentisfree/poll/2012/jan/30/smacking-children-david-lammy",
|
23
|
+
"http://www.guardian.co.uk/uk/video/2012/may/13/occupy-protesters-clash-police-video",
|
24
|
+
"http://www.guardian.co.uk/uk/gallery/2012/may/10/public-sector-protests-in-pictures",
|
25
|
+
"http://www.guardian.co.uk/media/video/2012/may/24/chris-huhne-partner-privacy-case-video",
|
26
|
+
"http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable"
|
27
|
+
]
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should detect guardian articles from the url" do
|
31
|
+
@valid_urls.each do |url|
|
32
|
+
GuardianPageParserFactory.can_parse?(:url => url).should be_true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should ignore pages with the wrong url format" do
|
37
|
+
@invalid_urls.each do |url|
|
38
|
+
GuardianPageParserFactory.can_parse?(:url => url).should be_nil
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
describe GuardianPageParserV1 do
|
46
|
+
|
47
|
+
describe "when parsing the anger-grows article" do
|
48
|
+
before do
|
49
|
+
@valid_options = {
|
50
|
+
:url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
|
51
|
+
:page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html"),
|
52
|
+
:valid_hash => '04108a9a7e3196da185e4d10432740a1'
|
53
|
+
}
|
54
|
+
@pa = GuardianPageParserV1.new(@valid_options)
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should parse the title" do
|
58
|
+
@pa.title.should == "Anger grows over RBS chief's £900,000 bonus"
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should parse the date in UTC" do
|
62
|
+
@pa.date.should == DateTime.parse("Fri Jan 27 12:58:53 +0000 2012")
|
63
|
+
@pa.date.zone.should == '+00:00'
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should parse the content" do
|
67
|
+
@pa.content[0].should == "Ed Miliband and Boris Johnson have joined the chorus of criticism over the decision by the Royal Bank of Scotland to award its chief executive a bonus of nearly £1m."
|
68
|
+
@pa.content[7].should == 'Speaking from the World Economic Forum in Davos, Switzerland, Johnson described the bonus as "absolutely bewildering" and said it should have been blocked by ministers.'
|
69
|
+
@pa.content[38].should == '"Even to be considering this at a time when we are struggling to get our economies growing is quite simply madness," he told leaders in a speech to the World Economic Forum.'
|
70
|
+
@pa.content.last.should == "."
|
71
|
+
@pa.content.size.should == 40
|
72
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe "when parsing the syria-libya-middle-east article" do
|
77
|
+
before do
|
78
|
+
@valid_options = {
|
79
|
+
:url => 'http://www.guardian.co.uk/world/middle-east-live/2011/jun/22/syria-libya-middle-east-unrest-live?INTCMP=ILCNETTXT3487',
|
80
|
+
:page => File.read("spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html"),
|
81
|
+
:valid_hash => '19427d70638b8d787a004f31ede29757'
|
82
|
+
}
|
83
|
+
@pa = GuardianPageParserV1.new(@valid_options)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should parse the title" do
|
87
|
+
@pa.title.should == "Bahrain, Syria and Middle East unrest - Wednesday 22 June 2011"
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should parse the content" do
|
91
|
+
@pa.content[0].should == "9.31am:Welcome to Middle East Live. There's so much happening across the region that it's difficult to know which stories to watch today. Here's a run down of the latest developments by country:"
|
92
|
+
@pa.content[1].should == "Bahrain"
|
93
|
+
@pa.content[6].should == "When I see children being killed, I must have misgivings. That's why I warned about the risk of civilian casualties... You can't have a decisive ending. Now is the time to do whatever we can to reach a political solution."
|
94
|
+
@pa.content.last.should == "(That's it from us today. Thanks for your comments)."
|
95
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
CHANGED
@@ -1,74 +1,71 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-parser
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 23
|
9
|
-
version: "0.23"
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.25'
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- John Leach
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-06-05 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: oniguruma
|
22
|
-
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
24
17
|
none: false
|
25
|
-
requirements:
|
26
|
-
- -
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 19
|
29
|
-
segments:
|
30
|
-
- 1
|
31
|
-
- 1
|
32
|
-
- 0
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
33
21
|
version: 1.1.0
|
34
22
|
type: :runtime
|
35
|
-
version_requirements: *id001
|
36
|
-
- !ruby/object:Gem::Dependency
|
37
|
-
name: htmlentities
|
38
23
|
prerelease: false
|
39
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.0
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: htmlentities
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
40
33
|
none: false
|
41
|
-
requirements:
|
42
|
-
- -
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
hash: 63
|
45
|
-
segments:
|
46
|
-
- 4
|
47
|
-
- 0
|
48
|
-
- 0
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
49
37
|
version: 4.0.0
|
50
38
|
type: :runtime
|
51
|
-
|
52
|
-
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 4.0.0
|
46
|
+
description: A Ruby library to parse the content out of web pages, such as BBC News
|
47
|
+
pages and Guardian articles. Used by the News Sniffer project.
|
53
48
|
email: john@johnleach.co.uk
|
54
49
|
executables: []
|
55
|
-
|
56
50
|
extensions: []
|
57
|
-
|
58
|
-
extra_rdoc_files:
|
51
|
+
extra_rdoc_files:
|
59
52
|
- README.rdoc
|
60
53
|
- LICENSE
|
61
|
-
files:
|
54
|
+
files:
|
62
55
|
- lib/web-page-parser/parser_factory.rb
|
63
56
|
- lib/web-page-parser/base_parser.rb
|
64
57
|
- lib/web-page-parser/parsers/test_page_parser.rb
|
58
|
+
- lib/web-page-parser/parsers/guardian_page_parser.rb
|
65
59
|
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
66
60
|
- lib/web-page-parser.rb
|
67
61
|
- README.rdoc
|
68
62
|
- LICENSE
|
69
63
|
- spec/parser_factory_spec.rb
|
70
64
|
- spec/base_parser_spec.rb
|
65
|
+
- spec/parsers/guardian_page_spec.rb
|
71
66
|
- spec/parsers/bbc_news_page_spec.rb
|
67
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
68
|
+
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
72
69
|
- spec/fixtures/bbc_news/10341015.stm.html
|
73
70
|
- spec/fixtures/bbc_news/8029015.stm.html
|
74
71
|
- spec/fixtures/bbc_news/7745137.stm.html
|
@@ -80,44 +77,37 @@ files:
|
|
80
77
|
- spec/fixtures/bbc_news/11125504.html
|
81
78
|
- spec/fixtures/bbc_news/6072486.stm.html
|
82
79
|
- spec/spec.opts
|
83
|
-
has_rdoc: true
|
84
80
|
homepage: http://github.com/johnl/web-page-parser/tree/master
|
85
81
|
licenses: []
|
86
|
-
|
87
82
|
post_install_message:
|
88
83
|
rdoc_options: []
|
89
|
-
|
90
|
-
require_paths:
|
84
|
+
require_paths:
|
91
85
|
- lib
|
92
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
87
|
none: false
|
94
|
-
requirements:
|
95
|
-
- -
|
96
|
-
- !ruby/object:Gem::Version
|
97
|
-
|
98
|
-
|
99
|
-
- 0
|
100
|
-
version: "0"
|
101
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ! '>='
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
93
|
none: false
|
103
|
-
requirements:
|
104
|
-
- -
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
|
107
|
-
segments:
|
108
|
-
- 0
|
109
|
-
version: "0"
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
110
98
|
requirements: []
|
111
|
-
|
112
99
|
rubyforge_project: web-page-parser
|
113
|
-
rubygems_version: 1.
|
100
|
+
rubygems_version: 1.8.23
|
114
101
|
signing_key:
|
115
102
|
specification_version: 3
|
116
103
|
summary: A parser for web pages
|
117
|
-
test_files:
|
104
|
+
test_files:
|
118
105
|
- spec/parser_factory_spec.rb
|
119
106
|
- spec/base_parser_spec.rb
|
107
|
+
- spec/parsers/guardian_page_spec.rb
|
120
108
|
- spec/parsers/bbc_news_page_spec.rb
|
109
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
110
|
+
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
121
111
|
- spec/fixtures/bbc_news/10341015.stm.html
|
122
112
|
- spec/fixtures/bbc_news/8029015.stm.html
|
123
113
|
- spec/fixtures/bbc_news/7745137.stm.html
|
@@ -129,3 +119,4 @@ test_files:
|
|
129
119
|
- spec/fixtures/bbc_news/11125504.html
|
130
120
|
- spec/fixtures/bbc_news/6072486.stm.html
|
131
121
|
- spec/spec.opts
|
122
|
+
has_rdoc: true
|