web-page-parser 0.23 → 0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@ describe BbcNewsPageParserFactory do
14
14
  "http://news.bbc.co.uk/2/low/uk_news/england/devon/7996447.stm",
15
15
  "http://www.bbc.co.uk/news/business-11125504",
16
16
  "http://www.bbc.co.uk/news/10604897",
17
- "http://www.bbc.co.uk/news/world-middle-east-13373006"
17
+ "http://www.bbc.co.uk/news/world-middle-east-18229870#sa-ns_mchannel=rss&ns_source=PublicRSS20-sa"
18
18
  ]
19
19
  @invalid_urls = [
20
20
  "http://news.bbc.co.uk/2/hi/health/default.stm",
@@ -0,0 +1,98 @@
1
+ # -*- coding: utf-8 -*-
2
+ $:.unshift File.join(File.dirname(__FILE__), '../../lib')
3
+ require 'spec/base_parser_spec'
4
+ require 'web-page-parser'
5
+ include WebPageParser
6
+
7
+ describe GuardianPageParserFactory do
8
+ before do
9
+ @valid_urls = [
10
+ "http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus",
11
+ "http://www.guardian.co.uk/commentisfree/2012/jan/27/ian-jack-battle-for-scotland",
12
+ "http://www.guardian.co.uk/environment/bike-blog/2012/jan/27/hgv-cyclists-safety-bike-blog",
13
+ "http://www.guardian.co.uk/tv-and-radio/2012/jan/26/well-take-manhattan-david-bailey",
14
+ ]
15
+ @invalid_urls = [
16
+ "http://www.guardian.co.uk/business",
17
+ "http://www.guardian.co.uk/mobile/apps",
18
+ "http://www.guardian.co.uk/business/nils-pratley-on-finance",
19
+ "http://www.guardian.co.uk/commentisfree/commentisfree+uk/uk",
20
+ "http://www.guardian.co.uk/help/feeds",
21
+ "http://www.guardian.co.uk/uk/cartoon/2012/jan/28/nicolas-sarkozy-caricature",
22
+ "http://www.guardian.co.uk/commentisfree/poll/2012/jan/30/smacking-children-david-lammy",
23
+ "http://www.guardian.co.uk/uk/video/2012/may/13/occupy-protesters-clash-police-video",
24
+ "http://www.guardian.co.uk/uk/gallery/2012/may/10/public-sector-protests-in-pictures",
25
+ "http://www.guardian.co.uk/media/video/2012/may/24/chris-huhne-partner-privacy-case-video",
26
+ "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable"
27
+ ]
28
+ end
29
+
30
+ it "should detect guardian articles from the url" do
31
+ @valid_urls.each do |url|
32
+ GuardianPageParserFactory.can_parse?(:url => url).should be_true
33
+ end
34
+ end
35
+
36
+ it "should ignore pages with the wrong url format" do
37
+ @invalid_urls.each do |url|
38
+ GuardianPageParserFactory.can_parse?(:url => url).should be_nil
39
+ end
40
+ end
41
+
42
+ end
43
+
44
+
45
+ describe GuardianPageParserV1 do
46
+
47
+ describe "when parsing the anger-grows article" do
48
+ before do
49
+ @valid_options = {
50
+ :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
51
+ :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html"),
52
+ :valid_hash => '04108a9a7e3196da185e4d10432740a1'
53
+ }
54
+ @pa = GuardianPageParserV1.new(@valid_options)
55
+ end
56
+
57
+ it "should parse the title" do
58
+ @pa.title.should == "Anger grows over RBS chief's £900,000 bonus"
59
+ end
60
+
61
+ it "should parse the date in UTC" do
62
+ @pa.date.should == DateTime.parse("Fri Jan 27 12:58:53 +0000 2012")
63
+ @pa.date.zone.should == '+00:00'
64
+ end
65
+
66
+ it "should parse the content" do
67
+ @pa.content[0].should == "Ed Miliband and Boris Johnson have joined the chorus of criticism over the decision by the Royal Bank of Scotland to award its chief executive a bonus of nearly £1m."
68
+ @pa.content[7].should == 'Speaking from the World Economic Forum in Davos, Switzerland, Johnson described the bonus as "absolutely bewildering" and said it should have been blocked by ministers.'
69
+ @pa.content[38].should == '"Even to be considering this at a time when we are struggling to get our economies growing is quite simply madness," he told leaders in a speech to the World Economic Forum.'
70
+ @pa.content.last.should == "."
71
+ @pa.content.size.should == 40
72
+ @pa.hash.should == @valid_options[:valid_hash]
73
+ end
74
+ end
75
+
76
+ describe "when parsing the syria-libya-middle-east article" do
77
+ before do
78
+ @valid_options = {
79
+ :url => 'http://www.guardian.co.uk/world/middle-east-live/2011/jun/22/syria-libya-middle-east-unrest-live?INTCMP=ILCNETTXT3487',
80
+ :page => File.read("spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html"),
81
+ :valid_hash => '19427d70638b8d787a004f31ede29757'
82
+ }
83
+ @pa = GuardianPageParserV1.new(@valid_options)
84
+ end
85
+
86
+ it "should parse the title" do
87
+ @pa.title.should == "Bahrain, Syria and Middle East unrest - Wednesday 22 June 2011"
88
+ end
89
+
90
+ it "should parse the content" do
91
+ @pa.content[0].should == "9.31am:Welcome to Middle East Live. There's so much happening across the region that it's difficult to know which stories to watch today. Here's a run down of the latest developments by country:"
92
+ @pa.content[1].should == "Bahrain"
93
+ @pa.content[6].should == "When I see children being killed, I must have misgivings. That's why I warned about the risk of civilian casualties... You can't have a decisive ending. Now is the time to do whatever we can to reach a political solution."
94
+ @pa.content.last.should == "(That's it from us today. Thanks for your comments)."
95
+ @pa.hash.should == @valid_options[:valid_hash]
96
+ end
97
+ end
98
+ end
metadata CHANGED
@@ -1,74 +1,71 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
- version: !ruby/object:Gem::Version
4
- hash: 37
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 23
9
- version: "0.23"
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.25'
5
+ prerelease:
10
6
  platform: ruby
11
- authors:
7
+ authors:
12
8
  - John Leach
13
9
  autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
-
17
- date: 2011-05-15 00:00:00 +01:00
18
- default_executable:
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
12
+ date: 2012-06-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
21
15
  name: oniguruma
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
24
17
  none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 19
29
- segments:
30
- - 1
31
- - 1
32
- - 0
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
33
21
  version: 1.1.0
34
22
  type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: htmlentities
38
23
  prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: htmlentities
32
+ requirement: !ruby/object:Gem::Requirement
40
33
  none: false
41
- requirements:
42
- - - ">="
43
- - !ruby/object:Gem::Version
44
- hash: 63
45
- segments:
46
- - 4
47
- - 0
48
- - 0
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
49
37
  version: 4.0.0
50
38
  type: :runtime
51
- version_requirements: *id002
52
- description: A Ruby library to parse the content out of web pages, such as BBC News pages. Used by the News Sniffer project.
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 4.0.0
46
+ description: A Ruby library to parse the content out of web pages, such as BBC News
47
+ pages and Guardian articles. Used by the News Sniffer project.
53
48
  email: john@johnleach.co.uk
54
49
  executables: []
55
-
56
50
  extensions: []
57
-
58
- extra_rdoc_files:
51
+ extra_rdoc_files:
59
52
  - README.rdoc
60
53
  - LICENSE
61
- files:
54
+ files:
62
55
  - lib/web-page-parser/parser_factory.rb
63
56
  - lib/web-page-parser/base_parser.rb
64
57
  - lib/web-page-parser/parsers/test_page_parser.rb
58
+ - lib/web-page-parser/parsers/guardian_page_parser.rb
65
59
  - lib/web-page-parser/parsers/bbc_news_page_parser.rb
66
60
  - lib/web-page-parser.rb
67
61
  - README.rdoc
68
62
  - LICENSE
69
63
  - spec/parser_factory_spec.rb
70
64
  - spec/base_parser_spec.rb
65
+ - spec/parsers/guardian_page_spec.rb
71
66
  - spec/parsers/bbc_news_page_spec.rb
67
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
68
+ - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
72
69
  - spec/fixtures/bbc_news/10341015.stm.html
73
70
  - spec/fixtures/bbc_news/8029015.stm.html
74
71
  - spec/fixtures/bbc_news/7745137.stm.html
@@ -80,44 +77,37 @@ files:
80
77
  - spec/fixtures/bbc_news/11125504.html
81
78
  - spec/fixtures/bbc_news/6072486.stm.html
82
79
  - spec/spec.opts
83
- has_rdoc: true
84
80
  homepage: http://github.com/johnl/web-page-parser/tree/master
85
81
  licenses: []
86
-
87
82
  post_install_message:
88
83
  rdoc_options: []
89
-
90
- require_paths:
84
+ require_paths:
91
85
  - lib
92
- required_ruby_version: !ruby/object:Gem::Requirement
86
+ required_ruby_version: !ruby/object:Gem::Requirement
93
87
  none: false
94
- requirements:
95
- - - ">="
96
- - !ruby/object:Gem::Version
97
- hash: 3
98
- segments:
99
- - 0
100
- version: "0"
101
- required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ! '>='
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
93
  none: false
103
- requirements:
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- hash: 3
107
- segments:
108
- - 0
109
- version: "0"
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
110
98
  requirements: []
111
-
112
99
  rubyforge_project: web-page-parser
113
- rubygems_version: 1.3.7
100
+ rubygems_version: 1.8.23
114
101
  signing_key:
115
102
  specification_version: 3
116
103
  summary: A parser for web pages
117
- test_files:
104
+ test_files:
118
105
  - spec/parser_factory_spec.rb
119
106
  - spec/base_parser_spec.rb
107
+ - spec/parsers/guardian_page_spec.rb
120
108
  - spec/parsers/bbc_news_page_spec.rb
109
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
110
+ - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
121
111
  - spec/fixtures/bbc_news/10341015.stm.html
122
112
  - spec/fixtures/bbc_news/8029015.stm.html
123
113
  - spec/fixtures/bbc_news/7745137.stm.html
@@ -129,3 +119,4 @@ test_files:
129
119
  - spec/fixtures/bbc_news/11125504.html
130
120
  - spec/fixtures/bbc_news/6072486.stm.html
131
121
  - spec/spec.opts
122
+ has_rdoc: true