web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +1 -0
  3. data.tar.gz.sig +0 -0
  4. data/README.rdoc +5 -0
  5. data/lib/web-page-parser.rb +31 -0
  6. data/lib/web-page-parser/base_parser.rb +92 -42
  7. data/lib/web-page-parser/http.rb +63 -0
  8. data/lib/web-page-parser/parser_factory.rb +0 -1
  9. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
  10. data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
  11. data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
  12. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
  13. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
  14. data/spec/base_parser_spec.rb +24 -8
  15. data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
  16. data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
  17. data/spec/fixtures/bbc_news/21528631.html +2021 -0
  18. data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
  19. data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
  20. data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
  21. data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
  22. data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
  23. data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
  24. data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
  25. data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
  26. data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
  27. data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
  28. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
  29. data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
  30. data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
  31. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
  32. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
  33. data/spec/parser_factory_spec.rb +3 -3
  34. data/spec/parsers/bbc_news_page_spec.rb +223 -3
  35. data/spec/parsers/guardian_page_spec.rb +157 -4
  36. data/spec/parsers/independent_page_parser_spec.rb +152 -0
  37. data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
  38. data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
  39. data/spec/spec_helper.rb +5 -0
  40. metadata +167 -59
  41. metadata.gz.sig +2 -0
@@ -1,11 +1,11 @@
1
- $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
- require 'web-page-parser'
1
+ require 'spec_helper'
3
2
  include WebPageParser
4
3
 
5
4
  describe ParserFactory do
6
5
 
7
6
  it "should load parsers in the parsers directory" do
8
- ParserFactory.factories.first.to_s.should == "TestPageParserFactory"
7
+ pfl = ParserFactory.factories.collect { |f| f.to_s }
8
+ pfl.should include "TestPageParserFactory"
9
9
  end
10
10
 
11
11
  it "should provide the right PageParser for the given url" do
@@ -1,6 +1,8 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  $:.unshift File.join(File.dirname(__FILE__), '../../lib')
3
- require 'spec/base_parser_spec'
3
+ $:.unshift File.join(File.dirname(__FILE__), '../../spec')
4
+ require 'base_parser_spec'
5
+ require 'spec_helper'
4
6
  require 'web-page-parser'
5
7
  include WebPageParser
6
8
 
@@ -44,16 +46,208 @@ describe BbcNewsPageParserFactory do
44
46
  BbcNewsPageParserFactory.can_parse?(:url => url).should be_nil
45
47
  end
46
48
  end
47
-
49
+
48
50
  it "should ignore 'in pictures' articles" do
49
51
  BbcNewsPageParserFactory.can_parse?(:url => 'http://news.bbc.co.uk/1/hi/in_pictures/8039882.stm').should be_nil
50
52
  end
51
53
  end
52
54
 
55
+ describe BbcNewsPageParserV5 do
56
+
57
+ describe "downloaded article with non-utf8" do
58
+ page = BbcNewsPageParserV5.new(:url => "http://news.bbc.co.uk/1/hi/uk_politics/7984711.stm")
59
+ page.hash.should_not == nil
60
+ page.hash.should_not == ""
61
+ end
62
+
63
+ describe "Oscar Pistorius article" do
64
+ it_should_behave_like AllPageParsers
65
+ before do
66
+ @valid_options = {
67
+ :url => 'http://www.bbc.co.uk/news/world-africa-21528631',
68
+ :page => File.read("spec/fixtures/bbc_news/21528631.html"),
69
+ :valid_hash => ''
70
+ }
71
+ @pa = BbcNewsPageParserV5.new(@valid_options)
72
+ end
73
+
74
+ it "should parse the title" do
75
+ @pa.title.should == "Oscar Pistorius detective on attempted murder charges"
76
+ end
77
+
78
+ it "should parse the content" do
79
+ @pa.content.first.should == "The South African detective leading the Oscar Pistorius inquiry is facing seven charges of attempted murder, police have confirmed."
80
+ @pa.content.last.should == "In London he made history by becoming the first double-amputee to run in the Olympics, making the semi-final of the 400m."
81
+ @pa.content.should include "Reinstated charges"
82
+ @pa.content.should include "Mr Roux said this was a strong, loving relationship and that there was no motive to kill."
83
+ @pa.content.should include "The three were arrested in 2011, Eyewitness News says, citing police."
84
+ @pa.content.size.should == 38
85
+ end
86
+
87
+ it "should exclude the twitter feed" do
88
+ @pa.content.to_s.should_not =~ /Live tweets/
89
+ @pa.content.to_s.should_not =~ /An old mystery resurfaces/
90
+ end
91
+
92
+ it "should parse the publication date" do
93
+ # 2013/02/21 14:10:58
94
+ @pa.date.should == DateTime.parse("Feb 21 14:10:58 +0000 2013")
95
+ end
96
+ end
97
+
98
+ describe "UK economy article" do
99
+ before do
100
+ @valid_options = {
101
+ :url => 'http://www.bbc.co.uk/news/business-11125504',
102
+ :page => File.read("spec/fixtures/bbc_news/11125504.html"),
103
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
104
+ }
105
+ @pa = BbcNewsPageParserV5.new(@valid_options)
106
+ end
107
+
108
+ it "should parse the title" do
109
+ @pa.title.should == "UK economy 'to pick up in near term'"
110
+ end
111
+
112
+ it "should parse the content" do
113
+ @pa.content[0].should == "The British Chambers of Commerce (BCC) has upgraded its forecast for the UK's short term economic prospects, but said interest rates must be kept low to aid recovery."
114
+ @pa.content.last.should == '"Failure to get this right poses the biggest risk to recovery."'
115
+ @pa.content.size.should == 18
116
+ end
117
+ end
118
+
119
+ it "should ignore embedded-hyper content" do
120
+ @pa = BbcNewsPageParserV5.new(:page => File.read('spec/fixtures/bbc_news/12921632.html'))
121
+ @pa.content.to_s.should_not =~ /Fake and real quotes/
122
+ end
123
+
124
+ it "should parse the content of an article with market data" do
125
+ @pa = BbcNewsPageParserV5.new(:page => File.read('spec/fixtures/bbc_news/13293006.html'))
126
+ @pa.content.to_s.should_not =~ /Market Data/
127
+ @pa.content.to_s.should_not =~ /Last updated at/
128
+ @pa.content.size.should == 13
129
+ end
130
+
131
+ it "should ignore the twitter widget" do
132
+ pa = BbcNewsPageParserV5.new(:url => "http://www.bbc.co.uk/news/world-us-canada-20230333", :page => File.read("spec/fixtures/bbc_news/20230333.stm.html"))
133
+ pa.title.should == "US election: Results declared from some states"
134
+ pa.content.first.should == "President Barack Obama and challenger Mitt Romney remain locked in a tight race as US election results stream in."
135
+ pa.content.to_s.should_not =~ /US Election Tweets/
136
+ pa.content.last.should == "The BBC is providing full online live results of the US presidential election. More details here ."
137
+ pa.content.should include "Legal battles feared"
138
+ end
139
+
140
+ it "should ignore the 'latest' twitter widget" do
141
+ pa = BbcNewsPageParserV5.new(:url => "http://www.bbc.co.uk/news/uk-19957138", :page => File.read("spec/fixtures/bbc_news/19957138.stm.html"))
142
+ pa.title.should == "Gary McKinnon extradition to US blocked by Theresa May"
143
+ pa.content.to_s.should_not =~ /High Noon for Abu Qatada?/
144
+ pa.content.to_s.should_not =~ /Content from Twitter./
145
+ pa.content.last.should == "Mr McKinnon was arrested in 2002 and again in 2005 before an order for his extradition was made in July 2006 under the 2003 Extradition Act."
146
+ end
147
+
148
+ describe "Derrick Bird article" do
149
+ before do
150
+ @valid_options = {
151
+ :url => 'http://news.bbc.co.uk/1/hi/england/10249066.stm',
152
+ :page => File.read("spec/fixtures/bbc_news/10249066.stm.html"),
153
+ :valid_hash => '43634596a9f1cfb59bb9548282043119' # Differs from V3 as title is obtained more accurately
154
+ }
155
+ @pa = BbcNewsPageParserV5.new(@valid_options)
156
+ end
157
+
158
+ it "should parse the title" do
159
+ @pa.title.should == "Gunman's family unaware of motive for killings"
160
+ end
161
+
162
+ it "should parse the content" do
163
+ @pa.content[0].should == 'The family of gunman Derrick Bird say they have no idea why he carried out the "horrific" shootings in Cumbria.'
164
+ @pa.content.last.should == '"We appreciate what they are suffering at this time. We cannot offer any reason why Derrick took it upon himself to commit these crimes."'
165
+ @pa.content.size.should == 24
166
+ end
167
+
168
+ it "should parse the publication date" do
169
+ # 2010/06/06 13:48:45
170
+ @pa.date.should == DateTime.parse("Jun 06 13:48:45 +0000 2010")
171
+ end
172
+
173
+ it "should calculate a valid hash of the content" do
174
+ @pa.hash.should == @valid_options[:valid_hash]
175
+ end
176
+
177
+ end
178
+
179
+ describe "Obama invite article" do
180
+ before do
181
+ @valid_options = {
182
+ :url => 'http://news.bbc.co.uk/1/hi/world/middle_east/8011268.stm',
183
+ :page => File.read("spec/fixtures/bbc_news/8011268.stm.html"),
184
+ :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
185
+ }
186
+ @pa = BbcNewsPageParserV5.new(@valid_options)
187
+ end
188
+
189
+ it "should parse the title" do
190
+ @pa.title.should == "Obama invites Middle East heads"
191
+ end
192
+
193
+ it "should parse the date in UTC" do
194
+ # 2009/04/21 19:50:44
195
+ @pa.date.should == DateTime.parse("Apr 21 19:50:44 +0000 2009")
196
+ @pa.date.zone.should == '+00:00'
197
+ end
198
+
199
+ it "should parse the content" do
200
+ @pa.content.first.should == "US officials say the leaders of Israel, Egypt and the Palestinians have been invited for talks in Washington in a new push for Middle East peace."
201
+ @pa.content.last.should == "The US supports a two-state solution, with Israel existing peacefully alongside a Palestinian state."
202
+ @pa.content.size.should == 15
203
+ end
204
+
205
+ it "should decode html entities" do
206
+ @pa.content[8].should == 'He added: "We are actively working to finalise dates for the visits."'
207
+ end
208
+
209
+ it "should calculate a valid hash of the content" do
210
+ @pa.hash.should == @valid_options[:valid_hash]
211
+ end
212
+ end
213
+
214
+ describe "Woodward mortgage article" do
215
+ before do
216
+ @valid_options = {
217
+ :url => 'http://news.bbc.co.uk/1/hi/northern_ireland/8040164.stm',
218
+ :page => File.read("spec/fixtures/bbc_news/8040164.stm.html"),
219
+ :valid_hash => ''
220
+ }
221
+ @pa = BbcNewsPageParserV5.new(@valid_options)
222
+ end
223
+
224
+ it "should convert iso-8859-1 in the title to utf8" do
225
+ @pa.title.should == "£100K mortgage claim by Woodward"
226
+ end
227
+
228
+ it "should convert iso-8859-1 in the content to utf8" do
229
+ @pa.content.first.should =~ /£100,000/
230
+ end
231
+
232
+ end
233
+
234
+ it "should parse the content of an article with two captions" do
235
+ @pa = BbcNewsPageParserV5.new({ :url => "http://news.bbc.co.uk/1/hi/politics/10341015.stm",
236
+ :page => File.read("spec/fixtures/bbc_news/10341015.stm.html"),
237
+ :valid_hash => 'unknown'
238
+ })
239
+ @pa.content[0].should == "The coalition government has cancelled 12 projects totalling £2bn agreed to by the previous Labour government since the start of 2010."
240
+ @pa.content[1].should == "These include an £80m loan to Sheffield Forgemasters and new programmes for the young unemployed, Chief Secretary to the Treasury Danny Alexander told MPs."
241
+ @pa.content[2].should == 'Mr Alexander said the cuts were necessary to tackle the budget deficit and would be done in a "fair" way.'
242
+ end
243
+
244
+ end
245
+
53
246
  describe BbcNewsPageParserV4 do
54
247
  it_should_behave_like AllPageParsers
248
+
55
249
  before do
56
- @valid_options = {
250
+ @valid_options = {
57
251
  :url => 'http://www.bbc.co.uk/news/business-11125504',
58
252
  :page => File.read("spec/fixtures/bbc_news/11125504.html"),
59
253
  :valid_hash => 'd9e201abec3f4b9e38865b5135281978'
@@ -83,6 +277,32 @@ describe BbcNewsPageParserV4 do
83
277
  @pa.content.to_s.should_not =~ /Fake and real quotes/
84
278
  end
85
279
 
280
+ it "should retrieve the article from the bbc website" do
281
+ @pa = BbcNewsPageParserV4.new(:url => @valid_options[:url])
282
+ @pa.title.should == "UK economy 'to pick up in near term'"
283
+ end
284
+
285
+ it "should ignore the twitter widget" do
286
+ pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/world-us-canada-20230333", :page => File.read("spec/fixtures/bbc_news/20230333.stm.html"))
287
+ pa.title.should == "US election: Results declared from some states"
288
+ pa.content.first.should == "President Barack Obama and challenger Mitt Romney remain locked in a tight race as US election results stream in."
289
+ pa.content.to_s.should_not =~ /US Election Tweets/
290
+ pa.content.last.should == "Are you a voter in one of the swing states? Send us your comments on the election campaign using the form below."
291
+ end
292
+
293
+ it "should ignore the 'latest' twitter widget" do
294
+ pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/uk-19957138", :page => File.read("spec/fixtures/bbc_news/19957138.stm.html"))
295
+ pa.title.should == "Gary McKinnon extradition to US blocked by Theresa May"
296
+ pa.content.to_s.should_not =~ /High Noon for Abu Qatada?/
297
+ pa.content.to_s.should_not =~ /Content from Twitter./
298
+ pa.content.last.should == "Mr McKinnon was arrested in 2002 and again in 2005 before an order for his extradition was made in July 2006 under the 2003 Extradition Act."
299
+ end
300
+
301
+ it "should retrieve an old iso-8859-1 article without getting upset about encoding" do
302
+ @pa = BbcNewsPageParserV4.new(:url => "http://www.bbc.co.uk/news/magazine-20761954")
303
+ @pa.title.should == "Quiz of the Year: 52 weeks 52 questions, part four"
304
+ end
305
+
86
306
  end
87
307
 
88
308
 
@@ -1,7 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- $:.unshift File.join(File.dirname(__FILE__), '../../lib')
3
- require 'spec/base_parser_spec'
4
- require 'web-page-parser'
2
+ require 'spec_helper'
5
3
  include WebPageParser
6
4
 
7
5
  describe GuardianPageParserFactory do
@@ -11,6 +9,9 @@ describe GuardianPageParserFactory do
11
9
  "http://www.guardian.co.uk/commentisfree/2012/jan/27/ian-jack-battle-for-scotland",
12
10
  "http://www.guardian.co.uk/environment/bike-blog/2012/jan/27/hgv-cyclists-safety-bike-blog",
13
11
  "http://www.guardian.co.uk/tv-and-radio/2012/jan/26/well-take-manhattan-david-bailey",
12
+ "http://www.theguardian.com/world/2013/aug/24/syria-cameron-obama-intervention",
13
+ "http://www.theguardian.com/commentisfree/2013/aug/25/coalition-leaders-change-tune-rawnsley",
14
+ "http://www.theguardian.com/uk-news/2013/aug/25/police-officer-cleared-taser-brighton"
14
15
  ]
15
16
  @invalid_urls = [
16
17
  "http://www.guardian.co.uk/business",
@@ -23,7 +24,9 @@ describe GuardianPageParserFactory do
23
24
  "http://www.guardian.co.uk/uk/video/2012/may/13/occupy-protesters-clash-police-video",
24
25
  "http://www.guardian.co.uk/uk/gallery/2012/may/10/public-sector-protests-in-pictures",
25
26
  "http://www.guardian.co.uk/media/video/2012/may/24/chris-huhne-partner-privacy-case-video",
26
- "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable"
27
+ "http://www.guardian.co.uk/business/poll/2012/may/09/greek-exit-euro-inevitable",
28
+ "http://www.theguardian.com/global-development",
29
+ "http://www.theguardian.com/uk/business"
27
30
  ]
28
31
  end
29
32
 
@@ -95,4 +98,154 @@ describe GuardianPageParserV1 do
95
98
  @pa.hash.should == @valid_options[:valid_hash]
96
99
  end
97
100
  end
101
+
102
+
103
+
104
+ describe "when parsing the barack obama-nicki-minaj article" do
105
+ before do
106
+ @valid_options = {
107
+ :url => 'http://www.guardian.co.uk/music/2012/oct/16/barack-obama-nicki-minaj-mariah-carey',
108
+ :page => File.read("spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html"),
109
+ :valid_hash => '22fe55dc3664662ac6c1c79eac584754'
110
+ }
111
+ @pa = GuardianPageParserV1.new(@valid_options)
112
+ end
113
+
114
+ it "should not include +explainerText+" do
115
+ @pa.hash.should == @valid_options[:valid_hash]
116
+ @pa.content.to_s.should_not =~ /explainerText/
117
+ end
118
+ end
119
+
120
+ describe "when parsing the anger-grows article with the explainerText javascript" do
121
+ before do
122
+ @valid_options = {
123
+ :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
124
+ :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html"),
125
+ :valid_hash => '04108a9a7e3196da185e4d10432740a1'
126
+ }
127
+ @pa = GuardianPageParserV1.new(@valid_options)
128
+ end
129
+
130
+ it "should have the same hash as before" do
131
+ @pa.hash.should == @valid_options[:valid_hash]
132
+ end
133
+
134
+ it "should not include +explainerText+" do
135
+ @pa.content.to_s.should_not =~ /explainerText/
136
+ end
137
+ end
138
+ end
139
+
140
+ describe GuardianPageParserV2 do
141
+
142
+ describe "when parsing the anger-grows article" do
143
+ before do
144
+ @valid_options = {
145
+ :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
146
+ :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html"),
147
+ :valid_hash => '04108a9a7e3196da185e4d10432740a1'
148
+ }
149
+ @pa = GuardianPageParserV2.new(@valid_options)
150
+ end
151
+
152
+ it "should parse the title" do
153
+ @pa.title.should == "Anger grows over RBS chief's £900,000 bonus"
154
+ end
155
+
156
+ it "should parse the date in UTC" do
157
+ @pa.date.should == DateTime.parse("Fri Jan 27 12:58:53 +0000 2012")
158
+ @pa.date.zone.should == '+00:00'
159
+ end
160
+
161
+ it "should parse the content" do
162
+ @pa.content[0].should == "Ed Miliband and Boris Johnson have joined the chorus of criticism over the decision by the Royal Bank of Scotland to award its chief executive a bonus of nearly £1m."
163
+ @pa.content[7].should == 'Speaking from the World Economic Forum in Davos, Switzerland, Johnson described the bonus as "absolutely bewildering" and said it should have been blocked by ministers.'
164
+ @pa.content[38].should == '"Even to be considering this at a time when we are struggling to get our economies growing is quite simply madness," he told leaders in a speech to the World Economic Forum.'
165
+ @pa.content.last.should == "."
166
+ @pa.content.size.should == 40
167
+ @pa.hash.should == @valid_options[:valid_hash]
168
+ end
169
+ end
170
+
171
+ describe "when parsing the syria-libya-middle-east article" do
172
+ before do
173
+ @valid_options = {
174
+ :url => 'http://www.guardian.co.uk/world/middle-east-live/2011/jun/22/syria-libya-middle-east-unrest-live?INTCMP=ILCNETTXT3487',
175
+ :page => File.read("spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html"),
176
+ :valid_hash => 'a2ed6d79e1fd834df80e2d603b36be22' # changed from V1 due to html stripping
177
+ }
178
+ @pa = GuardianPageParserV2.new(@valid_options)
179
+ end
180
+
181
+ it "should parse the title" do
182
+ @pa.title.should == "Bahrain, Syria and Middle East unrest - Wednesday 22 June 2011"
183
+ end
184
+
185
+ it "should parse the content" do
186
+ @pa.content[0].should == "9.31am:Welcome to Middle East Live. There's so much happening across the region that it's difficult to know which stories to watch today. Here's a run down of the latest developments by country:"
187
+ @pa.content[1].should == "Bahrain"
188
+ @pa.content[6].should == "When I see children being killed, I must have misgivings. That's why I warned about the risk of civilian casualties... You can't have a decisive ending. Now is the time to do whatever we can to reach a political solution."
189
+ @pa.content.last.should == "(That's it from us today. Thanks for your comments)."
190
+ @pa.hash.should == @valid_options[:valid_hash]
191
+ end
192
+ end
193
+
194
+ describe "when parsing the barack obama-nicki-minaj article" do
195
+ before do
196
+ @valid_options = {
197
+ :url => 'http://www.guardian.co.uk/music/2012/oct/16/barack-obama-nicki-minaj-mariah-carey',
198
+ :page => File.read("spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html"),
199
+ :valid_hash => '22fe55dc3664662ac6c1c79eac584754'
200
+ }
201
+ @pa = GuardianPageParserV2.new(@valid_options)
202
+ end
203
+
204
+ it "should not include +explainerText+" do
205
+ @pa.hash.should == @valid_options[:valid_hash]
206
+ @pa.content.to_s.should_not =~ /explainerText/
207
+ end
208
+ end
209
+
210
+ describe "when parsing the anger-grows article with the explainerText javascript" do
211
+ before do
212
+ @valid_options = {
213
+ :url => 'http://www.guardian.co.uk/business/2012/jan/27/anger-grows-rbs-chiefs-bonus',
214
+ :page => File.read("spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html"),
215
+ :valid_hash => '04108a9a7e3196da185e4d10432740a1'
216
+ }
217
+ @pa = GuardianPageParserV2.new(@valid_options)
218
+ end
219
+
220
+ it "should have the same hash as before" do
221
+ @pa.hash.should == @valid_options[:valid_hash]
222
+ end
223
+
224
+ it "should not include +explainerText+" do
225
+ @pa.content.to_s.should_not =~ /explainerText/
226
+ end
227
+ end
228
+
229
+ describe "when parsing the nhs-patient-data article" do
230
+ before do
231
+ @valid_options = {
232
+ :url => 'http://www.theguardian.com/society/2014/jan/19/nhs-patient-data-available-companies-buy',
233
+ :page => File.read('spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html'),
234
+ :valid_hash => '0ae4a335bfd96ee3345350814f1e9f97'
235
+ }
236
+ @pa = GuardianPageParserV2.new(@valid_options)
237
+ end
238
+
239
+ it "should parse the title" do
240
+ @pa.title.should == 'NHS patient data to be made available for sale to drug and insurance companies'
241
+ end
242
+
243
+ it "should parse the content" do
244
+ @pa.content[0].should == 'Drug and insurance companies will from later this year be able to buy information on patients including mental health conditions and diseases such as cancer, as well as smoking and drinking habits, once a single English database of medical data has been created.'
245
+ @pa.content.last.should == 'A spokesperson said: "A phased rollout of care.data is being readied over a three month period with first extractions from March allowing time for the HSCIC to assess the quality of the data and the linkage before making the data available. We think it would be wrong to exclude private companies simply on ideological grounds; instead, the test should be how the company wants to use the data to improve NHS care."'
246
+ @pa.content.size.should == 21
247
+ @pa.hash.should == @valid_options[:valid_hash]
248
+ end
249
+ end
250
+
98
251
  end