web-page-parser 1.1.0 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +56 -3
  5. data/lib/web-page-parser/parsers/guardian_page_parser.rb +33 -3
  6. data/lib/web-page-parser/parsers/independent_page_parser.rb +2 -2
  7. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +8 -3
  8. data/lib/web-page-parser/parsers/rt_page_parser.rb +49 -0
  9. data/lib/web-page-parser/parsers/the_intercept_page_parser.rb +6 -1
  10. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +38 -2
  11. data/spec/fixtures/bbc_news/31014941.html +1123 -0
  12. data/spec/fixtures/bbc_news/32271505.html +1168 -0
  13. data/spec/fixtures/bbc_news/32275608.html +1142 -0
  14. data/spec/fixtures/guardian/duplicate-headline.html +2735 -0
  15. data/spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html +1752 -0
  16. data/spec/fixtures/guardian/university-extremist-speakers.html +1590 -0
  17. data/spec/fixtures/independent/boris-johnson.html +1086 -0
  18. data/spec/fixtures/independent/lord-burns.html +726 -0
  19. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html +305 -0
  20. data/spec/fixtures/new_york_times/trump-kim-policy.html +305 -0
  21. data/spec/fixtures/rt/338045.html +682 -0
  22. data/spec/fixtures/rt/338237.html +682 -0
  23. data/spec/fixtures/theintercept/pentagon-missionary.html +211 -0
  24. data/spec/fixtures/washingtonpost/israeli-ambassador.html +747 -0
  25. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html +381 -0
  26. data/spec/fixtures/washingtonpost/trump-kim-summit.html +379 -0
  27. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html +386 -0
  28. data/spec/parsers/bbc_news_page_spec.rb +132 -11
  29. data/spec/parsers/guardian_page_spec.rb +100 -0
  30. data/spec/parsers/independent_page_parser_spec.rb +52 -0
  31. data/spec/parsers/new_york_times_page_parser_spec.rb +75 -10
  32. data/spec/parsers/rt_page_parser_spec.rb +87 -0
  33. data/spec/parsers/the_intercept_page_parser_spec.rb +30 -0
  34. data/spec/parsers/washingtonpost_page_parser_spec.rb +93 -1
  35. data/spec/web-page-parser +1 -0
  36. metadata +98 -56
  37. metadata.gz.sig +0 -0
@@ -248,4 +248,104 @@ describe GuardianPageParserV2 do
248
248
  end
249
249
  end
250
250
 
251
+ describe "when parsing the new format university extremist speakers article" do
252
+ before do
253
+ @valid_options = {
254
+ :url => 'http://www.theguardian.com/uk-news/2015/mar/20/theresa-may-drops-rules-ordering-universities-ban-extremist-speakers',
255
+ :page => File.read("spec/fixtures/guardian/university-extremist-speakers.html"),
256
+ :valid_hash => 'aeeb32a7acd2de155be83fd7de6446cb'
257
+ }
258
+ @pa = GuardianPageParserV2.new(@valid_options)
259
+ end
260
+
261
+ it "should parse the title" do
262
+ @pa.title.should == "Theresa May drops rules on ordering universities to ban extremist speakers"
263
+ end
264
+
265
+ it "should parse the date in UTC" do
266
+ @pa.date.should == DateTime.parse("Friday 20 March 2015 17:38:30 GMT")
267
+ @pa.date.zone.should == '+00:00'
268
+ end
269
+
270
+ it "should parse the content" do
271
+ @pa.content[0].should == "The home secretary, Theresa May, has been forced to drop new statutory rules under which ministers could order universities and colleges to ban external extremist speakers."
272
+ @pa.content.size.should == 19
273
+ @pa.hash.should == @valid_options[:valid_hash]
274
+ end
275
+
276
+ end
277
+
278
+ describe "when parsing the Julian Assange article" do
279
+ before do
280
+ @valid_options = {
281
+ :url => 'https://www.theguardian.com/media/2016/dec/24/julian-assange-donald-trump-hillary-clinton-interview',
282
+ :page => File.read("spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html"),
283
+ :valid_hash => '2757835e9e028a21b5e47c9199ade005'
284
+ }
285
+ @pa = GuardianPageParserV2.new(@valid_options)
286
+ end
287
+
288
+ it "should parse the title" do
289
+ @pa.title.should == "Julian Assange gives guarded praise of Trump and blasts Clinton in interview"
290
+ end
291
+
292
+ it "should parse the date in UTC" do
293
+ @pa.date.should == DateTime.parse("Saturday 24 December 2016 18:36:24 GMT")
294
+ @pa.date.zone.should == '+00:00'
295
+ end
296
+
297
+ it "should parse the content" do
298
+ @pa.content[0].should == "Julian Assange, the founder of WikiLeaks, has offered guarded praise of Donald Trump, arguing the president-elect “is not a DC insider” and could mean an opportunity for positive as well as negative change in the US."
299
+ @pa.content.last.should == "Dozens of journalists have been killed in Russia in the past two decades, and Freedom House considers the Russian press to be “not free” and notes: “The main national news agenda is firmly controlled by the Kremlin. The government sets editorial policy at state-owned television stations, which dominate the media landscape and generate propagandistic content.”"
300
+ @pa.content.size.should == 16
301
+ @pa.hash.should == @valid_options[:valid_hash]
302
+ end
303
+ end
304
+
305
+ describe "when parsing an article with duplicate headlines" do
306
+ before do
307
+ @valid_options = {
308
+ :url => 'https://www.theguardian.com/world/2016/dec/31/russia-syria-ceasefire-un-security-council-damascus-kazakhstan',
309
+ :page => File.read("spec/fixtures/guardian/duplicate-headline.html"),
310
+ }
311
+ @pa = GuardianPageParserV2.new(@valid_options)
312
+ end
313
+
314
+ it "should only return one of the titles" do
315
+ @pa.title.should == "Russia pushes for UN security council support for Syria ceasefire"
316
+ end
317
+
318
+ end
319
+
320
+ describe GuardianPageParserV3 do
321
+ describe "when parsing the Julian Assange article" do
322
+ before do
323
+ @valid_options = {
324
+ :url => 'https://www.theguardian.com/media/2016/dec/24/julian-assange-donald-trump-hillary-clinton-interview',
325
+ :page => File.read("spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html"),
326
+ :valid_hash => 'a94b1cfb7abab286ab4e880e3c440d66'
327
+ }
328
+ @pa = GuardianPageParserV3.new(@valid_options)
329
+ end
330
+
331
+ it "should parse the title" do
332
+ @pa.title.should == "Julian Assange gives guarded praise of Trump and blasts Clinton in interview"
333
+ end
334
+
335
+ it "should parse the date in UTC" do
336
+ @pa.date.should == DateTime.parse("Saturday 24 December 2016 18:36:24 GMT")
337
+ @pa.date.zone.should == '+00:00'
338
+ end
339
+
340
+ it "should parse the content" do
341
+ @pa.content[0].should == "Julian Assange, the founder of WikiLeaks, has offered guarded praise of Donald Trump, arguing the president-elect “is not a DC insider” and could mean an opportunity for positive as well as negative change in the US."
342
+ @pa.content.last.should == "This article was amended on 29 December 2016 to remove a sentence in which it was asserted that Assange “has long had a close relationship with the Putin regime”. A sentence was also amended which paraphrased the interview, suggesting Assange said “there was no need for Wikileaks to undertake a whistleblowing role in Russia because of the open and competitive debate he claimed exists there”. It has been amended to more directly describe the question Assange was responding to when he spoke of Russia’s “many vibrant publications”."
343
+ @pa.content.size.should == 17
344
+ @pa.hash.should == @valid_options[:valid_hash]
345
+ end
346
+
347
+ end
348
+
349
+ end
350
+
251
351
  end
@@ -153,4 +153,56 @@ describe IndependentPageParserV1 do
153
153
  end
154
154
  end
155
155
 
156
+ describe 'when parsing the lord-burns article' do
157
+ before do
158
+ @valid_options = {
159
+ :url => 'http://www.independent.co.uk/news/media/lord-burns-channel-4-chairman-forced-to-step-down-by-ministers-amid-privatisation-fears-a6670691.html',
160
+ :page => File.read('spec/fixtures/independent/lord-burns.html'),
161
+ :valid_hash => 'f0efb3b2ea91266fe4a551867fcf6fb1'
162
+ }
163
+ @pa = IndependentPageParserV1.new(@valid_options)
164
+ end
165
+
166
+ it "should parse the title" do
167
+ @pa.title.should == 'Lord Burns: Channel 4 chairman forced to step down by ministers amid privatisation fears'
168
+ end
169
+
170
+ it "should parse the date" do
171
+ @pa.date.should == DateTime.parse('28 September 2015 18:03:52 BST')
172
+ end
173
+
174
+ it "should calculate the hash correctly" do
175
+ @pa.hash.should == @valid_options[:valid_hash]
176
+ end
177
+
178
+ it "should parse the content" do
179
+ @pa.content[0].should == 'Ministers have forced Lord Burns, the chairman of Channel 4, to step down, fuelling speculation that the not-for-profit broadcaster is being prepared for privatisation.'
180
+ @pa.content.size.should == 6
181
+ end
182
+ end
183
+
184
+ describe 'when parsing the boris-johnson article' do
185
+ before do
186
+ @valid_options = {
187
+ :url => 'http://www.independent.co.uk/news/uk/politics/boris-johnson-warns-low-immigration-could-stall-economic-growth-a6693486.html',
188
+ :page => File.read('spec/fixtures/independent/boris-johnson.html'),
189
+ :valid_hash => 'e9fc5ef9502d3b167c00d5cefb308495'
190
+ }
191
+ @pa = IndependentPageParserV1.new(@valid_options)
192
+ end
193
+
194
+ it "should parse the title" do
195
+ @pa.title.should == 'Boris Johnson warns low immigration could stall economic growth'
196
+ end
197
+
198
+ it "should calculate the hash correctly" do
199
+ @pa.hash.should == @valid_options[:valid_hash]
200
+ end
201
+
202
+ it "should exclude the 'read more' and image widget captions" do
203
+ @pa.content.to_s.should_not =~ /Farage says/
204
+ @pa.content.to_s.should_not =~ /A butcher/
205
+ end
206
+ end
207
+
156
208
  end
@@ -8,10 +8,14 @@ describe NewYorkTimesPageParserFactory do
8
8
  @valid_urls = [
9
9
  "http://www.nytimes.com/2012/01/28/us/politics/no-more-nice-guys-fans-love-nuclear-newt.html?_r=1&ref=us",
10
10
  "http://www.nytimes.com/2012/01/29/business/global/greece-in-talks-with-creditors-on-debt-deal.html",
11
+ "https://www.nytimes.com/2014/01/12/world/europe/show-banned-french-comedian-has-new-one.html",
12
+ "https://www.nytimes.com/2018/06/12/world/asia/trump-kim-policy.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=span-ab-top-region&region=top-news&WT.nav=top-news"
11
13
  ]
12
14
  @invalid_urls = [
13
15
  "http://cityroom.blogs.nytimes.com/2012/01/27/the-week-in-pictures-for-jan-27/",
14
- "http://www.nytimes.com/pages/world/asia/index.html"
16
+ "http://www.nytimes.com/pages/world/asia/index.html",
17
+ "https://www.nytimes.com/section/business?module=SectionsNav&action=click&version=BrowseTree&region=TopBar&contentCollection=Business&pgtype=sectionfront",
18
+ "https://www.nytimes.com/section/technology/personaltech"
15
19
  ]
16
20
  end
17
21
 
@@ -151,35 +155,96 @@ describe NewYorkTimesPageParserV2 do
151
155
  end
152
156
  end
153
157
 
158
+ describe "when parsing the French comedian article with the 2018 formatting" do
159
+ before do
160
+ @valid_options = {
161
+ :url => 'https://www.nytimes.com/2014/01/12/world/europe/show-banned-french-comedian-has-new-one.html',
162
+ :page => File.read('spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html'),
163
+ :valid_hash => 'ab9cafaac593c12b5b457a5bfdd3eda5'
164
+ }
165
+ @pa = NewYorkTimesPageParserV2.new(@valid_options)
166
+ end
167
+
168
+ it "should parse the title" do
169
+ @pa.title.should == 'Show Banned, French Comedian Has New One'
170
+ end
171
+
172
+ it "should parse the date" do
173
+ @pa.date.should == DateTime.parse("2014-01-12T05:35:51+00:00")
174
+ end
175
+
176
+ it "should calculate the hash correctly" do
177
+ @pa.hash.should == @valid_options[:valid_hash]
178
+ end
179
+
180
+ it "should parse the content" do
181
+ @pa.content[0].should == 'PARIS — A French comedian said Saturday that he had dropped a show banned for its anti-Semitic language and was planning one that would cause no objections.'
182
+ @pa.content[3].should == '“We live in a democratic country and I have to comply with the laws, despite the blatant political interference,” he said. “As a comedian, I have pushed the debate to the very edge of laughter.”'
183
+ @pa.content.size.should == 18
184
+ end
185
+ end
186
+
187
+
188
+ describe "when parsing trump kim policy article" do
189
+ before do
190
+ @valid_options = {
191
+ :url => 'https://www.nytimes.com/2018/06/12/world/asia/trump-kim-policy.html',
192
+ :page => File.read('spec/fixtures/new_york_times/trump-kim-policy.html'),
193
+ :valid_hash => 'ab62998617a2fb91552122a9ac845e4c'
194
+ }
195
+ @pa = NewYorkTimesPageParserV2.new(@valid_options)
196
+ end
197
+
198
+ it "should parse the title" do
199
+ @pa.title.should == 'Vague on Details, Trump Is Betting on ‘Special Bond’ With Kim to Deliver Deal'
200
+ end
201
+
202
+ it "should parse the date" do
203
+ @pa.date.should == DateTime.parse("2018-06-12T16:35:00+00:00")
204
+ end
205
+
206
+ it "should calculate the hash correctly" do
207
+ @pa.hash.should == @valid_options[:valid_hash]
208
+ end
209
+
210
+ it "should parse the content" do
211
+ @pa.content[0].should == 'SINGAPORE — On paper, there is nothing President Trump extracted from North Korea’s leader, Kim Jong-un, in their summit meeting that Mr. Kim’s father and grandfather had not already given to past American presidents.'
212
+ @pa.content[8].should == '“I don’t know that I’ll ever admit that,” he added, “but I’ll find some kind of an excuse.”'
213
+ @pa.content.last.should == "Whatever he gets, it will be judged by one standard: whether he has “solved” the North Korea problem, as he vowed he would, rather than passing it on to his successor."
214
+ @pa.content.size.should == 28
215
+ end
216
+ end
217
+
154
218
 
155
219
  describe "retrieve_page" do
156
220
  it "should retrieve the article from the nyt website" do
157
- @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/22/us/politics/ignoring-calls-to-quit-akin-appeals-to-voters-in-ad.html?hp")
221
+ @pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/22/us/politics/ignoring-calls-to-quit-akin-appeals-to-voters-in-ad.html?hp")
158
222
  @pa.title.should =~ /ignoring/i
159
223
  end
160
224
 
161
225
  it "should retrieve the full article from the nyt website when given a first page url" do
162
- @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html?ref=world")
226
+ @pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html?ref=world")
163
227
  @pa.content.size.should > 40
164
- @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html")
228
+ @pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html")
165
229
  @pa.content.size.should > 40
166
230
  end
167
231
 
168
232
  it "should retrieve more than the paywall url limit" do
169
233
  urls = []
170
234
  [
171
- "http://feeds.nytimes.com/nyt/rss/HomePage",
235
+ "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
172
236
  "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml",
173
- "http://feeds.nytimes.com/nyt/rss/NYRegion",
237
+ "http://rss.nytimes.com/services/xml/rss/nyt/NYRegion.xml",
174
238
  "http://www.nytimes.com/services/xml/rss/nyt/World.xml"
175
239
  ].each do |fu|
176
- urls += Net::HTTP.get(URI(fu)).scan(/http:\/\/www.nytimes.com\/[0-9]{4}\/[^<"?]+/)
240
+ next if urls.size > 25
241
+ urls += Net::HTTP.get(URI(fu)).scan(/https:\/\/www.nytimes.com\/[0-9]{4}\/[^<"?]+/)
242
+ urls.uniq!
177
243
  end
178
244
 
179
- urls.uniq!
180
- pending("Failing spec but works in practise. Needs a looksee.") { urls.size.should > 25 }
245
+ urls.size.should > 25
181
246
  urls[0..24].each_with_index do |u,i|
182
- @pa = NewYorkTimesPageParserV1.new(:url => u)
247
+ @pa = NewYorkTimesPageParserV2.new(:url => u)
183
248
  @pa.page.curl.header_str.to_s.scan(/^Location: .*/).grep(/myaccount.nytimes.com/).should be_empty
184
249
  @pa.title.should_not =~ /^Log In/
185
250
  end
@@ -0,0 +1,87 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ include WebPageParser
4
+
5
+ describe RTPageParserFactory do
6
+ before do
7
+ @valid_urls = ["https://www.rt.com/uk/338045-privatization-aid-money-dfid/",
8
+ "https://www.rt.com/business/338032-saudi-arabia-megafund-assets/",
9
+ "https://www.rt.com/usa/338131-candy-thief-life-sentence/"]
10
+ @invalid_urls = ["https://www.rt.com/politics/",
11
+ "https://www.rt.com/in-vision/337713-egyptair-hijacker-hostages-free/",
12
+ "https://www.rt.com/in-motion/338243-anti-refugee-rally-uk/"]
13
+ end
14
+
15
+ it "should detect articles from the url" do
16
+ @valid_urls.each do |url|
17
+ RTPageParserFactory.can_parse?(:url => url).should be_true
18
+ end
19
+ end
20
+
21
+ it "should ignore pages with the wrong url format" do
22
+ @invalid_urls.each do |url|
23
+ RTPageParserFactory.can_parse?(:url => url).should be_nil
24
+ end
25
+ end
26
+
27
+ end
28
+
29
+ describe RTPageParserV1 do
30
+
31
+ describe "when parsing the privatised aid article" do
32
+ before do
33
+ @valid_options = {
34
+ :url => 'https://www.rt.com/uk/338045-privatization-aid-money-dfid/',
35
+ :page => File.read("spec/fixtures/rt/338045.html"),
36
+ :valid_hash => '067d06dffae315daf6ab88026fdc7966'
37
+ }
38
+ @pa = RTPageParserV1.new(@valid_options)
39
+ end
40
+
41
+ it "should parse the title" do
42
+ @pa.title.should == "‘Scandal of privatized aid’: Free-market consultants cream off £450mn in UK govt funds"
43
+ end
44
+
45
+ it "should parse the date in UTC" do
46
+ @pa.date.should == DateTime.parse("2016-04-01 15:17")
47
+ @pa.date.zone.should == '+00:00'
48
+ end
49
+
50
+ it "should parse the content" do
51
+ @pa.content[0].should == "Free-market consultants in Britain are taking hundreds of millions of pounds ring-fenced to alleviate poverty in the developing world, as the government continues with its agenda of privatizing aid, a damning report has warned."
52
+ @pa.content[4].should == "The study examined how much of DfID’s work was geared towards supporting market-based development and the private sector in poor states. Recent projects included backing for a “business advocacy capacity development program” in Zimbabwe, and projects to increase private schooling in Kenya."
53
+ @pa.content[22].should == "ASI describes itself as a transparent, objective organization dedicated to making public services more robust. It also claims to support economic growth and civil society, while building “democratic and accountable institutions.”"
54
+ @pa.content.size.should == 23
55
+ @pa.hash.should == @valid_options[:valid_hash]
56
+ end
57
+ end
58
+
59
+ describe "when parsing the trump article" do
60
+ before do
61
+ @valid_options = {
62
+ :url => 'https://www.rt.com/usa/338237-trump-fine-nato-breakup-obsolete/',
63
+ :page => File.read("spec/fixtures/rt/338237.html"),
64
+ :valid_hash => '633221af5d9b9a7161cd40eaaf22253d'
65
+ }
66
+ @pa = RTPageParserV1.new(@valid_options)
67
+ end
68
+
69
+ it "should parse the title" do
70
+ @pa.title.should == "Trump sparks NATO debate: ‘Obsolete’ or ‘tripwire that could lead to World War III’?"
71
+ end
72
+
73
+ it "should parse the date in UTC" do
74
+ @pa.date.should == DateTime.parse("2016-04-03 11:47")
75
+ @pa.date.zone.should == '+00:00'
76
+ end
77
+
78
+ it "should parse the content" do
79
+ @pa.content[0].should == "Republican presidential candidate Donald Trump slammed NATO on the campaign trail this week, saying he can live with breaking up the military alliance, which he calls “obsolete.”"
80
+ @pa.content[4].should == "NATO"
81
+ @pa.content[11].should == "The now-28 member-strong organization has defied its purported promise on a number of occasions."
82
+ @pa.content[24].should == "The cost of NATO"
83
+ @pa.content.size.should == 39
84
+ @pa.hash.should == @valid_options[:valid_hash]
85
+ end
86
+ end
87
+ end
@@ -64,4 +64,34 @@ describe TheInterceptPageParserV1 do
64
64
 
65
65
  end
66
66
 
67
+ describe 'when parsing the pentagon missionary article' do
68
+ before do
69
+ @valid_options = {
70
+ :url => 'https://theintercept.com/2015/10/26/pentagon-missionary-spies-christian-ngo-front-for-north-korea-espionage/',
71
+ :page => File.read('spec/fixtures/theintercept/pentagon-missionary.html'),
72
+ :valid_hash => 'aa8a59955cc0c783f782c5c13701c71d'
73
+ }
74
+ @pa = TheInterceptPageParserV1.new(@valid_options)
75
+ end
76
+
77
+ it 'should parse the title' do
78
+ @pa.title.should == "U.S. Military Used Christian NGO as Front for North Korea Espionage"
79
+ end
80
+
81
+ it 'should parse the content' do
82
+ @pa.content[0].should == 'ON MAY 10, 2007, in the East Room of the White House, President George W. Bush presided over a ceremony honoring the nation’s most accomplished community service leaders. Among those collecting a President’s Volunteer Service Award that afternoon was Kay Hiramine, the Colorado-based founder of a multimillion-dollar humanitarian organization.'
83
+ @pa.content[13].should == 'HISG WAS ESTABLISHED shortly after 9/11, when Hiramine led a group of three friends in creating a humanitarian organization that they hoped could provide disaster relief and sustainable development in poor and war-torn countries around the world, according to the organization’s incorporation documents.'
84
+ @pa.content[83].should == 'This report makes reference to a donation from Working Partners Foundation to Catholic Relief Services, based on Working Partners Foundation’s tax filings. Catholic Relief Services, which conducted a review after publication, said its own records contained no indication it received money from Working Partners Foundation or HISG.'
85
+ @pa.content.last.should == 'Top photo: U.S. President George W. Bush with Kay Hiramine prior to presenting him with a President’s Volunteer Service Award on May 10, 2007, in the East Room of the White House (photo flipped). '
86
+ @pa.content.size.should == 86
87
+ @pa.hash.should == @valid_options[:valid_hash]
88
+ end
89
+
90
+ it 'should parse the date in UTC' do
91
+ @pa.date.should == DateTime.parse('Oct. 26 2015 15:05:22')
92
+ @pa.date.zone.should == '+00:00'
93
+ end
94
+
95
+ end
96
+
67
97
  end
@@ -6,7 +6,8 @@ describe WashingtonPostPageParserFactory do
6
6
  before do
7
7
  @valid_urls = [
8
8
  'http://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
9
- 'http://www.washingtonpost.com/business/technology/nsa-program-defenders-question-snowdens-motives/2014/01/19/091fccaa-811d-11e3-bbe5-6a2a3141e3a9_story.html'
9
+ 'http://www.washingtonpost.com/business/technology/nsa-program-defenders-question-snowdens-motives/2014/01/19/091fccaa-811d-11e3-bbe5-6a2a3141e3a9_story.html',
10
+ 'https://www.washingtonpost.com/world/middle_east/israel-ambassador-to-us-sends-anti-boycott-message-with-gift/2015/12/23/652d639c-a99b-11e5-b596-113f59ee069a_story.html'
10
11
  ]
11
12
  @invalid_urls = [
12
13
  'http://www.washingtonpost.com/politics/',
@@ -119,4 +120,95 @@ describe WashingtonPostPageParserV1 do
119
120
 
120
121
  end
121
122
 
123
+ describe 'when parsing the Israeli ambassador article' do
124
+ before do
125
+ @valid_options = {
126
+ :url => 'https://www.washingtonpost.com/world/middle_east/israel-ambassador-to-us-sends-anti-boycott-message-with-gift/2015/12/23/652d639c-a99b-11e5-b596-113f59ee069a_story.html',
127
+ :page => File.read('spec/fixtures/washingtonpost/israeli-ambassador.html'),
128
+ :valid_hash => 'c2e80bf1012949bf3a124576466b1b40'
129
+ }
130
+ @pa = WashingtonPostPageParserV1.new(@valid_options)
131
+ end
132
+
133
+ it "should parse the title" do
134
+ @pa.title.should eq 'Israel ambassador to US sends anti-boycott message with gift'
135
+ end
136
+
137
+ it 'should parse the date in UTC' do
138
+ @pa.date.should eq DateTime.parse("December 23 2015")
139
+ @pa.date.zone.should eq '+00:00'
140
+ end
141
+
142
+ it "should contain no javascript" do
143
+ @pa.content.join(' ').should_not =~ /function/
144
+ end
145
+
146
+ it "should parse the content" do
147
+ @pa.content[0].should eq 'JERUSALEM — Israel’s ambassador to the United States has dispatched a politically charged holiday gift.'
148
+ @pa.content.size.should eq 6
149
+ @pa.hash.should == @valid_options[:valid_hash]
150
+ end
151
+ end
152
+ end
153
+
154
+ describe WashingtonPostPageParserV2 do
155
+ describe 'when parsing the trump kim simmit article' do
156
+ before do
157
+ @valid_options = {
158
+ :url => 'https://www.washingtonpost.com/politics/trump-kim-summit-trump-says-we-have-developed-a-very-special-bond-at-end-of-historic-meeting/2018/06/12/ff43465a-6dba-11e8-bf86-a2351b5ece99_story.html?utm_term=.392b71a75a35',
159
+ :page => File.read('spec/fixtures/washingtonpost/trump-kim-summit.html'),
160
+ :valid_hash => '5b703096157e74b65fdf00fd9227ebbc'
161
+ }
162
+ @pa = WashingtonPostPageParserV2.new(@valid_options)
163
+ end
164
+
165
+ it "should parse the guid" do
166
+ @pa.guid.should eq "ff43465a-6dba-11e8-bf86-a2351b5ece99"
167
+ end
168
+
169
+ it "should parse the title" do
170
+ @pa.title.should eq 'Trump-Kim summit: Trump says after historic meeting, ‘We have developed a very special bond’'
171
+ end
172
+
173
+ it 'should parse the date in UTC' do
174
+ @pa.date.should eq DateTime.parse("2018-06-12T11:47:00-05:00")
175
+ end
176
+
177
+ it "should parse the content" do
178
+ @pa.content[0].should eq 'SINGAPORE — President Trump concluded a historic summit with North Korean leader Kim Jong Un here Tuesday by sketching a path to prosperity for the isolated nation. But it remained highly uncertain whether the young dictator would embrace the offer by agreeing to eliminate his nuclear arsenal.'
179
+ @pa.content.last.should eq 'Carol Morello in Washington and Brian Murphy in Seoul contributed to this report.'
180
+ @pa.content[9].should eq 'Trump said that aides would begin additional talks soon and that he would potentially invite Kim to the White House and be open to visiting Pyongyang “at the appropriate time.” Yet he also acknowledged that disarmament would not come quickly.'
181
+ @pa.content.size.should eq 45
182
+ @pa.hash.should eq @valid_options[:valid_hash]
183
+ end
184
+ end
185
+
186
+ describe 'when parsing the bust-boom article from 2018' do
187
+ before do
188
+ @valid_options = {
189
+ :url => 'https://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
190
+ :page => File.read('spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html'),
191
+ :valid_hash => 'bbcdda8a8dffcabf71088039fb366e34'
192
+ }
193
+ @pa = WashingtonPostPageParserV2.new(@valid_options)
194
+ end
195
+
196
+ it "should parse the title" do
197
+ @pa.title.should == 'Will a bust follow the boom in Britain?'
198
+ end
199
+
200
+ it 'should parse the date in UTC' do
201
+ @pa.date.should eq DateTime.parse("2014-01-18T05:43:00-05:00")
202
+ @pa.date.zone.should eq '+00:00'
203
+ end
204
+
205
+ it "should parse the content" do
206
+ @pa.content[0].should eq 'LONDON — For decades, the modest two-bedroom apartment off Abbey Road was home to some of London’s neediest, a small, leaky outpost in this city’s vast constellation of public housing.'
207
+ @pa.content[12].should eq 'Crazy in the capital'
208
+ @pa.content.last.should eq '“It’s about time the government did something to help,” he said. “I don’t come from a rich family, so I don’t have parents who will give 15,000 pounds for a deposit. That’s not available to me. I’m genuinely pleased Cameron has done something for the working man, which is me.”'
209
+ @pa.content.size.should eq 25
210
+ @pa.hash.should eq @valid_options[:valid_hash]
211
+ end
212
+ end
213
+
122
214
  end