web-page-parser 1.1.0 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +56 -3
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +33 -3
- data/lib/web-page-parser/parsers/independent_page_parser.rb +2 -2
- data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +8 -3
- data/lib/web-page-parser/parsers/rt_page_parser.rb +49 -0
- data/lib/web-page-parser/parsers/the_intercept_page_parser.rb +6 -1
- data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +38 -2
- data/spec/fixtures/bbc_news/31014941.html +1123 -0
- data/spec/fixtures/bbc_news/32271505.html +1168 -0
- data/spec/fixtures/bbc_news/32275608.html +1142 -0
- data/spec/fixtures/guardian/duplicate-headline.html +2735 -0
- data/spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html +1752 -0
- data/spec/fixtures/guardian/university-extremist-speakers.html +1590 -0
- data/spec/fixtures/independent/boris-johnson.html +1086 -0
- data/spec/fixtures/independent/lord-burns.html +726 -0
- data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html +305 -0
- data/spec/fixtures/new_york_times/trump-kim-policy.html +305 -0
- data/spec/fixtures/rt/338045.html +682 -0
- data/spec/fixtures/rt/338237.html +682 -0
- data/spec/fixtures/theintercept/pentagon-missionary.html +211 -0
- data/spec/fixtures/washingtonpost/israeli-ambassador.html +747 -0
- data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html +381 -0
- data/spec/fixtures/washingtonpost/trump-kim-summit.html +379 -0
- data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html +386 -0
- data/spec/parsers/bbc_news_page_spec.rb +132 -11
- data/spec/parsers/guardian_page_spec.rb +100 -0
- data/spec/parsers/independent_page_parser_spec.rb +52 -0
- data/spec/parsers/new_york_times_page_parser_spec.rb +75 -10
- data/spec/parsers/rt_page_parser_spec.rb +87 -0
- data/spec/parsers/the_intercept_page_parser_spec.rb +30 -0
- data/spec/parsers/washingtonpost_page_parser_spec.rb +93 -1
- data/spec/web-page-parser +1 -0
- metadata +98 -56
- metadata.gz.sig +0 -0
@@ -248,4 +248,104 @@ describe GuardianPageParserV2 do
|
|
248
248
|
end
|
249
249
|
end
|
250
250
|
|
251
|
+
describe "when parsing the new format university extremist speakers article" do
|
252
|
+
before do
|
253
|
+
@valid_options = {
|
254
|
+
:url => 'http://www.theguardian.com/uk-news/2015/mar/20/theresa-may-drops-rules-ordering-universities-ban-extremist-speakers',
|
255
|
+
:page => File.read("spec/fixtures/guardian/university-extremist-speakers.html"),
|
256
|
+
:valid_hash => 'aeeb32a7acd2de155be83fd7de6446cb'
|
257
|
+
}
|
258
|
+
@pa = GuardianPageParserV2.new(@valid_options)
|
259
|
+
end
|
260
|
+
|
261
|
+
it "should parse the title" do
|
262
|
+
@pa.title.should == "Theresa May drops rules on ordering universities to ban extremist speakers"
|
263
|
+
end
|
264
|
+
|
265
|
+
it "should parse the date in UTC" do
|
266
|
+
@pa.date.should == DateTime.parse("Friday 20 March 2015 17:38:30 GMT")
|
267
|
+
@pa.date.zone.should == '+00:00'
|
268
|
+
end
|
269
|
+
|
270
|
+
it "should parse the content" do
|
271
|
+
@pa.content[0].should == "The home secretary, Theresa May, has been forced to drop new statutory rules under which ministers could order universities and colleges to ban external extremist speakers."
|
272
|
+
@pa.content.size.should == 19
|
273
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
274
|
+
end
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
describe "when parsing the Julian Assange article" do
|
279
|
+
before do
|
280
|
+
@valid_options = {
|
281
|
+
:url => 'https://www.theguardian.com/media/2016/dec/24/julian-assange-donald-trump-hillary-clinton-interview',
|
282
|
+
:page => File.read("spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html"),
|
283
|
+
:valid_hash => '2757835e9e028a21b5e47c9199ade005'
|
284
|
+
}
|
285
|
+
@pa = GuardianPageParserV2.new(@valid_options)
|
286
|
+
end
|
287
|
+
|
288
|
+
it "should parse the title" do
|
289
|
+
@pa.title.should == "Julian Assange gives guarded praise of Trump and blasts Clinton in interview"
|
290
|
+
end
|
291
|
+
|
292
|
+
it "should parse the date in UTC" do
|
293
|
+
@pa.date.should == DateTime.parse("Saturday 24 December 2016 18:36:24 GMT")
|
294
|
+
@pa.date.zone.should == '+00:00'
|
295
|
+
end
|
296
|
+
|
297
|
+
it "should parse the content" do
|
298
|
+
@pa.content[0].should == "Julian Assange, the founder of WikiLeaks, has offered guarded praise of Donald Trump, arguing the president-elect “is not a DC insider” and could mean an opportunity for positive as well as negative change in the US."
|
299
|
+
@pa.content.last.should == "Dozens of journalists have been killed in Russia in the past two decades, and Freedom House considers the Russian press to be “not free” and notes: “The main national news agenda is firmly controlled by the Kremlin. The government sets editorial policy at state-owned television stations, which dominate the media landscape and generate propagandistic content.”"
|
300
|
+
@pa.content.size.should == 16
|
301
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
describe "when parsing an article with duplicate headlines" do
|
306
|
+
before do
|
307
|
+
@valid_options = {
|
308
|
+
:url => 'https://www.theguardian.com/world/2016/dec/31/russia-syria-ceasefire-un-security-council-damascus-kazakhstan',
|
309
|
+
:page => File.read("spec/fixtures/guardian/duplicate-headline.html"),
|
310
|
+
}
|
311
|
+
@pa = GuardianPageParserV2.new(@valid_options)
|
312
|
+
end
|
313
|
+
|
314
|
+
it "should only return one of the titles" do
|
315
|
+
@pa.title.should == "Russia pushes for UN security council support for Syria ceasefire"
|
316
|
+
end
|
317
|
+
|
318
|
+
end
|
319
|
+
|
320
|
+
describe GuardianPageParserV3 do
|
321
|
+
describe "when parsing the Julian Assange article" do
|
322
|
+
before do
|
323
|
+
@valid_options = {
|
324
|
+
:url => 'https://www.theguardian.com/media/2016/dec/24/julian-assange-donald-trump-hillary-clinton-interview',
|
325
|
+
:page => File.read("spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html"),
|
326
|
+
:valid_hash => 'a94b1cfb7abab286ab4e880e3c440d66'
|
327
|
+
}
|
328
|
+
@pa = GuardianPageParserV3.new(@valid_options)
|
329
|
+
end
|
330
|
+
|
331
|
+
it "should parse the title" do
|
332
|
+
@pa.title.should == "Julian Assange gives guarded praise of Trump and blasts Clinton in interview"
|
333
|
+
end
|
334
|
+
|
335
|
+
it "should parse the date in UTC" do
|
336
|
+
@pa.date.should == DateTime.parse("Saturday 24 December 2016 18:36:24 GMT")
|
337
|
+
@pa.date.zone.should == '+00:00'
|
338
|
+
end
|
339
|
+
|
340
|
+
it "should parse the content" do
|
341
|
+
@pa.content[0].should == "Julian Assange, the founder of WikiLeaks, has offered guarded praise of Donald Trump, arguing the president-elect “is not a DC insider” and could mean an opportunity for positive as well as negative change in the US."
|
342
|
+
@pa.content.last.should == "This article was amended on 29 December 2016 to remove a sentence in which it was asserted that Assange “has long had a close relationship with the Putin regime”. A sentence was also amended which paraphrased the interview, suggesting Assange said “there was no need for Wikileaks to undertake a whistleblowing role in Russia because of the open and competitive debate he claimed exists there”. It has been amended to more directly describe the question Assange was responding to when he spoke of Russia’s “many vibrant publications”."
|
343
|
+
@pa.content.size.should == 17
|
344
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
345
|
+
end
|
346
|
+
|
347
|
+
end
|
348
|
+
|
349
|
+
end
|
350
|
+
|
251
351
|
end
|
@@ -153,4 +153,56 @@ describe IndependentPageParserV1 do
|
|
153
153
|
end
|
154
154
|
end
|
155
155
|
|
156
|
+
describe 'when parsing the lord-burns article' do
|
157
|
+
before do
|
158
|
+
@valid_options = {
|
159
|
+
:url => 'http://www.independent.co.uk/news/media/lord-burns-channel-4-chairman-forced-to-step-down-by-ministers-amid-privatisation-fears-a6670691.html',
|
160
|
+
:page => File.read('spec/fixtures/independent/lord-burns.html'),
|
161
|
+
:valid_hash => 'f0efb3b2ea91266fe4a551867fcf6fb1'
|
162
|
+
}
|
163
|
+
@pa = IndependentPageParserV1.new(@valid_options)
|
164
|
+
end
|
165
|
+
|
166
|
+
it "should parse the title" do
|
167
|
+
@pa.title.should == 'Lord Burns: Channel 4 chairman forced to step down by ministers amid privatisation fears'
|
168
|
+
end
|
169
|
+
|
170
|
+
it "should parse the date" do
|
171
|
+
@pa.date.should == DateTime.parse('28 September 2015 18:03:52 BST')
|
172
|
+
end
|
173
|
+
|
174
|
+
it "should calculate the hash correctly" do
|
175
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
176
|
+
end
|
177
|
+
|
178
|
+
it "should parse the content" do
|
179
|
+
@pa.content[0].should == 'Ministers have forced Lord Burns, the chairman of Channel 4, to step down, fuelling speculation that the not-for-profit broadcaster is being prepared for privatisation.'
|
180
|
+
@pa.content.size.should == 6
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
describe 'when parsing the boris-johnson article' do
|
185
|
+
before do
|
186
|
+
@valid_options = {
|
187
|
+
:url => 'http://www.independent.co.uk/news/uk/politics/boris-johnson-warns-low-immigration-could-stall-economic-growth-a6693486.html',
|
188
|
+
:page => File.read('spec/fixtures/independent/boris-johnson.html'),
|
189
|
+
:valid_hash => 'e9fc5ef9502d3b167c00d5cefb308495'
|
190
|
+
}
|
191
|
+
@pa = IndependentPageParserV1.new(@valid_options)
|
192
|
+
end
|
193
|
+
|
194
|
+
it "should parse the title" do
|
195
|
+
@pa.title.should == 'Boris Johnson warns low immigration could stall economic growth'
|
196
|
+
end
|
197
|
+
|
198
|
+
it "should calculate the hash correctly" do
|
199
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
200
|
+
end
|
201
|
+
|
202
|
+
it "should exclude the 'read more' and image widget captions" do
|
203
|
+
@pa.content.to_s.should_not =~ /Farage says/
|
204
|
+
@pa.content.to_s.should_not =~ /A butcher/
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
156
208
|
end
|
@@ -8,10 +8,14 @@ describe NewYorkTimesPageParserFactory do
|
|
8
8
|
@valid_urls = [
|
9
9
|
"http://www.nytimes.com/2012/01/28/us/politics/no-more-nice-guys-fans-love-nuclear-newt.html?_r=1&ref=us",
|
10
10
|
"http://www.nytimes.com/2012/01/29/business/global/greece-in-talks-with-creditors-on-debt-deal.html",
|
11
|
+
"https://www.nytimes.com/2014/01/12/world/europe/show-banned-french-comedian-has-new-one.html",
|
12
|
+
"https://www.nytimes.com/2018/06/12/world/asia/trump-kim-policy.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=span-ab-top-region®ion=top-news&WT.nav=top-news"
|
11
13
|
]
|
12
14
|
@invalid_urls = [
|
13
15
|
"http://cityroom.blogs.nytimes.com/2012/01/27/the-week-in-pictures-for-jan-27/",
|
14
|
-
"http://www.nytimes.com/pages/world/asia/index.html"
|
16
|
+
"http://www.nytimes.com/pages/world/asia/index.html",
|
17
|
+
"https://www.nytimes.com/section/business?module=SectionsNav&action=click&version=BrowseTree®ion=TopBar&contentCollection=Business&pgtype=sectionfront",
|
18
|
+
"https://www.nytimes.com/section/technology/personaltech"
|
15
19
|
]
|
16
20
|
end
|
17
21
|
|
@@ -151,35 +155,96 @@ describe NewYorkTimesPageParserV2 do
|
|
151
155
|
end
|
152
156
|
end
|
153
157
|
|
158
|
+
describe "when parsing the French comedian article with the 2018 formatting" do
|
159
|
+
before do
|
160
|
+
@valid_options = {
|
161
|
+
:url => 'https://www.nytimes.com/2014/01/12/world/europe/show-banned-french-comedian-has-new-one.html',
|
162
|
+
:page => File.read('spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html'),
|
163
|
+
:valid_hash => 'ab9cafaac593c12b5b457a5bfdd3eda5'
|
164
|
+
}
|
165
|
+
@pa = NewYorkTimesPageParserV2.new(@valid_options)
|
166
|
+
end
|
167
|
+
|
168
|
+
it "should parse the title" do
|
169
|
+
@pa.title.should == 'Show Banned, French Comedian Has New One'
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should parse the date" do
|
173
|
+
@pa.date.should == DateTime.parse("2014-01-12T05:35:51+00:00")
|
174
|
+
end
|
175
|
+
|
176
|
+
it "should calculate the hash correctly" do
|
177
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
178
|
+
end
|
179
|
+
|
180
|
+
it "should parse the content" do
|
181
|
+
@pa.content[0].should == 'PARIS — A French comedian said Saturday that he had dropped a show banned for its anti-Semitic language and was planning one that would cause no objections.'
|
182
|
+
@pa.content[3].should == '“We live in a democratic country and I have to comply with the laws, despite the blatant political interference,” he said. “As a comedian, I have pushed the debate to the very edge of laughter.”'
|
183
|
+
@pa.content.size.should == 18
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
describe "when parsing trump kim policy article" do
|
189
|
+
before do
|
190
|
+
@valid_options = {
|
191
|
+
:url => 'https://www.nytimes.com/2018/06/12/world/asia/trump-kim-policy.html',
|
192
|
+
:page => File.read('spec/fixtures/new_york_times/trump-kim-policy.html'),
|
193
|
+
:valid_hash => 'ab62998617a2fb91552122a9ac845e4c'
|
194
|
+
}
|
195
|
+
@pa = NewYorkTimesPageParserV2.new(@valid_options)
|
196
|
+
end
|
197
|
+
|
198
|
+
it "should parse the title" do
|
199
|
+
@pa.title.should == 'Vague on Details, Trump Is Betting on ‘Special Bond’ With Kim to Deliver Deal'
|
200
|
+
end
|
201
|
+
|
202
|
+
it "should parse the date" do
|
203
|
+
@pa.date.should == DateTime.parse("2018-06-12T16:35:00+00:00")
|
204
|
+
end
|
205
|
+
|
206
|
+
it "should calculate the hash correctly" do
|
207
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
208
|
+
end
|
209
|
+
|
210
|
+
it "should parse the content" do
|
211
|
+
@pa.content[0].should == 'SINGAPORE — On paper, there is nothing President Trump extracted from North Korea’s leader, Kim Jong-un, in their summit meeting that Mr. Kim’s father and grandfather had not already given to past American presidents.'
|
212
|
+
@pa.content[8].should == '“I don’t know that I’ll ever admit that,” he added, “but I’ll find some kind of an excuse.”'
|
213
|
+
@pa.content.last.should == "Whatever he gets, it will be judged by one standard: whether he has “solved” the North Korea problem, as he vowed he would, rather than passing it on to his successor."
|
214
|
+
@pa.content.size.should == 28
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
154
218
|
|
155
219
|
describe "retrieve_page" do
|
156
220
|
it "should retrieve the article from the nyt website" do
|
157
|
-
@pa =
|
221
|
+
@pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/22/us/politics/ignoring-calls-to-quit-akin-appeals-to-voters-in-ad.html?hp")
|
158
222
|
@pa.title.should =~ /ignoring/i
|
159
223
|
end
|
160
224
|
|
161
225
|
it "should retrieve the full article from the nyt website when given a first page url" do
|
162
|
-
@pa =
|
226
|
+
@pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html?ref=world")
|
163
227
|
@pa.content.size.should > 40
|
164
|
-
@pa =
|
228
|
+
@pa = NewYorkTimesPageParserV2.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html")
|
165
229
|
@pa.content.size.should > 40
|
166
230
|
end
|
167
231
|
|
168
232
|
it "should retrieve more than the paywall url limit" do
|
169
233
|
urls = []
|
170
234
|
[
|
171
|
-
"http://
|
235
|
+
"http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
|
172
236
|
"http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml",
|
173
|
-
"http://
|
237
|
+
"http://rss.nytimes.com/services/xml/rss/nyt/NYRegion.xml",
|
174
238
|
"http://www.nytimes.com/services/xml/rss/nyt/World.xml"
|
175
239
|
].each do |fu|
|
176
|
-
urls
|
240
|
+
next if urls.size > 25
|
241
|
+
urls += Net::HTTP.get(URI(fu)).scan(/https:\/\/www.nytimes.com\/[0-9]{4}\/[^<"?]+/)
|
242
|
+
urls.uniq!
|
177
243
|
end
|
178
244
|
|
179
|
-
urls.
|
180
|
-
pending("Failing spec but works in practise. Needs a looksee.") { urls.size.should > 25 }
|
245
|
+
urls.size.should > 25
|
181
246
|
urls[0..24].each_with_index do |u,i|
|
182
|
-
@pa =
|
247
|
+
@pa = NewYorkTimesPageParserV2.new(:url => u)
|
183
248
|
@pa.page.curl.header_str.to_s.scan(/^Location: .*/).grep(/myaccount.nytimes.com/).should be_empty
|
184
249
|
@pa.title.should_not =~ /^Log In/
|
185
250
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
include WebPageParser
|
4
|
+
|
5
|
+
describe RTPageParserFactory do
|
6
|
+
before do
|
7
|
+
@valid_urls = ["https://www.rt.com/uk/338045-privatization-aid-money-dfid/",
|
8
|
+
"https://www.rt.com/business/338032-saudi-arabia-megafund-assets/",
|
9
|
+
"https://www.rt.com/usa/338131-candy-thief-life-sentence/"]
|
10
|
+
@invalid_urls = ["https://www.rt.com/politics/",
|
11
|
+
"https://www.rt.com/in-vision/337713-egyptair-hijacker-hostages-free/",
|
12
|
+
"https://www.rt.com/in-motion/338243-anti-refugee-rally-uk/"]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should detect articles from the url" do
|
16
|
+
@valid_urls.each do |url|
|
17
|
+
RTPageParserFactory.can_parse?(:url => url).should be_true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore pages with the wrong url format" do
|
22
|
+
@invalid_urls.each do |url|
|
23
|
+
RTPageParserFactory.can_parse?(:url => url).should be_nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
describe RTPageParserV1 do
|
30
|
+
|
31
|
+
describe "when parsing the privatised aid article" do
|
32
|
+
before do
|
33
|
+
@valid_options = {
|
34
|
+
:url => 'https://www.rt.com/uk/338045-privatization-aid-money-dfid/',
|
35
|
+
:page => File.read("spec/fixtures/rt/338045.html"),
|
36
|
+
:valid_hash => '067d06dffae315daf6ab88026fdc7966'
|
37
|
+
}
|
38
|
+
@pa = RTPageParserV1.new(@valid_options)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should parse the title" do
|
42
|
+
@pa.title.should == "‘Scandal of privatized aid’: Free-market consultants cream off £450mn in UK govt funds"
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should parse the date in UTC" do
|
46
|
+
@pa.date.should == DateTime.parse("2016-04-01 15:17")
|
47
|
+
@pa.date.zone.should == '+00:00'
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should parse the content" do
|
51
|
+
@pa.content[0].should == "Free-market consultants in Britain are taking hundreds of millions of pounds ring-fenced to alleviate poverty in the developing world, as the government continues with its agenda of privatizing aid, a damning report has warned."
|
52
|
+
@pa.content[4].should == "The study examined how much of DfID’s work was geared towards supporting market-based development and the private sector in poor states. Recent projects included backing for a “business advocacy capacity development program” in Zimbabwe, and projects to increase private schooling in Kenya."
|
53
|
+
@pa.content[22].should == "ASI describes itself as a transparent, objective organization dedicated to making public services more robust. It also claims to support economic growth and civil society, while building “democratic and accountable institutions.”"
|
54
|
+
@pa.content.size.should == 23
|
55
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "when parsing the trump article" do
|
60
|
+
before do
|
61
|
+
@valid_options = {
|
62
|
+
:url => 'https://www.rt.com/usa/338237-trump-fine-nato-breakup-obsolete/',
|
63
|
+
:page => File.read("spec/fixtures/rt/338237.html"),
|
64
|
+
:valid_hash => '633221af5d9b9a7161cd40eaaf22253d'
|
65
|
+
}
|
66
|
+
@pa = RTPageParserV1.new(@valid_options)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should parse the title" do
|
70
|
+
@pa.title.should == "Trump sparks NATO debate: ‘Obsolete’ or ‘tripwire that could lead to World War III’?"
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should parse the date in UTC" do
|
74
|
+
@pa.date.should == DateTime.parse("2016-04-03 11:47")
|
75
|
+
@pa.date.zone.should == '+00:00'
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should parse the content" do
|
79
|
+
@pa.content[0].should == "Republican presidential candidate Donald Trump slammed NATO on the campaign trail this week, saying he can live with breaking up the military alliance, which he calls “obsolete.”"
|
80
|
+
@pa.content[4].should == "NATO"
|
81
|
+
@pa.content[11].should == "The now-28 member-strong organization has defied its purported promise on a number of occasions."
|
82
|
+
@pa.content[24].should == "The cost of NATO"
|
83
|
+
@pa.content.size.should == 39
|
84
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -64,4 +64,34 @@ describe TheInterceptPageParserV1 do
|
|
64
64
|
|
65
65
|
end
|
66
66
|
|
67
|
+
describe 'when parsing the pentagon missionary article' do
|
68
|
+
before do
|
69
|
+
@valid_options = {
|
70
|
+
:url => 'https://theintercept.com/2015/10/26/pentagon-missionary-spies-christian-ngo-front-for-north-korea-espionage/',
|
71
|
+
:page => File.read('spec/fixtures/theintercept/pentagon-missionary.html'),
|
72
|
+
:valid_hash => 'aa8a59955cc0c783f782c5c13701c71d'
|
73
|
+
}
|
74
|
+
@pa = TheInterceptPageParserV1.new(@valid_options)
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should parse the title' do
|
78
|
+
@pa.title.should == "U.S. Military Used Christian NGO as Front for North Korea Espionage"
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'should parse the content' do
|
82
|
+
@pa.content[0].should == 'ON MAY 10, 2007, in the East Room of the White House, President George W. Bush presided over a ceremony honoring the nation’s most accomplished community service leaders. Among those collecting a President’s Volunteer Service Award that afternoon was Kay Hiramine, the Colorado-based founder of a multimillion-dollar humanitarian organization.'
|
83
|
+
@pa.content[13].should == 'HISG WAS ESTABLISHED shortly after 9/11, when Hiramine led a group of three friends in creating a humanitarian organization that they hoped could provide disaster relief and sustainable development in poor and war-torn countries around the world, according to the organization’s incorporation documents.'
|
84
|
+
@pa.content[83].should == 'This report makes reference to a donation from Working Partners Foundation to Catholic Relief Services, based on Working Partners Foundation’s tax filings. Catholic Relief Services, which conducted a review after publication, said its own records contained no indication it received money from Working Partners Foundation or HISG.'
|
85
|
+
@pa.content.last.should == 'Top photo: U.S. President George W. Bush with Kay Hiramine prior to presenting him with a President’s Volunteer Service Award on May 10, 2007, in the East Room of the White House (photo flipped). '
|
86
|
+
@pa.content.size.should == 86
|
87
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should parse the date in UTC' do
|
91
|
+
@pa.date.should == DateTime.parse('Oct. 26 2015 15:05:22')
|
92
|
+
@pa.date.zone.should == '+00:00'
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
67
97
|
end
|
@@ -6,7 +6,8 @@ describe WashingtonPostPageParserFactory do
|
|
6
6
|
before do
|
7
7
|
@valid_urls = [
|
8
8
|
'http://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
|
9
|
-
'http://www.washingtonpost.com/business/technology/nsa-program-defenders-question-snowdens-motives/2014/01/19/091fccaa-811d-11e3-bbe5-6a2a3141e3a9_story.html'
|
9
|
+
'http://www.washingtonpost.com/business/technology/nsa-program-defenders-question-snowdens-motives/2014/01/19/091fccaa-811d-11e3-bbe5-6a2a3141e3a9_story.html',
|
10
|
+
'https://www.washingtonpost.com/world/middle_east/israel-ambassador-to-us-sends-anti-boycott-message-with-gift/2015/12/23/652d639c-a99b-11e5-b596-113f59ee069a_story.html'
|
10
11
|
]
|
11
12
|
@invalid_urls = [
|
12
13
|
'http://www.washingtonpost.com/politics/',
|
@@ -119,4 +120,95 @@ describe WashingtonPostPageParserV1 do
|
|
119
120
|
|
120
121
|
end
|
121
122
|
|
123
|
+
describe 'when parsing the Israeli ambassador article' do
|
124
|
+
before do
|
125
|
+
@valid_options = {
|
126
|
+
:url => 'https://www.washingtonpost.com/world/middle_east/israel-ambassador-to-us-sends-anti-boycott-message-with-gift/2015/12/23/652d639c-a99b-11e5-b596-113f59ee069a_story.html',
|
127
|
+
:page => File.read('spec/fixtures/washingtonpost/israeli-ambassador.html'),
|
128
|
+
:valid_hash => 'c2e80bf1012949bf3a124576466b1b40'
|
129
|
+
}
|
130
|
+
@pa = WashingtonPostPageParserV1.new(@valid_options)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should parse the title" do
|
134
|
+
@pa.title.should eq 'Israel ambassador to US sends anti-boycott message with gift'
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'should parse the date in UTC' do
|
138
|
+
@pa.date.should eq DateTime.parse("December 23 2015")
|
139
|
+
@pa.date.zone.should eq '+00:00'
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should contain no javascript" do
|
143
|
+
@pa.content.join(' ').should_not =~ /function/
|
144
|
+
end
|
145
|
+
|
146
|
+
it "should parse the content" do
|
147
|
+
@pa.content[0].should eq 'JERUSALEM — Israel’s ambassador to the United States has dispatched a politically charged holiday gift.'
|
148
|
+
@pa.content.size.should eq 6
|
149
|
+
@pa.hash.should == @valid_options[:valid_hash]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe WashingtonPostPageParserV2 do
|
155
|
+
describe 'when parsing the trump kim simmit article' do
|
156
|
+
before do
|
157
|
+
@valid_options = {
|
158
|
+
:url => 'https://www.washingtonpost.com/politics/trump-kim-summit-trump-says-we-have-developed-a-very-special-bond-at-end-of-historic-meeting/2018/06/12/ff43465a-6dba-11e8-bf86-a2351b5ece99_story.html?utm_term=.392b71a75a35',
|
159
|
+
:page => File.read('spec/fixtures/washingtonpost/trump-kim-summit.html'),
|
160
|
+
:valid_hash => '5b703096157e74b65fdf00fd9227ebbc'
|
161
|
+
}
|
162
|
+
@pa = WashingtonPostPageParserV2.new(@valid_options)
|
163
|
+
end
|
164
|
+
|
165
|
+
it "should parse the guid" do
|
166
|
+
@pa.guid.should eq "ff43465a-6dba-11e8-bf86-a2351b5ece99"
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should parse the title" do
|
170
|
+
@pa.title.should eq 'Trump-Kim summit: Trump says after historic meeting, ‘We have developed a very special bond’'
|
171
|
+
end
|
172
|
+
|
173
|
+
it 'should parse the date in UTC' do
|
174
|
+
@pa.date.should eq DateTime.parse("2018-06-12T11:47:00-05:00")
|
175
|
+
end
|
176
|
+
|
177
|
+
it "should parse the content" do
|
178
|
+
@pa.content[0].should eq 'SINGAPORE — President Trump concluded a historic summit with North Korean leader Kim Jong Un here Tuesday by sketching a path to prosperity for the isolated nation. But it remained highly uncertain whether the young dictator would embrace the offer by agreeing to eliminate his nuclear arsenal.'
|
179
|
+
@pa.content.last.should eq 'Carol Morello in Washington and Brian Murphy in Seoul contributed to this report.'
|
180
|
+
@pa.content[9].should eq 'Trump said that aides would begin additional talks soon and that he would potentially invite Kim to the White House and be open to visiting Pyongyang “at the appropriate time.” Yet he also acknowledged that disarmament would not come quickly.'
|
181
|
+
@pa.content.size.should eq 45
|
182
|
+
@pa.hash.should eq @valid_options[:valid_hash]
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
describe 'when parsing the bust-boom article from 2018' do
|
187
|
+
before do
|
188
|
+
@valid_options = {
|
189
|
+
:url => 'https://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
|
190
|
+
:page => File.read('spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html'),
|
191
|
+
:valid_hash => 'bbcdda8a8dffcabf71088039fb366e34'
|
192
|
+
}
|
193
|
+
@pa = WashingtonPostPageParserV2.new(@valid_options)
|
194
|
+
end
|
195
|
+
|
196
|
+
it "should parse the title" do
|
197
|
+
@pa.title.should == 'Will a bust follow the boom in Britain?'
|
198
|
+
end
|
199
|
+
|
200
|
+
it 'should parse the date in UTC' do
|
201
|
+
@pa.date.should eq DateTime.parse("2014-01-18T05:43:00-05:00")
|
202
|
+
@pa.date.zone.should eq '+00:00'
|
203
|
+
end
|
204
|
+
|
205
|
+
it "should parse the content" do
|
206
|
+
@pa.content[0].should eq 'LONDON — For decades, the modest two-bedroom apartment off Abbey Road was home to some of London’s neediest, a small, leaky outpost in this city’s vast constellation of public housing.'
|
207
|
+
@pa.content[12].should eq 'Crazy in the capital'
|
208
|
+
@pa.content.last.should eq '“It’s about time the government did something to help,” he said. “I don’t come from a rich family, so I don’t have parents who will give 15,000 pounds for a deposit. That’s not available to me. I’m genuinely pleased Cameron has done something for the working man, which is me.”'
|
209
|
+
@pa.content.size.should eq 25
|
210
|
+
@pa.hash.should eq @valid_options[:valid_hash]
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
122
214
|
end
|