web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +1 -0
  3. data.tar.gz.sig +0 -0
  4. data/README.rdoc +5 -0
  5. data/lib/web-page-parser.rb +31 -0
  6. data/lib/web-page-parser/base_parser.rb +92 -42
  7. data/lib/web-page-parser/http.rb +63 -0
  8. data/lib/web-page-parser/parser_factory.rb +0 -1
  9. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
  10. data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
  11. data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
  12. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
  13. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
  14. data/spec/base_parser_spec.rb +24 -8
  15. data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
  16. data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
  17. data/spec/fixtures/bbc_news/21528631.html +2021 -0
  18. data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
  19. data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
  20. data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
  21. data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
  22. data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
  23. data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
  24. data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
  25. data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
  26. data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
  27. data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
  28. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
  29. data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
  30. data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
  31. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
  32. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
  33. data/spec/parser_factory_spec.rb +3 -3
  34. data/spec/parsers/bbc_news_page_spec.rb +223 -3
  35. data/spec/parsers/guardian_page_spec.rb +157 -4
  36. data/spec/parsers/independent_page_parser_spec.rb +152 -0
  37. data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
  38. data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
  39. data/spec/spec_helper.rb +5 -0
  40. metadata +167 -59
  41. metadata.gz.sig +2 -0
@@ -0,0 +1,152 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ require 'net/http'
4
+ include WebPageParser
5
+
6
+ describe IndependentPageParserFactory do
7
+ before do
8
+ @valid_urls = [
9
+ 'http://www.independent.co.uk/news/world/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html?somequery=string',
10
+ 'http://www.independent.co.uk/news/media/tv-radio/the-vex-factor-bbc-produces-almost-identical-programmes-to-those-by-itv-mps-are-told--by-itv-9059695.html',
11
+ ]
12
+ @invalid_urls = [
13
+ 'http://www.independent.co.uk/sport/rugby/rugby-union/',
14
+ 'http://www.independent.co.uk/news/business/',
15
+ 'http://www.independent.co.uk/news/pictures/spencer-tunicks-nude-art-installations-9067645.html',
16
+ 'http://www.independent.co.uk/sport/rugby/rugby-union/international/chris-robshaw-flanker-answers-all-the-questions-about-red-rose-captaincy-9065632.html'
17
+ ]
18
+ end
19
+
20
+ it "should detect independent articles from the url" do
21
+ @valid_urls.each do |url|
22
+ IndependentPageParserFactory.can_parse?(:url => url).should be_true
23
+ end
24
+ end
25
+
26
+ it "should ignore pages with the wrong url format" do
27
+ @invalid_urls.each do |url|
28
+ IndependentPageParserFactory.can_parse?(:url => url).should be_nil
29
+ end
30
+ end
31
+
32
+ end
33
+
34
+
35
+ describe IndependentPageParserV1 do
36
+ describe "when parsing the Saudi authorities article" do
37
+ before do
38
+ @valid_options = {
39
+ :url => 'www.independent.co.uk/news/world/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html',
40
+ :page => File.read("spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html"),
41
+ :valid_hash => 'cfc2994d68b1c59e10bf3225ae557d31'
42
+ }
43
+ @pa = IndependentPageParserV1.new(@valid_options)
44
+ end
45
+
46
+ it "should parse the title" do
47
+ @pa.title.should == "Saudi authorities stop text-message tracking of women… for now"
48
+ end
49
+
50
+ it "should parse the date" do
51
+ @pa.date.should == DateTime.parse("Jan 16 2014")
52
+ end
53
+
54
+ it "should calculate the hash correctly" do
55
+ @pa.hash.should == @valid_options[:valid_hash]
56
+ end
57
+
58
+ it "should parse the content" do
59
+ @pa.content[0].should == "A monitoring system which sends text alerts to Saudi women’s male 'guardians' when they cross the border has been temporarily suspended."
60
+ @pa.content[3].should == 'Many Saudi women have welcomed the freeze of the measure, including Sabria S. Jawhar, a Saudi columnist and assistant professor of applied linguistics at King Saud bin Abdulaziz University for Health Sciences.'
61
+ @pa.content.size.should == 11
62
+ end
63
+ end
64
+
65
+ describe "when parsing the syria article" do
66
+ before do
67
+ @valid_options = {
68
+ :url => 'http://www.independent.co.uk/news/world/middle-east/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html',
69
+ :page => File.read('spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html'),
70
+ :valid_hash => 'bb552f44352d9807bc0299e5d12f6f0e'
71
+ }
72
+ @pa = IndependentPageParserV1.new(@valid_options)
73
+ end
74
+
75
+ it "should parse the title" do
76
+ @pa.title.should == "Innocent, starving, close to death: One victim of the siege that shames Syria"
77
+ end
78
+
79
+ it "should parse the date" do
80
+ @pa.date.should == DateTime.parse('Jan 16 2014')
81
+ end
82
+
83
+ it "should calculate the hash correctly" do
84
+ @pa.hash.should == @valid_options[:valid_hash]
85
+ end
86
+
87
+ it "should parse the content" do
88
+ @pa.content[0].should == "Israa al-Masri was still a toddler when she lost her battle to cling to life. But the image of her face, pictured just minutes before she finally succumbed to starvation, is becoming the symbol of a wider nightmare."
89
+ @pa.content.last.should == 'Given the weight of evidence of extreme hunger within Yarmouk, there is no particular reason to doubt the authenticity of other images such as the one above, which is also taken from footage supplied by an activist group, and featured by Al Jazeera. But despite rigorous checks, there is still no sure way of verifying them.'
90
+ @pa.content.size.should == 37
91
+ end
92
+ end
93
+
94
+ describe "when parsing the belgian-man article" do
95
+ before do
96
+ @valid_options = {
97
+ :url => 'http://www.independent.co.uk/news/world/europe/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html',
98
+ :page => File.read('spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html'),
99
+ :valid_hash => '8e7ca0b6a3c3de210c743a36cea05990'
100
+ }
101
+ @pa = IndependentPageParserV1.new(@valid_options)
102
+ end
103
+
104
+ it "should parse the title" do
105
+ @pa.title.should == 'Belgian man who skipped 100 restaurant bills is killed'
106
+ end
107
+
108
+ it "should parse the date" do
109
+ @pa.date.should == DateTime.parse('Jan 23 2014')
110
+ end
111
+
112
+ it "should calculate the hash correctly" do
113
+ @pa.hash.should == @valid_options[:valid_hash]
114
+ end
115
+
116
+ it "should parse the content" do
117
+ @pa.content[0].should == 'He was a “happy-go-lucky” guy who was notorious for spicing up life on benefits in the medieval Belgian town of Ghent by strolling into a restaurant, calmly ordering lobster washed down with the finest brandy or some other gastronomic delight and then walking out without paying the bill.'
118
+ @pa.content[1].should == 'But, after 100 or so incidents spread over a five-year spree, Titus Clarysse has turned up dead, prompting police to launch an investigation of “murder or manslaughter”.'
119
+ @pa.content.size.should == 19
120
+ end
121
+ end
122
+
123
+ describe 'when parsing the uk-sanctuary article' do
124
+ before do
125
+ @valid_options = {
126
+ :url => 'http://www.independent.co.uk/news/uk/politics/david-cameron-set-for-uturn-over-uk-sanctuary-for-most-vulnerable-syria-refugees-following-aid-agencies-plea-9077647.html',
127
+ :page => File.read('spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html'),
128
+ :valid_hash => '37a6522307cea4fbae37b8eb52191c1d'
129
+ }
130
+ @pa = IndependentPageParserV1.new(@valid_options)
131
+ end
132
+
133
+ it "should parse the title" do
134
+ @pa.title.should == 'David Cameron set for U-turn over UK sanctuary for most vulnerable Syria refugees following plea by aid agencies'
135
+ end
136
+
137
+ it "should parse the date" do
138
+ @pa.date.should == DateTime.parse('Jan 23 2014')
139
+ end
140
+
141
+ it "should calculate the hash correctly" do
142
+ @pa.hash.should == @valid_options[:valid_hash]
143
+ end
144
+
145
+ it "should parse the content" do
146
+ @pa.content[0].should == 'David Cameron opened the door yesterday for Britain to give sanctuary to some of the most vulnerable Syrian refugees trapped in appalling conditions in neighbouring countries.'
147
+ @pa.content.join(' ').should_not =~ /brightcove/
148
+ @pa.content.size.should == 16
149
+ end
150
+ end
151
+
152
+ end
@@ -0,0 +1,190 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ require 'net/http'
4
+ include WebPageParser
5
+
6
+ describe NewYorkTimesPageParserFactory do
7
+ before do
8
+ @valid_urls = [
9
+ "http://www.nytimes.com/2012/01/28/us/politics/no-more-nice-guys-fans-love-nuclear-newt.html?_r=1&ref=us",
10
+ "http://www.nytimes.com/2012/01/29/business/global/greece-in-talks-with-creditors-on-debt-deal.html",
11
+ ]
12
+ @invalid_urls = [
13
+ "http://cityroom.blogs.nytimes.com/2012/01/27/the-week-in-pictures-for-jan-27/",
14
+ "http://www.nytimes.com/pages/world/asia/index.html"
15
+ ]
16
+ end
17
+
18
+ it "should detect new york times articles from the url" do
19
+ @valid_urls.each do |url|
20
+ NewYorkTimesPageParserFactory.can_parse?(:url => url).should be_true
21
+ end
22
+ end
23
+
24
+ it "should ignore pages with the wrong url format" do
25
+ @invalid_urls.each do |url|
26
+ NewYorkTimesPageParserFactory.can_parse?(:url => url).should be_nil
27
+ end
28
+ end
29
+
30
+ end
31
+
32
+
33
+ describe NewYorkTimesPageParserV1 do
34
+ describe "when parsing the Gingrich article" do
35
+ before do
36
+ @valid_options = {
37
+ :url => 'http://www.nytimes.com/2012/01/27/us/politics/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html?src=me&ref=general',
38
+ :page => File.read("spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html"),
39
+ :valid_hash => '7562feadc3db5c9a4c474cc0e9db421a'
40
+ }
41
+ @pa = NewYorkTimesPageParserV1.new(@valid_options)
42
+ end
43
+
44
+ it "should parse the title" do
45
+ @pa.title.should == "Gingrich Stuck to Caustic Path in Ethics Battles"
46
+ end
47
+
48
+ it "should parse the date" do
49
+ @pa.date.should == DateTime.parse("Sat Jan 26 2012")
50
+ end
51
+
52
+ it "should calculate the hash correctly" do
53
+ @pa.hash.should == @valid_options[:valid_hash]
54
+ end
55
+
56
+ it "should parse the content" do
57
+ @pa.content[0].should == "WASHINGTON — Newt Gingrich had an urgent warning for conservatives: Jim Wright, the Democratic speaker of the House, was out to destroy America."
58
+ @pa.content[4].should == "Mr. Gingrich, Democrats and Republicans here agree, emerged as one of Washington’s most aggressive practitioners of slash-and-burn politics; many fault him for erasing whatever civility once existed in the capital. He believed, and preached, that harsh language could win elections; in 1990, the political action committee he ran, Gopac, instructed Republican candidates to learn to “speak like Newt,” and offered a list of words to describe Democrats — like decay, traitors, radical, sick, destroy, pathetic, corrupt and shame."
59
+ @pa.content.size.should == 48
60
+ end
61
+ end
62
+
63
+ describe "when parsing the hamas-leader article" do
64
+ before do
65
+ @valid_options = {
66
+ :url => 'http://www.nytimes.com/2012/01/28/world/middleeast/khaled-meshal-the-leader-of-hamas-vacates-damascus.html',
67
+ :page => File.read("spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html"),
68
+ :valid_hash => '99ae48e19224402890b380019ca5fbda'
69
+ }
70
+ @pa = NewYorkTimesPageParserV1.new(@valid_options)
71
+ end
72
+
73
+ it "should parse the title" do
74
+ @pa.title.should == "Hamas Leader Abandons Longtime Base in Damascus"
75
+ end
76
+
77
+ it "should parse the date" do
78
+ @pa.date.should == DateTime.parse("Fri Jan 27 2012")
79
+ end
80
+
81
+ it "should calculate the hash correctly" do
82
+ @pa.hash.should == @valid_options[:valid_hash]
83
+ end
84
+
85
+ it "should parse the content" do
86
+ @pa.content[0].should == "GAZA — Khaled Meshal, the leader of the Palestinian Islamist movement Hamas, has effectively abandoned his longtime base in Syria, where a popular uprising has left thousands dead, and has no plans to return, Hamas sources in Gaza said Friday."
87
+ @pa.content[4].should == %Q{On Sunday, Mr. Meshal is scheduled to make his first official visit to Jordan since he was deported in 1999. Qatar, one of Mr. Assad’s most vocal Arab critics, played mediator in arranging for Mr. Meshal’s visit to Jordan, which is expected to include a meeting with King Abdullah II. Jordan was the first Arab country to urge Mr. Assad to step down.}
88
+ @pa.content.last.should == "Ethan Bronner contributed reporting from Jerusalem."
89
+ @pa.content.size.should == 7
90
+ @pa.hash.should == @valid_options[:valid_hash]
91
+ end
92
+ end
93
+ end
94
+
95
+ describe NewYorkTimesPageParserV2 do
96
+ describe "when parsing the Gingrich article" do
97
+ before do
98
+ @valid_options = {
99
+ :url => 'http://www.nytimes.com/2012/01/27/us/politics/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html?src=me&ref=general',
100
+ :page => File.read("spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html"),
101
+ :valid_hash => '7562feadc3db5c9a4c474cc0e9db421a'
102
+ }
103
+ @pa = NewYorkTimesPageParserV2.new(@valid_options)
104
+ end
105
+
106
+ it "should parse the title" do
107
+ @pa.title.should == "Gingrich Stuck to Caustic Path in Ethics Battles"
108
+ end
109
+
110
+ it "should parse the date" do
111
+ @pa.date.should == DateTime.parse("Sat Jan 26 2012")
112
+ end
113
+
114
+ it "should calculate the hash correctly" do
115
+ @pa.hash.should == @valid_options[:valid_hash]
116
+ end
117
+
118
+ it "should parse the content" do
119
+ @pa.content[0].should == "WASHINGTON — Newt Gingrich had an urgent warning for conservatives: Jim Wright, the Democratic speaker of the House, was out to destroy America."
120
+ @pa.content[4].should == "Mr. Gingrich, Democrats and Republicans here agree, emerged as one of Washington’s most aggressive practitioners of slash-and-burn politics; many fault him for erasing whatever civility once existed in the capital. He believed, and preached, that harsh language could win elections; in 1990, the political action committee he ran, Gopac, instructed Republican candidates to learn to “speak like Newt,” and offered a list of words to describe Democrats — like decay, traitors, radical, sick, destroy, pathetic, corrupt and shame."
121
+ @pa.content.size.should == 48
122
+ end
123
+ end
124
+
125
+ describe "when parsing the French comedian article" do
126
+ before do
127
+ @valid_options = {
128
+ :url => 'http://www.nytimes.com/2014/01/12/world/europe/show-banned-french-comedian-has-new-one.html',
129
+ :page => File.read('spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html'),
130
+ :valid_hash => 'ab9cafaac593c12b5b457a5bfdd3eda5'
131
+ }
132
+ @pa = NewYorkTimesPageParserV2.new(@valid_options)
133
+ end
134
+
135
+ it "should parse the title" do
136
+ @pa.title.should == 'Show Banned, French Comedian Has New One'
137
+ end
138
+
139
+ it "should parse the date" do
140
+ @pa.date.should == DateTime.parse("Jan 12th 2014")
141
+ end
142
+
143
+ it "should calculate the hash correctly" do
144
+ @pa.hash.should == @valid_options[:valid_hash]
145
+ end
146
+
147
+ it "should parse the content" do
148
+ @pa.content[0].should == 'PARIS — A French comedian said Saturday that he had dropped a show banned for its anti-Semitic language and was planning one that would cause no objections.'
149
+ @pa.content[3].should == '“We live in a democratic country and I have to comply with the laws, despite the blatant political interference,” he said. “As a comedian, I have pushed the debate to the very edge of laughter.”'
150
+ @pa.content.size.should == 18
151
+ end
152
+ end
153
+
154
+
155
+ describe "retrieve_page" do
156
+ it "should retrieve the article from the nyt website" do
157
+ @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/22/us/politics/ignoring-calls-to-quit-akin-appeals-to-voters-in-ad.html?hp")
158
+ @pa.title.should =~ /ignoring/i
159
+ end
160
+
161
+ it "should retrieve the full article from the nyt website when given a first page url" do
162
+ @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html?ref=world")
163
+ @pa.content.size.should > 40
164
+ @pa = NewYorkTimesPageParserV1.new(:url => "http://www.nytimes.com/2012/08/21/world/middleeast/syrian-rebels-coalesce-into-a-fighting-force.html")
165
+ @pa.content.size.should > 40
166
+ end
167
+
168
+ it "should retrieve more than the paywall url limit" do
169
+ urls = []
170
+ [
171
+ "http://feeds.nytimes.com/nyt/rss/HomePage",
172
+ "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml",
173
+ "http://feeds.nytimes.com/nyt/rss/NYRegion",
174
+ "http://www.nytimes.com/services/xml/rss/nyt/World.xml"
175
+ ].each do |fu|
176
+ urls += Net::HTTP.get(URI(fu)).scan(/http:\/\/www.nytimes.com\/[0-9]{4}\/[^<"?]+/)
177
+ end
178
+
179
+ urls.uniq!
180
+ pending("Failing spec but works in practise. Needs a looksee.") { urls.size.should > 25 }
181
+ urls[0..24].each_with_index do |u,i|
182
+ @pa = NewYorkTimesPageParserV1.new(:url => u)
183
+ @pa.page.curl.header_str.to_s.scan(/^Location: .*/).grep(/myaccount.nytimes.com/).should be_empty
184
+ @pa.title.should_not =~ /^Log In/
185
+ end
186
+ end
187
+
188
+ end
189
+
190
+ end
@@ -0,0 +1,114 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ include WebPageParser
4
+
5
+ describe WashingtonPostPageParserFactory do
6
+ before do
7
+ @valid_urls = [
8
+ 'http://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
9
+ 'http://www.washingtonpost.com/business/technology/nsa-program-defenders-question-snowdens-motives/2014/01/19/091fccaa-811d-11e3-bbe5-6a2a3141e3a9_story.html'
10
+ ]
11
+ @invalid_urls = [
12
+ 'http://www.washingtonpost.com/politics/',
13
+ 'http://www.washingtonpost.com/local/',
14
+ 'http://www.washingtonpost.com/blogs/worldviews/',
15
+ 'http://www.washingtonpost.com/blogs/worldviews/wp/tag/iran/'
16
+ ]
17
+ end
18
+
19
+ it "should detect washpo articles from the url" do
20
+ @valid_urls.each do |url|
21
+ WashingtonPostPageParserFactory.can_parse?(:url => url).should be_true
22
+ end
23
+ end
24
+
25
+ it "should ignore pages with the wrong url format" do
26
+ @invalid_urls.each do |url|
27
+ WashingtonPostPageParserFactory.can_parse?(:url => url).should be_nil
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+ describe WashingtonPostPageParserV1 do
34
+
35
+ describe 'when parsing the al-shabab article' do
36
+ before do
37
+ @valid_options = {
38
+ :url => 'http://www.washingtonpost.com/world/national-security/pentagon-confirms-al-shabab-leader-killed-in-airstrike-in-somalia/2014/09/05/fc9fee06-3512-11e4-9e92-0899b306bbea_story.html',
39
+ :page => File.read('spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html'),
40
+ :valid_hash => 'FIXME'
41
+ }
42
+ @pa = WashingtonPostPageParserV1.new(@valid_options)
43
+
44
+ end
45
+
46
+ it 'should parse the title' do
47
+ @pa.title.should == 'White House confirms al-Shabab leader killed in airstrike in Somalia'
48
+ end
49
+
50
+ it 'should parse the content' do
51
+ @pa.content[0].should == 'In a major setback for al-Qaeda’s affiliate in East Africa, the Obama administration said Friday it had confirmed the death of a key Somali militant leader who had been targeted in an airstrike earlier in the week.'
52
+ end
53
+ end
54
+
55
+ describe 'when parsing the bust-boom article' do
56
+ before do
57
+ @valid_options = {
58
+ :url => 'http://www.washingtonpost.com/world/will-a-bust-follow-the-boom-in-britain/2014/01/18/3677a6ae-7f9d-11e3-97d3-b9925ce2c57b_story.html?tid=hpModule_04941f10-8a79-11e2-98d9-3012c1cd8d1e&hpid=z16',
59
+ :page => File.read('spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html'),
60
+ :valid_hash => '86020be298247aaecfc53e3d66f8c6ee'
61
+ }
62
+ @pa = WashingtonPostPageParserV1.new(@valid_options)
63
+ end
64
+
65
+ it "should parse the title" do
66
+ @pa.title.should == 'Will a bust follow the boom in Britain?'
67
+ end
68
+
69
+ it 'should parse the date in UTC' do
70
+ @pa.date.should == DateTime.parse("January 18th 2014")
71
+ @pa.date.zone.should == '+00:00'
72
+ end
73
+
74
+ it "should parse the content" do
75
+ @pa.content[0].should == 'LONDON — For decades, the modest two-bedroom apartment off Abbey Road was home to some of London’s neediest, a small, leaky outpost in this city’s vast constellation of public housing.'
76
+ @pa.content[12].should == 'Crazy in the capital'
77
+ @pa.content.last.should == '“It’s about time the government did something to help,” he said. “I don’t come from a rich family, so I don’t have parents who will give 15,000 pounds for a deposit. That’s not available to me. I’m genuinely pleased Cameron has done something for the working man, which is me.”'
78
+ @pa.content.size.should == 25
79
+ @pa.hash.should == @valid_options[:valid_hash]
80
+ end
81
+ end
82
+
83
+ describe 'when parsing the sgt bowe article' do
84
+ before do
85
+ @valid_options = {
86
+ :url => 'http://www.washingtonpost.com/world/national-security/sgt-bowe-bergdahls-capture-remains-amystery/2014/01/15/4f8ef686-7e28-11e3-9556-4a4bf7bcbd84_story.html?wprss=rss_national-security',
87
+ :page => File.read('spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html'),
88
+ :valid_hash => '1fd07efe6bfbf5c4551e88d09a663e25'
89
+ }
90
+ @pa = WashingtonPostPageParserV1.new(@valid_options)
91
+ end
92
+
93
+ it "should parse the title" do
94
+ @pa.title.should == 'Sgt. Bowe Bergdahl’s capture remains a mystery'
95
+ end
96
+
97
+ it 'should parse the date in UTC' do
98
+ @pa.date.should == DateTime.parse("January 15th 2014")
99
+ @pa.date.zone.should == '+00:00'
100
+ end
101
+
102
+ it "should contain no javascript" do
103
+ @pa.content.join(' ').should_not =~ /function/
104
+ end
105
+
106
+ it "should parse the content" do
107
+ @pa.content[0].should == 'Correction: An earlier version of this article misspelled the name of Sgt. Bowe Bergdahl.'
108
+ @pa.content.size.should == 8 # The blockquote ends up as one big paragraph
109
+ @pa.hash.should == @valid_options[:valid_hash]
110
+ end
111
+
112
+ end
113
+
114
+ end