web-page-parser 1.1.0 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +56 -3
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +33 -3
- data/lib/web-page-parser/parsers/independent_page_parser.rb +2 -2
- data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +8 -3
- data/lib/web-page-parser/parsers/rt_page_parser.rb +49 -0
- data/lib/web-page-parser/parsers/the_intercept_page_parser.rb +6 -1
- data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +38 -2
- data/spec/fixtures/bbc_news/31014941.html +1123 -0
- data/spec/fixtures/bbc_news/32271505.html +1168 -0
- data/spec/fixtures/bbc_news/32275608.html +1142 -0
- data/spec/fixtures/guardian/duplicate-headline.html +2735 -0
- data/spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html +1752 -0
- data/spec/fixtures/guardian/university-extremist-speakers.html +1590 -0
- data/spec/fixtures/independent/boris-johnson.html +1086 -0
- data/spec/fixtures/independent/lord-burns.html +726 -0
- data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html +305 -0
- data/spec/fixtures/new_york_times/trump-kim-policy.html +305 -0
- data/spec/fixtures/rt/338045.html +682 -0
- data/spec/fixtures/rt/338237.html +682 -0
- data/spec/fixtures/theintercept/pentagon-missionary.html +211 -0
- data/spec/fixtures/washingtonpost/israeli-ambassador.html +747 -0
- data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html +381 -0
- data/spec/fixtures/washingtonpost/trump-kim-summit.html +379 -0
- data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html +386 -0
- data/spec/parsers/bbc_news_page_spec.rb +132 -11
- data/spec/parsers/guardian_page_spec.rb +100 -0
- data/spec/parsers/independent_page_parser_spec.rb +52 -0
- data/spec/parsers/new_york_times_page_parser_spec.rb +75 -10
- data/spec/parsers/rt_page_parser_spec.rb +87 -0
- data/spec/parsers/the_intercept_page_parser_spec.rb +30 -0
- data/spec/parsers/washingtonpost_page_parser_spec.rb +93 -1
- data/spec/web-page-parser +1 -0
- metadata +98 -56
- metadata.gz.sig +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
spec/../spec/
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Leach
|
@@ -10,28 +10,31 @@ bindir: bin
|
|
10
10
|
cert_chain:
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
+
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
13
|
+
MIIERjCCAq6gAwIBAgIBATANBgkqhkiG9w0BAQsFADAoMSYwJAYDVQQDDB1qb2hu
|
14
|
+
L0RDPWpvaG5sZWFjaC9EQz1jby9EQz11azAeFw0xODA3MjUxMDQ2MTRaFw0yMDA3
|
15
|
+
MjQxMDQ2MTRaMCgxJjAkBgNVBAMMHWpvaG4vREM9am9obmxlYWNoL0RDPWNvL0RD
|
16
|
+
PXVrMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAxq6iwNdGb7lFZAHA
|
17
|
+
of8e2Akt6ziqAv9JC0GTPULqy474MfwykTOg6nharVTpSonUUWatbo4e3YDITMWB
|
18
|
+
wbiawE86ourm9onuMrDsdLJlyVXFPTJWLo8LBgpIIJ1jtD8TiqJgr+Qtd3yCgvYv
|
19
|
+
F1iXhEXBXs6x6WymJbCSlFFrAk/z6iOUGFjWIvkFX50UI7eL3Khym8APkBnyRlxz
|
20
|
+
/P3sySP/LdABk31H0dQp5DvcN8RHRg2UeKa+Ey+xPqg4TtMm8uY2uot/qO4jhW/Z
|
21
|
+
YbJNRFUr0C0rEgI4oGZQ0MIOurAhdkAaRSKbxfwIJoaDeER8Y2Yggnb61J5LOEgE
|
22
|
+
G8WtIAbp5LYcYwgJ2rRvc6X3E1tWM4d7Bo0FGTS0w6HmjtzLMXjiUgtWRFHF1I9S
|
23
|
+
V+nVZ1FL8/XSlAAuqD//6Dw20u+3Qoau3iw/PUdN6ODfAH6USBYqj2nH/m8VDtoZ
|
24
|
+
2my9UCZ+/xjsxn5aBKlXkQ3En8B61Es6vrgFHoQYpeKhBsUvAgMBAAGjezB5MAkG
|
25
|
+
A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBQ4o+vdcZcr0kfV4LdYBog5
|
26
|
+
dCrJvzAfBgNVHREEGDAWgRRqb2huQGpvaG5sZWFjaC5jby51azAfBgNVHRIEGDAW
|
27
|
+
gRRqb2huQGpvaG5sZWFjaC5jby51azANBgkqhkiG9w0BAQsFAAOCAYEAtPF2fUxO
|
28
|
+
5iVE/AMac4KaEEZTF7anQFOldRiz0noI48OXN2bvtZtuq8QucIMMtGbtEWPOigGh
|
29
|
+
oeWv0K7ZWvUNixRoWp3enSSByFwMF7UYpTBG54mos8UMfBj4C/pjvdqnJ34li2Ur
|
30
|
+
mb2vxJQAGNzzbfvcnNin0wM/iPRKuvM7OTalNnEOOaOehHvqClBlRU71Dp3BngXA
|
31
|
+
CD+fSJptjeixJY2pzSsZPDui84zIkHJtWyY77mrkEyQkRAdThf9Xnn3UOZ0yF0m/
|
32
|
+
kbEN/MgvvoWBtfReYx/uMT1PzdiRrhaThEbmQV4JNFNUtgbewTNCVNvtMq9q0j2t
|
33
|
+
nCtWWXRiGW/lJMBIKFHoe1l0m/IlC1Adji1QjosLFDqz+K+OglsJG5F6fF6gCvhC
|
34
|
+
xBDK27vsuKyrp/Zcg0HyYzJMpDu9NErKki8MXzlpOfrvR15NuEL267bvpI5QA3m/
|
35
|
+
KzVFklk+44JKTYFrr6Nu+BoGS9jVpzdBFobVQZmJO4Cugqxl40zSpZqj
|
33
36
|
-----END CERTIFICATE-----
|
34
|
-
date:
|
37
|
+
date: 2018-06-17 00:00:00.000000000 Z
|
35
38
|
dependencies:
|
36
39
|
- !ruby/object:Gem::Dependency
|
37
40
|
name: htmlentities
|
@@ -104,8 +107,8 @@ dependencies:
|
|
104
107
|
- !ruby/object:Gem::Version
|
105
108
|
version: '0'
|
106
109
|
description: A Ruby library to parse the content out of web pages. Currently supports
|
107
|
-
BBC News pages, The Guardian, Independent, New York Times
|
108
|
-
Used by the News Sniffer project.
|
110
|
+
BBC News pages, The Guardian, Independent, New York Times, RT, Washington Post and
|
111
|
+
The Intercept articles. Used by the News Sniffer project. https://www.newssniffer.co.uk
|
109
112
|
email: john@johnleach.co.uk
|
110
113
|
executables: []
|
111
114
|
extensions: []
|
@@ -123,6 +126,7 @@ files:
|
|
123
126
|
- lib/web-page-parser/parsers/guardian_page_parser.rb
|
124
127
|
- lib/web-page-parser/parsers/independent_page_parser.rb
|
125
128
|
- lib/web-page-parser/parsers/new_york_times_page_parser.rb
|
129
|
+
- lib/web-page-parser/parsers/rt_page_parser.rb
|
126
130
|
- lib/web-page-parser/parsers/test_page_parser.rb
|
127
131
|
- lib/web-page-parser/parsers/the_intercept_page_parser.rb
|
128
132
|
- lib/web-page-parser/parsers/washingtonpost_page_parser.rb
|
@@ -135,6 +139,9 @@ files:
|
|
135
139
|
- spec/fixtures/bbc_news/19957138.stm.html
|
136
140
|
- spec/fixtures/bbc_news/20230333.stm.html
|
137
141
|
- spec/fixtures/bbc_news/21528631.html
|
142
|
+
- spec/fixtures/bbc_news/31014941.html
|
143
|
+
- spec/fixtures/bbc_news/32271505.html
|
144
|
+
- spec/fixtures/bbc_news/32275608.html
|
138
145
|
- spec/fixtures/bbc_news/6072486.stm.html
|
139
146
|
- spec/fixtures/bbc_news/7745137.stm.html
|
140
147
|
- spec/fixtures/bbc_news/8011268.stm.html
|
@@ -145,29 +152,45 @@ files:
|
|
145
152
|
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
|
146
153
|
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
147
154
|
- spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
|
155
|
+
- spec/fixtures/guardian/duplicate-headline.html
|
156
|
+
- spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
|
148
157
|
- spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
|
149
158
|
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
159
|
+
- spec/fixtures/guardian/university-extremist-speakers.html
|
150
160
|
- spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
|
161
|
+
- spec/fixtures/independent/boris-johnson.html
|
151
162
|
- spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
|
152
163
|
- spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
|
164
|
+
- spec/fixtures/independent/lord-burns.html
|
153
165
|
- spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
|
154
166
|
- spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
|
167
|
+
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
|
155
168
|
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
|
156
169
|
- spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
|
170
|
+
- spec/fixtures/new_york_times/trump-kim-policy.html
|
171
|
+
- spec/fixtures/rt/338045.html
|
172
|
+
- spec/fixtures/rt/338237.html
|
157
173
|
- spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
|
174
|
+
- spec/fixtures/theintercept/pentagon-missionary.html
|
175
|
+
- spec/fixtures/washingtonpost/israeli-ambassador.html
|
158
176
|
- spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
|
177
|
+
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
|
159
178
|
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
|
179
|
+
- spec/fixtures/washingtonpost/trump-kim-summit.html
|
180
|
+
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
|
160
181
|
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
|
161
182
|
- spec/parser_factory_spec.rb
|
162
183
|
- spec/parsers/bbc_news_page_spec.rb
|
163
184
|
- spec/parsers/guardian_page_spec.rb
|
164
185
|
- spec/parsers/independent_page_parser_spec.rb
|
165
186
|
- spec/parsers/new_york_times_page_parser_spec.rb
|
187
|
+
- spec/parsers/rt_page_parser_spec.rb
|
166
188
|
- spec/parsers/the_intercept_page_parser_spec.rb
|
167
189
|
- spec/parsers/washingtonpost_page_parser_spec.rb
|
168
190
|
- spec/spec.opts
|
169
191
|
- spec/spec_helper.rb
|
170
|
-
|
192
|
+
- spec/web-page-parser
|
193
|
+
homepage: https://github.com/johnl/web-page-parser
|
171
194
|
licenses:
|
172
195
|
- MIT
|
173
196
|
metadata: {}
|
@@ -187,49 +210,68 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
187
210
|
version: '0'
|
188
211
|
requirements: []
|
189
212
|
rubyforge_project:
|
190
|
-
rubygems_version: 2.
|
213
|
+
rubygems_version: 2.7.6
|
191
214
|
signing_key:
|
192
215
|
specification_version: 4
|
193
216
|
summary: A parser for various news organisation's web pages
|
194
217
|
test_files:
|
195
|
-
- spec/
|
196
|
-
- spec/
|
197
|
-
- spec/
|
198
|
-
- spec/fixtures/
|
199
|
-
- spec/fixtures/
|
200
|
-
- spec/fixtures/
|
201
|
-
- spec/fixtures/
|
202
|
-
- spec/fixtures/
|
203
|
-
- spec/fixtures/
|
204
|
-
- spec/fixtures/
|
205
|
-
- spec/fixtures/bbc_news/8011268.stm.html
|
206
|
-
- spec/fixtures/bbc_news/8029015.stm.html
|
207
|
-
- spec/fixtures/bbc_news/8040164.stm.html
|
208
|
-
- spec/fixtures/bbc_news/8063681.stm.html
|
209
|
-
- spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
|
210
|
-
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
|
211
|
-
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
218
|
+
- spec/base_parser_spec.rb
|
219
|
+
- spec/web-page-parser
|
220
|
+
- spec/parser_factory_spec.rb
|
221
|
+
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
|
222
|
+
- spec/fixtures/new_york_times/trump-kim-policy.html
|
223
|
+
- spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
|
224
|
+
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
|
225
|
+
- spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
|
226
|
+
- spec/fixtures/rt/338237.html
|
227
|
+
- spec/fixtures/rt/338045.html
|
212
228
|
- spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
|
213
|
-
- spec/fixtures/guardian/
|
229
|
+
- spec/fixtures/guardian/university-extremist-speakers.html
|
230
|
+
- spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
|
231
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
214
232
|
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
215
|
-
- spec/fixtures/
|
216
|
-
- spec/fixtures/
|
233
|
+
- spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
|
234
|
+
- spec/fixtures/guardian/duplicate-headline.html
|
235
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
|
217
236
|
- spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
|
237
|
+
- spec/fixtures/independent/boris-johnson.html
|
218
238
|
- spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
|
219
|
-
- spec/fixtures/
|
220
|
-
- spec/fixtures/
|
221
|
-
- spec/fixtures/
|
222
|
-
- spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
|
239
|
+
- spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
|
240
|
+
- spec/fixtures/independent/lord-burns.html
|
241
|
+
- spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
|
223
242
|
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
|
243
|
+
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
|
244
|
+
- spec/fixtures/washingtonpost/israeli-ambassador.html
|
245
|
+
- spec/fixtures/washingtonpost/trump-kim-summit.html
|
246
|
+
- spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
|
247
|
+
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
|
224
248
|
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
|
225
249
|
- spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
|
250
|
+
- spec/fixtures/theintercept/pentagon-missionary.html
|
251
|
+
- spec/fixtures/bbc_news/31014941.html
|
252
|
+
- spec/fixtures/bbc_news/6072486.stm.html
|
253
|
+
- spec/fixtures/bbc_news/13293006.html
|
254
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
255
|
+
- spec/fixtures/bbc_news/21528631.html
|
256
|
+
- spec/fixtures/bbc_news/20230333.stm.html
|
257
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
258
|
+
- spec/fixtures/bbc_news/8040164.stm.html
|
259
|
+
- spec/fixtures/bbc_news/10249066.stm.html
|
260
|
+
- spec/fixtures/bbc_news/19957138.stm.html
|
261
|
+
- spec/fixtures/bbc_news/32275608.html
|
262
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
263
|
+
- spec/fixtures/bbc_news/32271505.html
|
264
|
+
- spec/fixtures/bbc_news/11125504.html
|
265
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
266
|
+
- spec/fixtures/bbc_news/12921632.html
|
267
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
268
|
+
- spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
|
269
|
+
- spec/spec_helper.rb
|
270
|
+
- spec/spec.opts
|
271
|
+
- spec/parsers/new_york_times_page_parser_spec.rb
|
272
|
+
- spec/parsers/the_intercept_page_parser_spec.rb
|
273
|
+
- spec/parsers/rt_page_parser_spec.rb
|
226
274
|
- spec/parsers/bbc_news_page_spec.rb
|
227
275
|
- spec/parsers/guardian_page_spec.rb
|
228
|
-
- spec/parsers/new_york_times_page_parser_spec.rb
|
229
276
|
- spec/parsers/independent_page_parser_spec.rb
|
230
|
-
- spec/parsers/the_intercept_page_parser_spec.rb
|
231
277
|
- spec/parsers/washingtonpost_page_parser_spec.rb
|
232
|
-
- spec/parser_factory_spec.rb
|
233
|
-
- spec/spec.opts
|
234
|
-
- spec/spec_helper.rb
|
235
|
-
- spec/base_parser_spec.rb
|
metadata.gz.sig
CHANGED
Binary file
|