web-page-parser 1.1.0 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +56 -3
  5. data/lib/web-page-parser/parsers/guardian_page_parser.rb +33 -3
  6. data/lib/web-page-parser/parsers/independent_page_parser.rb +2 -2
  7. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +8 -3
  8. data/lib/web-page-parser/parsers/rt_page_parser.rb +49 -0
  9. data/lib/web-page-parser/parsers/the_intercept_page_parser.rb +6 -1
  10. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +38 -2
  11. data/spec/fixtures/bbc_news/31014941.html +1123 -0
  12. data/spec/fixtures/bbc_news/32271505.html +1168 -0
  13. data/spec/fixtures/bbc_news/32275608.html +1142 -0
  14. data/spec/fixtures/guardian/duplicate-headline.html +2735 -0
  15. data/spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html +1752 -0
  16. data/spec/fixtures/guardian/university-extremist-speakers.html +1590 -0
  17. data/spec/fixtures/independent/boris-johnson.html +1086 -0
  18. data/spec/fixtures/independent/lord-burns.html +726 -0
  19. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html +305 -0
  20. data/spec/fixtures/new_york_times/trump-kim-policy.html +305 -0
  21. data/spec/fixtures/rt/338045.html +682 -0
  22. data/spec/fixtures/rt/338237.html +682 -0
  23. data/spec/fixtures/theintercept/pentagon-missionary.html +211 -0
  24. data/spec/fixtures/washingtonpost/israeli-ambassador.html +747 -0
  25. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html +381 -0
  26. data/spec/fixtures/washingtonpost/trump-kim-summit.html +379 -0
  27. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html +386 -0
  28. data/spec/parsers/bbc_news_page_spec.rb +132 -11
  29. data/spec/parsers/guardian_page_spec.rb +100 -0
  30. data/spec/parsers/independent_page_parser_spec.rb +52 -0
  31. data/spec/parsers/new_york_times_page_parser_spec.rb +75 -10
  32. data/spec/parsers/rt_page_parser_spec.rb +87 -0
  33. data/spec/parsers/the_intercept_page_parser_spec.rb +30 -0
  34. data/spec/parsers/washingtonpost_page_parser_spec.rb +93 -1
  35. data/spec/web-page-parser +1 -0
  36. metadata +98 -56
  37. metadata.gz.sig +0 -0
@@ -0,0 +1 @@
1
+ spec/../spec/
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Leach
@@ -10,28 +10,31 @@ bindir: bin
10
10
  cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
- MIIDmjCCAoKgAwIBAgIBATANBgkqhkiG9w0BAQUFADBSMQ0wCwYDVQQDDARqb2hu
14
- MRkwFwYKCZImiZPyLGQBGRYJam9obmxlYWNoMRIwEAYKCZImiZPyLGQBGRYCY28x
15
- EjAQBgoJkiaJk/IsZAEZFgJ1azAeFw0xNDEwMjUxNzAyMDBaFw0xNTEwMjUxNzAy
16
- MDBaMFIxDTALBgNVBAMMBGpvaG4xGTAXBgoJkiaJk/IsZAEZFglqb2hubGVhY2gx
17
- EjAQBgoJkiaJk/IsZAEZFgJjbzESMBAGCgmSJomT8ixkARkWAnVrMIIBIjANBgkq
18
- hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA49t9Dck0zPjDiEuRLTEsoXVQEZECcFtu
19
- FU2j6ZFGd/twm4dlZ2qHYs5GYtpSwlfJYhAOc2a9cHz1KKSevGaPRxpsFhKEZ+Yj
20
- 5R8y4VZCY1Rx7tyX2PYdWBNaeTNPIACdW4HNg2/n1bbhu4LkQ+PYBQb6bbeFnzTx
21
- dl2ZLvhwSRUbl7aIiYyENbpOmPKCL1ReJUkQn+1Kyq76ZMY6pG6iSeeZvDtKZKqd
22
- MX4bWAIBeT6mUv/jhIDkJgj+JO11v3wbhojAcVHInGnyCQ7dLn3hurlLfII4SiLT
23
- foOh2i2OY5ZTG5PoPEGMiagBWAUmQUA+Yc6gnfpjrX/aFG/aa6T2+wIDAQABo3sw
24
- eTAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUqd/WZnE4x+pZ+DS1
25
- +H3vH+/gvfkwHwYDVR0RBBgwFoEUam9obkBqb2hubGVhY2guY28udWswHwYDVR0S
26
- BBgwFoEUam9obkBqb2hubGVhY2guY28udWswDQYJKoZIhvcNAQEFBQADggEBABG0
27
- Avvj8mNb+0drmLLFLMLck8oEcpzONbVG6A3XWrkUkTsdsw8VB848QuFg3gR+3ReU
28
- C98Bm+8L1zYVkfhTzHNJ8Y9HGC+8eEXoMQw1C2jGBcN+i4G+eylOBv+PTJX3UU8r
29
- r7Tb7QqD7tPjy7TS91OvyImc7Jixt848nrrs9nWSiEIaVxQqBRRdKANsgFISvvA4
30
- CPFEkKZm3GcFRIVu9yQO1LWfsbvbVLhD5HSynklijwo2RroPXlNKi6RXsxKwgtqD
31
- MghEyBTNQa+QTUTKQMjYOO3kV+Wuv+iQGaMm/bu2SD+Ov0XUzzAsSfz0ZvrF3fbG
32
- jdD4CMQtJNDqDiWuUkg=
13
+ MIIERjCCAq6gAwIBAgIBATANBgkqhkiG9w0BAQsFADAoMSYwJAYDVQQDDB1qb2hu
14
+ L0RDPWpvaG5sZWFjaC9EQz1jby9EQz11azAeFw0xODA3MjUxMDQ2MTRaFw0yMDA3
15
+ MjQxMDQ2MTRaMCgxJjAkBgNVBAMMHWpvaG4vREM9am9obmxlYWNoL0RDPWNvL0RD
16
+ PXVrMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAxq6iwNdGb7lFZAHA
17
+ of8e2Akt6ziqAv9JC0GTPULqy474MfwykTOg6nharVTpSonUUWatbo4e3YDITMWB
18
+ wbiawE86ourm9onuMrDsdLJlyVXFPTJWLo8LBgpIIJ1jtD8TiqJgr+Qtd3yCgvYv
19
+ F1iXhEXBXs6x6WymJbCSlFFrAk/z6iOUGFjWIvkFX50UI7eL3Khym8APkBnyRlxz
20
+ /P3sySP/LdABk31H0dQp5DvcN8RHRg2UeKa+Ey+xPqg4TtMm8uY2uot/qO4jhW/Z
21
+ YbJNRFUr0C0rEgI4oGZQ0MIOurAhdkAaRSKbxfwIJoaDeER8Y2Yggnb61J5LOEgE
22
+ G8WtIAbp5LYcYwgJ2rRvc6X3E1tWM4d7Bo0FGTS0w6HmjtzLMXjiUgtWRFHF1I9S
23
+ V+nVZ1FL8/XSlAAuqD//6Dw20u+3Qoau3iw/PUdN6ODfAH6USBYqj2nH/m8VDtoZ
24
+ 2my9UCZ+/xjsxn5aBKlXkQ3En8B61Es6vrgFHoQYpeKhBsUvAgMBAAGjezB5MAkG
25
+ A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBQ4o+vdcZcr0kfV4LdYBog5
26
+ dCrJvzAfBgNVHREEGDAWgRRqb2huQGpvaG5sZWFjaC5jby51azAfBgNVHRIEGDAW
27
+ gRRqb2huQGpvaG5sZWFjaC5jby51azANBgkqhkiG9w0BAQsFAAOCAYEAtPF2fUxO
28
+ 5iVE/AMac4KaEEZTF7anQFOldRiz0noI48OXN2bvtZtuq8QucIMMtGbtEWPOigGh
29
+ oeWv0K7ZWvUNixRoWp3enSSByFwMF7UYpTBG54mos8UMfBj4C/pjvdqnJ34li2Ur
30
+ mb2vxJQAGNzzbfvcnNin0wM/iPRKuvM7OTalNnEOOaOehHvqClBlRU71Dp3BngXA
31
+ CD+fSJptjeixJY2pzSsZPDui84zIkHJtWyY77mrkEyQkRAdThf9Xnn3UOZ0yF0m/
32
+ kbEN/MgvvoWBtfReYx/uMT1PzdiRrhaThEbmQV4JNFNUtgbewTNCVNvtMq9q0j2t
33
+ nCtWWXRiGW/lJMBIKFHoe1l0m/IlC1Adji1QjosLFDqz+K+OglsJG5F6fF6gCvhC
34
+ xBDK27vsuKyrp/Zcg0HyYzJMpDu9NErKki8MXzlpOfrvR15NuEL267bvpI5QA3m/
35
+ KzVFklk+44JKTYFrr6Nu+BoGS9jVpzdBFobVQZmJO4Cugqxl40zSpZqj
33
36
  -----END CERTIFICATE-----
34
- date: 2015-01-30 00:00:00.000000000 Z
37
+ date: 2018-06-17 00:00:00.000000000 Z
35
38
  dependencies:
36
39
  - !ruby/object:Gem::Dependency
37
40
  name: htmlentities
@@ -104,8 +107,8 @@ dependencies:
104
107
  - !ruby/object:Gem::Version
105
108
  version: '0'
106
109
  description: A Ruby library to parse the content out of web pages. Currently supports
107
- BBC News pages, The Guardian, Independent, New York Times and The Intercept articles.
108
- Used by the News Sniffer project. http://www.newssniffer.co.uk
110
+ BBC News pages, The Guardian, Independent, New York Times, RT, Washington Post and
111
+ The Intercept articles. Used by the News Sniffer project. https://www.newssniffer.co.uk
109
112
  email: john@johnleach.co.uk
110
113
  executables: []
111
114
  extensions: []
@@ -123,6 +126,7 @@ files:
123
126
  - lib/web-page-parser/parsers/guardian_page_parser.rb
124
127
  - lib/web-page-parser/parsers/independent_page_parser.rb
125
128
  - lib/web-page-parser/parsers/new_york_times_page_parser.rb
129
+ - lib/web-page-parser/parsers/rt_page_parser.rb
126
130
  - lib/web-page-parser/parsers/test_page_parser.rb
127
131
  - lib/web-page-parser/parsers/the_intercept_page_parser.rb
128
132
  - lib/web-page-parser/parsers/washingtonpost_page_parser.rb
@@ -135,6 +139,9 @@ files:
135
139
  - spec/fixtures/bbc_news/19957138.stm.html
136
140
  - spec/fixtures/bbc_news/20230333.stm.html
137
141
  - spec/fixtures/bbc_news/21528631.html
142
+ - spec/fixtures/bbc_news/31014941.html
143
+ - spec/fixtures/bbc_news/32271505.html
144
+ - spec/fixtures/bbc_news/32275608.html
138
145
  - spec/fixtures/bbc_news/6072486.stm.html
139
146
  - spec/fixtures/bbc_news/7745137.stm.html
140
147
  - spec/fixtures/bbc_news/8011268.stm.html
@@ -145,29 +152,45 @@ files:
145
152
  - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
146
153
  - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
147
154
  - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
155
+ - spec/fixtures/guardian/duplicate-headline.html
156
+ - spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
148
157
  - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
149
158
  - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
159
+ - spec/fixtures/guardian/university-extremist-speakers.html
150
160
  - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
161
+ - spec/fixtures/independent/boris-johnson.html
151
162
  - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
152
163
  - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
164
+ - spec/fixtures/independent/lord-burns.html
153
165
  - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
154
166
  - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
167
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
155
168
  - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
156
169
  - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
170
+ - spec/fixtures/new_york_times/trump-kim-policy.html
171
+ - spec/fixtures/rt/338045.html
172
+ - spec/fixtures/rt/338237.html
157
173
  - spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
174
+ - spec/fixtures/theintercept/pentagon-missionary.html
175
+ - spec/fixtures/washingtonpost/israeli-ambassador.html
158
176
  - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
177
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
159
178
  - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
179
+ - spec/fixtures/washingtonpost/trump-kim-summit.html
180
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
160
181
  - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
161
182
  - spec/parser_factory_spec.rb
162
183
  - spec/parsers/bbc_news_page_spec.rb
163
184
  - spec/parsers/guardian_page_spec.rb
164
185
  - spec/parsers/independent_page_parser_spec.rb
165
186
  - spec/parsers/new_york_times_page_parser_spec.rb
187
+ - spec/parsers/rt_page_parser_spec.rb
166
188
  - spec/parsers/the_intercept_page_parser_spec.rb
167
189
  - spec/parsers/washingtonpost_page_parser_spec.rb
168
190
  - spec/spec.opts
169
191
  - spec/spec_helper.rb
170
- homepage: http://github.com/johnl/web-page-parser
192
+ - spec/web-page-parser
193
+ homepage: https://github.com/johnl/web-page-parser
171
194
  licenses:
172
195
  - MIT
173
196
  metadata: {}
@@ -187,49 +210,68 @@ required_rubygems_version: !ruby/object:Gem::Requirement
187
210
  version: '0'
188
211
  requirements: []
189
212
  rubyforge_project:
190
- rubygems_version: 2.2.2
213
+ rubygems_version: 2.7.6
191
214
  signing_key:
192
215
  specification_version: 4
193
216
  summary: A parser for various news organisation's web pages
194
217
  test_files:
195
- - spec/fixtures/bbc_news/10249066.stm.html
196
- - spec/fixtures/bbc_news/10341015.stm.html
197
- - spec/fixtures/bbc_news/11125504.html
198
- - spec/fixtures/bbc_news/12921632.html
199
- - spec/fixtures/bbc_news/13293006.html
200
- - spec/fixtures/bbc_news/19957138.stm.html
201
- - spec/fixtures/bbc_news/20230333.stm.html
202
- - spec/fixtures/bbc_news/21528631.html
203
- - spec/fixtures/bbc_news/6072486.stm.html
204
- - spec/fixtures/bbc_news/7745137.stm.html
205
- - spec/fixtures/bbc_news/8011268.stm.html
206
- - spec/fixtures/bbc_news/8029015.stm.html
207
- - spec/fixtures/bbc_news/8040164.stm.html
208
- - spec/fixtures/bbc_news/8063681.stm.html
209
- - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
210
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
211
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
218
+ - spec/base_parser_spec.rb
219
+ - spec/web-page-parser
220
+ - spec/parser_factory_spec.rb
221
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
222
+ - spec/fixtures/new_york_times/trump-kim-policy.html
223
+ - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
224
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
225
+ - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
226
+ - spec/fixtures/rt/338237.html
227
+ - spec/fixtures/rt/338045.html
212
228
  - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
213
- - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
229
+ - spec/fixtures/guardian/university-extremist-speakers.html
230
+ - spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
231
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
214
232
  - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
215
- - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
216
- - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
233
+ - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
234
+ - spec/fixtures/guardian/duplicate-headline.html
235
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
217
236
  - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
237
+ - spec/fixtures/independent/boris-johnson.html
218
238
  - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
219
- - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
220
- - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
221
- - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
222
- - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
239
+ - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
240
+ - spec/fixtures/independent/lord-burns.html
241
+ - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
223
242
  - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
243
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
244
+ - spec/fixtures/washingtonpost/israeli-ambassador.html
245
+ - spec/fixtures/washingtonpost/trump-kim-summit.html
246
+ - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
247
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
224
248
  - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
225
249
  - spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
250
+ - spec/fixtures/theintercept/pentagon-missionary.html
251
+ - spec/fixtures/bbc_news/31014941.html
252
+ - spec/fixtures/bbc_news/6072486.stm.html
253
+ - spec/fixtures/bbc_news/13293006.html
254
+ - spec/fixtures/bbc_news/10341015.stm.html
255
+ - spec/fixtures/bbc_news/21528631.html
256
+ - spec/fixtures/bbc_news/20230333.stm.html
257
+ - spec/fixtures/bbc_news/8011268.stm.html
258
+ - spec/fixtures/bbc_news/8040164.stm.html
259
+ - spec/fixtures/bbc_news/10249066.stm.html
260
+ - spec/fixtures/bbc_news/19957138.stm.html
261
+ - spec/fixtures/bbc_news/32275608.html
262
+ - spec/fixtures/bbc_news/8029015.stm.html
263
+ - spec/fixtures/bbc_news/32271505.html
264
+ - spec/fixtures/bbc_news/11125504.html
265
+ - spec/fixtures/bbc_news/7745137.stm.html
266
+ - spec/fixtures/bbc_news/12921632.html
267
+ - spec/fixtures/bbc_news/8063681.stm.html
268
+ - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
269
+ - spec/spec_helper.rb
270
+ - spec/spec.opts
271
+ - spec/parsers/new_york_times_page_parser_spec.rb
272
+ - spec/parsers/the_intercept_page_parser_spec.rb
273
+ - spec/parsers/rt_page_parser_spec.rb
226
274
  - spec/parsers/bbc_news_page_spec.rb
227
275
  - spec/parsers/guardian_page_spec.rb
228
- - spec/parsers/new_york_times_page_parser_spec.rb
229
276
  - spec/parsers/independent_page_parser_spec.rb
230
- - spec/parsers/the_intercept_page_parser_spec.rb
231
277
  - spec/parsers/washingtonpost_page_parser_spec.rb
232
- - spec/parser_factory_spec.rb
233
- - spec/spec.opts
234
- - spec/spec_helper.rb
235
- - spec/base_parser_spec.rb
metadata.gz.sig CHANGED
Binary file