web-page-parser 1.1.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +56 -3
  5. data/lib/web-page-parser/parsers/guardian_page_parser.rb +33 -3
  6. data/lib/web-page-parser/parsers/independent_page_parser.rb +2 -2
  7. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +8 -3
  8. data/lib/web-page-parser/parsers/rt_page_parser.rb +49 -0
  9. data/lib/web-page-parser/parsers/the_intercept_page_parser.rb +6 -1
  10. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +38 -2
  11. data/spec/fixtures/bbc_news/31014941.html +1123 -0
  12. data/spec/fixtures/bbc_news/32271505.html +1168 -0
  13. data/spec/fixtures/bbc_news/32275608.html +1142 -0
  14. data/spec/fixtures/guardian/duplicate-headline.html +2735 -0
  15. data/spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html +1752 -0
  16. data/spec/fixtures/guardian/university-extremist-speakers.html +1590 -0
  17. data/spec/fixtures/independent/boris-johnson.html +1086 -0
  18. data/spec/fixtures/independent/lord-burns.html +726 -0
  19. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html +305 -0
  20. data/spec/fixtures/new_york_times/trump-kim-policy.html +305 -0
  21. data/spec/fixtures/rt/338045.html +682 -0
  22. data/spec/fixtures/rt/338237.html +682 -0
  23. data/spec/fixtures/theintercept/pentagon-missionary.html +211 -0
  24. data/spec/fixtures/washingtonpost/israeli-ambassador.html +747 -0
  25. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html +381 -0
  26. data/spec/fixtures/washingtonpost/trump-kim-summit.html +379 -0
  27. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html +386 -0
  28. data/spec/parsers/bbc_news_page_spec.rb +132 -11
  29. data/spec/parsers/guardian_page_spec.rb +100 -0
  30. data/spec/parsers/independent_page_parser_spec.rb +52 -0
  31. data/spec/parsers/new_york_times_page_parser_spec.rb +75 -10
  32. data/spec/parsers/rt_page_parser_spec.rb +87 -0
  33. data/spec/parsers/the_intercept_page_parser_spec.rb +30 -0
  34. data/spec/parsers/washingtonpost_page_parser_spec.rb +93 -1
  35. data/spec/web-page-parser +1 -0
  36. metadata +98 -56
  37. metadata.gz.sig +0 -0
@@ -0,0 +1 @@
1
+ spec/../spec/
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Leach
@@ -10,28 +10,31 @@ bindir: bin
10
10
  cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
- MIIDmjCCAoKgAwIBAgIBATANBgkqhkiG9w0BAQUFADBSMQ0wCwYDVQQDDARqb2hu
14
- MRkwFwYKCZImiZPyLGQBGRYJam9obmxlYWNoMRIwEAYKCZImiZPyLGQBGRYCY28x
15
- EjAQBgoJkiaJk/IsZAEZFgJ1azAeFw0xNDEwMjUxNzAyMDBaFw0xNTEwMjUxNzAy
16
- MDBaMFIxDTALBgNVBAMMBGpvaG4xGTAXBgoJkiaJk/IsZAEZFglqb2hubGVhY2gx
17
- EjAQBgoJkiaJk/IsZAEZFgJjbzESMBAGCgmSJomT8ixkARkWAnVrMIIBIjANBgkq
18
- hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA49t9Dck0zPjDiEuRLTEsoXVQEZECcFtu
19
- FU2j6ZFGd/twm4dlZ2qHYs5GYtpSwlfJYhAOc2a9cHz1KKSevGaPRxpsFhKEZ+Yj
20
- 5R8y4VZCY1Rx7tyX2PYdWBNaeTNPIACdW4HNg2/n1bbhu4LkQ+PYBQb6bbeFnzTx
21
- dl2ZLvhwSRUbl7aIiYyENbpOmPKCL1ReJUkQn+1Kyq76ZMY6pG6iSeeZvDtKZKqd
22
- MX4bWAIBeT6mUv/jhIDkJgj+JO11v3wbhojAcVHInGnyCQ7dLn3hurlLfII4SiLT
23
- foOh2i2OY5ZTG5PoPEGMiagBWAUmQUA+Yc6gnfpjrX/aFG/aa6T2+wIDAQABo3sw
24
- eTAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUqd/WZnE4x+pZ+DS1
25
- +H3vH+/gvfkwHwYDVR0RBBgwFoEUam9obkBqb2hubGVhY2guY28udWswHwYDVR0S
26
- BBgwFoEUam9obkBqb2hubGVhY2guY28udWswDQYJKoZIhvcNAQEFBQADggEBABG0
27
- Avvj8mNb+0drmLLFLMLck8oEcpzONbVG6A3XWrkUkTsdsw8VB848QuFg3gR+3ReU
28
- C98Bm+8L1zYVkfhTzHNJ8Y9HGC+8eEXoMQw1C2jGBcN+i4G+eylOBv+PTJX3UU8r
29
- r7Tb7QqD7tPjy7TS91OvyImc7Jixt848nrrs9nWSiEIaVxQqBRRdKANsgFISvvA4
30
- CPFEkKZm3GcFRIVu9yQO1LWfsbvbVLhD5HSynklijwo2RroPXlNKi6RXsxKwgtqD
31
- MghEyBTNQa+QTUTKQMjYOO3kV+Wuv+iQGaMm/bu2SD+Ov0XUzzAsSfz0ZvrF3fbG
32
- jdD4CMQtJNDqDiWuUkg=
13
+ MIIERjCCAq6gAwIBAgIBATANBgkqhkiG9w0BAQsFADAoMSYwJAYDVQQDDB1qb2hu
14
+ L0RDPWpvaG5sZWFjaC9EQz1jby9EQz11azAeFw0xODA3MjUxMDQ2MTRaFw0yMDA3
15
+ MjQxMDQ2MTRaMCgxJjAkBgNVBAMMHWpvaG4vREM9am9obmxlYWNoL0RDPWNvL0RD
16
+ PXVrMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEAxq6iwNdGb7lFZAHA
17
+ of8e2Akt6ziqAv9JC0GTPULqy474MfwykTOg6nharVTpSonUUWatbo4e3YDITMWB
18
+ wbiawE86ourm9onuMrDsdLJlyVXFPTJWLo8LBgpIIJ1jtD8TiqJgr+Qtd3yCgvYv
19
+ F1iXhEXBXs6x6WymJbCSlFFrAk/z6iOUGFjWIvkFX50UI7eL3Khym8APkBnyRlxz
20
+ /P3sySP/LdABk31H0dQp5DvcN8RHRg2UeKa+Ey+xPqg4TtMm8uY2uot/qO4jhW/Z
21
+ YbJNRFUr0C0rEgI4oGZQ0MIOurAhdkAaRSKbxfwIJoaDeER8Y2Yggnb61J5LOEgE
22
+ G8WtIAbp5LYcYwgJ2rRvc6X3E1tWM4d7Bo0FGTS0w6HmjtzLMXjiUgtWRFHF1I9S
23
+ V+nVZ1FL8/XSlAAuqD//6Dw20u+3Qoau3iw/PUdN6ODfAH6USBYqj2nH/m8VDtoZ
24
+ 2my9UCZ+/xjsxn5aBKlXkQ3En8B61Es6vrgFHoQYpeKhBsUvAgMBAAGjezB5MAkG
25
+ A1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQWBBQ4o+vdcZcr0kfV4LdYBog5
26
+ dCrJvzAfBgNVHREEGDAWgRRqb2huQGpvaG5sZWFjaC5jby51azAfBgNVHRIEGDAW
27
+ gRRqb2huQGpvaG5sZWFjaC5jby51azANBgkqhkiG9w0BAQsFAAOCAYEAtPF2fUxO
28
+ 5iVE/AMac4KaEEZTF7anQFOldRiz0noI48OXN2bvtZtuq8QucIMMtGbtEWPOigGh
29
+ oeWv0K7ZWvUNixRoWp3enSSByFwMF7UYpTBG54mos8UMfBj4C/pjvdqnJ34li2Ur
30
+ mb2vxJQAGNzzbfvcnNin0wM/iPRKuvM7OTalNnEOOaOehHvqClBlRU71Dp3BngXA
31
+ CD+fSJptjeixJY2pzSsZPDui84zIkHJtWyY77mrkEyQkRAdThf9Xnn3UOZ0yF0m/
32
+ kbEN/MgvvoWBtfReYx/uMT1PzdiRrhaThEbmQV4JNFNUtgbewTNCVNvtMq9q0j2t
33
+ nCtWWXRiGW/lJMBIKFHoe1l0m/IlC1Adji1QjosLFDqz+K+OglsJG5F6fF6gCvhC
34
+ xBDK27vsuKyrp/Zcg0HyYzJMpDu9NErKki8MXzlpOfrvR15NuEL267bvpI5QA3m/
35
+ KzVFklk+44JKTYFrr6Nu+BoGS9jVpzdBFobVQZmJO4Cugqxl40zSpZqj
33
36
  -----END CERTIFICATE-----
34
- date: 2015-01-30 00:00:00.000000000 Z
37
+ date: 2018-06-17 00:00:00.000000000 Z
35
38
  dependencies:
36
39
  - !ruby/object:Gem::Dependency
37
40
  name: htmlentities
@@ -104,8 +107,8 @@ dependencies:
104
107
  - !ruby/object:Gem::Version
105
108
  version: '0'
106
109
  description: A Ruby library to parse the content out of web pages. Currently supports
107
- BBC News pages, The Guardian, Independent, New York Times and The Intercept articles.
108
- Used by the News Sniffer project. http://www.newssniffer.co.uk
110
+ BBC News pages, The Guardian, Independent, New York Times, RT, Washington Post and
111
+ The Intercept articles. Used by the News Sniffer project. https://www.newssniffer.co.uk
109
112
  email: john@johnleach.co.uk
110
113
  executables: []
111
114
  extensions: []
@@ -123,6 +126,7 @@ files:
123
126
  - lib/web-page-parser/parsers/guardian_page_parser.rb
124
127
  - lib/web-page-parser/parsers/independent_page_parser.rb
125
128
  - lib/web-page-parser/parsers/new_york_times_page_parser.rb
129
+ - lib/web-page-parser/parsers/rt_page_parser.rb
126
130
  - lib/web-page-parser/parsers/test_page_parser.rb
127
131
  - lib/web-page-parser/parsers/the_intercept_page_parser.rb
128
132
  - lib/web-page-parser/parsers/washingtonpost_page_parser.rb
@@ -135,6 +139,9 @@ files:
135
139
  - spec/fixtures/bbc_news/19957138.stm.html
136
140
  - spec/fixtures/bbc_news/20230333.stm.html
137
141
  - spec/fixtures/bbc_news/21528631.html
142
+ - spec/fixtures/bbc_news/31014941.html
143
+ - spec/fixtures/bbc_news/32271505.html
144
+ - spec/fixtures/bbc_news/32275608.html
138
145
  - spec/fixtures/bbc_news/6072486.stm.html
139
146
  - spec/fixtures/bbc_news/7745137.stm.html
140
147
  - spec/fixtures/bbc_news/8011268.stm.html
@@ -145,29 +152,45 @@ files:
145
152
  - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
146
153
  - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
147
154
  - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
155
+ - spec/fixtures/guardian/duplicate-headline.html
156
+ - spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
148
157
  - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
149
158
  - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
159
+ - spec/fixtures/guardian/university-extremist-speakers.html
150
160
  - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
161
+ - spec/fixtures/independent/boris-johnson.html
151
162
  - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
152
163
  - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
164
+ - spec/fixtures/independent/lord-burns.html
153
165
  - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
154
166
  - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
167
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
155
168
  - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
156
169
  - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
170
+ - spec/fixtures/new_york_times/trump-kim-policy.html
171
+ - spec/fixtures/rt/338045.html
172
+ - spec/fixtures/rt/338237.html
157
173
  - spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
174
+ - spec/fixtures/theintercept/pentagon-missionary.html
175
+ - spec/fixtures/washingtonpost/israeli-ambassador.html
158
176
  - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
177
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
159
178
  - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
179
+ - spec/fixtures/washingtonpost/trump-kim-summit.html
180
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
160
181
  - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
161
182
  - spec/parser_factory_spec.rb
162
183
  - spec/parsers/bbc_news_page_spec.rb
163
184
  - spec/parsers/guardian_page_spec.rb
164
185
  - spec/parsers/independent_page_parser_spec.rb
165
186
  - spec/parsers/new_york_times_page_parser_spec.rb
187
+ - spec/parsers/rt_page_parser_spec.rb
166
188
  - spec/parsers/the_intercept_page_parser_spec.rb
167
189
  - spec/parsers/washingtonpost_page_parser_spec.rb
168
190
  - spec/spec.opts
169
191
  - spec/spec_helper.rb
170
- homepage: http://github.com/johnl/web-page-parser
192
+ - spec/web-page-parser
193
+ homepage: https://github.com/johnl/web-page-parser
171
194
  licenses:
172
195
  - MIT
173
196
  metadata: {}
@@ -187,49 +210,68 @@ required_rubygems_version: !ruby/object:Gem::Requirement
187
210
  version: '0'
188
211
  requirements: []
189
212
  rubyforge_project:
190
- rubygems_version: 2.2.2
213
+ rubygems_version: 2.7.6
191
214
  signing_key:
192
215
  specification_version: 4
193
216
  summary: A parser for various news organisation's web pages
194
217
  test_files:
195
- - spec/fixtures/bbc_news/10249066.stm.html
196
- - spec/fixtures/bbc_news/10341015.stm.html
197
- - spec/fixtures/bbc_news/11125504.html
198
- - spec/fixtures/bbc_news/12921632.html
199
- - spec/fixtures/bbc_news/13293006.html
200
- - spec/fixtures/bbc_news/19957138.stm.html
201
- - spec/fixtures/bbc_news/20230333.stm.html
202
- - spec/fixtures/bbc_news/21528631.html
203
- - spec/fixtures/bbc_news/6072486.stm.html
204
- - spec/fixtures/bbc_news/7745137.stm.html
205
- - spec/fixtures/bbc_news/8011268.stm.html
206
- - spec/fixtures/bbc_news/8029015.stm.html
207
- - spec/fixtures/bbc_news/8040164.stm.html
208
- - spec/fixtures/bbc_news/8063681.stm.html
209
- - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
210
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
211
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
218
+ - spec/base_parser_spec.rb
219
+ - spec/web-page-parser
220
+ - spec/parser_factory_spec.rb
221
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
222
+ - spec/fixtures/new_york_times/trump-kim-policy.html
223
+ - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
224
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one-2018.html
225
+ - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
226
+ - spec/fixtures/rt/338237.html
227
+ - spec/fixtures/rt/338045.html
212
228
  - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
213
- - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
229
+ - spec/fixtures/guardian/university-extremist-speakers.html
230
+ - spec/fixtures/guardian/julian-assange-donald-trump-hillary-clinton-interview.html
231
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
214
232
  - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
215
- - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
216
- - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
233
+ - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
234
+ - spec/fixtures/guardian/duplicate-headline.html
235
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
217
236
  - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
237
+ - spec/fixtures/independent/boris-johnson.html
218
238
  - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
219
- - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
220
- - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
221
- - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
222
- - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
239
+ - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
240
+ - spec/fixtures/independent/lord-burns.html
241
+ - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
223
242
  - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
243
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain-2018.html
244
+ - spec/fixtures/washingtonpost/israeli-ambassador.html
245
+ - spec/fixtures/washingtonpost/trump-kim-summit.html
246
+ - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
247
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery-2018.html
224
248
  - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
225
249
  - spec/fixtures/theintercept/canada-proclaiming-war-12-years-shocked-someone-attacked-soldiers.html
250
+ - spec/fixtures/theintercept/pentagon-missionary.html
251
+ - spec/fixtures/bbc_news/31014941.html
252
+ - spec/fixtures/bbc_news/6072486.stm.html
253
+ - spec/fixtures/bbc_news/13293006.html
254
+ - spec/fixtures/bbc_news/10341015.stm.html
255
+ - spec/fixtures/bbc_news/21528631.html
256
+ - spec/fixtures/bbc_news/20230333.stm.html
257
+ - spec/fixtures/bbc_news/8011268.stm.html
258
+ - spec/fixtures/bbc_news/8040164.stm.html
259
+ - spec/fixtures/bbc_news/10249066.stm.html
260
+ - spec/fixtures/bbc_news/19957138.stm.html
261
+ - spec/fixtures/bbc_news/32275608.html
262
+ - spec/fixtures/bbc_news/8029015.stm.html
263
+ - spec/fixtures/bbc_news/32271505.html
264
+ - spec/fixtures/bbc_news/11125504.html
265
+ - spec/fixtures/bbc_news/7745137.stm.html
266
+ - spec/fixtures/bbc_news/12921632.html
267
+ - spec/fixtures/bbc_news/8063681.stm.html
268
+ - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
269
+ - spec/spec_helper.rb
270
+ - spec/spec.opts
271
+ - spec/parsers/new_york_times_page_parser_spec.rb
272
+ - spec/parsers/the_intercept_page_parser_spec.rb
273
+ - spec/parsers/rt_page_parser_spec.rb
226
274
  - spec/parsers/bbc_news_page_spec.rb
227
275
  - spec/parsers/guardian_page_spec.rb
228
- - spec/parsers/new_york_times_page_parser_spec.rb
229
276
  - spec/parsers/independent_page_parser_spec.rb
230
- - spec/parsers/the_intercept_page_parser_spec.rb
231
277
  - spec/parsers/washingtonpost_page_parser_spec.rb
232
- - spec/parser_factory_spec.rb
233
- - spec/spec.opts
234
- - spec/spec_helper.rb
235
- - spec/base_parser_spec.rb
metadata.gz.sig CHANGED
Binary file