web-page-parser 0.25 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. checksums.yaml.gz.sig +1 -0
  3. data.tar.gz.sig +0 -0
  4. data/README.rdoc +5 -0
  5. data/lib/web-page-parser.rb +31 -0
  6. data/lib/web-page-parser/base_parser.rb +92 -42
  7. data/lib/web-page-parser/http.rb +63 -0
  8. data/lib/web-page-parser/parser_factory.rb +0 -1
  9. data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
  10. data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
  11. data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
  12. data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
  13. data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
  14. data/spec/base_parser_spec.rb +24 -8
  15. data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
  16. data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
  17. data/spec/fixtures/bbc_news/21528631.html +2021 -0
  18. data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
  19. data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
  20. data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
  21. data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
  22. data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
  23. data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
  24. data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
  25. data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
  26. data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
  27. data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
  28. data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
  29. data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
  30. data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
  31. data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
  32. data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
  33. data/spec/parser_factory_spec.rb +3 -3
  34. data/spec/parsers/bbc_news_page_spec.rb +223 -3
  35. data/spec/parsers/guardian_page_spec.rb +157 -4
  36. data/spec/parsers/independent_page_parser_spec.rb +152 -0
  37. data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
  38. data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
  39. data/spec/spec_helper.rb +5 -0
  40. metadata +167 -59
  41. metadata.gz.sig +2 -0
@@ -0,0 +1,5 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
2
+ $:.unshift File.join(File.dirname(__FILE__), '../spec')
3
+
4
+ require 'web-page-parser'
5
+
metadata CHANGED
@@ -1,50 +1,111 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web-page-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.25'
5
- prerelease:
4
+ version: 1.0.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - John Leach
9
8
  autorequire:
10
9
  bindir: bin
11
- cert_chain: []
12
- date: 2012-06-05 00:00:00.000000000 Z
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDmjCCAoKgAwIBAgIBATANBgkqhkiG9w0BAQUFADBSMQ0wCwYDVQQDDARqb2hu
14
+ MRkwFwYKCZImiZPyLGQBGRYJam9obmxlYWNoMRIwEAYKCZImiZPyLGQBGRYCY28x
15
+ EjAQBgoJkiaJk/IsZAEZFgJ1azAeFw0xNDEwMjUxNzAyMDBaFw0xNTEwMjUxNzAy
16
+ MDBaMFIxDTALBgNVBAMMBGpvaG4xGTAXBgoJkiaJk/IsZAEZFglqb2hubGVhY2gx
17
+ EjAQBgoJkiaJk/IsZAEZFgJjbzESMBAGCgmSJomT8ixkARkWAnVrMIIBIjANBgkq
18
+ hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA49t9Dck0zPjDiEuRLTEsoXVQEZECcFtu
19
+ FU2j6ZFGd/twm4dlZ2qHYs5GYtpSwlfJYhAOc2a9cHz1KKSevGaPRxpsFhKEZ+Yj
20
+ 5R8y4VZCY1Rx7tyX2PYdWBNaeTNPIACdW4HNg2/n1bbhu4LkQ+PYBQb6bbeFnzTx
21
+ dl2ZLvhwSRUbl7aIiYyENbpOmPKCL1ReJUkQn+1Kyq76ZMY6pG6iSeeZvDtKZKqd
22
+ MX4bWAIBeT6mUv/jhIDkJgj+JO11v3wbhojAcVHInGnyCQ7dLn3hurlLfII4SiLT
23
+ foOh2i2OY5ZTG5PoPEGMiagBWAUmQUA+Yc6gnfpjrX/aFG/aa6T2+wIDAQABo3sw
24
+ eTAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUqd/WZnE4x+pZ+DS1
25
+ +H3vH+/gvfkwHwYDVR0RBBgwFoEUam9obkBqb2hubGVhY2guY28udWswHwYDVR0S
26
+ BBgwFoEUam9obkBqb2hubGVhY2guY28udWswDQYJKoZIhvcNAQEFBQADggEBABG0
27
+ Avvj8mNb+0drmLLFLMLck8oEcpzONbVG6A3XWrkUkTsdsw8VB848QuFg3gR+3ReU
28
+ C98Bm+8L1zYVkfhTzHNJ8Y9HGC+8eEXoMQw1C2jGBcN+i4G+eylOBv+PTJX3UU8r
29
+ r7Tb7QqD7tPjy7TS91OvyImc7Jixt848nrrs9nWSiEIaVxQqBRRdKANsgFISvvA4
30
+ CPFEkKZm3GcFRIVu9yQO1LWfsbvbVLhD5HSynklijwo2RroPXlNKi6RXsxKwgtqD
31
+ MghEyBTNQa+QTUTKQMjYOO3kV+Wuv+iQGaMm/bu2SD+Ov0XUzzAsSfz0ZvrF3fbG
32
+ jdD4CMQtJNDqDiWuUkg=
33
+ -----END CERTIFICATE-----
34
+ date: 2014-10-25 00:00:00.000000000 Z
13
35
  dependencies:
14
36
  - !ruby/object:Gem::Dependency
15
- name: oniguruma
37
+ name: htmlentities
16
38
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
39
  requirements:
19
- - - ! '>='
40
+ - - "~>"
20
41
  - !ruby/object:Gem::Version
21
- version: 1.1.0
42
+ version: '4.3'
22
43
  type: :runtime
23
44
  prerelease: false
24
45
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
46
  requirements:
27
- - - ! '>='
47
+ - - "~>"
28
48
  - !ruby/object:Gem::Version
29
- version: 1.1.0
49
+ version: '4.3'
30
50
  - !ruby/object:Gem::Dependency
31
- name: htmlentities
51
+ name: curb
52
+ requirement: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '0.8'
57
+ type: :runtime
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '0.8'
64
+ - !ruby/object:Gem::Dependency
65
+ name: nokogiri
32
66
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
67
  requirements:
35
- - - ! '>='
68
+ - - "~>"
36
69
  - !ruby/object:Gem::Version
37
- version: 4.0.0
70
+ version: '1.6'
38
71
  type: :runtime
39
72
  prerelease: false
40
73
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
74
  requirements:
43
- - - ! '>='
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '1.6'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: '2.11'
85
+ type: :development
86
+ prerelease: false
87
+ version_requirements: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - "~>"
90
+ - !ruby/object:Gem::Version
91
+ version: '2.11'
92
+ - !ruby/object:Gem::Dependency
93
+ name: rake
94
+ requirement: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
44
104
  - !ruby/object:Gem::Version
45
- version: 4.0.0
46
- description: A Ruby library to parse the content out of web pages, such as BBC News
47
- pages and Guardian articles. Used by the News Sniffer project.
105
+ version: '0'
106
+ description: A Ruby library to parse the content out of web pages. Currently supports
107
+ BBC News pages, The Guardian, Independent and New York Times articles. Used by the
108
+ News Sniffer project. http://www.newssniffer.co.uk
48
109
  email: john@johnleach.co.uk
49
110
  executables: []
50
111
  extensions: []
@@ -52,71 +113,118 @@ extra_rdoc_files:
52
113
  - README.rdoc
53
114
  - LICENSE
54
115
  files:
55
- - lib/web-page-parser/parser_factory.rb
116
+ - LICENSE
117
+ - README.rdoc
118
+ - lib/web-page-parser.rb
56
119
  - lib/web-page-parser/base_parser.rb
57
- - lib/web-page-parser/parsers/test_page_parser.rb
58
- - lib/web-page-parser/parsers/guardian_page_parser.rb
120
+ - lib/web-page-parser/http.rb
121
+ - lib/web-page-parser/parser_factory.rb
59
122
  - lib/web-page-parser/parsers/bbc_news_page_parser.rb
60
- - lib/web-page-parser.rb
61
- - README.rdoc
62
- - LICENSE
63
- - spec/parser_factory_spec.rb
123
+ - lib/web-page-parser/parsers/guardian_page_parser.rb
124
+ - lib/web-page-parser/parsers/independent_page_parser.rb
125
+ - lib/web-page-parser/parsers/new_york_times_page_parser.rb
126
+ - lib/web-page-parser/parsers/test_page_parser.rb
127
+ - lib/web-page-parser/parsers/washingtonpost_page_parser.rb
64
128
  - spec/base_parser_spec.rb
65
- - spec/parsers/guardian_page_spec.rb
66
- - spec/parsers/bbc_news_page_spec.rb
67
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
68
- - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
69
- - spec/fixtures/bbc_news/10341015.stm.html
70
- - spec/fixtures/bbc_news/8029015.stm.html
71
- - spec/fixtures/bbc_news/7745137.stm.html
72
- - spec/fixtures/bbc_news/8063681.stm.html
73
129
  - spec/fixtures/bbc_news/10249066.stm.html
74
- - spec/fixtures/bbc_news/8011268.stm.html
130
+ - spec/fixtures/bbc_news/10341015.stm.html
131
+ - spec/fixtures/bbc_news/11125504.html
75
132
  - spec/fixtures/bbc_news/12921632.html
76
133
  - spec/fixtures/bbc_news/13293006.html
77
- - spec/fixtures/bbc_news/11125504.html
134
+ - spec/fixtures/bbc_news/19957138.stm.html
135
+ - spec/fixtures/bbc_news/20230333.stm.html
136
+ - spec/fixtures/bbc_news/21528631.html
78
137
  - spec/fixtures/bbc_news/6072486.stm.html
138
+ - spec/fixtures/bbc_news/7745137.stm.html
139
+ - spec/fixtures/bbc_news/8011268.stm.html
140
+ - spec/fixtures/bbc_news/8029015.stm.html
141
+ - spec/fixtures/bbc_news/8040164.stm.html
142
+ - spec/fixtures/bbc_news/8063681.stm.html
143
+ - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
144
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
145
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
146
+ - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
147
+ - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
148
+ - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
149
+ - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
150
+ - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
151
+ - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
152
+ - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
153
+ - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
154
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
155
+ - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
156
+ - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
157
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
158
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
159
+ - spec/parser_factory_spec.rb
160
+ - spec/parsers/bbc_news_page_spec.rb
161
+ - spec/parsers/guardian_page_spec.rb
162
+ - spec/parsers/independent_page_parser_spec.rb
163
+ - spec/parsers/new_york_times_page_parser_spec.rb
164
+ - spec/parsers/washingtonpost_page_parser_spec.rb
79
165
  - spec/spec.opts
80
- homepage: http://github.com/johnl/web-page-parser/tree/master
81
- licenses: []
166
+ - spec/spec_helper.rb
167
+ homepage: http://github.com/johnl/web-page-parser
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
82
171
  post_install_message:
83
172
  rdoc_options: []
84
173
  require_paths:
85
174
  - lib
86
175
  required_ruby_version: !ruby/object:Gem::Requirement
87
- none: false
88
176
  requirements:
89
- - - ! '>='
177
+ - - ">="
90
178
  - !ruby/object:Gem::Version
91
179
  version: '0'
92
180
  required_rubygems_version: !ruby/object:Gem::Requirement
93
- none: false
94
181
  requirements:
95
- - - ! '>='
182
+ - - ">="
96
183
  - !ruby/object:Gem::Version
97
184
  version: '0'
98
185
  requirements: []
99
- rubyforge_project: web-page-parser
100
- rubygems_version: 1.8.23
186
+ rubyforge_project:
187
+ rubygems_version: 2.2.2
101
188
  signing_key:
102
- specification_version: 3
103
- summary: A parser for web pages
189
+ specification_version: 4
190
+ summary: A parser for various news organisation's web pages
104
191
  test_files:
105
- - spec/parser_factory_spec.rb
106
- - spec/base_parser_spec.rb
107
- - spec/parsers/guardian_page_spec.rb
108
- - spec/parsers/bbc_news_page_spec.rb
109
- - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
192
+ - spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
193
+ - spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
194
+ - spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
110
195
  - spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
111
- - spec/fixtures/bbc_news/10341015.stm.html
112
- - spec/fixtures/bbc_news/8029015.stm.html
113
- - spec/fixtures/bbc_news/7745137.stm.html
114
- - spec/fixtures/bbc_news/8063681.stm.html
196
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
197
+ - spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
198
+ - spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
199
+ - spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
200
+ - spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
201
+ - spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
202
+ - spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
203
+ - spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
204
+ - spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
205
+ - spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
206
+ - spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
207
+ - spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
208
+ - spec/fixtures/bbc_news/20230333.stm.html
115
209
  - spec/fixtures/bbc_news/10249066.stm.html
116
- - spec/fixtures/bbc_news/8011268.stm.html
117
- - spec/fixtures/bbc_news/12921632.html
118
210
  - spec/fixtures/bbc_news/13293006.html
211
+ - spec/fixtures/bbc_news/7745137.stm.html
212
+ - spec/fixtures/bbc_news/8029015.stm.html
119
213
  - spec/fixtures/bbc_news/11125504.html
214
+ - spec/fixtures/bbc_news/8040164.stm.html
215
+ - spec/fixtures/bbc_news/21528631.html
216
+ - spec/fixtures/bbc_news/10341015.stm.html
217
+ - spec/fixtures/bbc_news/8063681.stm.html
218
+ - spec/fixtures/bbc_news/19957138.stm.html
120
219
  - spec/fixtures/bbc_news/6072486.stm.html
220
+ - spec/fixtures/bbc_news/8011268.stm.html
221
+ - spec/fixtures/bbc_news/12921632.html
222
+ - spec/base_parser_spec.rb
223
+ - spec/parsers/washingtonpost_page_parser_spec.rb
224
+ - spec/parsers/bbc_news_page_spec.rb
225
+ - spec/parsers/guardian_page_spec.rb
226
+ - spec/parsers/independent_page_parser_spec.rb
227
+ - spec/parsers/new_york_times_page_parser_spec.rb
121
228
  - spec/spec.opts
122
- has_rdoc: true
229
+ - spec/spec_helper.rb
230
+ - spec/parser_factory_spec.rb
@@ -0,0 +1,2 @@
1
+ um��^~ c-6�q�!,+�[���ʷ�����̹���`Qݎ�5�a��l%�t�k�}5C
2
+ �,�l��z@��ؾ:�?C�%�G�x�]�+"�バ�����g֫_���{ě��$��t!z �l��Ҍ��=�no�Ui9����_W�IpH�&G��:��4��FLCM�X�}�}�Fdǩ�s�7w�zGcn��ʥ�j \���`��@ӝ��FC]߾s9�"��G�~z�