web-page-parser 0.25 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +1 -0
- data.tar.gz.sig +0 -0
- data/README.rdoc +5 -0
- data/lib/web-page-parser.rb +31 -0
- data/lib/web-page-parser/base_parser.rb +92 -42
- data/lib/web-page-parser/http.rb +63 -0
- data/lib/web-page-parser/parser_factory.rb +0 -1
- data/lib/web-page-parser/parsers/bbc_news_page_parser.rb +72 -9
- data/lib/web-page-parser/parsers/guardian_page_parser.rb +51 -11
- data/lib/web-page-parser/parsers/independent_page_parser.rb +56 -0
- data/lib/web-page-parser/parsers/new_york_times_page_parser.rb +108 -0
- data/lib/web-page-parser/parsers/washingtonpost_page_parser.rb +59 -0
- data/spec/base_parser_spec.rb +24 -8
- data/spec/fixtures/bbc_news/19957138.stm.html +1974 -0
- data/spec/fixtures/bbc_news/20230333.stm.html +2529 -0
- data/spec/fixtures/bbc_news/21528631.html +2021 -0
- data/spec/fixtures/bbc_news/8040164.stm.html +3095 -0
- data/spec/fixtures/cassette_library/BbcNewsPageParserV4.yml +1743 -0
- data/spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html +4713 -0
- data/spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html +4371 -0
- data/spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html +4150 -0
- data/spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html +4401 -0
- data/spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html +4454 -0
- data/spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html +4455 -0
- data/spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html +4368 -0
- data/spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html +919 -0
- data/spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html +328 -0
- data/spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html +1164 -0
- data/spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html +1 -0
- data/spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html +3664 -0
- data/spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html +3729 -0
- data/spec/parser_factory_spec.rb +3 -3
- data/spec/parsers/bbc_news_page_spec.rb +223 -3
- data/spec/parsers/guardian_page_spec.rb +157 -4
- data/spec/parsers/independent_page_parser_spec.rb +152 -0
- data/spec/parsers/new_york_times_page_parser_spec.rb +190 -0
- data/spec/parsers/washingtonpost_page_parser_spec.rb +114 -0
- data/spec/spec_helper.rb +5 -0
- metadata +167 -59
- metadata.gz.sig +2 -0
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,50 +1,111 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web-page-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- John Leach
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
|
-
cert_chain:
|
12
|
-
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDmjCCAoKgAwIBAgIBATANBgkqhkiG9w0BAQUFADBSMQ0wCwYDVQQDDARqb2hu
|
14
|
+
MRkwFwYKCZImiZPyLGQBGRYJam9obmxlYWNoMRIwEAYKCZImiZPyLGQBGRYCY28x
|
15
|
+
EjAQBgoJkiaJk/IsZAEZFgJ1azAeFw0xNDEwMjUxNzAyMDBaFw0xNTEwMjUxNzAy
|
16
|
+
MDBaMFIxDTALBgNVBAMMBGpvaG4xGTAXBgoJkiaJk/IsZAEZFglqb2hubGVhY2gx
|
17
|
+
EjAQBgoJkiaJk/IsZAEZFgJjbzESMBAGCgmSJomT8ixkARkWAnVrMIIBIjANBgkq
|
18
|
+
hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA49t9Dck0zPjDiEuRLTEsoXVQEZECcFtu
|
19
|
+
FU2j6ZFGd/twm4dlZ2qHYs5GYtpSwlfJYhAOc2a9cHz1KKSevGaPRxpsFhKEZ+Yj
|
20
|
+
5R8y4VZCY1Rx7tyX2PYdWBNaeTNPIACdW4HNg2/n1bbhu4LkQ+PYBQb6bbeFnzTx
|
21
|
+
dl2ZLvhwSRUbl7aIiYyENbpOmPKCL1ReJUkQn+1Kyq76ZMY6pG6iSeeZvDtKZKqd
|
22
|
+
MX4bWAIBeT6mUv/jhIDkJgj+JO11v3wbhojAcVHInGnyCQ7dLn3hurlLfII4SiLT
|
23
|
+
foOh2i2OY5ZTG5PoPEGMiagBWAUmQUA+Yc6gnfpjrX/aFG/aa6T2+wIDAQABo3sw
|
24
|
+
eTAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUqd/WZnE4x+pZ+DS1
|
25
|
+
+H3vH+/gvfkwHwYDVR0RBBgwFoEUam9obkBqb2hubGVhY2guY28udWswHwYDVR0S
|
26
|
+
BBgwFoEUam9obkBqb2hubGVhY2guY28udWswDQYJKoZIhvcNAQEFBQADggEBABG0
|
27
|
+
Avvj8mNb+0drmLLFLMLck8oEcpzONbVG6A3XWrkUkTsdsw8VB848QuFg3gR+3ReU
|
28
|
+
C98Bm+8L1zYVkfhTzHNJ8Y9HGC+8eEXoMQw1C2jGBcN+i4G+eylOBv+PTJX3UU8r
|
29
|
+
r7Tb7QqD7tPjy7TS91OvyImc7Jixt848nrrs9nWSiEIaVxQqBRRdKANsgFISvvA4
|
30
|
+
CPFEkKZm3GcFRIVu9yQO1LWfsbvbVLhD5HSynklijwo2RroPXlNKi6RXsxKwgtqD
|
31
|
+
MghEyBTNQa+QTUTKQMjYOO3kV+Wuv+iQGaMm/bu2SD+Ov0XUzzAsSfz0ZvrF3fbG
|
32
|
+
jdD4CMQtJNDqDiWuUkg=
|
33
|
+
-----END CERTIFICATE-----
|
34
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
13
35
|
dependencies:
|
14
36
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
37
|
+
name: htmlentities
|
16
38
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
39
|
requirements:
|
19
|
-
- -
|
40
|
+
- - "~>"
|
20
41
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
42
|
+
version: '4.3'
|
22
43
|
type: :runtime
|
23
44
|
prerelease: false
|
24
45
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
46
|
requirements:
|
27
|
-
- -
|
47
|
+
- - "~>"
|
28
48
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
49
|
+
version: '4.3'
|
30
50
|
- !ruby/object:Gem::Dependency
|
31
|
-
name:
|
51
|
+
name: curb
|
52
|
+
requirement: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0.8'
|
57
|
+
type: :runtime
|
58
|
+
prerelease: false
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0.8'
|
64
|
+
- !ruby/object:Gem::Dependency
|
65
|
+
name: nokogiri
|
32
66
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
67
|
requirements:
|
35
|
-
- -
|
68
|
+
- - "~>"
|
36
69
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
70
|
+
version: '1.6'
|
38
71
|
type: :runtime
|
39
72
|
prerelease: false
|
40
73
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
74
|
requirements:
|
43
|
-
- -
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.6'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rspec
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '2.11'
|
85
|
+
type: :development
|
86
|
+
prerelease: false
|
87
|
+
version_requirements: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - "~>"
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '2.11'
|
92
|
+
- !ruby/object:Gem::Dependency
|
93
|
+
name: rake
|
94
|
+
requirement: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
44
104
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
46
|
-
description: A Ruby library to parse the content out of web pages
|
47
|
-
pages
|
105
|
+
version: '0'
|
106
|
+
description: A Ruby library to parse the content out of web pages. Currently supports
|
107
|
+
BBC News pages, The Guardian, Independent and New York Times articles. Used by the
|
108
|
+
News Sniffer project. http://www.newssniffer.co.uk
|
48
109
|
email: john@johnleach.co.uk
|
49
110
|
executables: []
|
50
111
|
extensions: []
|
@@ -52,71 +113,118 @@ extra_rdoc_files:
|
|
52
113
|
- README.rdoc
|
53
114
|
- LICENSE
|
54
115
|
files:
|
55
|
-
-
|
116
|
+
- LICENSE
|
117
|
+
- README.rdoc
|
118
|
+
- lib/web-page-parser.rb
|
56
119
|
- lib/web-page-parser/base_parser.rb
|
57
|
-
- lib/web-page-parser/
|
58
|
-
- lib/web-page-parser/
|
120
|
+
- lib/web-page-parser/http.rb
|
121
|
+
- lib/web-page-parser/parser_factory.rb
|
59
122
|
- lib/web-page-parser/parsers/bbc_news_page_parser.rb
|
60
|
-
- lib/web-page-parser.rb
|
61
|
-
-
|
62
|
-
-
|
63
|
-
-
|
123
|
+
- lib/web-page-parser/parsers/guardian_page_parser.rb
|
124
|
+
- lib/web-page-parser/parsers/independent_page_parser.rb
|
125
|
+
- lib/web-page-parser/parsers/new_york_times_page_parser.rb
|
126
|
+
- lib/web-page-parser/parsers/test_page_parser.rb
|
127
|
+
- lib/web-page-parser/parsers/washingtonpost_page_parser.rb
|
64
128
|
- spec/base_parser_spec.rb
|
65
|
-
- spec/parsers/guardian_page_spec.rb
|
66
|
-
- spec/parsers/bbc_news_page_spec.rb
|
67
|
-
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
68
|
-
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
69
|
-
- spec/fixtures/bbc_news/10341015.stm.html
|
70
|
-
- spec/fixtures/bbc_news/8029015.stm.html
|
71
|
-
- spec/fixtures/bbc_news/7745137.stm.html
|
72
|
-
- spec/fixtures/bbc_news/8063681.stm.html
|
73
129
|
- spec/fixtures/bbc_news/10249066.stm.html
|
74
|
-
- spec/fixtures/bbc_news/
|
130
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
131
|
+
- spec/fixtures/bbc_news/11125504.html
|
75
132
|
- spec/fixtures/bbc_news/12921632.html
|
76
133
|
- spec/fixtures/bbc_news/13293006.html
|
77
|
-
- spec/fixtures/bbc_news/
|
134
|
+
- spec/fixtures/bbc_news/19957138.stm.html
|
135
|
+
- spec/fixtures/bbc_news/20230333.stm.html
|
136
|
+
- spec/fixtures/bbc_news/21528631.html
|
78
137
|
- spec/fixtures/bbc_news/6072486.stm.html
|
138
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
139
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
140
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
141
|
+
- spec/fixtures/bbc_news/8040164.stm.html
|
142
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
143
|
+
- spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
|
144
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
|
145
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
146
|
+
- spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
|
147
|
+
- spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
|
148
|
+
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
149
|
+
- spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
|
150
|
+
- spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
|
151
|
+
- spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
|
152
|
+
- spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
|
153
|
+
- spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
|
154
|
+
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
|
155
|
+
- spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
|
156
|
+
- spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
|
157
|
+
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
|
158
|
+
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
|
159
|
+
- spec/parser_factory_spec.rb
|
160
|
+
- spec/parsers/bbc_news_page_spec.rb
|
161
|
+
- spec/parsers/guardian_page_spec.rb
|
162
|
+
- spec/parsers/independent_page_parser_spec.rb
|
163
|
+
- spec/parsers/new_york_times_page_parser_spec.rb
|
164
|
+
- spec/parsers/washingtonpost_page_parser_spec.rb
|
79
165
|
- spec/spec.opts
|
80
|
-
|
81
|
-
|
166
|
+
- spec/spec_helper.rb
|
167
|
+
homepage: http://github.com/johnl/web-page-parser
|
168
|
+
licenses:
|
169
|
+
- MIT
|
170
|
+
metadata: {}
|
82
171
|
post_install_message:
|
83
172
|
rdoc_options: []
|
84
173
|
require_paths:
|
85
174
|
- lib
|
86
175
|
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
-
none: false
|
88
176
|
requirements:
|
89
|
-
- -
|
177
|
+
- - ">="
|
90
178
|
- !ruby/object:Gem::Version
|
91
179
|
version: '0'
|
92
180
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
|
-
none: false
|
94
181
|
requirements:
|
95
|
-
- -
|
182
|
+
- - ">="
|
96
183
|
- !ruby/object:Gem::Version
|
97
184
|
version: '0'
|
98
185
|
requirements: []
|
99
|
-
rubyforge_project:
|
100
|
-
rubygems_version:
|
186
|
+
rubyforge_project:
|
187
|
+
rubygems_version: 2.2.2
|
101
188
|
signing_key:
|
102
|
-
specification_version:
|
103
|
-
summary: A parser for web pages
|
189
|
+
specification_version: 4
|
190
|
+
summary: A parser for various news organisation's web pages
|
104
191
|
test_files:
|
105
|
-
- spec/
|
106
|
-
- spec/
|
107
|
-
- spec/
|
108
|
-
- spec/parsers/bbc_news_page_spec.rb
|
109
|
-
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
192
|
+
- spec/fixtures/new_york_times/show-banned-french-comedian-has-new-one.html
|
193
|
+
- spec/fixtures/new_york_times/the-long-run-gingrich-stuck-to-caustic-path-in-ethics-battles.html
|
194
|
+
- spec/fixtures/new_york_times/khaled-meshal-the-leader-of-hamas-vacates-damascus.html
|
110
195
|
- spec/fixtures/guardian/syria-libya-middle-east-unrest-live.html
|
111
|
-
- spec/fixtures/
|
112
|
-
- spec/fixtures/
|
113
|
-
- spec/fixtures/
|
114
|
-
- spec/fixtures/
|
196
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus.html
|
197
|
+
- spec/fixtures/guardian/anger-grows-rbs-chiefs-bonus-with-explainer.html
|
198
|
+
- spec/fixtures/guardian/nhs-patient-data-available-companies-buy.html
|
199
|
+
- spec/fixtures/guardian/barack-obama-nicki-minaj-mariah-carey.html
|
200
|
+
- spec/fixtures/washingtonpost/sgt-bowe-bergdahls-capture-remains-amystery.html
|
201
|
+
- spec/fixtures/washingtonpost/will-a-bust-follow-the-boom-in-britain.html
|
202
|
+
- spec/fixtures/washingtonpost/pentagon-confirms-al-shabab-leader-killed.html
|
203
|
+
- spec/fixtures/cassette_library/BbcNewsPageParserV4.yml
|
204
|
+
- spec/fixtures/independent/innocent-starving-close-to-death-one-victim-of-the-siege-that-shames-syria-9065538.html
|
205
|
+
- spec/fixtures/independent/david-cameron-set-for-uturn-over-uk-sanctuary-9077647.html
|
206
|
+
- spec/fixtures/independent/belgian-man-who-skipped-100-restaurant-bills-is-killed-9081407.html
|
207
|
+
- spec/fixtures/independent/saudi-authorities-stop-textmessage-tracking-of-women-for-now-9065486.html
|
208
|
+
- spec/fixtures/bbc_news/20230333.stm.html
|
115
209
|
- spec/fixtures/bbc_news/10249066.stm.html
|
116
|
-
- spec/fixtures/bbc_news/8011268.stm.html
|
117
|
-
- spec/fixtures/bbc_news/12921632.html
|
118
210
|
- spec/fixtures/bbc_news/13293006.html
|
211
|
+
- spec/fixtures/bbc_news/7745137.stm.html
|
212
|
+
- spec/fixtures/bbc_news/8029015.stm.html
|
119
213
|
- spec/fixtures/bbc_news/11125504.html
|
214
|
+
- spec/fixtures/bbc_news/8040164.stm.html
|
215
|
+
- spec/fixtures/bbc_news/21528631.html
|
216
|
+
- spec/fixtures/bbc_news/10341015.stm.html
|
217
|
+
- spec/fixtures/bbc_news/8063681.stm.html
|
218
|
+
- spec/fixtures/bbc_news/19957138.stm.html
|
120
219
|
- spec/fixtures/bbc_news/6072486.stm.html
|
220
|
+
- spec/fixtures/bbc_news/8011268.stm.html
|
221
|
+
- spec/fixtures/bbc_news/12921632.html
|
222
|
+
- spec/base_parser_spec.rb
|
223
|
+
- spec/parsers/washingtonpost_page_parser_spec.rb
|
224
|
+
- spec/parsers/bbc_news_page_spec.rb
|
225
|
+
- spec/parsers/guardian_page_spec.rb
|
226
|
+
- spec/parsers/independent_page_parser_spec.rb
|
227
|
+
- spec/parsers/new_york_times_page_parser_spec.rb
|
121
228
|
- spec/spec.opts
|
122
|
-
|
229
|
+
- spec/spec_helper.rb
|
230
|
+
- spec/parser_factory_spec.rb
|
metadata.gz.sig
ADDED