GDNewsScraper 3.0.4 → 3.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
4
- data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
3
+ metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
4
+ data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
5
5
  SHA512:
6
- metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
7
- data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
6
+ metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
7
+ data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49
@@ -7,49 +7,31 @@ module GDNewsScraper::Scrapers
7
7
  "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
8
8
  }
9
9
 
10
- STREAM_URI ||= 'https://www.polygon.com'
10
+ URL ||= 'https://www.polygon.com'
11
11
 
12
12
  WHITELIST ||= {
13
13
  default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
14
14
  inner: ['strong', 'em', 'li']
15
15
  }
16
16
 
17
- DOM ||= {
18
- article: {
19
- wrapper: '.c-compact-river',
20
- container: '.c-compact-river__entry',
21
- inner_container: '.c-entry-box--compact',
22
- inner_container_video: '.c-entry-box--compact--video',
23
- title: '.c-entry-box--compact__title',
24
- cover: '.c-entry-box--compact__image',
25
- meta: '.c-byline'
26
- },
27
-
28
- pagination: {
29
- previous: '.c-pagination__prev',
30
- info: '.c-pagination__text',
31
- next: '.c-pagination__next'
32
- }
33
- }
34
-
35
17
  class News
36
18
  attr_accessor :stream
37
19
 
38
- def initialize(offset = 0)
20
+ def initialize(offset = nil)
39
21
  unless offset.nil?
40
- uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
22
+ uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"
41
23
 
42
24
  @page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
43
25
  @stream = Hash.new
44
26
 
45
27
  stream[:stream] = Hash.new
46
- stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
47
- stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
48
- stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
49
- stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
28
+ stream[:stream][:size] = @page.at('.c-pagination__text').text.split.first.to_num
29
+ stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
30
+ stream[:stream][:prev] = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
31
+ stream[:stream][:next] = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i
50
32
 
51
33
  stream[:feed] = Hash.new
52
- stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
34
+ stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
53
35
  stream[:feed][:source] = 'polygon'
54
36
  stream[:feed][:label] = 'Polygon'
55
37
 
@@ -60,8 +42,8 @@ module GDNewsScraper::Scrapers
60
42
  end
61
43
 
62
44
  def perform
63
- @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
64
- stream[:articles] << parse(article)
45
+ @page.css('.c-compact-river__entry').first(2).each do |article|
46
+ stream[:articles].push(parse(article))
65
47
  end
66
48
  end
67
49
 
@@ -72,38 +54,74 @@ module GDNewsScraper::Scrapers
72
54
  def parse(article)
73
55
  pulse = Hash.new
74
56
 
75
- is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
76
-
77
- key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
- url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
79
- title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
80
-
81
- pulse[:id] = key
82
- pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
-
84
- begin
85
- pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
86
- rescue
87
- pulse[:cover] = nil
57
+ # This allows the Parser to get its data from the Index page, when the
58
+ # article is a Nokogiri::XML or from the Article page when the article
59
+ # is a URL.
60
+ #
61
+ # Passing a URL is mainly for debugging in case an Article fails to
62
+ # parse and should only be used as such..
63
+ #
64
+ if article.is_a?(String)
65
+ begin
66
+ article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
67
+
68
+ is_a_video = article_page.at('.c-video-embed').nil?
69
+
70
+ key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
71
+ url = article
72
+ title = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
73
+ cover = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
74
+ author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])
75
+
76
+ begin
77
+ article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
78
+ parsed_date = DateTime.parse(article_date)
79
+
80
+ date = parsed_date.to_time.to_i
81
+
82
+ # Never failed so not entirely sure what to rescue from, but with
83
+ # dates it allways risky not to rescue
84
+ #
85
+ # TODO: When it fails, find out why and rescue from that instead
86
+ # of rescuing from 'everything' ..
87
+ #
88
+ rescue
89
+ date = nil
90
+ end
91
+ rescue TypeError
92
+ raise ArgumentError.new('Invalid URL')
93
+ end
94
+ elsif article.is_a?(Nokogiri::XML::Element)
95
+ is_a_video = !article.at('.c-entry-box--compact--video').nil?
96
+
97
+ key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
98
+ url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
99
+ title = strip(article.at('.c-entry-box--compact__title'))
100
+ cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
101
+ author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
102
+ date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
103
+ else
104
+ raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
88
105
  end
89
-
106
+
107
+ pulse[:id] = key
108
+ pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
109
+ pulse[:cover] = cover
90
110
  pulse[:url] = url
91
111
  pulse[:title] = title
92
- pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
93
- pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
112
+ pulse[:author] = author
113
+ pulse[:date] = date
94
114
  pulse[:content] = parse_article_body(url, is_a_video)
95
115
  pulse[:tags] = title.downcase.split
96
116
 
97
117
  return pulse
98
118
  rescue => e
99
- "There was a problem while parsing Article for '#{ title }' => #{ e }"
119
+ "There was a problem while parsing this Article: #{ e }"
100
120
  end
101
121
 
102
- private
103
-
104
122
  def parse_article_body(article_url, is_a_video = false)
105
123
  article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
106
- article_container = article_page.css('.c-entry-content')
124
+ article_container = article_page.at('.c-entry-content')
107
125
 
108
126
  article_body = {
109
127
  galleries: { },
@@ -118,7 +136,7 @@ module GDNewsScraper::Scrapers
118
136
 
119
137
  if is_a_video
120
138
  iframe = article_page.at('.c-video-embed--media').at('iframe')
121
- iframe_id = random_string
139
+ iframe_id = unique_id
122
140
 
123
141
  article_body[:videos][iframe_id] = {}
124
142
  article_body[:videos][iframe_id][:url] = iframe.attr('src')
@@ -127,148 +145,189 @@ module GDNewsScraper::Scrapers
127
145
  end
128
146
 
129
147
  article_container.children.each do |node|
130
- if node.name == 'div'
148
+ content = node.content.strip.empty?
149
+ text = node.text.strip.empty?
150
+ attributes = node.attributes.empty?
151
+ children = node.children.empty?
131
152
 
132
- # Check to see if the div contains a embeded video
133
- #
134
- iframe = node.at('iframe')
153
+ if content && text && attributes && children
154
+ node.remove
155
+ else
156
+ if node.name == 'div'
135
157
 
136
- if iframe # YouTube videos
137
- iframe_id = random_string
158
+ # Check to see if the div contains a embeded video
159
+ #
160
+ iframe = node.at('iframe')
138
161
 
139
- article_body[:videos][iframe_id] = {}
140
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
162
+ if iframe # YouTube videos
163
+ iframe_id = unique_id
141
164
 
142
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
143
- end
165
+ article_body[:videos][iframe_id] = {}
166
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
144
167
 
145
- # Check to see if the div contains a gallery
146
- #
147
- gallery = node.at('.c-image-gallery')
168
+ article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
169
+ end
148
170
 
149
- if gallery
150
- gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
171
+ # Check to see if the Article has a video by Polygon, which is
172
+ # embeded differnetly than a YouTube video..
173
+ #
174
+ polygon_video = node.attributes['data-volume-uuid']
175
+
176
+ unless polygon_video.nil?
177
+ id = unique_id
178
+
179
+ article_body[:videos][id] = {}
180
+ article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
181
+ article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"
182
+
183
+ article_body[:body] << node.replace("{{video:#{ id }}}").to_html
184
+ end
185
+
186
+ # Check to see if the div contains a gallery
187
+ #
188
+ gallery = node.at('.c-image-gallery')
189
+
190
+ if gallery
191
+ gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
151
192
 
152
- gallery_id = random_string
153
- article_body[:galleries][gallery_id] = []
193
+ gallery_id = unique_id
194
+ article_body[:galleries][gallery_id] = []
154
195
 
155
- gallery_container.children.children.each do |image_container|
156
- image = image_container.at('a')
196
+ gallery_container.children.children.each do |image_container|
197
+ image = image_container.at('a')
157
198
 
158
- if image
159
- article_body[:galleries][gallery_id] << image.attr('href')
199
+ if image
200
+ article_body[:galleries][gallery_id] << image.attr('href')
201
+ end
160
202
  end
203
+
204
+ article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
161
205
  end
162
206
 
163
- article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
164
- end
207
+ twitdget = node.at('.twitter-tweet')
165
208
 
166
- twitdget = node.at('.twitter-tweet')
209
+ if twitdget
210
+ article_body[:body] << twitdget.to_html
211
+ end
167
212
 
168
- if twitdget
169
- article_body[:body] << twitdget.to_html
213
+ redditget = node.at('.reddit-card')
214
+
215
+ if redditget
216
+ article_body[:body] << redditget.to_html
217
+ end
170
218
  end
171
219
 
172
- redditget = node.at('.reddit-card')
220
+ # First ensure the node is an actual element. This removes random HTML elements
221
+ #
222
+ # => node.element?
223
+ #
224
+ # Secondly, ensure the node is what we actual want. We don't want <div>'s
225
+ # which are usualy used for placing inline advertisments or content specific
226
+ # only to that website
227
+ #
228
+ # => WHITELIST[:default].include?(node.name)
229
+ #
230
+ if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
231
+ case node.name
232
+ when 'figure'
173
233
 
174
- if redditget
175
- article_body[:body] << redditget.to_html
176
- end
177
- end
234
+ image = node.at('.e-image__image')
235
+ image_url = image.attr('data-original')
178
236
 
179
- # First ensure the node is an actual element. This removes random HTML elements
180
- #
181
- # => node.element?
182
- #
183
- # Secondly, ensure the node is what we actual want. We don't want <div>'s
184
- # which are usualy used for placing inline advertisments or content specific
185
- # only to that website
186
- #
187
- # => WHITELIST[:default].include?(node.name)
188
- #
189
- if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
190
- case node.name
191
- when 'figure'
192
-
193
- image = node.css('.e-image__image').first
194
- image_url = image.attr('data-original')
195
-
196
- begin
197
237
  if image_url.split('.').last == 'gif'
198
- image_id = random_string
238
+ id = unique_id
199
239
 
200
- article_body[:images][image_id] = {}
201
- article_body[:images][image_id][:url] = image_url
240
+ article_body[:images][id] = { }
241
+ article_body[:images][id][:url] = image_url
202
242
 
203
- article_body[:body] << node.replace("{{image:#{ image_id }}}").to_html
243
+ article_body[:body] << node.replace("{{image:#{ id }}}").to_html
204
244
  else
205
- image_alt = image.children.at('img').attr('alt')
206
- image_title = image.children.at('img').attr('title')
245
+ id = unique_id
246
+
247
+ figure(article_body, id, node, image, image_url)
248
+
249
+ article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
250
+ end
251
+
252
+ else
207
253
 
208
- image_meta = node.css('.e-image__meta')
254
+ node.children.each do |inner_node|
255
+ case inner_node.name
256
+ when 'a'
257
+ id = unique_id
258
+
259
+ article_body[:anchors][id] = {
260
+ text: inner_node.children.text,
261
+ url: inner_node.attr('href')
262
+ }
209
263
 
210
- figure_id = random_string
264
+ inner_node.replace("{{anchor:#{ id }}}")
265
+ when 'figure'
266
+ id = unique_id
211
267
 
212
- article_body[:figures][figure_id] = {}
268
+ image = node.at('.e-image__image')
269
+ image_url = image.attr('data-original')
213
270
 
214
- article_body[:figures][figure_id][:image] = image_url
215
- article_body[:figures][figure_id][:title] = image_title
216
- article_body[:figures][figure_id][:alt] = image_alt
271
+ figure(article_body, id, node, image, image_url)
217
272
 
218
- unless image_meta.empty?
219
- article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
220
- article_body[:figures][figure_id][:cite] = image_meta.first.at('cite')&.text
273
+ node = node.replace("{{figure:#{ id }}}").to_html
274
+ article_body[:body] << node
221
275
  end
222
-
223
- article_body[:body] << node.replace("{{figure:#{ figure_id }}}").to_html
224
276
  end
225
- rescue
226
- raise 'Unknown format, please review.'
227
- end
228
- else
229
277
 
230
- node.children.each do |url|
231
278
  begin
232
- if url.name == 'a'
233
- url_id = random_string
234
-
235
- article_body[:anchors][url_id] = {
236
- text: url.children.text,
237
- url: url.attributes['href'].value
238
- }
239
279
 
240
- url.replace("{{anchor:#{ url_id }}}")
241
- end
280
+ # Remove all attributes
281
+ #
282
+ parsed_node = node.xpath('.//@*').remove
283
+
284
+ # Return clean HTML, including HTML elements and text
285
+ #
286
+ parsed_node = node.to_html
287
+
242
288
  rescue
243
- raise 'Unknown format, please review.'
289
+
244
290
  end
245
291
  end
246
292
 
247
- # Remove all attributes
248
- #
249
- parsed_node = node.xpath('.//@*').remove
250
-
251
- # Return clean HTML, including HTML elements and text
252
- #
253
- parsed_node = node.to_html
293
+ article_body[:body] << parsed_node unless parsed_node.nil?
254
294
  end
255
-
256
- article_body[:body] << parsed_node
257
295
  end
258
296
  end
259
297
 
260
298
  return article_body
261
299
  rescue => e
262
- "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
300
+ "There was a problem while parsing this Article: #{ e }"
263
301
  end
264
302
 
303
+ def figure(article_body, id, node, image, image_url)
304
+ article_body[:figures][id] = { }
305
+
306
+ article_body[:figures][id][:image] = image_url
307
+ article_body[:figures][id][:title] = image.at('img').attr('title')
308
+ article_body[:figures][id][:alt] = image.at('img').attr('alt')
309
+
310
+ image_meta = node.at('.e-image__meta')
311
+
312
+ unless image_meta.nil?
313
+ article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
314
+ article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
315
+ end
316
+ end
317
+
318
+ private
319
+
265
320
  def attr(attribute)
266
321
  attributes&.fetch(attribute, nil)&.value
267
322
  end
268
323
 
269
- def random_string
324
+ def strip(string)
325
+ string&.text&.strip
326
+ end
327
+
328
+ def unique_id
270
329
  (0...50).map { (65 + rand(25)).chr }.join.to_sym
271
330
  end
272
331
  end # News
273
332
  end # PolygonCOM
274
- end # GDNewsScraper::Scrapers
333
+ end # GDNewsScraper::Scrapers
@@ -0,0 +1,5 @@
1
+ class String
2
+ def to_num
3
+ gsub(/\D/, '').to_i
4
+ end
5
+ end
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.4'
2
+ VERSION ||= '3.0.6'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -31,5 +31,9 @@ module GDNewsScraper
31
31
  # v3.0.3 - Added a new method which will refresh the content of an Article
32
32
  # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
33
33
  # DOM structure
34
- #
34
+ # v3.0.5 - Adds the possibility to parse an article from its URL rather than
35
+ # having to go through the index page to get its metadata
36
+ # v3.0.6 - Small refactor of the code which also improved parsing speed by
37
+ # about 10% on average! :)
38
+ #
35
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.4
4
+ version: 3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-28 00:00:00.000000000 Z
11
+ date: 2017-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -74,6 +74,7 @@ files:
74
74
  - lib/GDNewsScraper.rb
75
75
  - lib/GDNewsScraper/scrapers/polygon_com/news.rb
76
76
  - lib/GDNewsScraper/scrapers/polygon_com/reviews.rb
77
+ - lib/GDNewsScraper/string.rb
77
78
  - lib/GDNewsScraper/version.rb
78
79
  homepage: https://github.com/games-directory/scraper
79
80
  licenses: