GDNewsScraper 3.0.4 → 3.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
4
- data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
3
+ metadata.gz: c37e3bbf8420be9d2bf182d091eb5b8713e46679
4
+ data.tar.gz: c20ce1f2ee2b57757bd0d9d87b3bdc34fd751cf7
5
5
  SHA512:
6
- metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
7
- data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
6
+ metadata.gz: 74639942afca8966f6602c642a89cb31979e896a4656c8ed6dbf20d5e8764a642ab62f6a05566265b75531f685c542d1ff0daaa8ce0f474e3b882355a1f9afc3
7
+ data.tar.gz: 1783aa15b5339131027ecf0aa0d2776d4959664157cee1305eca2acf206e43551f9079e4a60a322fe7a99f95f1513c302d8e13cf9529c95a97aa6e2476215b49
@@ -7,49 +7,31 @@ module GDNewsScraper::Scrapers
7
7
  "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
8
8
  }
9
9
 
10
- STREAM_URI ||= 'https://www.polygon.com'
10
+ URL ||= 'https://www.polygon.com'
11
11
 
12
12
  WHITELIST ||= {
13
13
  default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
14
14
  inner: ['strong', 'em', 'li']
15
15
  }
16
16
 
17
- DOM ||= {
18
- article: {
19
- wrapper: '.c-compact-river',
20
- container: '.c-compact-river__entry',
21
- inner_container: '.c-entry-box--compact',
22
- inner_container_video: '.c-entry-box--compact--video',
23
- title: '.c-entry-box--compact__title',
24
- cover: '.c-entry-box--compact__image',
25
- meta: '.c-byline'
26
- },
27
-
28
- pagination: {
29
- previous: '.c-pagination__prev',
30
- info: '.c-pagination__text',
31
- next: '.c-pagination__next'
32
- }
33
- }
34
-
35
17
  class News
36
18
  attr_accessor :stream
37
19
 
38
- def initialize(offset = 0)
20
+ def initialize(offset = nil)
39
21
  unless offset.nil?
40
- uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
22
+ uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"
41
23
 
42
24
  @page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
43
25
  @stream = Hash.new
44
26
 
45
27
  stream[:stream] = Hash.new
46
- stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
47
- stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
48
- stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
49
- stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
28
+ stream[:stream][:size] = @page.at('.c-pagination__text').text.split.first.to_num
29
+ stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
30
+ stream[:stream][:prev] = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
31
+ stream[:stream][:next] = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i
50
32
 
51
33
  stream[:feed] = Hash.new
52
- stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
34
+ stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
53
35
  stream[:feed][:source] = 'polygon'
54
36
  stream[:feed][:label] = 'Polygon'
55
37
 
@@ -60,8 +42,8 @@ module GDNewsScraper::Scrapers
60
42
  end
61
43
 
62
44
  def perform
63
- @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
64
- stream[:articles] << parse(article)
45
+ @page.css('.c-compact-river__entry').first(2).each do |article|
46
+ stream[:articles].push(parse(article))
65
47
  end
66
48
  end
67
49
 
@@ -72,38 +54,74 @@ module GDNewsScraper::Scrapers
72
54
  def parse(article)
73
55
  pulse = Hash.new
74
56
 
75
- is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
76
-
77
- key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
- url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
79
- title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
80
-
81
- pulse[:id] = key
82
- pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
-
84
- begin
85
- pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
86
- rescue
87
- pulse[:cover] = nil
57
+ # This allows the Parser to get its data from the Index page, when the
58
+ # article is a Nokogiri::XML or from the Article page when the article
59
+ # is a URL.
60
+ #
61
+ # Passing a URL is mainly for debugging in case an Article fails to
62
+ # parse and should only be used as such..
63
+ #
64
+ if article.is_a?(String)
65
+ begin
66
+ article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
67
+
68
+ is_a_video = article_page.at('.c-video-embed').nil?
69
+
70
+ key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
71
+ url = article
72
+ title = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
73
+ cover = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
74
+ author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])
75
+
76
+ begin
77
+ article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
78
+ parsed_date = DateTime.parse(article_date)
79
+
80
+ date = parsed_date.to_time.to_i
81
+
82
+ # Never failed so not entirely sure what to rescue from, but with
83
+ # dates it allways risky not to rescue
84
+ #
85
+ # TODO: When it fails, find out why and rescue from that instead
86
+ # of rescuing from 'everything' ..
87
+ #
88
+ rescue
89
+ date = nil
90
+ end
91
+ rescue TypeError
92
+ raise ArgumentError.new('Invalid URL')
93
+ end
94
+ elsif article.is_a?(Nokogiri::XML::Element)
95
+ is_a_video = !article.at('.c-entry-box--compact--video').nil?
96
+
97
+ key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
98
+ url = article.at('.c-entry-box--compact__title').at('> a').attr('href')
99
+ title = strip(article.at('.c-entry-box--compact__title'))
100
+ cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
101
+ author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
102
+ date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
103
+ else
104
+ raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
88
105
  end
89
-
106
+
107
+ pulse[:id] = key
108
+ pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
109
+ pulse[:cover] = cover
90
110
  pulse[:url] = url
91
111
  pulse[:title] = title
92
- pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
93
- pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
112
+ pulse[:author] = author
113
+ pulse[:date] = date
94
114
  pulse[:content] = parse_article_body(url, is_a_video)
95
115
  pulse[:tags] = title.downcase.split
96
116
 
97
117
  return pulse
98
118
  rescue => e
99
- "There was a problem while parsing Article for '#{ title }' => #{ e }"
119
+ "There was a problem while parsing this Article: #{ e }"
100
120
  end
101
121
 
102
- private
103
-
104
122
  def parse_article_body(article_url, is_a_video = false)
105
123
  article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
106
- article_container = article_page.css('.c-entry-content')
124
+ article_container = article_page.at('.c-entry-content')
107
125
 
108
126
  article_body = {
109
127
  galleries: { },
@@ -118,7 +136,7 @@ module GDNewsScraper::Scrapers
118
136
 
119
137
  if is_a_video
120
138
  iframe = article_page.at('.c-video-embed--media').at('iframe')
121
- iframe_id = random_string
139
+ iframe_id = unique_id
122
140
 
123
141
  article_body[:videos][iframe_id] = {}
124
142
  article_body[:videos][iframe_id][:url] = iframe.attr('src')
@@ -127,148 +145,189 @@ module GDNewsScraper::Scrapers
127
145
  end
128
146
 
129
147
  article_container.children.each do |node|
130
- if node.name == 'div'
148
+ content = node.content.strip.empty?
149
+ text = node.text.strip.empty?
150
+ attributes = node.attributes.empty?
151
+ children = node.children.empty?
131
152
 
132
- # Check to see if the div contains a embeded video
133
- #
134
- iframe = node.at('iframe')
153
+ if content && text && attributes && children
154
+ node.remove
155
+ else
156
+ if node.name == 'div'
135
157
 
136
- if iframe # YouTube videos
137
- iframe_id = random_string
158
+ # Check to see if the div contains a embeded video
159
+ #
160
+ iframe = node.at('iframe')
138
161
 
139
- article_body[:videos][iframe_id] = {}
140
- article_body[:videos][iframe_id][:url] = iframe.attr('src')
162
+ if iframe # YouTube videos
163
+ iframe_id = unique_id
141
164
 
142
- article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
143
- end
165
+ article_body[:videos][iframe_id] = {}
166
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
144
167
 
145
- # Check to see if the div contains a gallery
146
- #
147
- gallery = node.at('.c-image-gallery')
168
+ article_body[:body] << iframe.replace("{{video:#{ iframe_id }}}").to_html
169
+ end
148
170
 
149
- if gallery
150
- gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
171
+ # Check to see if the Article has a video by Polygon, which is
172
+ # embeded differnetly than a YouTube video..
173
+ #
174
+ polygon_video = node.attributes['data-volume-uuid']
175
+
176
+ unless polygon_video.nil?
177
+ id = unique_id
178
+
179
+ article_body[:videos][id] = {}
180
+ article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
181
+ article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"
182
+
183
+ article_body[:body] << node.replace("{{video:#{ id }}}").to_html
184
+ end
185
+
186
+ # Check to see if the div contains a gallery
187
+ #
188
+ gallery = node.at('.c-image-gallery')
189
+
190
+ if gallery
191
+ gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
151
192
 
152
- gallery_id = random_string
153
- article_body[:galleries][gallery_id] = []
193
+ gallery_id = unique_id
194
+ article_body[:galleries][gallery_id] = []
154
195
 
155
- gallery_container.children.children.each do |image_container|
156
- image = image_container.at('a')
196
+ gallery_container.children.children.each do |image_container|
197
+ image = image_container.at('a')
157
198
 
158
- if image
159
- article_body[:galleries][gallery_id] << image.attr('href')
199
+ if image
200
+ article_body[:galleries][gallery_id] << image.attr('href')
201
+ end
160
202
  end
203
+
204
+ article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
161
205
  end
162
206
 
163
- article_body[:body] << gallery.replace("{{gallery:#{ gallery_id }}}").to_html
164
- end
207
+ twitdget = node.at('.twitter-tweet')
165
208
 
166
- twitdget = node.at('.twitter-tweet')
209
+ if twitdget
210
+ article_body[:body] << twitdget.to_html
211
+ end
167
212
 
168
- if twitdget
169
- article_body[:body] << twitdget.to_html
213
+ redditget = node.at('.reddit-card')
214
+
215
+ if redditget
216
+ article_body[:body] << redditget.to_html
217
+ end
170
218
  end
171
219
 
172
- redditget = node.at('.reddit-card')
220
+ # First ensure the node is an actual element. This removes random HTML elements
221
+ #
222
+ # => node.element?
223
+ #
224
+ # Secondly, ensure the node is what we actual want. We don't want <div>'s
225
+ # which are usualy used for placing inline advertisments or content specific
226
+ # only to that website
227
+ #
228
+ # => WHITELIST[:default].include?(node.name)
229
+ #
230
+ if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
231
+ case node.name
232
+ when 'figure'
173
233
 
174
- if redditget
175
- article_body[:body] << redditget.to_html
176
- end
177
- end
234
+ image = node.at('.e-image__image')
235
+ image_url = image.attr('data-original')
178
236
 
179
- # First ensure the node is an actual element. This removes random HTML elements
180
- #
181
- # => node.element?
182
- #
183
- # Secondly, ensure the node is what we actual want. We don't want <div>'s
184
- # which are usualy used for placing inline advertisments or content specific
185
- # only to that website
186
- #
187
- # => WHITELIST[:default].include?(node.name)
188
- #
189
- if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
190
- case node.name
191
- when 'figure'
192
-
193
- image = node.css('.e-image__image').first
194
- image_url = image.attr('data-original')
195
-
196
- begin
197
237
  if image_url.split('.').last == 'gif'
198
- image_id = random_string
238
+ id = unique_id
199
239
 
200
- article_body[:images][image_id] = {}
201
- article_body[:images][image_id][:url] = image_url
240
+ article_body[:images][id] = { }
241
+ article_body[:images][id][:url] = image_url
202
242
 
203
- article_body[:body] << node.replace("{{image:#{ image_id }}}").to_html
243
+ article_body[:body] << node.replace("{{image:#{ id }}}").to_html
204
244
  else
205
- image_alt = image.children.at('img').attr('alt')
206
- image_title = image.children.at('img').attr('title')
245
+ id = unique_id
246
+
247
+ figure(article_body, id, node, image, image_url)
248
+
249
+ article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
250
+ end
251
+
252
+ else
207
253
 
208
- image_meta = node.css('.e-image__meta')
254
+ node.children.each do |inner_node|
255
+ case inner_node.name
256
+ when 'a'
257
+ id = unique_id
258
+
259
+ article_body[:anchors][id] = {
260
+ text: inner_node.children.text,
261
+ url: inner_node.attr('href')
262
+ }
209
263
 
210
- figure_id = random_string
264
+ inner_node.replace("{{anchor:#{ id }}}")
265
+ when 'figure'
266
+ id = unique_id
211
267
 
212
- article_body[:figures][figure_id] = {}
268
+ image = node.at('.e-image__image')
269
+ image_url = image.attr('data-original')
213
270
 
214
- article_body[:figures][figure_id][:image] = image_url
215
- article_body[:figures][figure_id][:title] = image_title
216
- article_body[:figures][figure_id][:alt] = image_alt
271
+ figure(article_body, id, node, image, image_url)
217
272
 
218
- unless image_meta.empty?
219
- article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
220
- article_body[:figures][figure_id][:cite] = image_meta.first.at('cite')&.text
273
+ node = node.replace("{{figure:#{ id }}}").to_html
274
+ article_body[:body] << node
221
275
  end
222
-
223
- article_body[:body] << node.replace("{{figure:#{ figure_id }}}").to_html
224
276
  end
225
- rescue
226
- raise 'Unknown format, please review.'
227
- end
228
- else
229
277
 
230
- node.children.each do |url|
231
278
  begin
232
- if url.name == 'a'
233
- url_id = random_string
234
-
235
- article_body[:anchors][url_id] = {
236
- text: url.children.text,
237
- url: url.attributes['href'].value
238
- }
239
279
 
240
- url.replace("{{anchor:#{ url_id }}}")
241
- end
280
+ # Remove all attributes
281
+ #
282
+ parsed_node = node.xpath('.//@*').remove
283
+
284
+ # Return clean HTML, including HTML elements and text
285
+ #
286
+ parsed_node = node.to_html
287
+
242
288
  rescue
243
- raise 'Unknown format, please review.'
289
+
244
290
  end
245
291
  end
246
292
 
247
- # Remove all attributes
248
- #
249
- parsed_node = node.xpath('.//@*').remove
250
-
251
- # Return clean HTML, including HTML elements and text
252
- #
253
- parsed_node = node.to_html
293
+ article_body[:body] << parsed_node unless parsed_node.nil?
254
294
  end
255
-
256
- article_body[:body] << parsed_node
257
295
  end
258
296
  end
259
297
 
260
298
  return article_body
261
299
  rescue => e
262
- "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
300
+ "There was a problem while parsing this Article: #{ e }"
263
301
  end
264
302
 
303
+ def figure(article_body, id, node, image, image_url)
304
+ article_body[:figures][id] = { }
305
+
306
+ article_body[:figures][id][:image] = image_url
307
+ article_body[:figures][id][:title] = image.at('img').attr('title')
308
+ article_body[:figures][id][:alt] = image.at('img').attr('alt')
309
+
310
+ image_meta = node.at('.e-image__meta')
311
+
312
+ unless image_meta.nil?
313
+ article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
314
+ article_body[:figures][id][:cite] = strip(image_meta.at('cite'))
315
+ end
316
+ end
317
+
318
+ private
319
+
265
320
  def attr(attribute)
266
321
  attributes&.fetch(attribute, nil)&.value
267
322
  end
268
323
 
269
- def random_string
324
+ def strip(string)
325
+ string&.text&.strip
326
+ end
327
+
328
+ def unique_id
270
329
  (0...50).map { (65 + rand(25)).chr }.join.to_sym
271
330
  end
272
331
  end # News
273
332
  end # PolygonCOM
274
- end # GDNewsScraper::Scrapers
333
+ end # GDNewsScraper::Scrapers
@@ -0,0 +1,5 @@
1
+ class String
2
+ def to_num
3
+ gsub(/\D/, '').to_i
4
+ end
5
+ end
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.4'
2
+ VERSION ||= '3.0.6'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -31,5 +31,9 @@ module GDNewsScraper
31
31
  # v3.0.3 - Added a new method which will refresh the content of an Article
32
32
  # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
33
33
  # DOM structure
34
- #
34
+ # v3.0.5 - Adds the possibility to parse an article from its URL rather than
35
+ # having to go through the index page to get its metadata
36
+ # v3.0.6 - Small refactor of the code which also improved parsing speed by
37
+ # about 10% on average! :)
38
+ #
35
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.4
4
+ version: 3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-28 00:00:00.000000000 Z
11
+ date: 2017-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -74,6 +74,7 @@ files:
74
74
  - lib/GDNewsScraper.rb
75
75
  - lib/GDNewsScraper/scrapers/polygon_com/news.rb
76
76
  - lib/GDNewsScraper/scrapers/polygon_com/reviews.rb
77
+ - lib/GDNewsScraper/string.rb
77
78
  - lib/GDNewsScraper/version.rb
78
79
  homepage: https://github.com/games-directory/scraper
79
80
  licenses: