GDNewsScraper 2.0.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 165229c71b29b8ea97c5a08c981421798d7a8d53
4
- data.tar.gz: 86e96cc7035010913d220632bacbece653bb75df
3
+ metadata.gz: 300b4b0c79a91907c9031699c874cd2339dda2e7
4
+ data.tar.gz: d452da316c641f609d3cb83c5d88601d5abca271
5
5
  SHA512:
6
- metadata.gz: 931a89d894e137d571e27162f8c2e3cab9358e8ca8ea0de46569a68c7ce39dcbb4ef70d4bcc7f1c7c17d8d3e7f473c95929218f216b5aef42024b319a5c24fd0
7
- data.tar.gz: a26324d325f91bdeb60c57ae42b67a82ce4df74a86cc8ab30d5451219a3b0c50f000e7542af36b5d230220c7ebeb2b51d8e571a74e40a14cd34b057d8b64bdc6
6
+ metadata.gz: 7023abbec8ca015b9152737cd72385cf01aec37b2de5dc6e53b6d94ae5fd433e5c27940eaecbe37f1ceb4e6549312740a9d422fc6873144f8d877a2739d8469e
7
+ data.tar.gz: 72260107eab5febed2a934117869e815ff46cf721cc5bc7a830d4c78e754cb260aa0df173ac5349862bc44528e00c2d596b2a81ea13c734899863b32944991e9
@@ -4,27 +4,23 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'GDNewsScraper/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "GDNewsScraper"
7
+ spec.name = 'GDNewsScraper'
8
8
  spec.version = GDNewsScraper::VERSION
9
- spec.authors = ["Vlad Radulescu"]
10
- spec.email = ["pacMakaveli90@gmail.co.uk"]
9
+ spec.authors = ['Vlad Radulescu']
10
+ spec.email = ['pacMakaveli90@gmail.co.uk']
11
11
 
12
12
  spec.summary = %q{A Ruby web scraper for gaming News and Reviews}
13
13
  spec.description = %q{A Ruby Scraper created for games.directory to crawl the web for gaming News and Reviews.}
14
- spec.homepage = "https://github.com/games-directory/scraper"
15
- spec.license = "MIT"
14
+ spec.homepage = 'https://github.com/games-directory/scraper'
15
+ spec.license = 'MIT'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_dependency "nokogiri"
23
- spec.add_dependency "httparty"
24
- spec.add_dependency "activesupport"
22
+ spec.add_dependency 'nokogiri'
25
23
 
26
- spec.add_development_dependency "bundler", "~> 1.12"
27
- spec.add_development_dependency "rake", "~> 10.0"
28
- spec.add_development_dependency "rspec", "~> 3.0"
29
- spec.add_development_dependency "pry"
24
+ spec.add_development_dependency 'bundler', '~> 1.12'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
30
26
  end
data/Gemfile CHANGED
@@ -1,10 +1,3 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'nokogiri'
4
- gem 'httparty'
5
- gem 'activesupport'
6
- gem 'sanitize'
7
-
8
- gem 'pry'
9
-
10
3
  gemspec
@@ -1,72 +1,269 @@
1
- require 'active_support/hash_with_indifferent_access'
1
+ require 'pry'
2
+ require 'base64'
3
+ require 'json'
2
4
 
3
- module GDNewsScraper
4
- module Scrapers
5
- module PolygonCOM
5
+ module GDNewsScraper::Scrapers
6
+
7
+ HEADERS ||= {
8
+ "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
9
+ }
6
10
 
7
- class News
8
- attr_accessor :page, :articles
11
+ STREAM_URI ||= 'https://www.polygon.com'
9
12
 
10
- def initialize(offset = 0)
11
- headers = { "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" }
12
- uri = "http://www.polygon.com/news/#{offset}"
13
+ WHITELIST ||= {
14
+ default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
15
+ inner: ['strong', 'em', 'li']
16
+ }
13
17
 
14
- @page ||= Nokogiri::HTML(HTTParty.get(uri, headers: headers))
15
- @articles = HashWithIndifferentAccess.new
18
+ DOM = {
19
+ article: {
20
+ wrapper: '.c-compact-river',
21
+ container: '.c-compact-river__entry',
22
+ inner_container: '.c-entry-box--compact',
23
+ inner_container_video: '.c-entry-box--compact--video',
24
+ title: '.c-entry-box--compact__title',
25
+ cover: '.c-entry-box--compact__image',
26
+ meta: '.c-byline'
27
+ },
16
28
 
17
- pagination = container.css('.pagination').children[3].children.text.split
29
+ pagination: {
30
+ previous: '.c-pagination__prev',
31
+ info: '.c-pagination__text',
32
+ next: '.c-pagination__next'
33
+ }
34
+ }
18
35
 
19
- articles[:stream_size] = pagination.last.to_i
20
- articles[:offset] = pagination[3].to_i
36
+ module PolygonCOM
37
+ class News
38
+ attr_accessor :stream
21
39
 
22
- articles[:feed] = HashWithIndifferentAccess.new
23
- articles[:feed][:source] = 'polygon'
24
- articles[:feed][:label] = 'Polygon'
40
+ def initialize(offset = 0)
41
+ uri = "#{ STREAM_URI }/news/archives/#{ offset }"
25
42
 
26
- articles[:stream] = get_all_news
43
+ @page ||= Nokogiri::HTML(open(uri, HEADERS))
44
+ @stream = Hash.new
45
+
46
+ stream[:stream] = Hash.new
47
+ stream[:stream][:size] = @page.css(DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
48
+ stream[:stream][:pages] = @page.css(DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
49
+ stream[:stream][:prev] = @page.css(DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
50
+ stream[:stream][:next] = @page.css(DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
51
+
52
+ stream[:feed] = Hash.new
53
+ stream[:feed][:url] = STREAM_URI
54
+ stream[:feed][:source] = 'polygon'
55
+ stream[:feed][:label] = 'Polygon'
56
+
57
+ stream[:articles] = Array.new
58
+
59
+ perform
60
+ rescue
61
+ return 'There was a problem initializing the PolygonCOM::News Service'
62
+ end
63
+
64
+ def perform
65
+ @page.css(DOM[:article][:container]).each do |article|
66
+ stream[:articles] << parse(article)
67
+ end
68
+ rescue
69
+ return 'There was a problem performing the initial task in the PolygonCOM::News Service'
70
+ end
71
+
72
+ def parse(article)
73
+ pulse = Hash.new
74
+
75
+ is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
76
+
77
+ key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
+ url = article.css(DOM[:article][:title]).children.first.attr('href')
79
+ title = article.css(DOM[:article][:title]).children.first.text
80
+
81
+ pulse[:id] = key
82
+ pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
+
84
+ begin
85
+ pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
86
+ rescue
87
+ pulse[:cover] = nil
88
+ end
89
+
90
+ pulse[:url] = url
91
+ pulse[:title] = title
92
+ pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1].text
93
+ pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
94
+ pulse[:content] = parse_article_body(url, is_a_video)
95
+ pulse[:tags] = title.downcase.split
96
+
97
+
98
+ return pulse
99
+ rescue
100
+ return 'There was a problem creating the article in the PolygonCOM::News Service'
101
+ end
102
+
103
+ private
104
+
105
+ def parse_article_body(article_url, is_a_video = false)
106
+ article_page = Nokogiri::HTML(open(article_url, HEADERS))
107
+ article_container = article_page.css('.c-entry-content')
108
+
109
+ article_body = {
110
+ galleries: { },
111
+ videos: { },
112
+ images: { },
113
+
114
+ anchors: { },
115
+ figures: { },
116
+
117
+ body: [ ]
118
+ }
119
+
120
+ if is_a_video
121
+ iframe = article_page.at('.c-video-embed--media').at('iframe')
122
+ iframe_id = random_string
123
+
124
+ article_body[:videos][iframe_id] = {}
125
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
126
+
127
+ article_body[:body] << iframe.replace("[video]#{ iframe_id }").to_html
27
128
  end
28
129
 
29
- def get_all_news
30
- news = HashWithIndifferentAccess.new
31
-
32
- container.css('.m-block').each do |article|
33
- body = article.css('.pinned_wrapper').css('.m-block__body')
34
- article_id = article.attributes['data-entry-id'].value.to_i
35
-
36
- news[article_id] = HashWithIndifferentAccess.new
37
- news[article_id][:id] = article_id
38
- news[article_id][:hash] = Base64.encode64(body.children[3].children[1].children.children.text)
39
- news[article_id][:url] = body.children[1].attributes['href'].value
40
- news[article_id][:title] = body.children[3].children[1].children.children.text
41
- news[article_id][:author] = body.children[3].children[3].children[1].children.text
42
- news[article_id][:content] = body.children[3].css('.copy').text
43
- news[article_id][:tags] = body.children[3].children[1].children.children.text.split('/').last.scan(/[[:alpha:]]{4,}/).uniq
44
- news[article_id][:date] = body.children[3].children[3].css('.long_date').children.text.strip
45
-
46
- begin
47
- cover = body.children[1].attributes['data-original']
48
-
49
- if cover.nil?
50
- news[article_id][:cover] = "https://cdn#{body.children[1].children[1].attributes['data-original'].value.split('/cdn').last}"
51
- else
52
- news[article_id][:cover] = "https://cdn#{body.children[1].attributes['data-original'].value.split('/cdn').last}"
130
+ article_container.children.each do |node|
131
+ if node.name == 'div'
132
+
133
+ # Check to see if the div contains a embeded video
134
+ #
135
+ iframe = node.at('iframe')
136
+
137
+ if iframe # YouTube videos
138
+ iframe_id = random_string
139
+
140
+ article_body[:videos][iframe_id] = {}
141
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
142
+
143
+ article_body[:body] << iframe.replace("[video]#{ iframe_id }").to_html
144
+ end
145
+
146
+ # Check to see if the div contains a gallery
147
+ #
148
+ gallery = node.at('.c-image-gallery')
149
+
150
+ if gallery
151
+ gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
152
+
153
+ gallery_id = random_string
154
+ article_body[:galleries][gallery_id] = []
155
+
156
+ gallery_container.children.children.each do |image_container|
157
+ image = image_container.at('a')
158
+
159
+ if image
160
+ article_body[:galleries][gallery_id] << image.attr('href')
161
+ end
53
162
  end
54
- rescue
55
- news[article_id][:cover] = nil
163
+
164
+ article_body[:body] << gallery.replace("[gallery]#{ gallery_id }").to_html
165
+ end
166
+
167
+ twitdget = node.at('.twitter-tweet')
168
+
169
+ if twitdget
170
+ article_body[:body] << twitdget.to_html
56
171
  end
57
172
  end
58
173
 
59
- return news
60
- end
174
+ # First ensure the node is an actual element. This removes random HTML elements
175
+ #
176
+ # => node.element?
177
+ #
178
+ # Secondly, ensure the node is what we actual want. We don't want <div>'s
179
+ # which are usualy used for placing inline advertisments or content specific
180
+ # only to that website
181
+ #
182
+ # => WHITELIST[:default].include?(node.name)
183
+ #
184
+ if node.element? && WHITELIST[:default].include?(node.name)
185
+ case node.name
186
+ when 'figure'
187
+
188
+ image = node.css('.e-image__image').first
189
+ image_url = image.attr('data-original')
190
+
191
+ begin
192
+ if image_url.split('.').last == 'gif'
193
+ image_id = random_string
194
+
195
+ article_body[:images][image_id] = {}
196
+ article_body[:images][image_id][:url] = image_url
197
+
198
+ article_body[:body] << node.replace("[image]#{ image_id }").to_html
199
+ else
200
+ image_alt = image.children.at('img').attr('alt')
201
+ image_title = image.children.at('img').attr('title')
61
202
 
62
- private
203
+ image_meta = node.css('.e-image__meta')
63
204
 
64
- def container
65
- page.css('.m-grouptown')
205
+ figure_id = random_string
206
+
207
+ article_body[:figures][figure_id] = {}
208
+
209
+ article_body[:figures][figure_id][:image] = image_url
210
+ article_body[:figures][figure_id][:title] = image_title
211
+ article_body[:figures][figure_id][:alt] = image_alt
212
+
213
+ unless image_meta.empty?
214
+ article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
215
+ article_body[:figures][figure_id][:cite] = image_meta.first.at('cite')&.text
216
+ end
217
+
218
+ article_body[:body] << node.replace("[figure]#{ figure_id }").to_html
219
+ end
220
+ rescue
221
+ raise 'Unknown format, please review.'
222
+ end
223
+ else
224
+
225
+ node.children.each do |url|
226
+ begin
227
+ if url.name == 'a'
228
+ url_id = random_string
229
+
230
+ article_body[:anchors][url_id.to_sym] = {
231
+ text: url.children.text,
232
+ url: url.attributes['href'].value
233
+ }
234
+
235
+ url.replace("[anchor]#{ url_id }")
236
+ end
237
+ rescue
238
+ raise 'Unknown format, please review.'
239
+ end
240
+ end
241
+
242
+ # Remove all attributes
243
+ #
244
+ parsed_node = node.xpath('.//@*').remove
245
+
246
+ # Return clean HTML, including HTML elements and text
247
+ #
248
+ parsed_node = node.to_html
249
+ end
250
+
251
+ article_body[:body] << parsed_node
252
+ end
66
253
  end
67
254
 
68
- end # News
255
+ return article_body
256
+ rescue
257
+ return 'There was a problem parsing the article body in the PolygonCOM::News Service'
258
+ end
259
+
260
+ def attr(attribute)
261
+ attributes&.fetch(attribute, nil)&.value
262
+ end
69
263
 
70
- end # PolygonCOM
71
- end # Scrapers
72
- end # GDNewsScraper
264
+ def random_string
265
+ (0...50).map { (65 + rand(25)).chr }.join
266
+ end
267
+ end # News
268
+ end # PolygonCOM
269
+ end # GDNewsScraper::Scrapers
@@ -1,5 +1,3 @@
1
- require 'active_support/hash_with_indifferent_access'
2
-
3
1
  module GDNewsScraper
4
2
  module Scrapers
5
3
  module PolygonCOM
@@ -12,7 +10,7 @@ module GDNewsScraper
12
10
  uri = "https://www.polygon.com/games/reviewed/#{offset}"
13
11
 
14
12
  @page ||= Nokogiri::HTML(HTTParty.get(uri, headers: headers))
15
- @articles = HashWithIndifferentAccess.new
13
+ @articles = Hash.new
16
14
 
17
15
  pagination = container.css('.pagination').children[3].children.text.split
18
16
 
@@ -20,7 +18,7 @@ module GDNewsScraper
20
18
  articles[:offset] = pagination[3].to_i
21
19
  articles[:total] = get_all_reviews.size
22
20
 
23
- articles[:feed] = HashWithIndifferentAccess.new
21
+ articles[:feed] = Hash.new
24
22
  articles[:feed][:source] = 'polygon'
25
23
  articles[:feed][:label] = 'Polygon'
26
24
 
@@ -28,7 +26,7 @@ module GDNewsScraper
28
26
  end
29
27
 
30
28
  def get_all_reviews
31
- reviews = HashWithIndifferentAccess.new
29
+ reviews = Hash.new
32
30
 
33
31
  page.css('.m-game--index__list').children.each do |review|
34
32
 
@@ -1,6 +1,12 @@
1
1
  module GDNewsScraper
2
- VERSION = "2.0.2"
2
+ VERSION ||= '3.0.0'
3
3
 
4
+ # => major: A new Source has been added or removed
5
+ # => minor: A Source code has changed drastically to a point where it's not
6
+ # backwards compatible anymore
7
+ # => patch: Small addition to an existing Source. The new code shouldn't break
8
+ # any existing implementation. If it does, it needs backwards compatibility
9
+ #
4
10
  # CHANGELOG
5
11
  #
6
12
  # v1.0.0 - Initial Gem Setup
@@ -13,5 +19,10 @@ module GDNewsScraper
13
19
  # would cause the script to fail when requesting the photo for the
14
20
  # Article
15
21
  # v2.0.2 - Fix minor bug in PolygonCOM News scraper
16
-
22
+ # v2.1.0 - Updated PolygonCOM to reflect their new UI changes
23
+ # v2.1.1 - Fixed PolygonCOM Scraper to include videos if there are any
24
+ # v3.0.0 - Major overhaul to how an Article is parsed and returned to the User
25
+ # In a 'Wordpress' kind of style, various html elements are now
26
+ # returned in a way that an app can recognize and translate it into a
27
+ # widget
17
28
  end
data/lib/GDNewsScraper.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  require 'GDNewsScraper/version'
2
- require 'httparty'
3
2
  require 'nokogiri'
3
+ require 'open-uri'
4
4
 
5
5
  module GDNewsScraper
6
6
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-06-22 00:00:00.000000000 Z
11
+ date: 2017-11-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,34 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: httparty
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: activesupport
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
27
  - !ruby/object:Gem::Dependency
56
28
  name: bundler
57
29
  requirement: !ruby/object:Gem::Requirement
@@ -80,34 +52,6 @@ dependencies:
80
52
  - - "~>"
81
53
  - !ruby/object:Gem::Version
82
54
  version: '10.0'
83
- - !ruby/object:Gem::Dependency
84
- name: rspec
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '3.0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - "~>"
95
- - !ruby/object:Gem::Version
96
- version: '3.0'
97
- - !ruby/object:Gem::Dependency
98
- name: pry
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
55
  description: A Ruby Scraper created for games.directory to crawl the web for gaming
112
56
  News and Reviews.
113
57
  email:
@@ -151,8 +95,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
151
95
  version: '0'
152
96
  requirements: []
153
97
  rubyforge_project:
154
- rubygems_version: 2.5.1
98
+ rubygems_version: 2.6.12
155
99
  signing_key:
156
100
  specification_version: 4
157
101
  summary: A Ruby web scraper for gaming News and Reviews
158
102
  test_files: []
103
+ has_rdoc: