GDNewsScraper 2.0.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 165229c71b29b8ea97c5a08c981421798d7a8d53
4
- data.tar.gz: 86e96cc7035010913d220632bacbece653bb75df
3
+ metadata.gz: 300b4b0c79a91907c9031699c874cd2339dda2e7
4
+ data.tar.gz: d452da316c641f609d3cb83c5d88601d5abca271
5
5
  SHA512:
6
- metadata.gz: 931a89d894e137d571e27162f8c2e3cab9358e8ca8ea0de46569a68c7ce39dcbb4ef70d4bcc7f1c7c17d8d3e7f473c95929218f216b5aef42024b319a5c24fd0
7
- data.tar.gz: a26324d325f91bdeb60c57ae42b67a82ce4df74a86cc8ab30d5451219a3b0c50f000e7542af36b5d230220c7ebeb2b51d8e571a74e40a14cd34b057d8b64bdc6
6
+ metadata.gz: 7023abbec8ca015b9152737cd72385cf01aec37b2de5dc6e53b6d94ae5fd433e5c27940eaecbe37f1ceb4e6549312740a9d422fc6873144f8d877a2739d8469e
7
+ data.tar.gz: 72260107eab5febed2a934117869e815ff46cf721cc5bc7a830d4c78e754cb260aa0df173ac5349862bc44528e00c2d596b2a81ea13c734899863b32944991e9
@@ -4,27 +4,23 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'GDNewsScraper/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
7
- spec.name = "GDNewsScraper"
7
+ spec.name = 'GDNewsScraper'
8
8
  spec.version = GDNewsScraper::VERSION
9
- spec.authors = ["Vlad Radulescu"]
10
- spec.email = ["pacMakaveli90@gmail.co.uk"]
9
+ spec.authors = ['Vlad Radulescu']
10
+ spec.email = ['pacMakaveli90@gmail.co.uk']
11
11
 
12
12
  spec.summary = %q{A Ruby web scraper for gaming News and Reviews}
13
13
  spec.description = %q{A Ruby Scraper created for games.directory to crawl the web for gaming News and Reviews.}
14
- spec.homepage = "https://github.com/games-directory/scraper"
15
- spec.license = "MIT"
14
+ spec.homepage = 'https://github.com/games-directory/scraper'
15
+ spec.license = 'MIT'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_dependency "nokogiri"
23
- spec.add_dependency "httparty"
24
- spec.add_dependency "activesupport"
22
+ spec.add_dependency 'nokogiri'
25
23
 
26
- spec.add_development_dependency "bundler", "~> 1.12"
27
- spec.add_development_dependency "rake", "~> 10.0"
28
- spec.add_development_dependency "rspec", "~> 3.0"
29
- spec.add_development_dependency "pry"
24
+ spec.add_development_dependency 'bundler', '~> 1.12'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
30
26
  end
data/Gemfile CHANGED
@@ -1,10 +1,3 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'nokogiri'
4
- gem 'httparty'
5
- gem 'activesupport'
6
- gem 'sanitize'
7
-
8
- gem 'pry'
9
-
10
3
  gemspec
@@ -1,72 +1,269 @@
1
- require 'active_support/hash_with_indifferent_access'
1
+ require 'pry'
2
+ require 'base64'
3
+ require 'json'
2
4
 
3
- module GDNewsScraper
4
- module Scrapers
5
- module PolygonCOM
5
+ module GDNewsScraper::Scrapers
6
+
7
+ HEADERS ||= {
8
+ "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
9
+ }
6
10
 
7
- class News
8
- attr_accessor :page, :articles
11
+ STREAM_URI ||= 'https://www.polygon.com'
9
12
 
10
- def initialize(offset = 0)
11
- headers = { "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" }
12
- uri = "http://www.polygon.com/news/#{offset}"
13
+ WHITELIST ||= {
14
+ default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
15
+ inner: ['strong', 'em', 'li']
16
+ }
13
17
 
14
- @page ||= Nokogiri::HTML(HTTParty.get(uri, headers: headers))
15
- @articles = HashWithIndifferentAccess.new
18
+ DOM = {
19
+ article: {
20
+ wrapper: '.c-compact-river',
21
+ container: '.c-compact-river__entry',
22
+ inner_container: '.c-entry-box--compact',
23
+ inner_container_video: '.c-entry-box--compact--video',
24
+ title: '.c-entry-box--compact__title',
25
+ cover: '.c-entry-box--compact__image',
26
+ meta: '.c-byline'
27
+ },
16
28
 
17
- pagination = container.css('.pagination').children[3].children.text.split
29
+ pagination: {
30
+ previous: '.c-pagination__prev',
31
+ info: '.c-pagination__text',
32
+ next: '.c-pagination__next'
33
+ }
34
+ }
18
35
 
19
- articles[:stream_size] = pagination.last.to_i
20
- articles[:offset] = pagination[3].to_i
36
+ module PolygonCOM
37
+ class News
38
+ attr_accessor :stream
21
39
 
22
- articles[:feed] = HashWithIndifferentAccess.new
23
- articles[:feed][:source] = 'polygon'
24
- articles[:feed][:label] = 'Polygon'
40
+ def initialize(offset = 0)
41
+ uri = "#{ STREAM_URI }/news/archives/#{ offset }"
25
42
 
26
- articles[:stream] = get_all_news
43
+ @page ||= Nokogiri::HTML(open(uri, HEADERS))
44
+ @stream = Hash.new
45
+
46
+ stream[:stream] = Hash.new
47
+ stream[:stream][:size] = @page.css(DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
48
+ stream[:stream][:pages] = @page.css(DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
49
+ stream[:stream][:prev] = @page.css(DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
50
+ stream[:stream][:next] = @page.css(DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
51
+
52
+ stream[:feed] = Hash.new
53
+ stream[:feed][:url] = STREAM_URI
54
+ stream[:feed][:source] = 'polygon'
55
+ stream[:feed][:label] = 'Polygon'
56
+
57
+ stream[:articles] = Array.new
58
+
59
+ perform
60
+ rescue
61
+ return 'There was a problem initializing the PolygonCOM::News Service'
62
+ end
63
+
64
+ def perform
65
+ @page.css(DOM[:article][:container]).each do |article|
66
+ stream[:articles] << parse(article)
67
+ end
68
+ rescue
69
+ return 'There was a problem performing the initial task in the PolygonCOM::News Service'
70
+ end
71
+
72
+ def parse(article)
73
+ pulse = Hash.new
74
+
75
+ is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
76
+
77
+ key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
+ url = article.css(DOM[:article][:title]).children.first.attr('href')
79
+ title = article.css(DOM[:article][:title]).children.first.text
80
+
81
+ pulse[:id] = key
82
+ pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
+
84
+ begin
85
+ pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
86
+ rescue
87
+ pulse[:cover] = nil
88
+ end
89
+
90
+ pulse[:url] = url
91
+ pulse[:title] = title
92
+ pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1].text
93
+ pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
94
+ pulse[:content] = parse_article_body(url, is_a_video)
95
+ pulse[:tags] = title.downcase.split
96
+
97
+
98
+ return pulse
99
+ rescue
100
+ return 'There was a problem creating the article in the PolygonCOM::News Service'
101
+ end
102
+
103
+ private
104
+
105
+ def parse_article_body(article_url, is_a_video = false)
106
+ article_page = Nokogiri::HTML(open(article_url, HEADERS))
107
+ article_container = article_page.css('.c-entry-content')
108
+
109
+ article_body = {
110
+ galleries: { },
111
+ videos: { },
112
+ images: { },
113
+
114
+ anchors: { },
115
+ figures: { },
116
+
117
+ body: [ ]
118
+ }
119
+
120
+ if is_a_video
121
+ iframe = article_page.at('.c-video-embed--media').at('iframe')
122
+ iframe_id = random_string
123
+
124
+ article_body[:videos][iframe_id] = {}
125
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
126
+
127
+ article_body[:body] << iframe.replace("[video]#{ iframe_id }").to_html
27
128
  end
28
129
 
29
- def get_all_news
30
- news = HashWithIndifferentAccess.new
31
-
32
- container.css('.m-block').each do |article|
33
- body = article.css('.pinned_wrapper').css('.m-block__body')
34
- article_id = article.attributes['data-entry-id'].value.to_i
35
-
36
- news[article_id] = HashWithIndifferentAccess.new
37
- news[article_id][:id] = article_id
38
- news[article_id][:hash] = Base64.encode64(body.children[3].children[1].children.children.text)
39
- news[article_id][:url] = body.children[1].attributes['href'].value
40
- news[article_id][:title] = body.children[3].children[1].children.children.text
41
- news[article_id][:author] = body.children[3].children[3].children[1].children.text
42
- news[article_id][:content] = body.children[3].css('.copy').text
43
- news[article_id][:tags] = body.children[3].children[1].children.children.text.split('/').last.scan(/[[:alpha:]]{4,}/).uniq
44
- news[article_id][:date] = body.children[3].children[3].css('.long_date').children.text.strip
45
-
46
- begin
47
- cover = body.children[1].attributes['data-original']
48
-
49
- if cover.nil?
50
- news[article_id][:cover] = "https://cdn#{body.children[1].children[1].attributes['data-original'].value.split('/cdn').last}"
51
- else
52
- news[article_id][:cover] = "https://cdn#{body.children[1].attributes['data-original'].value.split('/cdn').last}"
130
+ article_container.children.each do |node|
131
+ if node.name == 'div'
132
+
133
+ # Check to see if the div contains a embeded video
134
+ #
135
+ iframe = node.at('iframe')
136
+
137
+ if iframe # YouTube videos
138
+ iframe_id = random_string
139
+
140
+ article_body[:videos][iframe_id] = {}
141
+ article_body[:videos][iframe_id][:url] = iframe.attr('src')
142
+
143
+ article_body[:body] << iframe.replace("[video]#{ iframe_id }").to_html
144
+ end
145
+
146
+ # Check to see if the div contains a gallery
147
+ #
148
+ gallery = node.at('.c-image-gallery')
149
+
150
+ if gallery
151
+ gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')
152
+
153
+ gallery_id = random_string
154
+ article_body[:galleries][gallery_id] = []
155
+
156
+ gallery_container.children.children.each do |image_container|
157
+ image = image_container.at('a')
158
+
159
+ if image
160
+ article_body[:galleries][gallery_id] << image.attr('href')
161
+ end
53
162
  end
54
- rescue
55
- news[article_id][:cover] = nil
163
+
164
+ article_body[:body] << gallery.replace("[gallery]#{ gallery_id }").to_html
165
+ end
166
+
167
+ twitdget = node.at('.twitter-tweet')
168
+
169
+ if twitdget
170
+ article_body[:body] << twitdget.to_html
56
171
  end
57
172
  end
58
173
 
59
- return news
60
- end
174
+ # First ensure the node is an actual element. This removes random HTML elements
175
+ #
176
+ # => node.element?
177
+ #
178
+ # Secondly, ensure the node is what we actual want. We don't want <div>'s
179
+ # which are usualy used for placing inline advertisments or content specific
180
+ # only to that website
181
+ #
182
+ # => WHITELIST[:default].include?(node.name)
183
+ #
184
+ if node.element? && WHITELIST[:default].include?(node.name)
185
+ case node.name
186
+ when 'figure'
187
+
188
+ image = node.css('.e-image__image').first
189
+ image_url = image.attr('data-original')
190
+
191
+ begin
192
+ if image_url.split('.').last == 'gif'
193
+ image_id = random_string
194
+
195
+ article_body[:images][image_id] = {}
196
+ article_body[:images][image_id][:url] = image_url
197
+
198
+ article_body[:body] << node.replace("[image]#{ image_id }").to_html
199
+ else
200
+ image_alt = image.children.at('img').attr('alt')
201
+ image_title = image.children.at('img').attr('title')
61
202
 
62
- private
203
+ image_meta = node.css('.e-image__meta')
63
204
 
64
- def container
65
- page.css('.m-grouptown')
205
+ figure_id = random_string
206
+
207
+ article_body[:figures][figure_id] = {}
208
+
209
+ article_body[:figures][figure_id][:image] = image_url
210
+ article_body[:figures][figure_id][:title] = image_title
211
+ article_body[:figures][figure_id][:alt] = image_alt
212
+
213
+ unless image_meta.empty?
214
+ article_body[:figures][figure_id][:caption] = image_meta.first.at('figcaption')&.text
215
+ article_body[:figures][figure_id][:cite] = image_meta.first.at('cite')&.text
216
+ end
217
+
218
+ article_body[:body] << node.replace("[figure]#{ figure_id }").to_html
219
+ end
220
+ rescue
221
+ raise 'Unknown format, please review.'
222
+ end
223
+ else
224
+
225
+ node.children.each do |url|
226
+ begin
227
+ if url.name == 'a'
228
+ url_id = random_string
229
+
230
+ article_body[:anchors][url_id.to_sym] = {
231
+ text: url.children.text,
232
+ url: url.attributes['href'].value
233
+ }
234
+
235
+ url.replace("[anchor]#{ url_id }")
236
+ end
237
+ rescue
238
+ raise 'Unknown format, please review.'
239
+ end
240
+ end
241
+
242
+ # Remove all attributes
243
+ #
244
+ parsed_node = node.xpath('.//@*').remove
245
+
246
+ # Return clean HTML, including HTML elements and text
247
+ #
248
+ parsed_node = node.to_html
249
+ end
250
+
251
+ article_body[:body] << parsed_node
252
+ end
66
253
  end
67
254
 
68
- end # News
255
+ return article_body
256
+ rescue
257
+ return 'There was a problem parsing the article body in the PolygonCOM::News Service'
258
+ end
259
+
260
+ def attr(attribute)
261
+ attributes&.fetch(attribute, nil)&.value
262
+ end
69
263
 
70
- end # PolygonCOM
71
- end # Scrapers
72
- end # GDNewsScraper
264
+ def random_string
265
+ (0...50).map { (65 + rand(25)).chr }.join
266
+ end
267
+ end # News
268
+ end # PolygonCOM
269
+ end # GDNewsScraper::Scrapers
@@ -1,5 +1,3 @@
1
- require 'active_support/hash_with_indifferent_access'
2
-
3
1
  module GDNewsScraper
4
2
  module Scrapers
5
3
  module PolygonCOM
@@ -12,7 +10,7 @@ module GDNewsScraper
12
10
  uri = "https://www.polygon.com/games/reviewed/#{offset}"
13
11
 
14
12
  @page ||= Nokogiri::HTML(HTTParty.get(uri, headers: headers))
15
- @articles = HashWithIndifferentAccess.new
13
+ @articles = Hash.new
16
14
 
17
15
  pagination = container.css('.pagination').children[3].children.text.split
18
16
 
@@ -20,7 +18,7 @@ module GDNewsScraper
20
18
  articles[:offset] = pagination[3].to_i
21
19
  articles[:total] = get_all_reviews.size
22
20
 
23
- articles[:feed] = HashWithIndifferentAccess.new
21
+ articles[:feed] = Hash.new
24
22
  articles[:feed][:source] = 'polygon'
25
23
  articles[:feed][:label] = 'Polygon'
26
24
 
@@ -28,7 +26,7 @@ module GDNewsScraper
28
26
  end
29
27
 
30
28
  def get_all_reviews
31
- reviews = HashWithIndifferentAccess.new
29
+ reviews = Hash.new
32
30
 
33
31
  page.css('.m-game--index__list').children.each do |review|
34
32
 
@@ -1,6 +1,12 @@
1
1
  module GDNewsScraper
2
- VERSION = "2.0.2"
2
+ VERSION ||= '3.0.0'
3
3
 
4
+ # => major: A new Source has been added or removed
5
+ # => minor: A Source code has changed drastically to a point where it's not
6
+ # backwards compatible anymore
7
+ # => patch: Small addition to an existing Source. The new code shouldn't break
8
+ # any existing implementation. If it does, it needs backwards compatibility
9
+ #
4
10
  # CHANGELOG
5
11
  #
6
12
  # v1.0.0 - Initial Gem Setup
@@ -13,5 +19,10 @@ module GDNewsScraper
13
19
  # would cause the script to fail when requesting the photo for the
14
20
  # Article
15
21
  # v2.0.2 - Fix minor bug in PolygonCOM News scraper
16
-
22
+ # v2.1.0 - Updated PolygonCOM to reflect their new UI changes
23
+ # v2.1.1 - Fixed PolygonCOM Scraper to include videos if there are any
24
+ # v3.0.0 - Major overhaul to how an Article is parsed and returned to the User
25
+ # In a 'Wordpress' kind of style, various html elements are now
26
+ # returned in a way that an app can recognize and translate it into a
27
+ # widget
17
28
  end
data/lib/GDNewsScraper.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  require 'GDNewsScraper/version'
2
- require 'httparty'
3
2
  require 'nokogiri'
3
+ require 'open-uri'
4
4
 
5
5
  module GDNewsScraper
6
6
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-06-22 00:00:00.000000000 Z
11
+ date: 2017-11-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,34 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: httparty
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: activesupport
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
27
  - !ruby/object:Gem::Dependency
56
28
  name: bundler
57
29
  requirement: !ruby/object:Gem::Requirement
@@ -80,34 +52,6 @@ dependencies:
80
52
  - - "~>"
81
53
  - !ruby/object:Gem::Version
82
54
  version: '10.0'
83
- - !ruby/object:Gem::Dependency
84
- name: rspec
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '3.0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - "~>"
95
- - !ruby/object:Gem::Version
96
- version: '3.0'
97
- - !ruby/object:Gem::Dependency
98
- name: pry
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
55
  description: A Ruby Scraper created for games.directory to crawl the web for gaming
112
56
  News and Reviews.
113
57
  email:
@@ -151,8 +95,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
151
95
  version: '0'
152
96
  requirements: []
153
97
  rubyforge_project:
154
- rubygems_version: 2.5.1
98
+ rubygems_version: 2.6.12
155
99
  signing_key:
156
100
  specification_version: 4
157
101
  summary: A Ruby web scraper for gaming News and Reviews
158
102
  test_files: []
103
+ has_rdoc: