GDNewsScraper 3.0.1 → 3.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06e468f8771feccf3609fa31369c22b865c5645b
4
- data.tar.gz: 1936f2409c7f4c7bfb33cc58d3feaa5243e47203
3
+ metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
4
+ data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
5
5
  SHA512:
6
- metadata.gz: 5a90733cf5b4403b607154fcc354153ae26aa4ca9cd57334258e59a88e9ffdf7c913cc9dddb7e4e5cb643a6481cb24ad874bdb1aad91b583923a45507ffb1686
7
- data.tar.gz: 310056309d60efe507b8e5ca92025f55945cc8dc4d17f934444f77e94281c20f129ab29242275ace4de86a0ddbd381b665ada074937c793b7dd0b0b592ff52db
6
+ metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
7
+ data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
@@ -1,109 +1,108 @@
1
- require 'pry'
2
1
  require 'base64'
3
2
  require 'json'
4
3
 
5
4
  module GDNewsScraper::Scrapers
6
-
7
- HEADERS ||= {
8
- "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
9
- }
10
-
11
- STREAM_URI ||= 'https://www.polygon.com'
12
-
13
- WHITELIST ||= {
14
- default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
15
- inner: ['strong', 'em', 'li']
16
- }
17
-
18
- DOM = {
19
- article: {
20
- wrapper: '.c-compact-river',
21
- container: '.c-compact-river__entry',
22
- inner_container: '.c-entry-box--compact',
23
- inner_container_video: '.c-entry-box--compact--video',
24
- title: '.c-entry-box--compact__title',
25
- cover: '.c-entry-box--compact__image',
26
- meta: '.c-byline'
27
- },
28
-
29
- pagination: {
30
- previous: '.c-pagination__prev',
31
- info: '.c-pagination__text',
32
- next: '.c-pagination__next'
5
+ module PolygonCOM
6
+ HEADERS ||= {
7
+ "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
8
+ }
9
+
10
+ STREAM_URI ||= 'https://www.polygon.com'
11
+
12
+ WHITELIST ||= {
13
+ default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
14
+ inner: ['strong', 'em', 'li']
15
+ }
16
+
17
+ DOM ||= {
18
+ article: {
19
+ wrapper: '.c-compact-river',
20
+ container: '.c-compact-river__entry',
21
+ inner_container: '.c-entry-box--compact',
22
+ inner_container_video: '.c-entry-box--compact--video',
23
+ title: '.c-entry-box--compact__title',
24
+ cover: '.c-entry-box--compact__image',
25
+ meta: '.c-byline'
26
+ },
27
+
28
+ pagination: {
29
+ previous: '.c-pagination__prev',
30
+ info: '.c-pagination__text',
31
+ next: '.c-pagination__next'
32
+ }
33
33
  }
34
- }
35
34
 
36
- module PolygonCOM
37
35
  class News
38
36
  attr_accessor :stream
39
37
 
40
38
  def initialize(offset = 0)
41
- uri = "#{ STREAM_URI }/news/archives/#{ offset }"
39
+ unless offset.nil?
40
+ uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
42
41
 
43
- @page ||= Nokogiri::HTML(open(uri, HEADERS))
44
- @stream = Hash.new
42
+ @page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
43
+ @stream = Hash.new
45
44
 
46
- stream[:stream] = Hash.new
47
- stream[:stream][:size] = @page.css(DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
48
- stream[:stream][:pages] = @page.css(DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
49
- stream[:stream][:prev] = @page.css(DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
50
- stream[:stream][:next] = @page.css(DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
45
+ stream[:stream] = Hash.new
46
+ stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
47
+ stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
48
+ stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
49
+ stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
51
50
 
52
- stream[:feed] = Hash.new
53
- stream[:feed][:url] = STREAM_URI
54
- stream[:feed][:source] = 'polygon'
55
- stream[:feed][:label] = 'Polygon'
51
+ stream[:feed] = Hash.new
52
+ stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
53
+ stream[:feed][:source] = 'polygon'
54
+ stream[:feed][:label] = 'Polygon'
56
55
 
57
- stream[:articles] = Array.new
56
+ stream[:articles] = Array.new
58
57
 
59
- perform
60
- rescue
61
- return 'There was a problem initializing the PolygonCOM::News Service'
58
+ perform
59
+ end
62
60
  end
63
61
 
64
62
  def perform
65
- @page.css(DOM[:article][:container]).each do |article|
63
+ @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
66
64
  stream[:articles] << parse(article)
67
65
  end
68
- rescue
69
- return 'There was a problem performing the initial task in the PolygonCOM::News Service'
66
+ end
67
+
68
+ def refresh(article_url)
69
+ parse_article_body(article_url)
70
70
  end
71
71
 
72
72
  def parse(article)
73
73
  pulse = Hash.new
74
74
 
75
- is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
75
+ is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
76
76
 
77
- key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
- url = article.css(DOM[:article][:title]).children.first.attr('href')
79
- title = article.css(DOM[:article][:title]).children.first.text
77
+ key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
+ url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
79
+ title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
80
80
 
81
81
  pulse[:id] = key
82
82
  pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
83
 
84
84
  begin
85
- pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
85
+ pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
86
86
  rescue
87
87
  pulse[:cover] = nil
88
88
  end
89
89
 
90
90
  pulse[:url] = url
91
91
  pulse[:title] = title
92
- pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1].text
93
- pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
92
+ pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
93
+ pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
94
94
  pulse[:content] = parse_article_body(url, is_a_video)
95
95
  pulse[:tags] = title.downcase.split
96
96
 
97
-
98
97
  return pulse
99
- rescue
100
- return 'There was a problem creating the article in the PolygonCOM::News Service'
98
+ rescue => e
99
+ "There was a problem while parsing Article for '#{ title }' => #{ e }"
101
100
  end
102
101
 
103
102
  private
104
103
 
105
104
  def parse_article_body(article_url, is_a_video = false)
106
- article_page = Nokogiri::HTML(open(article_url, HEADERS))
105
+ article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
107
106
  article_container = article_page.css('.c-entry-content')
108
107
 
109
108
  article_body = {
@@ -169,6 +168,12 @@ module GDNewsScraper::Scrapers
169
168
  if twitdget
170
169
  article_body[:body] << twitdget.to_html
171
170
  end
171
+
172
+ redditget = node.at('.reddit-card')
173
+
174
+ if redditget
175
+ article_body[:body] << redditget.to_html
176
+ end
172
177
  end
173
178
 
174
179
  # First ensure the node is an actual element. This removes random HTML elements
@@ -181,7 +186,7 @@ module GDNewsScraper::Scrapers
181
186
  #
182
187
  # => WHITELIST[:default].include?(node.name)
183
188
  #
184
- if node.element? && WHITELIST[:default].include?(node.name)
189
+ if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
185
190
  case node.name
186
191
  when 'figure'
187
192
 
@@ -253,8 +258,8 @@ module GDNewsScraper::Scrapers
253
258
  end
254
259
 
255
260
  return article_body
256
- rescue
257
- return 'There was a problem parsing the article body in the PolygonCOM::News Service'
261
+ rescue => e
262
+ "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
258
263
  end
259
264
 
260
265
  def attr(attribute)
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.1'
2
+ VERSION ||= '3.0.4'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -27,4 +27,9 @@ module GDNewsScraper
27
27
  # widget
28
28
  # v3.0.1 - Change the placeholder from [] to {{:}} which makes it a lot easier
29
29
  # to scan and replace with Regex using scan(/\{{(.*?)\}}/)
30
+ # v3.0.2 - Parse Reddit inline widgets in the same way as Twitter widget
31
+ # v3.0.3 - Added a new method which will refresh the content of an Article
32
+ # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
33
+ # DOM structure
34
+ #
30
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-27 00:00:00.000000000 Z
11
+ date: 2017-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri