GDNewsScraper 3.0.1 → 3.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06e468f8771feccf3609fa31369c22b865c5645b
4
- data.tar.gz: 1936f2409c7f4c7bfb33cc58d3feaa5243e47203
3
+ metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
4
+ data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
5
5
  SHA512:
6
- metadata.gz: 5a90733cf5b4403b607154fcc354153ae26aa4ca9cd57334258e59a88e9ffdf7c913cc9dddb7e4e5cb643a6481cb24ad874bdb1aad91b583923a45507ffb1686
7
- data.tar.gz: 310056309d60efe507b8e5ca92025f55945cc8dc4d17f934444f77e94281c20f129ab29242275ace4de86a0ddbd381b665ada074937c793b7dd0b0b592ff52db
6
+ metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
7
+ data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
@@ -1,109 +1,108 @@
1
- require 'pry'
2
1
  require 'base64'
3
2
  require 'json'
4
3
 
5
4
  module GDNewsScraper::Scrapers
6
-
7
- HEADERS ||= {
8
- "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
9
- }
10
-
11
- STREAM_URI ||= 'https://www.polygon.com'
12
-
13
- WHITELIST ||= {
14
- default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
15
- inner: ['strong', 'em', 'li']
16
- }
17
-
18
- DOM = {
19
- article: {
20
- wrapper: '.c-compact-river',
21
- container: '.c-compact-river__entry',
22
- inner_container: '.c-entry-box--compact',
23
- inner_container_video: '.c-entry-box--compact--video',
24
- title: '.c-entry-box--compact__title',
25
- cover: '.c-entry-box--compact__image',
26
- meta: '.c-byline'
27
- },
28
-
29
- pagination: {
30
- previous: '.c-pagination__prev',
31
- info: '.c-pagination__text',
32
- next: '.c-pagination__next'
5
+ module PolygonCOM
6
+ HEADERS ||= {
7
+ "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
8
+ }
9
+
10
+ STREAM_URI ||= 'https://www.polygon.com'
11
+
12
+ WHITELIST ||= {
13
+ default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
14
+ inner: ['strong', 'em', 'li']
15
+ }
16
+
17
+ DOM ||= {
18
+ article: {
19
+ wrapper: '.c-compact-river',
20
+ container: '.c-compact-river__entry',
21
+ inner_container: '.c-entry-box--compact',
22
+ inner_container_video: '.c-entry-box--compact--video',
23
+ title: '.c-entry-box--compact__title',
24
+ cover: '.c-entry-box--compact__image',
25
+ meta: '.c-byline'
26
+ },
27
+
28
+ pagination: {
29
+ previous: '.c-pagination__prev',
30
+ info: '.c-pagination__text',
31
+ next: '.c-pagination__next'
32
+ }
33
33
  }
34
- }
35
34
 
36
- module PolygonCOM
37
35
  class News
38
36
  attr_accessor :stream
39
37
 
40
38
  def initialize(offset = 0)
41
- uri = "#{ STREAM_URI }/news/archives/#{ offset }"
39
+ unless offset.nil?
40
+ uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
42
41
 
43
- @page ||= Nokogiri::HTML(open(uri, HEADERS))
44
- @stream = Hash.new
42
+ @page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
43
+ @stream = Hash.new
45
44
 
46
- stream[:stream] = Hash.new
47
- stream[:stream][:size] = @page.css(DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
48
- stream[:stream][:pages] = @page.css(DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
49
- stream[:stream][:prev] = @page.css(DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
50
- stream[:stream][:next] = @page.css(DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
45
+ stream[:stream] = Hash.new
46
+ stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
47
+ stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
48
+ stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
49
+ stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
51
50
 
52
- stream[:feed] = Hash.new
53
- stream[:feed][:url] = STREAM_URI
54
- stream[:feed][:source] = 'polygon'
55
- stream[:feed][:label] = 'Polygon'
51
+ stream[:feed] = Hash.new
52
+ stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
53
+ stream[:feed][:source] = 'polygon'
54
+ stream[:feed][:label] = 'Polygon'
56
55
 
57
- stream[:articles] = Array.new
56
+ stream[:articles] = Array.new
58
57
 
59
- perform
60
- rescue
61
- return 'There was a problem initializing the PolygonCOM::News Service'
58
+ perform
59
+ end
62
60
  end
63
61
 
64
62
  def perform
65
- @page.css(DOM[:article][:container]).each do |article|
63
+ @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
66
64
  stream[:articles] << parse(article)
67
65
  end
68
- rescue
69
- return 'There was a problem performing the initial task in the PolygonCOM::News Service'
66
+ end
67
+
68
+ def refresh(article_url)
69
+ parse_article_body(article_url)
70
70
  end
71
71
 
72
72
  def parse(article)
73
73
  pulse = Hash.new
74
74
 
75
- is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
75
+ is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
76
76
 
77
- key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
- url = article.css(DOM[:article][:title]).children.first.attr('href')
79
- title = article.css(DOM[:article][:title]).children.first.text
77
+ key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
78
+ url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
79
+ title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
80
80
 
81
81
  pulse[:id] = key
82
82
  pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
83
83
 
84
84
  begin
85
- pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
85
+ pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
86
86
  rescue
87
87
  pulse[:cover] = nil
88
88
  end
89
89
 
90
90
  pulse[:url] = url
91
91
  pulse[:title] = title
92
- pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1].text
93
- pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
92
+ pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
93
+ pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
94
94
  pulse[:content] = parse_article_body(url, is_a_video)
95
95
  pulse[:tags] = title.downcase.split
96
96
 
97
-
98
97
  return pulse
99
- rescue
100
- return 'There was a problem creating the article in the PolygonCOM::News Service'
98
+ rescue => e
99
+ "There was a problem while parsing Article for '#{ title }' => #{ e }"
101
100
  end
102
101
 
103
102
  private
104
103
 
105
104
  def parse_article_body(article_url, is_a_video = false)
106
- article_page = Nokogiri::HTML(open(article_url, HEADERS))
105
+ article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
107
106
  article_container = article_page.css('.c-entry-content')
108
107
 
109
108
  article_body = {
@@ -169,6 +168,12 @@ module GDNewsScraper::Scrapers
169
168
  if twitdget
170
169
  article_body[:body] << twitdget.to_html
171
170
  end
171
+
172
+ redditget = node.at('.reddit-card')
173
+
174
+ if redditget
175
+ article_body[:body] << redditget.to_html
176
+ end
172
177
  end
173
178
 
174
179
  # First ensure the node is an actual element. This removes random HTML elements
@@ -181,7 +186,7 @@ module GDNewsScraper::Scrapers
181
186
  #
182
187
  # => WHITELIST[:default].include?(node.name)
183
188
  #
184
- if node.element? && WHITELIST[:default].include?(node.name)
189
+ if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
185
190
  case node.name
186
191
  when 'figure'
187
192
 
@@ -253,8 +258,8 @@ module GDNewsScraper::Scrapers
253
258
  end
254
259
 
255
260
  return article_body
256
- rescue
257
- return 'There was a problem parsing the article body in the PolygonCOM::News Service'
261
+ rescue => e
262
+ "There was a problem while parsing the Article body for '#{ title }' => #{ e }"
258
263
  end
259
264
 
260
265
  def attr(attribute)
@@ -1,5 +1,5 @@
1
1
  module GDNewsScraper
2
- VERSION ||= '3.0.1'
2
+ VERSION ||= '3.0.4'
3
3
 
4
4
  # => major: A new Source has been added or removed
5
5
  # => minor: A Source code has changed drastically to a point where it's not
@@ -27,4 +27,9 @@ module GDNewsScraper
27
27
  # widget
28
28
  # v3.0.1 - Change the placeholder from [] to {{:}} which makes it a lot easier
29
29
  # to scan and replace with Regex using scan(/\{{(.*?)\}}/)
30
+ # v3.0.2 - Parse Reddit inline widgets in the same way as Twitter widget
31
+ # v3.0.3 - Added a new method which will refresh the content of an Article
32
+ # v3.0.4 - Fixed an issue caused by Featured Articles which have a different
33
+ # DOM structure
34
+ #
30
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: GDNewsScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vlad Radulescu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-27 00:00:00.000000000 Z
11
+ date: 2017-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri