GDNewsScraper 3.0.1 → 3.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +68 -63
- data/lib/GDNewsScraper/version.rb +6 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
|
|
4
|
+
data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
|
|
7
|
+
data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
|
|
@@ -1,109 +1,108 @@
|
|
|
1
|
-
require 'pry'
|
|
2
1
|
require 'base64'
|
|
3
2
|
require 'json'
|
|
4
3
|
|
|
5
4
|
module GDNewsScraper::Scrapers
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
5
|
+
module PolygonCOM
|
|
6
|
+
HEADERS ||= {
|
|
7
|
+
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
STREAM_URI ||= 'https://www.polygon.com'
|
|
11
|
+
|
|
12
|
+
WHITELIST ||= {
|
|
13
|
+
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
|
|
14
|
+
inner: ['strong', 'em', 'li']
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
DOM ||= {
|
|
18
|
+
article: {
|
|
19
|
+
wrapper: '.c-compact-river',
|
|
20
|
+
container: '.c-compact-river__entry',
|
|
21
|
+
inner_container: '.c-entry-box--compact',
|
|
22
|
+
inner_container_video: '.c-entry-box--compact--video',
|
|
23
|
+
title: '.c-entry-box--compact__title',
|
|
24
|
+
cover: '.c-entry-box--compact__image',
|
|
25
|
+
meta: '.c-byline'
|
|
26
|
+
},
|
|
27
|
+
|
|
28
|
+
pagination: {
|
|
29
|
+
previous: '.c-pagination__prev',
|
|
30
|
+
info: '.c-pagination__text',
|
|
31
|
+
next: '.c-pagination__next'
|
|
32
|
+
}
|
|
33
33
|
}
|
|
34
|
-
}
|
|
35
34
|
|
|
36
|
-
module PolygonCOM
|
|
37
35
|
class News
|
|
38
36
|
attr_accessor :stream
|
|
39
37
|
|
|
40
38
|
def initialize(offset = 0)
|
|
41
|
-
|
|
39
|
+
unless offset.nil?
|
|
40
|
+
uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
@page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
|
43
|
+
@stream = Hash.new
|
|
45
44
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
45
|
+
stream[:stream] = Hash.new
|
|
46
|
+
stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
|
|
47
|
+
stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
|
|
48
|
+
stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
|
|
49
|
+
stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
|
|
51
50
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
51
|
+
stream[:feed] = Hash.new
|
|
52
|
+
stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
|
|
53
|
+
stream[:feed][:source] = 'polygon'
|
|
54
|
+
stream[:feed][:label] = 'Polygon'
|
|
56
55
|
|
|
57
|
-
|
|
56
|
+
stream[:articles] = Array.new
|
|
58
57
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return 'There was a problem initializing the PolygonCOM::News Service'
|
|
58
|
+
perform
|
|
59
|
+
end
|
|
62
60
|
end
|
|
63
61
|
|
|
64
62
|
def perform
|
|
65
|
-
@page.css(DOM[:article][:container]).each do |article|
|
|
63
|
+
@page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
|
|
66
64
|
stream[:articles] << parse(article)
|
|
67
65
|
end
|
|
68
|
-
|
|
69
|
-
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def refresh(article_url)
|
|
69
|
+
parse_article_body(article_url)
|
|
70
70
|
end
|
|
71
71
|
|
|
72
72
|
def parse(article)
|
|
73
73
|
pulse = Hash.new
|
|
74
74
|
|
|
75
|
-
is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
|
|
75
|
+
is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
|
|
76
76
|
|
|
77
|
-
key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
|
|
78
|
-
url = article.css(DOM[:article][:title]).children.first.attr('href')
|
|
79
|
-
title = article.css(DOM[:article][:title]).children.first.text
|
|
77
|
+
key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
|
|
78
|
+
url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
|
|
79
|
+
title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
|
|
80
80
|
|
|
81
81
|
pulse[:id] = key
|
|
82
82
|
pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
|
|
83
83
|
|
|
84
84
|
begin
|
|
85
|
-
pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
|
|
85
|
+
pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
|
|
86
86
|
rescue
|
|
87
87
|
pulse[:cover] = nil
|
|
88
88
|
end
|
|
89
89
|
|
|
90
90
|
pulse[:url] = url
|
|
91
91
|
pulse[:title] = title
|
|
92
|
-
pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1]
|
|
93
|
-
pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
|
|
92
|
+
pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
|
|
93
|
+
pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
|
|
94
94
|
pulse[:content] = parse_article_body(url, is_a_video)
|
|
95
95
|
pulse[:tags] = title.downcase.split
|
|
96
96
|
|
|
97
|
-
|
|
98
97
|
return pulse
|
|
99
|
-
rescue
|
|
100
|
-
|
|
98
|
+
rescue => e
|
|
99
|
+
"There was a problem while parsing Article for '#{ title }' => #{ e }"
|
|
101
100
|
end
|
|
102
101
|
|
|
103
102
|
private
|
|
104
103
|
|
|
105
104
|
def parse_article_body(article_url, is_a_video = false)
|
|
106
|
-
article_page = Nokogiri::HTML(open(article_url, HEADERS))
|
|
105
|
+
article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
|
107
106
|
article_container = article_page.css('.c-entry-content')
|
|
108
107
|
|
|
109
108
|
article_body = {
|
|
@@ -169,6 +168,12 @@ module GDNewsScraper::Scrapers
|
|
|
169
168
|
if twitdget
|
|
170
169
|
article_body[:body] << twitdget.to_html
|
|
171
170
|
end
|
|
171
|
+
|
|
172
|
+
redditget = node.at('.reddit-card')
|
|
173
|
+
|
|
174
|
+
if redditget
|
|
175
|
+
article_body[:body] << redditget.to_html
|
|
176
|
+
end
|
|
172
177
|
end
|
|
173
178
|
|
|
174
179
|
# First ensure the node is an actual element. This removes random HTML elements
|
|
@@ -181,7 +186,7 @@ module GDNewsScraper::Scrapers
|
|
|
181
186
|
#
|
|
182
187
|
# => WHITELIST[:default].include?(node.name)
|
|
183
188
|
#
|
|
184
|
-
if node.element? && WHITELIST[:default].include?(node.name)
|
|
189
|
+
if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
|
|
185
190
|
case node.name
|
|
186
191
|
when 'figure'
|
|
187
192
|
|
|
@@ -253,8 +258,8 @@ module GDNewsScraper::Scrapers
|
|
|
253
258
|
end
|
|
254
259
|
|
|
255
260
|
return article_body
|
|
256
|
-
rescue
|
|
257
|
-
|
|
261
|
+
rescue => e
|
|
262
|
+
"There was a problem while parsing the Article body for '#{ title }' => #{ e }"
|
|
258
263
|
end
|
|
259
264
|
|
|
260
265
|
def attr(attribute)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module GDNewsScraper
|
|
2
|
-
VERSION ||= '3.0.
|
|
2
|
+
VERSION ||= '3.0.4'
|
|
3
3
|
|
|
4
4
|
# => major: A new Source has been added or removed
|
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
|
@@ -27,4 +27,9 @@ module GDNewsScraper
|
|
|
27
27
|
# widget
|
|
28
28
|
# v3.0.1 - Change the placeholder from [] to {{:}} which makes it a lot easier
|
|
29
29
|
# to scan and replace with Regex using scan(/\{{(.*?)\}}/)
|
|
30
|
+
# v3.0.2 - Parse Reddit inline widgets in the same way as Twitter widget
|
|
31
|
+
# v3.0.3 - Added a new method which will refresh the content of an Article
|
|
32
|
+
# v3.0.4 - Fixed an issue caused by Featured Articles which have a different
|
|
33
|
+
# DOM structure
|
|
34
|
+
#
|
|
30
35
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: GDNewsScraper
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.0.
|
|
4
|
+
version: 3.0.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Radulescu
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2017-11-
|
|
11
|
+
date: 2017-11-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|