GDNewsScraper 3.0.1 → 3.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/GDNewsScraper/scrapers/polygon_com/news.rb +68 -63
- data/lib/GDNewsScraper/version.rb +6 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 692d674f129613f9b5fa1b379abf86c588d7f3f1
|
4
|
+
data.tar.gz: 3653b5992703ae9e7027e75b07ef0ed7818313a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 580d4967034bed31b74e80b72993fda91614f6b8bb91c2c3f924590448e6f1c22c490ee6292722528434b619ed9f8a8c0625eb24c8dcaa767c4176adebebe184
|
7
|
+
data.tar.gz: fe10ea48908f0e012a14a78de2b340bb6ac21956475d7efdb4235a6d3bf0fc7e39e7a8a812c9cbb0e4844242d3b8ae63cff358b989e4a4c2fa6811e19e0105c6
|
@@ -1,109 +1,108 @@
|
|
1
|
-
require 'pry'
|
2
1
|
require 'base64'
|
3
2
|
require 'json'
|
4
3
|
|
5
4
|
module GDNewsScraper::Scrapers
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
5
|
+
module PolygonCOM
|
6
|
+
HEADERS ||= {
|
7
|
+
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
8
|
+
}
|
9
|
+
|
10
|
+
STREAM_URI ||= 'https://www.polygon.com'
|
11
|
+
|
12
|
+
WHITELIST ||= {
|
13
|
+
default: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'figure', 'blockquote', 'ul', 'ol'],
|
14
|
+
inner: ['strong', 'em', 'li']
|
15
|
+
}
|
16
|
+
|
17
|
+
DOM ||= {
|
18
|
+
article: {
|
19
|
+
wrapper: '.c-compact-river',
|
20
|
+
container: '.c-compact-river__entry',
|
21
|
+
inner_container: '.c-entry-box--compact',
|
22
|
+
inner_container_video: '.c-entry-box--compact--video',
|
23
|
+
title: '.c-entry-box--compact__title',
|
24
|
+
cover: '.c-entry-box--compact__image',
|
25
|
+
meta: '.c-byline'
|
26
|
+
},
|
27
|
+
|
28
|
+
pagination: {
|
29
|
+
previous: '.c-pagination__prev',
|
30
|
+
info: '.c-pagination__text',
|
31
|
+
next: '.c-pagination__next'
|
32
|
+
}
|
33
33
|
}
|
34
|
-
}
|
35
34
|
|
36
|
-
module PolygonCOM
|
37
35
|
class News
|
38
36
|
attr_accessor :stream
|
39
37
|
|
40
38
|
def initialize(offset = 0)
|
41
|
-
|
39
|
+
unless offset.nil?
|
40
|
+
uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI }/news/archives/#{ offset }"
|
42
41
|
|
43
|
-
|
44
|
-
|
42
|
+
@page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
43
|
+
@stream = Hash.new
|
45
44
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
stream[:stream] = Hash.new
|
46
|
+
stream[:stream][:size] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[0].gsub(/\D/, '').to_i
|
47
|
+
stream[:stream][:pages] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:info]).text.split[6].gsub(/\D/, '').to_i
|
48
|
+
stream[:stream][:prev] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:previous])&.first&.attr('href')&.split('/')&.last.to_i
|
49
|
+
stream[:stream][:next] = @page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:pagination][:next])&.first&.attr('href')&.split('/')&.last.to_i
|
51
50
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
51
|
+
stream[:feed] = Hash.new
|
52
|
+
stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::STREAM_URI
|
53
|
+
stream[:feed][:source] = 'polygon'
|
54
|
+
stream[:feed][:label] = 'Polygon'
|
56
55
|
|
57
|
-
|
56
|
+
stream[:articles] = Array.new
|
58
57
|
|
59
|
-
|
60
|
-
|
61
|
-
return 'There was a problem initializing the PolygonCOM::News Service'
|
58
|
+
perform
|
59
|
+
end
|
62
60
|
end
|
63
61
|
|
64
62
|
def perform
|
65
|
-
@page.css(DOM[:article][:container]).each do |article|
|
63
|
+
@page.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:container]).each do |article|
|
66
64
|
stream[:articles] << parse(article)
|
67
65
|
end
|
68
|
-
|
69
|
-
|
66
|
+
end
|
67
|
+
|
68
|
+
def refresh(article_url)
|
69
|
+
parse_article_body(article_url)
|
70
70
|
end
|
71
71
|
|
72
72
|
def parse(article)
|
73
73
|
pulse = Hash.new
|
74
74
|
|
75
|
-
is_a_video = !article.at(DOM[:article][:inner_container_video]).nil?
|
75
|
+
is_a_video = !article.at(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container_video]).nil?
|
76
76
|
|
77
|
-
key = article.css(DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
|
78
|
-
url = article.css(DOM[:article][:title]).children.first.attr('href')
|
79
|
-
title = article.css(DOM[:article][:title]).children.first.text
|
77
|
+
key = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:inner_container]).first.attr('data-chorus-optimize-id').to_i
|
78
|
+
url = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.attr('href')
|
79
|
+
title = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:title]).children.first.text
|
80
80
|
|
81
81
|
pulse[:id] = key
|
82
82
|
pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }")
|
83
83
|
|
84
84
|
begin
|
85
|
-
pulse[:cover] = article.children.css(DOM[:article][:cover]).children.children.first.attr('src')
|
85
|
+
pulse[:cover] = article.children.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:cover]).children.children.first.attr('src')
|
86
86
|
rescue
|
87
87
|
pulse[:cover] = nil
|
88
88
|
end
|
89
89
|
|
90
90
|
pulse[:url] = url
|
91
91
|
pulse[:title] = title
|
92
|
-
pulse[:author] = article.css(DOM[:article][:meta]).first.children[1].children[1]
|
93
|
-
pulse[:date] = JSON.parse(article.css(DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
|
92
|
+
pulse[:author] = article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.children[1].children[1]&.text
|
93
|
+
pulse[:date] = JSON.parse(article.css(GDNewsScraper::Scrapers::PolygonCOM::DOM[:article][:meta]).first.attr('data-cdata'))['timestamp'].to_i
|
94
94
|
pulse[:content] = parse_article_body(url, is_a_video)
|
95
95
|
pulse[:tags] = title.downcase.split
|
96
96
|
|
97
|
-
|
98
97
|
return pulse
|
99
|
-
rescue
|
100
|
-
|
98
|
+
rescue => e
|
99
|
+
"There was a problem while parsing Article for '#{ title }' => #{ e }"
|
101
100
|
end
|
102
101
|
|
103
102
|
private
|
104
103
|
|
105
104
|
def parse_article_body(article_url, is_a_video = false)
|
106
|
-
article_page = Nokogiri::HTML(open(article_url, HEADERS))
|
105
|
+
article_page = Nokogiri::HTML(open(article_url, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
|
107
106
|
article_container = article_page.css('.c-entry-content')
|
108
107
|
|
109
108
|
article_body = {
|
@@ -169,6 +168,12 @@ module GDNewsScraper::Scrapers
|
|
169
168
|
if twitdget
|
170
169
|
article_body[:body] << twitdget.to_html
|
171
170
|
end
|
171
|
+
|
172
|
+
redditget = node.at('.reddit-card')
|
173
|
+
|
174
|
+
if redditget
|
175
|
+
article_body[:body] << redditget.to_html
|
176
|
+
end
|
172
177
|
end
|
173
178
|
|
174
179
|
# First ensure the node is an actual element. This removes random HTML elements
|
@@ -181,7 +186,7 @@ module GDNewsScraper::Scrapers
|
|
181
186
|
#
|
182
187
|
# => WHITELIST[:default].include?(node.name)
|
183
188
|
#
|
184
|
-
if node.element? && WHITELIST[:default].include?(node.name)
|
189
|
+
if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
|
185
190
|
case node.name
|
186
191
|
when 'figure'
|
187
192
|
|
@@ -253,8 +258,8 @@ module GDNewsScraper::Scrapers
|
|
253
258
|
end
|
254
259
|
|
255
260
|
return article_body
|
256
|
-
rescue
|
257
|
-
|
261
|
+
rescue => e
|
262
|
+
"There was a problem while parsing the Article body for '#{ title }' => #{ e }"
|
258
263
|
end
|
259
264
|
|
260
265
|
def attr(attribute)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module GDNewsScraper
|
2
|
-
VERSION ||= '3.0.
|
2
|
+
VERSION ||= '3.0.4'
|
3
3
|
|
4
4
|
# => major: A new Source has been added or removed
|
5
5
|
# => minor: A Source code has changed drastically to a point where it's not
|
@@ -27,4 +27,9 @@ module GDNewsScraper
|
|
27
27
|
# widget
|
28
28
|
# v3.0.1 - Change the placeholder from [] to {{:}} which makes it a lot easier
|
29
29
|
# to scan and replace with Regex using scan(/\{{(.*?)\}}/)
|
30
|
+
# v3.0.2 - Parse Reddit inline widgets in the same way as Twitter widget
|
31
|
+
# v3.0.3 - Added a new method which will refresh the content of an Article
|
32
|
+
# v3.0.4 - Fixed an issue caused by Featured Articles which have a different
|
33
|
+
# DOM structure
|
34
|
+
#
|
30
35
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: GDNewsScraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vlad Radulescu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|