CLI_Headline_Scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CLI_Headline_Scraper.gemspec +42 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +31 -0
- data/Rakefile +6 -0
- data/bin/console +20 -0
- data/bin/headline_scraper +14 -0
- data/bin/setup +8 -0
- data/lib/CLI_Headline_Scraper/Article.rb +48 -0
- data/lib/CLI_Headline_Scraper/CLI.rb +213 -0
- data/lib/CLI_Headline_Scraper/Network.rb +46 -0
- data/lib/CLI_Headline_Scraper/Scraper.rb +140 -0
- data/lib/CLI_Headline_Scraper/version.rb +3 -0
- data/lib/environment.rb +5 -0
- data/selection +26 -0
- data/spec.md +14 -0
- metadata +182 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 16e63056768bdb20ef402930ff101d595357db3c
|
4
|
+
data.tar.gz: 73f6ac8498d9934ce7e68d4a4b7eb9bf3c99c5e3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3a2eeb32652cb2a35c9ce0941d8b0fb4a344c1aebc2bcd1888e72bfb3e34cdf236a9377169452b639ed5872f26099771402b4540524472d5571e86b6ac1dd409
|
7
|
+
data.tar.gz: 1dec8df00d36ee2dcac4793a633523785496aae439175721a441a15975d732d818cb0cd661c930b717f46bc73b055fb3d45e4b9b41b24b2c78b3910d22c1c8d8
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "CLI_Headline_Scraper/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "CLI_Headline_Scraper"
|
8
|
+
spec.version = CLIHeadlineScraper::VERSION
|
9
|
+
spec.authors = ["Jim Stricker"]
|
10
|
+
spec.email = ["jmstricker93@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{A headline-scraping CLI program.}
|
13
|
+
spec.description = %q{This program will automatically compile a list of the top three headlines from a wide range of online news sources. It is intended less as a way to get your news every day, and more as an interesting tool to get a quick snapshot of how the day's top stories are being portrayed across the media ecosystem at a given moment. For instance, if you wanted to quickly see what is being emphasized in media outlets consisting of different political inclinations or based in different countries, this would allow you to do so quickly, rather than going to all of these websites' homepages manually.}
|
14
|
+
spec.homepage = "https://github.com/jmstrick93/CLI_Headline_Scraper"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# # to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
# if spec.respond_to?(:metadata)
|
20
|
+
# spec.metadata["allowed_push_host"] = 'http://mygemserver.com'
|
21
|
+
# else
|
22
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
23
|
+
# "public gem pushes."
|
24
|
+
# end
|
25
|
+
|
26
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
f.match(%r{^(test|spec|features)/})
|
28
|
+
end
|
29
|
+
spec.bindir = "exe"
|
30
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
|
+
spec.require_paths = ["lib"]
|
32
|
+
|
33
|
+
spec.add_development_dependency "bundler", "~> 1.15"
|
34
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
35
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
36
|
+
spec.add_development_dependency "pry", "~> 0.10"
|
37
|
+
spec.add_development_dependency "timecop", "~>0.9"
|
38
|
+
|
39
|
+
spec.add_dependency "require_all", "~> 1.4"
|
40
|
+
spec.add_dependency "launchy", "~> 2.4"
|
41
|
+
spec.add_dependency "nokogiri", "~> 1.8"
|
42
|
+
end
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 TODO: Write your name
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# CLIHeadlineScraper
|
2
|
+
|
3
|
+
|
4
|
+
## Installation
|
5
|
+
|
6
|
+
You can install this gem via `gem install CLI_Headline_Scraper`.
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
|
10
|
+
You can run this gem using the `headline_scraper` CLI command. You will then be given a list of headlines and news networks from which to select (instructions for doing so will be provided onscreen).
|
11
|
+
|
12
|
+
|
13
|
+
## Development
|
14
|
+
|
15
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
16
|
+
|
17
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
18
|
+
|
19
|
+
## Contributing
|
20
|
+
|
21
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/jmstrick93/CLI_Headline_Scraper.
|
22
|
+
|
23
|
+
Contributions adding functionality for additional news networks are encouraged.
|
24
|
+
|
25
|
+
Contributions adding a more robust RSpec suite are also welcome.
|
26
|
+
|
27
|
+
The scraper object will likely need to be occasionally maintained as the HTML formats of networks' websites changes. If you encounter an error while using this software, please report it at the GitHub repo above.
|
28
|
+
|
29
|
+
## License
|
30
|
+
|
31
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "environment"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
require "irb"
|
13
|
+
|
14
|
+
###Testing Conditions###
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
###end testing conditions###
|
20
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
class Article
|
2
|
+
|
3
|
+
attr_accessor :network_name, :network, :headline, :url, :authors, :html, :date, :number_of_comments, :summary
|
4
|
+
|
5
|
+
@@all = []
|
6
|
+
|
7
|
+
def initialize(headline, network_name) #headline will eventually be input as a scraper object.
|
8
|
+
self.class.all << self
|
9
|
+
@network_name = network_name
|
10
|
+
@network = Network.find_or_create_by_name(network_name)
|
11
|
+
@network.articles << self
|
12
|
+
#belongs to network
|
13
|
+
@headline = headline
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.all
|
17
|
+
@@all
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.create_with_url(headline, network_name, url)
|
21
|
+
article = Article.new(headline, network_name)
|
22
|
+
article.url = url
|
23
|
+
article
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
def self.find_by_headline(headline)
|
28
|
+
self.all.detect{|item| item.headline == headline}
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.find_by_network_name(network_name)
|
32
|
+
self.all.select{|item| item.network_name == network_name}
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def populate_metadata()
|
37
|
+
#retreives metadata of reuters article -- right now just time/date.
|
38
|
+
#1. Scrapes data from the selected article's url.(separate)
|
39
|
+
#3. Uses that data to populate article.authors, article.date_posted, article.text.
|
40
|
+
|
41
|
+
Scraper.reuters_article(self)
|
42
|
+
|
43
|
+
|
44
|
+
article = Article.find_by_headline(headline)
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
#Our CLI Controller
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
attr_reader :time
|
5
|
+
attr_accessor :current_item
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
end
|
9
|
+
|
10
|
+
def call
|
11
|
+
|
12
|
+
self.greet
|
13
|
+
self.display_menu #initial menu selection of what you want to see
|
14
|
+
self.respond_to_selection(self.select_item)
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def greet
|
19
|
+
puts "Welcome to Headline Scraper"
|
20
|
+
sleep(1)
|
21
|
+
puts "Please select which of the following articles you would like to view:"
|
22
|
+
sleep(2)
|
23
|
+
puts ""
|
24
|
+
end
|
25
|
+
|
26
|
+
def display_menu
|
27
|
+
self.class.display_time
|
28
|
+
puts ""
|
29
|
+
self.print_group_headlines
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def print_group_headlines
|
34
|
+
Network.all.each do |network|
|
35
|
+
puts network.name #prints network name once
|
36
|
+
network.print_headlines # prints network headlines in numbered list
|
37
|
+
puts "" #for spacing
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def valid_selection?(selection) #pre-screens nonsensical entries. DOES NOT check whether the item entered exists
|
42
|
+
if selection == nil #
|
43
|
+
false
|
44
|
+
elsif selection.length == 0
|
45
|
+
false
|
46
|
+
elsif selection.length == 1
|
47
|
+
if selection[0].to_i != 0 #makes furst first item isnt Integer
|
48
|
+
false
|
49
|
+
else
|
50
|
+
true
|
51
|
+
end
|
52
|
+
elsif selection.length == 2
|
53
|
+
if selection[0].to_i != 0 #makes sure first item isnt Integer
|
54
|
+
false
|
55
|
+
else
|
56
|
+
if selection[1].to_i == 0 #makes sure second item IS integer
|
57
|
+
false
|
58
|
+
elsif selection[1].to_i > 3 #makes sure there are not >3 entries
|
59
|
+
false
|
60
|
+
else
|
61
|
+
true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
elsif selection.length > 2 #makes sure entry isnt longer than 3
|
65
|
+
false
|
66
|
+
else
|
67
|
+
true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def select_item #returns an array where arr[0] is the network name and arr[1] is the article number.
|
73
|
+
#currently accepts all entries that do not contain a colon. Later make it so it checks whether the network entered exists.
|
74
|
+
selection = nil
|
75
|
+
until selection_exists?(selection) || selection == 'EXIT'
|
76
|
+
puts "To go to a network homepage, just type the name of that network."
|
77
|
+
puts "To go to a specific story, type the network name and then the article number, separated by a colon (e.g., BBC : 2)"
|
78
|
+
puts "To exit at any time, type 'exit'."
|
79
|
+
|
80
|
+
selection = gets.strip
|
81
|
+
selection = selection.split(":") if selection != nil #turns the entered data into an array so ti can be processed
|
82
|
+
if valid_selection?(selection)
|
83
|
+
selection[0].strip!
|
84
|
+
selection[0] = selection[0].upcase
|
85
|
+
if selection.length == 1
|
86
|
+
if selection[0] == 'EXIT'
|
87
|
+
self.exit_CLI
|
88
|
+
end
|
89
|
+
elsif selection.length == 2
|
90
|
+
selection[1].strip!
|
91
|
+
selection[1] = selection[1].to_i
|
92
|
+
end
|
93
|
+
|
94
|
+
if !selection_exists?(selection)
|
95
|
+
puts "Selection not found"
|
96
|
+
end
|
97
|
+
else
|
98
|
+
puts "Invalid Entry"
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
selection
|
103
|
+
end
|
104
|
+
|
105
|
+
def exit_CLI
|
106
|
+
puts "Goodbye!"
|
107
|
+
exit
|
108
|
+
end
|
109
|
+
|
110
|
+
def respond_to_selection(selection)
|
111
|
+
if selection.length == 1
|
112
|
+
the_network = Network.find_by_name(selection[0])
|
113
|
+
the_network.go_to_homepage
|
114
|
+
elsif selection.length == 2
|
115
|
+
the_network = Network.find_by_name(selection[0])
|
116
|
+
the_article = the_network.articles[selection[1]-1]
|
117
|
+
self.article_options_menu(the_article)
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
def selection_exists?(selection) #post-screens entries to make sure the valid entry actually refers to an existing item
|
124
|
+
if self.valid_selection?(selection)
|
125
|
+
if selection.length == 1
|
126
|
+
if Network.find_by_name(selection[0])
|
127
|
+
true
|
128
|
+
else
|
129
|
+
false
|
130
|
+
end
|
131
|
+
elsif selection.length == 2
|
132
|
+
if Network.find_by_name(selection[0])
|
133
|
+
if selection[1] > Network.find_by_name(selection[0]).articles.length || selection[1] <= 0
|
134
|
+
false
|
135
|
+
else
|
136
|
+
true
|
137
|
+
end
|
138
|
+
else
|
139
|
+
false
|
140
|
+
end
|
141
|
+
end
|
142
|
+
else
|
143
|
+
false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
def select_scrape_method(article)
|
150
|
+
|
151
|
+
case article.network_name
|
152
|
+
|
153
|
+
when "REUTERS"
|
154
|
+
Scraper.reuters_article(article)
|
155
|
+
when "FOX NEWS"
|
156
|
+
Scraper.fox_article(article)
|
157
|
+
when "MSNBC"
|
158
|
+
Scraper.msnbc_article(article)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
def article_options_menu(article)
|
168
|
+
#takes article object as an argument
|
169
|
+
#automatically displays article headline, network name, and article metadata (i.e. author, date & time posted, number of comments, tags etc.)
|
170
|
+
|
171
|
+
#gives the option for the user to either go to the article in browser or scrape the contents of the article
|
172
|
+
self.select_scrape_method(article)
|
173
|
+
|
174
|
+
puts "_____________________________________"
|
175
|
+
puts ""
|
176
|
+
puts article.network_name
|
177
|
+
puts article.headline
|
178
|
+
puts article.date
|
179
|
+
puts ""
|
180
|
+
puts article.summary
|
181
|
+
puts ""
|
182
|
+
puts "---------------"
|
183
|
+
puts ""
|
184
|
+
|
185
|
+
puts "What would you like to do? Enter a number."
|
186
|
+
puts "1. View article in browser."
|
187
|
+
puts "2. Return to previous menu."
|
188
|
+
puts "Or type 'exit'."
|
189
|
+
|
190
|
+
input = gets.strip.upcase
|
191
|
+
case input
|
192
|
+
when "1"
|
193
|
+
Launchy.open(article.url)
|
194
|
+
when "2"
|
195
|
+
self.display_menu
|
196
|
+
self.respond_to_selection(self.select_item)
|
197
|
+
when "EXIT"
|
198
|
+
self.exit_CLI
|
199
|
+
else
|
200
|
+
puts "Invalid Selection"
|
201
|
+
self.article_options_menu(article)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
def retrieve_article
|
207
|
+
end
|
208
|
+
|
209
|
+
def self.display_time
|
210
|
+
puts Time.new
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class Network
|
2
|
+
|
3
|
+
attr_accessor :articles, :url, :home_html
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
@@all = []
|
7
|
+
|
8
|
+
def initialize(name)
|
9
|
+
@name = name
|
10
|
+
@articles = [] #network has many articles
|
11
|
+
self.class.all << self
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all
|
15
|
+
@@all
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.create_with_url(name, url)
|
19
|
+
network = self.new(name)
|
20
|
+
network.url = url
|
21
|
+
network
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.find_by_name(name)
|
25
|
+
self.all.detect{|item| item.name == name}
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.find_or_create_by_name(name)
|
29
|
+
self.find_by_name(name) || self.new(name)
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def print_headlines
|
34
|
+
self.articles.each.with_index(1) do |article, i|
|
35
|
+
puts "#{i}. #{article.headline}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def go_to_homepage
|
41
|
+
Launchy.open(self.url)
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
class Scraper
|
2
|
+
|
3
|
+
def self.get_page(url)
|
4
|
+
doc = Nokogiri::HTML(open(url))
|
5
|
+
end
|
6
|
+
|
7
|
+
#<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
8
|
+
|
9
|
+
def self.reuters_homepage
|
10
|
+
|
11
|
+
url = "https://www.reuters.com"
|
12
|
+
homepage = self.get_page(url)
|
13
|
+
reuters = Network.create_with_url("REUTERS", url)
|
14
|
+
reuters.home_html = homepage
|
15
|
+
self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])}
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def self.scrape_reuters_articles
|
21
|
+
|
22
|
+
html = Network.find_by_name("REUTERS").home_html
|
23
|
+
leader = [html.css("section.right-now-module h2.story-title a").text, html.css("section.right-now-module h2.story-title a").attribute("href").value]
|
24
|
+
second = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title").first.text.strip, html.css("section#hp-top-news-top article.story div.story-content a").first.attribute("href").value]
|
25
|
+
third = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title")[1].text.strip, html.css("section#hp-top-news-top article.story div.story-content a")[1].attribute("href").value]
|
26
|
+
articles = [leader, second, third]
|
27
|
+
|
28
|
+
self.check_reuters_urls(articles)
|
29
|
+
|
30
|
+
articles
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def self.check_reuters_urls(articles)
|
36
|
+
#checks for and corrects common issue where MSNBC uses partial urls for internal links
|
37
|
+
|
38
|
+
articles.each do |article|
|
39
|
+
if !article[1].include?("www")
|
40
|
+
article[1] = "https://www.reuters.com" + article[1]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.reuters_article(article)
|
46
|
+
|
47
|
+
article.html = self.get_page(article.url)
|
48
|
+
article.summary = article.html.css("meta[name='description']").attribute("content").value
|
49
|
+
|
50
|
+
article.date = article.html.css("meta[name='REVISION_DATE']").attribute("content").value
|
51
|
+
|
52
|
+
# article.authors = article.html.css("meta[name='Author']").attribute("content").value
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
#<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
60
|
+
|
61
|
+
def self.fox_homepage
|
62
|
+
url = "http://www.foxnews.com"
|
63
|
+
homepage = self.get_page(url)
|
64
|
+
fox = Network.create_with_url("FOX NEWS", url)
|
65
|
+
fox.home_html = homepage
|
66
|
+
self.scrape_fox_articles.each{|article| article = Article.create_with_url(article[0],"FOX NEWS", article[1])}
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.scrape_fox_articles
|
71
|
+
|
72
|
+
html = Network.find_by_name("FOX NEWS").home_html
|
73
|
+
|
74
|
+
leader = [html.css("div.primary h1 a").text, html.css("div.primary h1 a").attribute("href").value]
|
75
|
+
second = [html.css("div.top-stories a h3").first.text, html.css("div.top-stories li").first.css("a").attribute("href").value]
|
76
|
+
|
77
|
+
third = [html.css("div.top-stories a h3")[1].text, html.css("div.top-stories li[data-vr-contentbox = ''] a")[4].attribute("href").value]
|
78
|
+
|
79
|
+
articles = [leader, second, third]
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.fox_article(article)
|
84
|
+
article.html = self.get_page(article.url)
|
85
|
+
article.summary = article.html.css("meta[name='description']").attribute("content").value
|
86
|
+
|
87
|
+
article.date = article.html.css("meta[name='dc.date']").attribute("content").value
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
#<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
93
|
+
|
94
|
+
def self.msnbc_homepage
|
95
|
+
url = "http://www.msnbc.com"
|
96
|
+
homepage = self.get_page(url)
|
97
|
+
msnbc = Network.create_with_url("MSNBC", url)
|
98
|
+
msnbc.home_html = homepage
|
99
|
+
self.scrape_msnbc_articles.each{|article| article = Article.create_with_url(article[0],"MSNBC", article[1])}
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.scrape_msnbc_articles
|
104
|
+
|
105
|
+
html = Network.find_by_name("MSNBC").home_html
|
106
|
+
leader = [html.css("a[data-fragment = '#homepage-item-1'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-1']").attribute("href").value]
|
107
|
+
second = [html.css("a[data-fragment = '#homepage-item-2'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-2']").attribute("href").value]
|
108
|
+
third = [html.css("a[data-fragment = '#homepage-item-3'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-3']").attribute("href").value]
|
109
|
+
|
110
|
+
articles = [leader, second, third]
|
111
|
+
self.check_msnbc_urls(articles)
|
112
|
+
|
113
|
+
articles
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.check_msnbc_urls(articles)
|
117
|
+
#checks for and corrects common issue where MSNBC uses partial urls for internal links
|
118
|
+
|
119
|
+
articles.each do |article|
|
120
|
+
if !article[1].include?("www")
|
121
|
+
article[1] = "http://www.msnbc.com" + article[1]
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.msnbc_article(article)
|
127
|
+
|
128
|
+
article.html = self.get_page(article.url)
|
129
|
+
article.summary = article.html.css("meta[name='description']").attribute("content").value
|
130
|
+
|
131
|
+
if !!article.html.css("meta[property='nv:date']")[0]
|
132
|
+
article.date = article.html.css("meta[property='nv:date']").attribute("content").value
|
133
|
+
else
|
134
|
+
article.date = article.html.css("meta[name = 'DC.date.issued']").attribute("content").value
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
end
|
data/lib/environment.rb
ADDED
data/selection
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
[1mFrom:[0m /home/jim/code/projects/CLI_Headline_Scraper/lib/CLI_Headline_Scraper/CLI.rb @ line 137 CLIHeadlineScraper::CLI#select_item:
|
3
|
+
|
4
|
+
[1;34m118[0m: [32mdef[0m [1;34mselect_item[0m [1;34m#returns an array where arr[0] is the network name and arr[1] is the article number.[0m
|
5
|
+
[1;34m119[0m: [1;34m#currently accepts all entries that do not contain a colon. Later make it so it checks whether the network entered exists.[0m
|
6
|
+
[1;34m120[0m: selection = [1;36mnil[0m
|
7
|
+
[1;34m121[0m: [32muntil[0m valid_selection?(selection)
|
8
|
+
[1;34m122[0m: puts [31m[1;31m"[0m[31mTo go to a story, type the network name and then the article number, separated by a colon (e.g., BBC : 2)[1;31m"[0m[31m[0m
|
9
|
+
[1;34m123[0m: selection = gets.strip
|
10
|
+
[1;34m124[0m:
|
11
|
+
[1;34m125[0m: selection = selection.split([31m[1;31m"[0m[31m:[1;31m"[0m[31m[0m)
|
12
|
+
[1;34m126[0m: selection[[1;34m0[0m].strip!
|
13
|
+
[1;34m127[0m: selection[[1;34m0[0m] = selection[[1;34m0[0m].upcase
|
14
|
+
[1;34m128[0m: [32mif[0m selection.length == [1;34m2[0m
|
15
|
+
[1;34m129[0m: selection[[1;34m1[0m].strip!
|
16
|
+
[1;34m130[0m: selection[[1;34m1[0m] = selection[[1;34m1[0m].to_i
|
17
|
+
[1;34m131[0m: [32mend[0m
|
18
|
+
[1;34m132[0m:
|
19
|
+
[1;34m133[0m: [32mif[0m !valid_selection?(selection)
|
20
|
+
[1;34m134[0m: puts [31m[1;31m"[0m[31mInvalid Entry[1;31m"[0m[31m[0m
|
21
|
+
[1;34m135[0m: [32mend[0m
|
22
|
+
[1;34m136[0m: [32mend[0m
|
23
|
+
=> [1;34m137[0m: binding.pry
|
24
|
+
[1;34m138[0m: selection
|
25
|
+
[1;34m139[0m: [32mend[0m
|
26
|
+
|
data/spec.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Specifications for the CLI Assessment
|
2
|
+
|
3
|
+
Specs:
|
4
|
+
- [x] Have a CLI for interfacing with the application
|
5
|
+
|
6
|
+
The program is launched using an executable and its processes are organized using a CLI object. It responds to command line inputs and produces cleanly-printed outputs.
|
7
|
+
|
8
|
+
- [X] Pull data from an external source
|
9
|
+
|
10
|
+
The program pulls data from the websites of three different news networks, and can be easily expanded to do so with more.
|
11
|
+
|
12
|
+
- [X] Implement both list and detail views
|
13
|
+
|
14
|
+
The program allows the user to look at lists of headlines and then select one for detail-view that includes publishing date, article summary, and gives the user the option to view it in their web browser.
|
metadata
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: CLI_Headline_Scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jim Stricker
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-09-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: timecop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.9'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.9'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: require_all
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.4'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.4'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: launchy
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.4'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.4'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: nokogiri
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.8'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.8'
|
125
|
+
description: This program will automatically compile a list of the top three headlines
|
126
|
+
from a wide range of online news sources. It is intended less as a way to get your
|
127
|
+
news every day, and more as an interesting tool to get a quick snapshot of how the
|
128
|
+
day's top stories are being portrayed across the media ecosystem at a given moment. For
|
129
|
+
instance, if you wanted to quickly see what is being emphasized in media outlets
|
130
|
+
consisting of different political inclinations or based in different countries,
|
131
|
+
this would allow you to do so quickly, rather than going to all of these websites'
|
132
|
+
homepages manually.
|
133
|
+
email:
|
134
|
+
- jmstricker93@gmail.com
|
135
|
+
executables: []
|
136
|
+
extensions: []
|
137
|
+
extra_rdoc_files: []
|
138
|
+
files:
|
139
|
+
- ".gitignore"
|
140
|
+
- ".rspec"
|
141
|
+
- ".travis.yml"
|
142
|
+
- CLI_Headline_Scraper.gemspec
|
143
|
+
- Gemfile
|
144
|
+
- LICENSE.txt
|
145
|
+
- README.md
|
146
|
+
- Rakefile
|
147
|
+
- bin/console
|
148
|
+
- bin/headline_scraper
|
149
|
+
- bin/setup
|
150
|
+
- lib/CLI_Headline_Scraper/Article.rb
|
151
|
+
- lib/CLI_Headline_Scraper/CLI.rb
|
152
|
+
- lib/CLI_Headline_Scraper/Network.rb
|
153
|
+
- lib/CLI_Headline_Scraper/Scraper.rb
|
154
|
+
- lib/CLI_Headline_Scraper/version.rb
|
155
|
+
- lib/environment.rb
|
156
|
+
- selection
|
157
|
+
- spec.md
|
158
|
+
homepage: https://github.com/jmstrick93/CLI_Headline_Scraper
|
159
|
+
licenses:
|
160
|
+
- MIT
|
161
|
+
metadata: {}
|
162
|
+
post_install_message:
|
163
|
+
rdoc_options: []
|
164
|
+
require_paths:
|
165
|
+
- lib
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - ">="
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
|
+
requirements:
|
173
|
+
- - ">="
|
174
|
+
- !ruby/object:Gem::Version
|
175
|
+
version: '0'
|
176
|
+
requirements: []
|
177
|
+
rubyforge_project:
|
178
|
+
rubygems_version: 2.4.8
|
179
|
+
signing_key:
|
180
|
+
specification_version: 4
|
181
|
+
summary: A headline-scraping CLI program.
|
182
|
+
test_files: []
|