CLI_Headline_Scraper 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLI_Headline_Scraper-0.1.6.gem +0 -0
- data/bin/console +0 -6
- data/bin/headline_scraper +2 -10
- data/lib/CLI_Headline_Scraper/Article.rb +11 -3
- data/lib/CLI_Headline_Scraper/CLI.rb +9 -11
- data/lib/CLI_Headline_Scraper/Scraper.rb +9 -5
- data/lib/CLI_Headline_Scraper/version.rb +1 -1
- data/lib/environment.rb +0 -1
- data/refactor_notes.txt +4 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4575df135e79c6ac587405d19faf26ae4bd0ba62
|
4
|
+
data.tar.gz: e249f38021d93f7e95d454b614bed0ae99cd5bd8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e1e0a9f6e8612f3073561bae8e2fcb45235b78af5eff98299c8c2865bfda2f1982b9f3b047e3035f47184a0d001b6e8cf5b46ef37e6b01a80fa11afbe9fd14f5
|
7
|
+
data.tar.gz: f16967e39c2a8020912c1b94adc75cec15c866714a6e420f02a414c6aea277c3867c33e494186fc84b2c3171177d0f10ed89316ea44fab37b354c9efcaceffdb
|
Binary file
|
data/bin/console
CHANGED
data/bin/headline_scraper
CHANGED
@@ -1,13 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
|
4
|
-
###testing conditions###
|
5
|
-
|
6
|
-
|
7
|
-
###end testing conditions###
|
8
|
-
|
9
|
-
Scraper.msnbc_homepage
|
10
|
-
Scraper.fox_homepage
|
11
|
-
Scraper.reuters_homepage
|
2
|
+
puts "starting"
|
3
|
+
require_relative '../lib/environment'
|
12
4
|
|
13
5
|
CLI.new.call
|
@@ -32,15 +32,23 @@ class Article
|
|
32
32
|
self.all.select{|item| item.network_name == network_name}
|
33
33
|
end
|
34
34
|
|
35
|
+
def self.find_by_summary(word)
|
36
|
+
|
37
|
+
#cycle through all articles.
|
38
|
+
#look at each article's summary
|
39
|
+
#if summary contains word, add summary to a new array.
|
40
|
+
#after finished with all articles, display array.
|
41
|
+
self.all.select { |article| article.summary.downcase.include?(word.downcase) }
|
42
|
+
|
43
|
+
|
44
|
+
end
|
45
|
+
|
35
46
|
|
36
47
|
def populate_metadata()
|
37
48
|
#retreives metadata of reuters article -- right now just time/date.
|
38
49
|
#1. Scrapes data from the selected article's url.(separate)
|
39
50
|
#3. Uses that data to populate article.authors, article.date_posted, article.text.
|
40
|
-
|
41
51
|
Scraper.reuters_article(self)
|
42
|
-
|
43
|
-
|
44
52
|
article = Article.find_by_headline(headline)
|
45
53
|
|
46
54
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
#Our CLI Controller
|
2
|
+
|
3
|
+
require 'pry'
|
2
4
|
class CLI
|
3
5
|
|
4
6
|
attr_reader :time
|
@@ -8,6 +10,12 @@ class CLI
|
|
8
10
|
end
|
9
11
|
|
10
12
|
def call
|
13
|
+
puts "Initializing..."
|
14
|
+
Scraper.msnbc_homepage
|
15
|
+
Scraper.fox_homepage
|
16
|
+
Scraper.reuters_homepage
|
17
|
+
puts("done")
|
18
|
+
puts("")
|
11
19
|
|
12
20
|
self.greet
|
13
21
|
self.display_menu #initial menu selection of what you want to see
|
@@ -19,7 +27,7 @@ class CLI
|
|
19
27
|
puts "Welcome to Headline Scraper"
|
20
28
|
sleep(1)
|
21
29
|
puts "Please select which of the following articles you would like to view:"
|
22
|
-
sleep(1.5)
|
30
|
+
sleep(1.5)
|
23
31
|
puts ""
|
24
32
|
end
|
25
33
|
|
@@ -116,8 +124,6 @@ class CLI
|
|
116
124
|
the_article = the_network.articles[selection[1]-1]
|
117
125
|
self.article_options_menu(the_article)
|
118
126
|
end
|
119
|
-
|
120
|
-
|
121
127
|
end
|
122
128
|
|
123
129
|
def selection_exists?(selection) #post-screens entries to make sure the valid entry actually refers to an existing item
|
@@ -144,8 +150,6 @@ class CLI
|
|
144
150
|
end
|
145
151
|
end
|
146
152
|
|
147
|
-
|
148
|
-
|
149
153
|
def select_scrape_method(article)
|
150
154
|
|
151
155
|
case article.network_name
|
@@ -159,11 +163,6 @@ class CLI
|
|
159
163
|
end
|
160
164
|
end
|
161
165
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
166
|
def article_options_menu(article)
|
168
167
|
#takes article object as an argument
|
169
168
|
#automatically displays article headline, network name, and article metadata (i.e. author, date & time posted, number of comments, tags etc.)
|
@@ -186,7 +185,6 @@ class CLI
|
|
186
185
|
puts "1. View article in browser."
|
187
186
|
puts "2. Return to previous menu."
|
188
187
|
puts "Or type 'exit'."
|
189
|
-
|
190
188
|
input = gets.strip.upcase
|
191
189
|
case input
|
192
190
|
when "1"
|
@@ -7,13 +7,14 @@ class Scraper
|
|
7
7
|
#<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
8
8
|
|
9
9
|
def self.reuters_homepage
|
10
|
-
|
10
|
+
puts "scraping Reuters homepage"
|
11
11
|
url = "https://www.reuters.com"
|
12
12
|
homepage = self.get_page(url)
|
13
13
|
reuters = Network.create_with_url("REUTERS", url)
|
14
14
|
reuters.home_html = homepage
|
15
15
|
self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])}
|
16
16
|
|
17
|
+
|
17
18
|
end
|
18
19
|
|
19
20
|
|
@@ -33,7 +34,7 @@ end
|
|
33
34
|
|
34
35
|
|
35
36
|
def self.check_reuters_urls(articles)
|
36
|
-
#checks for and corrects common issue where
|
37
|
+
#checks for and corrects common issue where a website uses partial urls for internal links
|
37
38
|
|
38
39
|
articles.each do |article|
|
39
40
|
if !article[1].include?("www")
|
@@ -59,6 +60,7 @@ end
|
|
59
60
|
#<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
60
61
|
|
61
62
|
def self.fox_homepage
|
63
|
+
puts "scraping Fox homepage"
|
62
64
|
url = "http://www.foxnews.com"
|
63
65
|
homepage = self.get_page(url)
|
64
66
|
fox = Network.create_with_url("FOX NEWS", url)
|
@@ -70,16 +72,17 @@ end
|
|
70
72
|
def self.scrape_fox_articles
|
71
73
|
|
72
74
|
html = Network.find_by_name("FOX NEWS").home_html
|
75
|
+
leader = [html.css("div.collection.collection-spotlight article.article.story-1 header a").text.strip, html.css("div.collection.collection-spotlight article.article.story-1 header a").attribute("href")]
|
73
76
|
|
74
|
-
|
75
|
-
second = [html.css("div.top-stories a h3").first.text, html.css("div.top-stories li").first.css("a").attribute("href").value]
|
77
|
+
second = [html.css("div.main.main-secondary article.article.story-1 h2.title a").text, html.css("div.main.main-secondary article.article.story-1 h2.title a").attribute("href").value]
|
76
78
|
|
77
|
-
|
79
|
+
third = [html.css("div.main.main-secondary article.article.story-2 h2.title a").text, html.css("div.main.main-secondary article.article.story-2 h2.title a").attribute("href").value]
|
78
80
|
|
79
81
|
articles = [leader, second, third]
|
80
82
|
|
81
83
|
end
|
82
84
|
|
85
|
+
|
83
86
|
def self.fox_article(article)
|
84
87
|
article.html = self.get_page(article.url)
|
85
88
|
article.summary = article.html.css("meta[name='description']").attribute("content").value
|
@@ -92,6 +95,7 @@ end
|
|
92
95
|
#<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
93
96
|
|
94
97
|
def self.msnbc_homepage
|
98
|
+
puts "scraping MSNBC homepage"
|
95
99
|
url = "http://www.msnbc.com"
|
96
100
|
homepage = self.get_page(url)
|
97
101
|
msnbc = Network.create_with_url("MSNBC", url)
|
data/lib/environment.rb
CHANGED
data/refactor_notes.txt
ADDED
@@ -0,0 +1,4 @@
|
|
1
|
+
1. Make separate scraper classes for each publication (supported by a scraper module)
|
2
|
+
2. Make it so that each Object only knows about a max of one other object.
|
3
|
+
3. Eliminate magic numbers.
|
4
|
+
4. Don't use 'singleton' classes. Make a new instance of each scraper object(one for each website) and store things like its html in instance variables like @doc.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: CLI_Headline_Scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jim Stricker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -126,6 +126,7 @@ files:
|
|
126
126
|
- ".gitignore"
|
127
127
|
- ".rspec"
|
128
128
|
- ".travis.yml"
|
129
|
+
- CLI_Headline_Scraper-0.1.6.gem
|
129
130
|
- CLI_Headline_Scraper.gemspec
|
130
131
|
- Gemfile
|
131
132
|
- LICENSE.txt
|
@@ -140,6 +141,7 @@ files:
|
|
140
141
|
- lib/CLI_Headline_Scraper/Scraper.rb
|
141
142
|
- lib/CLI_Headline_Scraper/version.rb
|
142
143
|
- lib/environment.rb
|
144
|
+
- refactor_notes.txt
|
143
145
|
- selection
|
144
146
|
- spec.md
|
145
147
|
homepage: https://github.com/jmstrick93/CLI_Headline_Scraper
|