CLI_Headline_Scraper 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c531dc5cb0962a2a016687894ad6212e6b7bf34b
4
- data.tar.gz: 14e4d1939afadf7e741ecad229e6710b55d809d4
3
+ metadata.gz: 4575df135e79c6ac587405d19faf26ae4bd0ba62
4
+ data.tar.gz: e249f38021d93f7e95d454b614bed0ae99cd5bd8
5
5
  SHA512:
6
- metadata.gz: 91ea48e86a9152864abcf1f187c76eb034a9a87464dfc23002b6b774b904a9fe7fcee952ae58145c38c289609e040e541ee89f7efda50eb184d26c9b74690360
7
- data.tar.gz: bb58536699d79b58ffb88fa06a63d43aab7392cb6edba027ffe0f86eefc0c9b354a778515069e983cd4f175e45d72f20f033bcf94825801abdf9aedc25f1b44b
6
+ metadata.gz: e1e0a9f6e8612f3073561bae8e2fcb45235b78af5eff98299c8c2865bfda2f1982b9f3b047e3035f47184a0d001b6e8cf5b46ef37e6b01a80fa11afbe9fd14f5
7
+ data.tar.gz: f16967e39c2a8020912c1b94adc75cec15c866714a6e420f02a414c6aea277c3867c33e494186fc84b2c3171177d0f10ed89316ea44fab37b354c9efcaceffdb
Binary file
data/bin/console CHANGED
@@ -11,10 +11,4 @@ require "environment"
11
11
  # Pry.start
12
12
  require "irb"
13
13
 
14
- ###Testing Conditions###
15
-
16
-
17
-
18
-
19
- ###end testing conditions###
20
14
  IRB.start(__FILE__)
data/bin/headline_scraper CHANGED
@@ -1,13 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
- require 'environment'
3
-
4
- ###testing conditions###
5
-
6
-
7
- ###end testing conditions###
8
-
9
- Scraper.msnbc_homepage
10
- Scraper.fox_homepage
11
- Scraper.reuters_homepage
2
+ puts "starting"
3
+ require_relative '../lib/environment'
12
4
 
13
5
  CLI.new.call
@@ -32,15 +32,23 @@ class Article
32
32
  self.all.select{|item| item.network_name == network_name}
33
33
  end
34
34
 
35
+ def self.find_by_summary(word)
36
+
37
+ #cycle through all articles.
38
+ #look at each article's summary
39
+ #if summary contains word, add summary to a new array.
40
+ #after finished with all articles, display array.
41
+ self.all.select { |article| article.summary.downcase.include?(word.downcase) }
42
+
43
+
44
+ end
45
+
35
46
 
36
47
  def populate_metadata()
37
48
  #retreives metadata of reuters article -- right now just time/date.
38
49
  #1. Scrapes data from the selected article's url.(separate)
39
50
  #3. Uses that data to populate article.authors, article.date_posted, article.text.
40
-
41
51
  Scraper.reuters_article(self)
42
-
43
-
44
52
  article = Article.find_by_headline(headline)
45
53
 
46
54
  end
@@ -1,4 +1,6 @@
1
1
  #Our CLI Controller
2
+
3
+ require 'pry'
2
4
  class CLI
3
5
 
4
6
  attr_reader :time
@@ -8,6 +10,12 @@ class CLI
8
10
  end
9
11
 
10
12
  def call
13
+ puts "Initializing..."
14
+ Scraper.msnbc_homepage
15
+ Scraper.fox_homepage
16
+ Scraper.reuters_homepage
17
+ puts("done")
18
+ puts("")
11
19
 
12
20
  self.greet
13
21
  self.display_menu #initial menu selection of what you want to see
@@ -19,7 +27,7 @@ class CLI
19
27
  puts "Welcome to Headline Scraper"
20
28
  sleep(1)
21
29
  puts "Please select which of the following articles you would like to view:"
22
- sleep(1.5)
30
+ sleep(1.5)
23
31
  puts ""
24
32
  end
25
33
 
@@ -116,8 +124,6 @@ class CLI
116
124
  the_article = the_network.articles[selection[1]-1]
117
125
  self.article_options_menu(the_article)
118
126
  end
119
-
120
-
121
127
  end
122
128
 
123
129
  def selection_exists?(selection) #post-screens entries to make sure the valid entry actually refers to an existing item
@@ -144,8 +150,6 @@ class CLI
144
150
  end
145
151
  end
146
152
 
147
-
148
-
149
153
  def select_scrape_method(article)
150
154
 
151
155
  case article.network_name
@@ -159,11 +163,6 @@ class CLI
159
163
  end
160
164
  end
161
165
 
162
-
163
-
164
-
165
-
166
-
167
166
  def article_options_menu(article)
168
167
  #takes article object as an argument
169
168
  #automatically displays article headline, network name, and article metadata (i.e. author, date & time posted, number of comments, tags etc.)
@@ -186,7 +185,6 @@ class CLI
186
185
  puts "1. View article in browser."
187
186
  puts "2. Return to previous menu."
188
187
  puts "Or type 'exit'."
189
-
190
188
  input = gets.strip.upcase
191
189
  case input
192
190
  when "1"
@@ -7,13 +7,14 @@ class Scraper
7
7
  #<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
8
8
 
9
9
  def self.reuters_homepage
10
-
10
+ puts "scraping Reuters homepage"
11
11
  url = "https://www.reuters.com"
12
12
  homepage = self.get_page(url)
13
13
  reuters = Network.create_with_url("REUTERS", url)
14
14
  reuters.home_html = homepage
15
15
  self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])}
16
16
 
17
+
17
18
  end
18
19
 
19
20
 
@@ -33,7 +34,7 @@ end
33
34
 
34
35
 
35
36
  def self.check_reuters_urls(articles)
36
- #checks for and corrects common issue where MSNBC uses partial urls for internal links
37
+ #checks for and corrects common issue where a website uses partial urls for internal links
37
38
 
38
39
  articles.each do |article|
39
40
  if !article[1].include?("www")
@@ -59,6 +60,7 @@ end
59
60
  #<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
60
61
 
61
62
  def self.fox_homepage
63
+ puts "scraping Fox homepage"
62
64
  url = "http://www.foxnews.com"
63
65
  homepage = self.get_page(url)
64
66
  fox = Network.create_with_url("FOX NEWS", url)
@@ -70,16 +72,17 @@ end
70
72
  def self.scrape_fox_articles
71
73
 
72
74
  html = Network.find_by_name("FOX NEWS").home_html
75
+ leader = [html.css("div.collection.collection-spotlight article.article.story-1 header a").text.strip, html.css("div.collection.collection-spotlight article.article.story-1 header a").attribute("href")]
73
76
 
74
- leader = [html.css("div.primary h1 a").text, html.css("div.primary h1 a").attribute("href").value]
75
- second = [html.css("div.top-stories a h3").first.text, html.css("div.top-stories li").first.css("a").attribute("href").value]
77
+ second = [html.css("div.main.main-secondary article.article.story-1 h2.title a").text, html.css("div.main.main-secondary article.article.story-1 h2.title a").attribute("href").value]
76
78
 
77
- third = [html.css("div.top-stories a h3")[1].text, html.css("div.top-stories li[data-vr-contentbox = ''] a")[4].attribute("href").value]
79
+ third = [html.css("div.main.main-secondary article.article.story-2 h2.title a").text, html.css("div.main.main-secondary article.article.story-2 h2.title a").attribute("href").value]
78
80
 
79
81
  articles = [leader, second, third]
80
82
 
81
83
  end
82
84
 
85
+
83
86
  def self.fox_article(article)
84
87
  article.html = self.get_page(article.url)
85
88
  article.summary = article.html.css("meta[name='description']").attribute("content").value
@@ -92,6 +95,7 @@ end
92
95
  #<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
93
96
 
94
97
  def self.msnbc_homepage
98
+ puts "scraping MSNBC homepage"
95
99
  url = "http://www.msnbc.com"
96
100
  homepage = self.get_page(url)
97
101
  msnbc = Network.create_with_url("MSNBC", url)
@@ -1,3 +1,3 @@
1
1
  module CLIHeadlineScraper
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.7"
3
3
  end
data/lib/environment.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'require_all'
2
1
  require 'launchy'
3
2
  require 'open-uri'
4
3
  require 'nokogiri'
@@ -0,0 +1,4 @@
1
+ 1. Make separate scraper classes for each publication (supported by a scraper module)
2
+ 2. Make it so that each Object only knows about a max of one other object.
3
+ 3. Eliminate magic numbers.
4
+ 4. Don't use 'singleton' classes. Make a new instance of each scraper object(one for each website) and store things like its html in instance variables like @doc.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: CLI_Headline_Scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jim Stricker
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-06 00:00:00.000000000 Z
11
+ date: 2017-11-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -126,6 +126,7 @@ files:
126
126
  - ".gitignore"
127
127
  - ".rspec"
128
128
  - ".travis.yml"
129
+ - CLI_Headline_Scraper-0.1.6.gem
129
130
  - CLI_Headline_Scraper.gemspec
130
131
  - Gemfile
131
132
  - LICENSE.txt
@@ -140,6 +141,7 @@ files:
140
141
  - lib/CLI_Headline_Scraper/Scraper.rb
141
142
  - lib/CLI_Headline_Scraper/version.rb
142
143
  - lib/environment.rb
144
+ - refactor_notes.txt
143
145
  - selection
144
146
  - spec.md
145
147
  homepage: https://github.com/jmstrick93/CLI_Headline_Scraper