top-headlines 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/lib/top-headlines/cli.rb +2 -1
- data/lib/top-headlines/source.rb +62 -8
- data/lib/top-headlines/version.rb +1 -1
- data/top-headlines-0.1.1.gem +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 95addc3e40b6793ecd773848f2394e2b2903d032
|
4
|
+
data.tar.gz: 63e758e38c0b5d7017bbe6ae76f8fb0621ed4d41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd46edc49f87d1b58a3cda6e4416ce6f15c56e387260ad2d2ce1d24e3de0c7452ecfda7bd61c2a3521ad6e0d3136c833c13a30951bc876a2f7052d5dab4ef5cc
|
7
|
+
data.tar.gz: e487345aa1328e64da21b8ac2d2bbb994a502ed7574aeba167dbe84dbde072126e305506f60c1572951ccff9ddfafa0b6ee65e40844b4abec0d6a733ff54240e
|
data/README.md
CHANGED
@@ -20,13 +20,14 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
In lib/top-headlines/source.rb, users will find a SOURCES hash. Users may add their own favorite news sources to the hash, so that
|
24
|
-
|
23
|
+
In lib/top-headlines/source.rb, users will find a SOURCES hash. Users may add their own favorite news sources to the hash, so that the gem dynamically scrapes even more headlines.
|
24
|
+
```
|
25
25
|
If you'd like to view how I made the gem, here's a set of videos that captures almost all of it:
|
26
26
|
1) https://drive.google.com/file/d/0B-xsMiWmDyyzcGk3MmlTc0xQOXM/view?usp=sharing
|
27
27
|
2) https://drive.google.com/file/d/0B-xsMiWmDyyzNDFyS01icFMtams/view?usp=sharing
|
28
28
|
3) https://drive.google.com/file/d/0B-xsMiWmDyyzU0VGNGJ5QkpaOUU/view?usp=sharing
|
29
29
|
4) https://drive.google.com/file/d/0B-xsMiWmDyyzbEdzX0ZlOVcwM2M/view?usp=sharing
|
30
|
+
```
|
30
31
|
|
31
32
|
## Development
|
32
33
|
|
data/lib/top-headlines/cli.rb
CHANGED
@@ -99,8 +99,9 @@ class TopHeadlines::CLI
|
|
99
99
|
puts "\nSelect another headline number to open full article in the browser."
|
100
100
|
print "YOUR SELECTION: "
|
101
101
|
@num = gets.strip.upcase
|
102
|
+
@input = @num if @num == "EXIT"
|
102
103
|
end
|
103
|
-
invalid_entry
|
104
|
+
invalid_entry unless @input == "EXIT"
|
104
105
|
end
|
105
106
|
|
106
107
|
def invalid_entry
|
data/lib/top-headlines/source.rb
CHANGED
@@ -4,28 +4,81 @@ class TopHeadlines::Source
|
|
4
4
|
"CNN" => {
|
5
5
|
url: "http://www.cnn.com/",
|
6
6
|
headlines_selector: "div.column.zn__column--idx-1 span.cd__headline-text",
|
7
|
-
urls_selector: "div.column.zn__column--idx-1"
|
7
|
+
urls_selector: "div.column.zn__column--idx-1",
|
8
|
+
child_selector: "a"
|
8
9
|
},
|
9
10
|
"MSNBC" => {
|
10
11
|
url: "http://www.msnbc.com/",
|
11
12
|
headlines_selector: "span.featured-slider-menu__item__link__title",
|
12
|
-
urls_selector: "ul.featured-slider-menu"
|
13
|
+
urls_selector: "ul.featured-slider-menu",
|
14
|
+
child_selector: "a"
|
13
15
|
},
|
14
16
|
"FOX" => {
|
15
17
|
url: "http://www.foxnews.com/",
|
16
18
|
headlines_selector: "section#trending li a",
|
17
|
-
urls_selector: "section#trending li"
|
19
|
+
urls_selector: "section#trending li",
|
20
|
+
child_selector: "a"
|
18
21
|
},
|
19
22
|
"NYTIMES" => {
|
20
23
|
url: "http://www.nytimes.com/",
|
21
24
|
headlines_selector: "section#top-news h2.story-heading a",
|
22
|
-
urls_selector: "section#top-news h2.story-heading"
|
25
|
+
urls_selector: "section#top-news h2.story-heading",
|
26
|
+
child_selector: "a"
|
23
27
|
},
|
24
28
|
"BLOOMBERG" => {
|
25
29
|
url: "http://www.bloomberg.com/",
|
26
30
|
headlines_selector: "section.top-news-v3 h1 a",
|
27
|
-
urls_selector: "section.top-news-v3 h1"
|
28
|
-
|
31
|
+
urls_selector: "section.top-news-v3 h1",
|
32
|
+
child_selector: "a"
|
33
|
+
},
|
34
|
+
"GUARDIAN" => {
|
35
|
+
url: "http://www.theguardian.com/us",
|
36
|
+
headlines_selector: "section#headlines div.fc-container__inner div.fc-item__container a.u-faux-block-link__overlay.js-headline-text",
|
37
|
+
urls_selector: "section#headlines div.fc-container__inner div.fc-item__container",
|
38
|
+
child_selector: "a.u-faux-block-link__overlay.js-headline-text"
|
39
|
+
},
|
40
|
+
"HUFF POST" => {
|
41
|
+
url: "http://www.huffingtonpost.com/",
|
42
|
+
headlines_selector: "div#center_entries_container h2 a",
|
43
|
+
urls_selector: "div#center_entries_container h2",
|
44
|
+
child_selector: "a"
|
45
|
+
},
|
46
|
+
"FORBES" => {
|
47
|
+
url: "http://www.forbes.com/",
|
48
|
+
headlines_selector: "h4",
|
49
|
+
urls_selector: "h4",
|
50
|
+
child_selector: "a"
|
51
|
+
},
|
52
|
+
"WSJ" => {
|
53
|
+
url: "http://www.wsj.com/",
|
54
|
+
headlines_selector: "a.wsj-headline-link",
|
55
|
+
urls_selector: "div.cb-col",
|
56
|
+
child_selector: "a.wsj-headline-link"
|
57
|
+
},
|
58
|
+
# "REDDIT" => { ## 429 Error
|
59
|
+
# url: "https://www.reddit.com/r/news/",
|
60
|
+
# headlines_selector: "p.title a.title.may-blank",
|
61
|
+
# urls_selector: "p.title",
|
62
|
+
# child_selector: "a"
|
63
|
+
# },
|
64
|
+
# "BBC" => {
|
65
|
+
# url: "http://www.bbc.com/news",
|
66
|
+
# headlines_selector: "div.column--primary span.title-link__title-text",
|
67
|
+
# urls_selector: "div.column--primary",
|
68
|
+
# child_selector: "a.title-link" ## NEEDS WORK returns e.g. /news/world-middle-east-36180184
|
69
|
+
# },
|
70
|
+
# "CBS" => {
|
71
|
+
# url: "http://www.cbsnews.com/",
|
72
|
+
# headlines_selector: "div.col-5.nocontent h3.title",
|
73
|
+
# urls_selector: "div.col-5.nocontent",
|
74
|
+
# child_selector: "a" ## NEEDS WORK – only select a child of parent h3.title
|
75
|
+
# },
|
76
|
+
# "YAHOO" => {
|
77
|
+
# url: "https://www.yahoo.com/news/",
|
78
|
+
# headlines_selector: "div#mrt-node-Col1-1-WideHero h3",
|
79
|
+
# urls_selector: "div#mrt-node-Col1-1-WideHero",
|
80
|
+
# child_selector: "a" ## NEEDS WORK – only select a child of parent h3
|
81
|
+
# },
|
29
82
|
}
|
30
83
|
|
31
84
|
def self.all
|
@@ -46,15 +99,16 @@ class TopHeadlines::Source
|
|
46
99
|
headlines_selector = source[:headlines_selector]
|
47
100
|
|
48
101
|
doc = Nokogiri::HTML(open(page_url))
|
49
|
-
headlines = doc.css(headlines_selector).map {|headline| headline.text}
|
102
|
+
headlines = doc.css(headlines_selector).map {|headline| headline.text.gsub("â", "'").gsub(/\n/,"").gsub(/\t/,"").strip}
|
50
103
|
end
|
51
104
|
|
52
105
|
def self.scrape_urls(source)
|
53
106
|
source = SOURCES[source]
|
54
107
|
page_url = source[:url]
|
55
108
|
urls_selector = source[:urls_selector]
|
109
|
+
child_selector = source[:child_selector]
|
56
110
|
|
57
111
|
doc = Nokogiri::HTML(open(page_url))
|
58
|
-
urls = doc.css(urls_selector).children.css(
|
112
|
+
urls = doc.css(urls_selector).children.css(child_selector).map {|url| url.attribute('href').value[0] == 'h' ? url.attribute('href').value : page_url + url.attribute('href').value}
|
59
113
|
end
|
60
114
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: top-headlines
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zachnewburgh
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/top-headlines/source.rb
|
105
105
|
- lib/top-headlines/version.rb
|
106
106
|
- top-headlines-0.1.0.gem
|
107
|
+
- top-headlines-0.1.1.gem
|
107
108
|
- top-headlines.gemspec
|
108
109
|
homepage: https://github.com/zachnewburgh/top-headlines-cli-gem
|
109
110
|
licenses:
|