top-headlines 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/lib/top-headlines/cli.rb +2 -1
- data/lib/top-headlines/source.rb +62 -8
- data/lib/top-headlines/version.rb +1 -1
- data/top-headlines-0.1.1.gem +0 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 95addc3e40b6793ecd773848f2394e2b2903d032
|
4
|
+
data.tar.gz: 63e758e38c0b5d7017bbe6ae76f8fb0621ed4d41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd46edc49f87d1b58a3cda6e4416ce6f15c56e387260ad2d2ce1d24e3de0c7452ecfda7bd61c2a3521ad6e0d3136c833c13a30951bc876a2f7052d5dab4ef5cc
|
7
|
+
data.tar.gz: e487345aa1328e64da21b8ac2d2bbb994a502ed7574aeba167dbe84dbde072126e305506f60c1572951ccff9ddfafa0b6ee65e40844b4abec0d6a733ff54240e
|
data/README.md
CHANGED
@@ -20,13 +20,14 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
In lib/top-headlines/source.rb, users will find a SOURCES hash. Users may add their own favorite news sources to the hash, so that
|
24
|
-
|
23
|
+
In lib/top-headlines/source.rb, users will find a SOURCES hash. Users may add their own favorite news sources to the hash, so that the gem dynamically scrapes even more headlines.
|
24
|
+
```
|
25
25
|
If you'd like to view how I made the gem, here's a set of videos that captures almost all of it:
|
26
26
|
1) https://drive.google.com/file/d/0B-xsMiWmDyyzcGk3MmlTc0xQOXM/view?usp=sharing
|
27
27
|
2) https://drive.google.com/file/d/0B-xsMiWmDyyzNDFyS01icFMtams/view?usp=sharing
|
28
28
|
3) https://drive.google.com/file/d/0B-xsMiWmDyyzU0VGNGJ5QkpaOUU/view?usp=sharing
|
29
29
|
4) https://drive.google.com/file/d/0B-xsMiWmDyyzbEdzX0ZlOVcwM2M/view?usp=sharing
|
30
|
+
```
|
30
31
|
|
31
32
|
## Development
|
32
33
|
|
data/lib/top-headlines/cli.rb
CHANGED
@@ -99,8 +99,9 @@ class TopHeadlines::CLI
|
|
99
99
|
puts "\nSelect another headline number to open full article in the browser."
|
100
100
|
print "YOUR SELECTION: "
|
101
101
|
@num = gets.strip.upcase
|
102
|
+
@input = @num if @num == "EXIT"
|
102
103
|
end
|
103
|
-
invalid_entry
|
104
|
+
invalid_entry unless @input == "EXIT"
|
104
105
|
end
|
105
106
|
|
106
107
|
def invalid_entry
|
data/lib/top-headlines/source.rb
CHANGED
@@ -4,28 +4,81 @@ class TopHeadlines::Source
|
|
4
4
|
"CNN" => {
|
5
5
|
url: "http://www.cnn.com/",
|
6
6
|
headlines_selector: "div.column.zn__column--idx-1 span.cd__headline-text",
|
7
|
-
urls_selector: "div.column.zn__column--idx-1"
|
7
|
+
urls_selector: "div.column.zn__column--idx-1",
|
8
|
+
child_selector: "a"
|
8
9
|
},
|
9
10
|
"MSNBC" => {
|
10
11
|
url: "http://www.msnbc.com/",
|
11
12
|
headlines_selector: "span.featured-slider-menu__item__link__title",
|
12
|
-
urls_selector: "ul.featured-slider-menu"
|
13
|
+
urls_selector: "ul.featured-slider-menu",
|
14
|
+
child_selector: "a"
|
13
15
|
},
|
14
16
|
"FOX" => {
|
15
17
|
url: "http://www.foxnews.com/",
|
16
18
|
headlines_selector: "section#trending li a",
|
17
|
-
urls_selector: "section#trending li"
|
19
|
+
urls_selector: "section#trending li",
|
20
|
+
child_selector: "a"
|
18
21
|
},
|
19
22
|
"NYTIMES" => {
|
20
23
|
url: "http://www.nytimes.com/",
|
21
24
|
headlines_selector: "section#top-news h2.story-heading a",
|
22
|
-
urls_selector: "section#top-news h2.story-heading"
|
25
|
+
urls_selector: "section#top-news h2.story-heading",
|
26
|
+
child_selector: "a"
|
23
27
|
},
|
24
28
|
"BLOOMBERG" => {
|
25
29
|
url: "http://www.bloomberg.com/",
|
26
30
|
headlines_selector: "section.top-news-v3 h1 a",
|
27
|
-
urls_selector: "section.top-news-v3 h1"
|
28
|
-
|
31
|
+
urls_selector: "section.top-news-v3 h1",
|
32
|
+
child_selector: "a"
|
33
|
+
},
|
34
|
+
"GUARDIAN" => {
|
35
|
+
url: "http://www.theguardian.com/us",
|
36
|
+
headlines_selector: "section#headlines div.fc-container__inner div.fc-item__container a.u-faux-block-link__overlay.js-headline-text",
|
37
|
+
urls_selector: "section#headlines div.fc-container__inner div.fc-item__container",
|
38
|
+
child_selector: "a.u-faux-block-link__overlay.js-headline-text"
|
39
|
+
},
|
40
|
+
"HUFF POST" => {
|
41
|
+
url: "http://www.huffingtonpost.com/",
|
42
|
+
headlines_selector: "div#center_entries_container h2 a",
|
43
|
+
urls_selector: "div#center_entries_container h2",
|
44
|
+
child_selector: "a"
|
45
|
+
},
|
46
|
+
"FORBES" => {
|
47
|
+
url: "http://www.forbes.com/",
|
48
|
+
headlines_selector: "h4",
|
49
|
+
urls_selector: "h4",
|
50
|
+
child_selector: "a"
|
51
|
+
},
|
52
|
+
"WSJ" => {
|
53
|
+
url: "http://www.wsj.com/",
|
54
|
+
headlines_selector: "a.wsj-headline-link",
|
55
|
+
urls_selector: "div.cb-col",
|
56
|
+
child_selector: "a.wsj-headline-link"
|
57
|
+
},
|
58
|
+
# "REDDIT" => { ## 429 Error
|
59
|
+
# url: "https://www.reddit.com/r/news/",
|
60
|
+
# headlines_selector: "p.title a.title.may-blank",
|
61
|
+
# urls_selector: "p.title",
|
62
|
+
# child_selector: "a"
|
63
|
+
# },
|
64
|
+
# "BBC" => {
|
65
|
+
# url: "http://www.bbc.com/news",
|
66
|
+
# headlines_selector: "div.column--primary span.title-link__title-text",
|
67
|
+
# urls_selector: "div.column--primary",
|
68
|
+
# child_selector: "a.title-link" ## NEEDS WORK returns e.g. /news/world-middle-east-36180184
|
69
|
+
# },
|
70
|
+
# "CBS" => {
|
71
|
+
# url: "http://www.cbsnews.com/",
|
72
|
+
# headlines_selector: "div.col-5.nocontent h3.title",
|
73
|
+
# urls_selector: "div.col-5.nocontent",
|
74
|
+
# child_selector: "a" ## NEEDS WORK – only select a child of parent h3.title
|
75
|
+
# },
|
76
|
+
# "YAHOO" => {
|
77
|
+
# url: "https://www.yahoo.com/news/",
|
78
|
+
# headlines_selector: "div#mrt-node-Col1-1-WideHero h3",
|
79
|
+
# urls_selector: "div#mrt-node-Col1-1-WideHero",
|
80
|
+
# child_selector: "a" ## NEEDS WORK – only select a child of parent h3
|
81
|
+
# },
|
29
82
|
}
|
30
83
|
|
31
84
|
def self.all
|
@@ -46,15 +99,16 @@ class TopHeadlines::Source
|
|
46
99
|
headlines_selector = source[:headlines_selector]
|
47
100
|
|
48
101
|
doc = Nokogiri::HTML(open(page_url))
|
49
|
-
headlines = doc.css(headlines_selector).map {|headline| headline.text}
|
102
|
+
headlines = doc.css(headlines_selector).map {|headline| headline.text.gsub("â", "'").gsub(/\n/,"").gsub(/\t/,"").strip}
|
50
103
|
end
|
51
104
|
|
52
105
|
def self.scrape_urls(source)
|
53
106
|
source = SOURCES[source]
|
54
107
|
page_url = source[:url]
|
55
108
|
urls_selector = source[:urls_selector]
|
109
|
+
child_selector = source[:child_selector]
|
56
110
|
|
57
111
|
doc = Nokogiri::HTML(open(page_url))
|
58
|
-
urls = doc.css(urls_selector).children.css(
|
112
|
+
urls = doc.css(urls_selector).children.css(child_selector).map {|url| url.attribute('href').value[0] == 'h' ? url.attribute('href').value : page_url + url.attribute('href').value}
|
59
113
|
end
|
60
114
|
end
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: top-headlines
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zachnewburgh
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/top-headlines/source.rb
|
105
105
|
- lib/top-headlines/version.rb
|
106
106
|
- top-headlines-0.1.0.gem
|
107
|
+
- top-headlines-0.1.1.gem
|
107
108
|
- top-headlines.gemspec
|
108
109
|
homepage: https://github.com/zachnewburgh/top-headlines-cli-gem
|
109
110
|
licenses:
|