newly 1.1.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -11
- data/Gemfile.lock +53 -26
- data/README.md +38 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/newly/feed.rb +19 -0
- data/lib/newly/news.rb +16 -0
- data/lib/newly/news_crawler.rb +42 -0
- data/lib/newly/page_crawler.rb +51 -0
- data/lib/newly/selector.rb +17 -0
- data/lib/newly.rb +2 -34
- data/newly.gemspec +26 -30
- data/spec/html/page_spec.html +51 -0
- data/spec/newly/news_crawler_spec.rb +99 -0
- data/spec/newly/page_crawler_spec.rb +52 -0
- data/spec/spec_helper.rb +5 -9
- metadata +42 -42
- data/README.rdoc +0 -39
- data/lib/news.rb +0 -12
- data/spec/html/ecbahia.html +0 -780
- data/spec/html/g1.html +0 -4988
- data/spec/html/g1_bahia.html +0 -4481
- data/spec/html/metro1_cidade.html +0 -2404
- data/spec/newly_spec.rb +0 -73
data/Gemfile
CHANGED
@@ -1,16 +1,9 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
|
4
|
-
# Example:
|
5
|
-
# gem "activesupport", ">= 2.3.5"
|
6
|
-
gem 'nokogiri'
|
3
|
+
gem 'nokogiri', '~> 1.5'
|
7
4
|
|
8
|
-
# Add dependencies to develop your gem here.
|
9
|
-
# Include everything needed to run rake, tests, features, etc.
|
10
5
|
group :development do
|
11
|
-
gem
|
12
|
-
gem
|
13
|
-
gem
|
14
|
-
gem "jeweler", "~> 1.8.4"
|
15
|
-
gem "simplecov"
|
6
|
+
gem 'rspec', '~> 3.0'
|
7
|
+
gem 'rspec-collection_matchers', '~> 1.0'
|
8
|
+
gem 'jeweler', '~> 1.8'
|
16
9
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,39 +1,66 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
addressable (2.3.6)
|
5
|
+
builder (3.2.2)
|
6
|
+
diff-lcs (1.2.5)
|
7
|
+
faraday (0.8.9)
|
8
|
+
multipart-post (~> 1.2.0)
|
9
|
+
git (1.2.7)
|
10
|
+
github_api (0.10.1)
|
11
|
+
addressable
|
12
|
+
faraday (~> 0.8.1)
|
13
|
+
hashie (>= 1.2)
|
14
|
+
multi_json (~> 1.4)
|
15
|
+
nokogiri (~> 1.5.2)
|
16
|
+
oauth2
|
17
|
+
hashie (3.2.0)
|
18
|
+
highline (1.6.21)
|
19
|
+
jeweler (1.8.8)
|
20
|
+
builder
|
7
21
|
bundler (~> 1.0)
|
8
22
|
git (>= 1.2.5)
|
23
|
+
github_api (= 0.10.1)
|
24
|
+
highline (>= 1.6.15)
|
25
|
+
nokogiri (= 1.5.10)
|
9
26
|
rake
|
10
27
|
rdoc
|
11
|
-
json (1.
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
28
|
+
json (1.8.1)
|
29
|
+
jwt (1.0.0)
|
30
|
+
multi_json (1.10.1)
|
31
|
+
multi_xml (0.5.5)
|
32
|
+
multipart-post (1.2.0)
|
33
|
+
nokogiri (1.5.10)
|
34
|
+
oauth2 (1.0.0)
|
35
|
+
faraday (>= 0.8, < 0.10)
|
36
|
+
jwt (~> 1.0)
|
37
|
+
multi_json (~> 1.3)
|
38
|
+
multi_xml (~> 0.5)
|
39
|
+
rack (~> 1.2)
|
40
|
+
rack (1.5.2)
|
41
|
+
rake (10.3.2)
|
42
|
+
rdoc (3.12.2)
|
16
43
|
json (~> 1.4)
|
17
|
-
rspec (
|
18
|
-
rspec-core (~>
|
19
|
-
rspec-expectations (~>
|
20
|
-
rspec-mocks (~>
|
21
|
-
rspec-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
44
|
+
rspec (3.0.0)
|
45
|
+
rspec-core (~> 3.0.0)
|
46
|
+
rspec-expectations (~> 3.0.0)
|
47
|
+
rspec-mocks (~> 3.0.0)
|
48
|
+
rspec-collection_matchers (1.0.0)
|
49
|
+
rspec-expectations (>= 2.99.0.beta1)
|
50
|
+
rspec-core (3.0.2)
|
51
|
+
rspec-support (~> 3.0.0)
|
52
|
+
rspec-expectations (3.0.2)
|
53
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
54
|
+
rspec-support (~> 3.0.0)
|
55
|
+
rspec-mocks (3.0.2)
|
56
|
+
rspec-support (~> 3.0.0)
|
57
|
+
rspec-support (3.0.2)
|
29
58
|
|
30
59
|
PLATFORMS
|
31
60
|
ruby
|
32
61
|
|
33
62
|
DEPENDENCIES
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
rspec (~> 2.8.0)
|
39
|
-
simplecov
|
63
|
+
jeweler (~> 1.8)
|
64
|
+
nokogiri (~> 1.5)
|
65
|
+
rspec (~> 3.0)
|
66
|
+
rspec-collection_matchers (~> 1.0)
|
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# newly
|
2
|
+
|
3
|
+
## DSL that helps scrapping news given a feed definition with url and selectors
|
4
|
+
|
5
|
+
## SYNOPSIS:
|
6
|
+
|
7
|
+
``` ruby
|
8
|
+
# Fecthing breaking news from some website
|
9
|
+
require 'newly'
|
10
|
+
|
11
|
+
# Fecthing breaking news from some website
|
12
|
+
my_feed = Newly::Feed.new(
|
13
|
+
container: '#ultimas-regiao div, #ultimas-regiao ul li',
|
14
|
+
href: 'a',
|
15
|
+
title: '.titulo',
|
16
|
+
subtitle: '.subtitulo',
|
17
|
+
image_source: 'img')
|
18
|
+
|
19
|
+
news = Newly::NewsCrawler.new(url: 'http://g1.globo.com/bahia/', feed: my_feed).fetch
|
20
|
+
news.each do |n|
|
21
|
+
puts n.url # news href url
|
22
|
+
puts n.title # news title
|
23
|
+
puts n.subtitle # news subtitle
|
24
|
+
puts n.image # news image src
|
25
|
+
end
|
26
|
+
```
|
27
|
+
|
28
|
+
## Contributing to newly
|
29
|
+
|
30
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
31
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
32
|
+
* Fork the project.
|
33
|
+
* Start a feature/bugfix branch.
|
34
|
+
* Commit and push until you are happy with your contribution.
|
35
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
36
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
37
|
+
|
38
|
+
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ Jeweler::Tasks.new do |gem|
|
|
18
18
|
gem.homepage = "http://github.com/alabeduarte/newly"
|
19
19
|
gem.license = "MIT"
|
20
20
|
gem.summary = %Q{Fetching breaking news from websites}
|
21
|
-
gem.description = %Q{
|
21
|
+
gem.description = %Q{DSL that helps scrapping news given a feed definition with url and selectors}
|
22
22
|
gem.email = "alabeduarte@gmail.com"
|
23
23
|
gem.authors = ["Alabê Duarte"]
|
24
24
|
# dependencies defined in Gemfile
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.0
|
data/lib/newly/feed.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module Newly
|
5
|
+
class Feed
|
6
|
+
attr_reader :container, :url_pattern, :title, :subtitle, :image_source, :favicon, :host, :limit
|
7
|
+
|
8
|
+
def initialize(args)
|
9
|
+
@container = args[:container]
|
10
|
+
@url_pattern = args[:url_pattern]
|
11
|
+
@title = args[:title]
|
12
|
+
@subtitle = args[:subtitle]
|
13
|
+
@image_source = args[:image_source]
|
14
|
+
@favicon = args[:favicon]
|
15
|
+
@host = args[:host]
|
16
|
+
@limit = args[:limit]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/newly/news.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module Newly
|
2
|
+
class News
|
3
|
+
attr_reader :url, :url_pattern, :title, :subtitle, :image, :feed_url
|
4
|
+
|
5
|
+
def initialize(args)
|
6
|
+
page_crawler = args[:page_crawler]
|
7
|
+
feed = args[:feed]
|
8
|
+
|
9
|
+
@feed_url = args[:feed_url]
|
10
|
+
@url = page_crawler.link feed.url_pattern
|
11
|
+
@title = page_crawler.titleize feed.title
|
12
|
+
@subtitle = page_crawler.titleize feed.subtitle
|
13
|
+
@image = page_crawler.image feed.image_source
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'newly/selector'
|
3
|
+
require 'newly/page_crawler'
|
4
|
+
require 'newly/news'
|
5
|
+
|
6
|
+
module Newly
|
7
|
+
class NewsCrawler
|
8
|
+
attr_reader :title, :selector, :url
|
9
|
+
|
10
|
+
def initialize(args)
|
11
|
+
@feed = args[:feed]
|
12
|
+
@url = args[:url]
|
13
|
+
raise "The url is required" unless @url
|
14
|
+
|
15
|
+
@selector = args[:selector] || Newly::Selector.new(Nokogiri::HTML(open @url))
|
16
|
+
end
|
17
|
+
|
18
|
+
def fetch
|
19
|
+
news_fetched = Set.new
|
20
|
+
all_news = @selector.all(container: @feed.container, max: @feed.limit)
|
21
|
+
|
22
|
+
all_news.each do |item|
|
23
|
+
news = build_news_by(item)
|
24
|
+
if news
|
25
|
+
news_fetched << news
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
news_fetched.to_a
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
def build_news_by(item)
|
34
|
+
if (item)
|
35
|
+
page_crawler = Newly::PageCrawler.new(@feed.host, item)
|
36
|
+
|
37
|
+
Newly::News.new(page_crawler: page_crawler, feed: @feed, feed_url: @url)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Newly
|
2
|
+
class PageCrawler
|
3
|
+
def initialize(host, document)
|
4
|
+
@host = host
|
5
|
+
@document = document
|
6
|
+
end
|
7
|
+
|
8
|
+
def titleize(element)
|
9
|
+
title = text(element)
|
10
|
+
title[0] = title.capitalize[0] if title
|
11
|
+
|
12
|
+
title
|
13
|
+
end
|
14
|
+
|
15
|
+
def text(element)
|
16
|
+
if valid?(element)
|
17
|
+
text = get(element).text
|
18
|
+
text if valid?(text)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def link(element)
|
23
|
+
href = find(element, 'href')
|
24
|
+
href = "#{@host}/#{href}".gsub('../', '') if href && !href.include?('http')
|
25
|
+
href
|
26
|
+
end
|
27
|
+
|
28
|
+
def image(element)
|
29
|
+
image = find(element, 'src')
|
30
|
+
if (image && image.include?("==/"))
|
31
|
+
image = "http://#{image.split("==/").last}"
|
32
|
+
end
|
33
|
+
image = "#{@host}/#{image}".gsub('../', '') if image && image.include?('../')
|
34
|
+
image
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def valid?(str)
|
39
|
+
str && !str.empty?
|
40
|
+
end
|
41
|
+
|
42
|
+
def get(element)
|
43
|
+
@document.css(element)
|
44
|
+
end
|
45
|
+
|
46
|
+
def find(element, type)
|
47
|
+
get(element).map { |doc| doc[type] }.first if valid?(element)
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Newly
|
2
|
+
class Selector
|
3
|
+
def initialize(selector)
|
4
|
+
@selector = selector
|
5
|
+
end
|
6
|
+
|
7
|
+
def all(args)
|
8
|
+
args[:max] ?
|
9
|
+
@selector.css(args[:container]).first(args[:max]) :
|
10
|
+
@selector.css(args[:container])
|
11
|
+
end
|
12
|
+
|
13
|
+
def title
|
14
|
+
@selector.at_css("title").text
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/newly.rb
CHANGED
@@ -1,34 +1,2 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
require 'news'
|
4
|
-
|
5
|
-
class Newly
|
6
|
-
|
7
|
-
attr_reader :title, :selector, :url
|
8
|
-
|
9
|
-
def initialize(url, selector=Nokogiri::HTML(open(url)))
|
10
|
-
@url = url
|
11
|
-
@selector = selector
|
12
|
-
@title = @selector.at_css("title").text
|
13
|
-
end
|
14
|
-
|
15
|
-
def highlights(args)
|
16
|
-
news = Array.new
|
17
|
-
@selector.css(args[:selector]).each do |item|
|
18
|
-
if (item)
|
19
|
-
href = item.css(args[:href]).map { |doc| doc['href'] }.first if args[:href]
|
20
|
-
date = item.css(args[:date]).text if args[:date]
|
21
|
-
title = item.css(args[:title]).text if args[:title]
|
22
|
-
subtitle = item.css(args[:subtitle]).text if args[:subtitle]
|
23
|
-
img = item.css(args[:img]).map { |doc| doc['src'] }.first if args[:img]
|
24
|
-
if (args[:host])
|
25
|
-
host = args[:host]
|
26
|
-
url = "#{host}/#{url}".gsub('../', '') if url
|
27
|
-
image = "#{host}/#{image}".gsub('../', '') if image && image.include?('../')
|
28
|
-
end
|
29
|
-
news << News.new(url: href, keywords: keywords, date: date, title: title, subtitle: subtitle, image: img)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
news
|
33
|
-
end
|
34
|
-
end
|
1
|
+
require 'newly/feed'
|
2
|
+
require 'newly/news_crawler'
|
data/newly.gemspec
CHANGED
@@ -5,16 +5,16 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "newly"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Alab\u{ea} Duarte"]
|
12
|
-
s.date = "
|
13
|
-
s.description = "
|
12
|
+
s.date = "2014-07-22"
|
13
|
+
s.description = "DSL that helps scrapping news given a feed definition with url and selectors"
|
14
14
|
s.email = "alabeduarte@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE.txt",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".DS_Store",
|
@@ -23,51 +23,47 @@ Gem::Specification.new do |s|
|
|
23
23
|
"Gemfile",
|
24
24
|
"Gemfile.lock",
|
25
25
|
"LICENSE.txt",
|
26
|
-
"README.
|
26
|
+
"README.md",
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"lib/newly.rb",
|
30
|
-
"lib/
|
30
|
+
"lib/newly/feed.rb",
|
31
|
+
"lib/newly/news.rb",
|
32
|
+
"lib/newly/news_crawler.rb",
|
33
|
+
"lib/newly/page_crawler.rb",
|
34
|
+
"lib/newly/selector.rb",
|
31
35
|
"newly.gemspec",
|
32
36
|
"spec/.DS_Store",
|
33
|
-
"spec/html/
|
34
|
-
"spec/
|
35
|
-
"spec/
|
36
|
-
"spec/html/metro1_cidade.html",
|
37
|
-
"spec/newly_spec.rb",
|
37
|
+
"spec/html/page_spec.html",
|
38
|
+
"spec/newly/news_crawler_spec.rb",
|
39
|
+
"spec/newly/page_crawler_spec.rb",
|
38
40
|
"spec/spec_helper.rb"
|
39
41
|
]
|
40
42
|
s.homepage = "http://github.com/alabeduarte/newly"
|
41
43
|
s.licenses = ["MIT"]
|
42
44
|
s.require_paths = ["lib"]
|
43
|
-
s.rubygems_version = "1.8.
|
45
|
+
s.rubygems_version = "1.8.21"
|
44
46
|
s.summary = "Fetching breaking news from websites"
|
45
47
|
|
46
48
|
if s.respond_to? :specification_version then
|
47
49
|
s.specification_version = 3
|
48
50
|
|
49
51
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
50
|
-
s.add_runtime_dependency(%q<nokogiri>, ["
|
51
|
-
s.add_development_dependency(%q<rspec>, ["~>
|
52
|
-
s.add_development_dependency(%q<
|
53
|
-
s.add_development_dependency(%q<
|
54
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
55
|
-
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
52
|
+
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
|
53
|
+
s.add_development_dependency(%q<rspec>, ["~> 3.0"])
|
54
|
+
s.add_development_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
|
55
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8"])
|
56
56
|
else
|
57
|
-
s.add_dependency(%q<nokogiri>, ["
|
58
|
-
s.add_dependency(%q<rspec>, ["~>
|
59
|
-
s.add_dependency(%q<
|
60
|
-
s.add_dependency(%q<
|
61
|
-
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
62
|
-
s.add_dependency(%q<simplecov>, [">= 0"])
|
57
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
58
|
+
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
59
|
+
s.add_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
|
60
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8"])
|
63
61
|
end
|
64
62
|
else
|
65
|
-
s.add_dependency(%q<nokogiri>, ["
|
66
|
-
s.add_dependency(%q<rspec>, ["~>
|
67
|
-
s.add_dependency(%q<
|
68
|
-
s.add_dependency(%q<
|
69
|
-
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
70
|
-
s.add_dependency(%q<simplecov>, [">= 0"])
|
63
|
+
s.add_dependency(%q<nokogiri>, ["~> 1.5"])
|
64
|
+
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
65
|
+
s.add_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
|
66
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8"])
|
71
67
|
end
|
72
68
|
end
|
73
69
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
<a class="a" href="http://atualidadesweb.com.br">I'm a Example Page</a>
|
2
|
+
<a class="b" href="http://atualidadesweb.com.br/sports">I'm a another Example Page</a>
|
3
|
+
<a class="c" href="http://atualidadesweb.com.br/economy"></a>
|
4
|
+
<a class="d" href="/economy">
|
5
|
+
<img class="d-img" src="http://atualidadesweb.com.br/images/logo3.png">
|
6
|
+
</a>
|
7
|
+
<a class="e" href="../economy">Test</a>
|
8
|
+
|
9
|
+
<img class="a-img" src="http://atualidadesweb.com.br/images/logo.png">
|
10
|
+
<img class="b-img" src="http://atualidadesweb.com.br/images/logo2.png">
|
11
|
+
<img class="c-img" src="http://atualidadesweb.com.br/images/logo4__.png==/atualidadesweb.com.br/images/logo4_.png==/atualidadesweb.com.br/images/logo4.png">
|
12
|
+
<img class="e-img" src="../images/logo5.png">
|
13
|
+
|
14
|
+
<div class="chamada chamada-principal">
|
15
|
+
<a href="http://g1.globo.com/bemestar/VC-no-Bem-Estar/noticia/2012/09/com-exercicio-fisico-e-dieta-saudavel-jovem-do-df-perde-83-kg-em-um-ano.html" class="foto" title="veja a transformacao do jovem que perdeu 83kg em apenas um ano (globo.com)" rel="bookmark">
|
16
|
+
<span class="borda-foto">
|
17
|
+
<img width="300" src="http://s2.glbimg.com/yq2Ruxgo6XPF6dMbJKNc5bXjxp0Hopt7xjMom4BO0BMlqexs4Crm0zfq9SXLeJQPRlKshWNRGcI1UffEKpSViw==/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" alt="veja a transformação do jovem que perdeu 83kg em apenas um ano (globo.com)" title="veja a transformacao do jovem que perdeu 83kg em apenas um ano (globo.com)" data-url-smart="DZuxxitB76ctspkSsETBLYY-a8oI3HZE2LAzjf4AHsKTMbXIn83Qq-5Zee3rsy8M/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" data-url-tablet="A4bt7aUjdYQalUJRpYfMX1duzejTqryhzIcdFf2-tmcHu3pYJZxWnLYWZrYYmG1r/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" data-url-desktop="gYRzgHhc1WrILA76XHKVHVduzejTqryhzIcdFf2-tmcHu3pYJZxWnLYWZrYYmG1r/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" />
|
18
|
+
</span>
|
19
|
+
<span class="conteudo"><p>fenomeno assustador</p></span>
|
20
|
+
</a>
|
21
|
+
</div>
|
22
|
+
<div class="chamada chamada-principal">
|
23
|
+
<a>
|
24
|
+
<span class="conteudo"><p>A</p></span>
|
25
|
+
</a>
|
26
|
+
</div>
|
27
|
+
<div class="chamada chamada-principal">
|
28
|
+
<a>
|
29
|
+
<span class="conteudo"><p>B</p></span>
|
30
|
+
</a>
|
31
|
+
</div>
|
32
|
+
<div class="chamada chamada-principal">
|
33
|
+
<a>
|
34
|
+
<span class="conteudo"><p>C</p></span>
|
35
|
+
</a>
|
36
|
+
</div>
|
37
|
+
|
38
|
+
<div class="itens-indice ultnot geral ">
|
39
|
+
<section>
|
40
|
+
<article class="col-1 linha-1 news">
|
41
|
+
<time datetime="2012-09-08T18:32">08/09</time>
|
42
|
+
<time datetime="2012-09-08T18:32" pubdate>18h32</time>
|
43
|
+
<h1>
|
44
|
+
<a href="http://esporte.uol.com.br/ultimas-noticias/reuters/2012/09/08/jackie-stewart-aconselha-hamilton-a-continuar-na-mclaren.htm">
|
45
|
+
<span>Jackie Stewart aconselha Hamilton a continuar na McLaren</span>
|
46
|
+
</a>
|
47
|
+
</h1>
|
48
|
+
<p>MONZA, 8 Set (Reuters) - Tricampeao de Formula 1, Jackie Stewart aconselhou Lewis Hamilton neste sabado a...</p>
|
49
|
+
</article>
|
50
|
+
</section>
|
51
|
+
</div>
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Newly::NewsCrawler do
|
4
|
+
|
5
|
+
describe "fetching news" do
|
6
|
+
|
7
|
+
it "should fetch news with limit" do
|
8
|
+
first_feed_with_limit = Newly::Feed.new(container: ".chamada-principal", limit: 2)
|
9
|
+
first_reader = build_reader_with 'http://bla.x', first_feed_with_limit
|
10
|
+
|
11
|
+
expect(first_reader).to have(2).fetch
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should fetch news without limit" do
|
15
|
+
first_feed_without_limit = Newly::Feed.new(
|
16
|
+
container: ".chamada-principal",
|
17
|
+
url_pattern: "a",
|
18
|
+
title: ".conteudo p",
|
19
|
+
image_source: "img"
|
20
|
+
)
|
21
|
+
first_reader = build_reader_with 'http://bla.x', first_feed_without_limit
|
22
|
+
|
23
|
+
expect(first_reader).to have(4).fetch
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "when news has content" do
|
27
|
+
context "first feed" do
|
28
|
+
let(:first_feed) do
|
29
|
+
Newly::Feed.new(
|
30
|
+
container: ".chamada-principal",
|
31
|
+
url_pattern: "a",
|
32
|
+
title: ".conteudo p",
|
33
|
+
image_source: "img"
|
34
|
+
)
|
35
|
+
end
|
36
|
+
let(:first_reader) { build_reader_with 'http://bla.x', first_feed }
|
37
|
+
|
38
|
+
it "should fetch high quality images" do
|
39
|
+
a_news = first_reader.fetch.first
|
40
|
+
expect(a_news.image).to eq "http://s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg"
|
41
|
+
end
|
42
|
+
it "should capitalize the title field" do
|
43
|
+
a_news = first_reader.fetch.first
|
44
|
+
expect(a_news.title).to eq "Fenomeno assustador"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "second feed" do
|
49
|
+
let(:second_feed) do
|
50
|
+
Newly::Feed.new(
|
51
|
+
container: "div.geral section article.news",
|
52
|
+
url_pattern: "h1 a",
|
53
|
+
title: "h1 a span",
|
54
|
+
subtitle: "p"
|
55
|
+
)
|
56
|
+
end
|
57
|
+
let(:second_reader) { build_reader_with 'http://noticias.uol.com.br/noticias', second_feed }
|
58
|
+
|
59
|
+
context "fetching news valid fields" do
|
60
|
+
let(:a_news) { second_reader.fetch.first }
|
61
|
+
|
62
|
+
it { expect(a_news.url).to eq 'http://esporte.uol.com.br/ultimas-noticias/reuters/2012/09/08/jackie-stewart-aconselha-hamilton-a-continuar-na-mclaren.htm' }
|
63
|
+
it { expect(a_news.title).to eq 'Jackie Stewart aconselha Hamilton a continuar na McLaren' }
|
64
|
+
it { expect(a_news.subtitle).to eq 'MONZA, 8 Set (Reuters) - Tricampeao de Formula 1, Jackie Stewart aconselhou Lewis Hamilton neste sabado a...' }
|
65
|
+
it { expect(a_news.feed_url).to eq "http://noticias.uol.com.br/noticias" }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
context "when reader has some invalid field" do
|
70
|
+
it "should not return news from invalid container" do
|
71
|
+
invalid_feed = Newly::Feed.new(
|
72
|
+
url: "http://bla.x",
|
73
|
+
container: "invalid"
|
74
|
+
)
|
75
|
+
invalid_reader = build_reader_with 'http://bla.x', invalid_feed
|
76
|
+
|
77
|
+
expect(invalid_reader).to have(0).fetch
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should not allow build readers without url" do
|
81
|
+
invalid_feed = Newly::Feed.new(container: "div.geral section article.news")
|
82
|
+
|
83
|
+
expect { Newly::NewsCrawler.new(selector: fake_selector, feed: invalid_feed) }.to raise_error "The url is required"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
def build_reader_with(url, feed)
|
93
|
+
Newly::NewsCrawler.new(selector: fake_selector, url: url, feed: feed)
|
94
|
+
end
|
95
|
+
def fake_selector
|
96
|
+
parsed_html = Nokogiri::HTML.parse(File.read 'spec/html/page_spec.html')
|
97
|
+
Newly::Selector.new parsed_html
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe Newly::PageCrawler do
|
3
|
+
let(:selector) { Nokogiri::HTML }
|
4
|
+
let(:host) { 'http://atualidadesweb.com.br' }
|
5
|
+
let(:subject) { Newly::PageCrawler.new(host, parse('spec/html/page_spec.html')) }
|
6
|
+
|
7
|
+
describe "#text" do
|
8
|
+
context "when is valid input" do
|
9
|
+
it { expect(subject.text(".a")).to eq "I'm a Example Page" }
|
10
|
+
it { expect(subject.text(".b")).to eq "I'm a another Example Page" }
|
11
|
+
end
|
12
|
+
context "when is invalid input" do
|
13
|
+
it { expect(subject.text(".c")).to be_nil }
|
14
|
+
it { expect(subject.text("")).to be_nil }
|
15
|
+
it { expect(subject.text(nil)).to be_nil }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "#link" do
|
20
|
+
context "when is valid input" do
|
21
|
+
it { expect(subject.link(".a")).to eq "#{host}" }
|
22
|
+
it { expect(subject.link(".b")).to eq "#{host}/sports" }
|
23
|
+
it { expect(subject.link(".c")).to eq "#{host}/economy" }
|
24
|
+
it { expect(subject.link(".d")).to eq "#{host}//economy" }
|
25
|
+
it { expect(subject.link(".e")).to eq "#{host}/economy" }
|
26
|
+
end
|
27
|
+
context "when is invalid input" do
|
28
|
+
it { expect(subject.link(".absent")).to be_nil }
|
29
|
+
it { expect(subject.link("")).to be_nil }
|
30
|
+
it { expect(subject.link(nil)).to be_nil }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#image" do
|
35
|
+
context "when is valid input" do
|
36
|
+
it { expect(subject.image("img.a-img")).to eq "#{host}/images/logo.png" }
|
37
|
+
it { expect(subject.image("img.b-img")).to eq "#{host}/images/logo2.png" }
|
38
|
+
it { expect(subject.image("img.d-img")).to eq "#{host}/images/logo3.png" }
|
39
|
+
it { expect(subject.image("img.c-img")).to eq "#{host}/images/logo4.png" }
|
40
|
+
it { expect(subject.image("img.e-img")).to eq "#{host}/images/logo5.png" }
|
41
|
+
end
|
42
|
+
context "when is invalid input" do
|
43
|
+
it { expect(subject.image("img.absent")).to be_nil }
|
44
|
+
it { expect(subject.image("")).to be_nil }
|
45
|
+
it { expect(subject.image(nil)).to be_nil }
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse(path)
|
50
|
+
selector.parse(File.read(path))
|
51
|
+
end
|
52
|
+
end
|