newly 1.1.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,16 +1,9 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- # Add dependencies required to use your gem here.
4
- # Example:
5
- # gem "activesupport", ">= 2.3.5"
6
- gem 'nokogiri'
3
+ gem 'nokogiri', '~> 1.5'
7
4
 
8
- # Add dependencies to develop your gem here.
9
- # Include everything needed to run rake, tests, features, etc.
10
5
  group :development do
11
- gem "rspec", "~> 2.8.0"
12
- gem "rdoc", "~> 3.12"
13
- gem "bundler", "~> 1.1.5"
14
- gem "jeweler", "~> 1.8.4"
15
- gem "simplecov"
6
+ gem 'rspec', '~> 3.0'
7
+ gem 'rspec-collection_matchers', '~> 1.0'
8
+ gem 'jeweler', '~> 1.8'
16
9
  end
data/Gemfile.lock CHANGED
@@ -1,39 +1,66 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- diff-lcs (1.1.3)
5
- git (1.2.5)
6
- jeweler (1.8.4)
4
+ addressable (2.3.6)
5
+ builder (3.2.2)
6
+ diff-lcs (1.2.5)
7
+ faraday (0.8.9)
8
+ multipart-post (~> 1.2.0)
9
+ git (1.2.7)
10
+ github_api (0.10.1)
11
+ addressable
12
+ faraday (~> 0.8.1)
13
+ hashie (>= 1.2)
14
+ multi_json (~> 1.4)
15
+ nokogiri (~> 1.5.2)
16
+ oauth2
17
+ hashie (3.2.0)
18
+ highline (1.6.21)
19
+ jeweler (1.8.8)
20
+ builder
7
21
  bundler (~> 1.0)
8
22
  git (>= 1.2.5)
23
+ github_api (= 0.10.1)
24
+ highline (>= 1.6.15)
25
+ nokogiri (= 1.5.10)
9
26
  rake
10
27
  rdoc
11
- json (1.7.4)
12
- multi_json (1.3.6)
13
- nokogiri (1.5.5)
14
- rake (0.9.2.2)
15
- rdoc (3.12)
28
+ json (1.8.1)
29
+ jwt (1.0.0)
30
+ multi_json (1.10.1)
31
+ multi_xml (0.5.5)
32
+ multipart-post (1.2.0)
33
+ nokogiri (1.5.10)
34
+ oauth2 (1.0.0)
35
+ faraday (>= 0.8, < 0.10)
36
+ jwt (~> 1.0)
37
+ multi_json (~> 1.3)
38
+ multi_xml (~> 0.5)
39
+ rack (~> 1.2)
40
+ rack (1.5.2)
41
+ rake (10.3.2)
42
+ rdoc (3.12.2)
16
43
  json (~> 1.4)
17
- rspec (2.8.0)
18
- rspec-core (~> 2.8.0)
19
- rspec-expectations (~> 2.8.0)
20
- rspec-mocks (~> 2.8.0)
21
- rspec-core (2.8.0)
22
- rspec-expectations (2.8.0)
23
- diff-lcs (~> 1.1.2)
24
- rspec-mocks (2.8.0)
25
- simplecov (0.6.4)
26
- multi_json (~> 1.0)
27
- simplecov-html (~> 0.5.3)
28
- simplecov-html (0.5.3)
44
+ rspec (3.0.0)
45
+ rspec-core (~> 3.0.0)
46
+ rspec-expectations (~> 3.0.0)
47
+ rspec-mocks (~> 3.0.0)
48
+ rspec-collection_matchers (1.0.0)
49
+ rspec-expectations (>= 2.99.0.beta1)
50
+ rspec-core (3.0.2)
51
+ rspec-support (~> 3.0.0)
52
+ rspec-expectations (3.0.2)
53
+ diff-lcs (>= 1.2.0, < 2.0)
54
+ rspec-support (~> 3.0.0)
55
+ rspec-mocks (3.0.2)
56
+ rspec-support (~> 3.0.0)
57
+ rspec-support (3.0.2)
29
58
 
30
59
  PLATFORMS
31
60
  ruby
32
61
 
33
62
  DEPENDENCIES
34
- bundler (~> 1.1.5)
35
- jeweler (~> 1.8.4)
36
- nokogiri
37
- rdoc (~> 3.12)
38
- rspec (~> 2.8.0)
39
- simplecov
63
+ jeweler (~> 1.8)
64
+ nokogiri (~> 1.5)
65
+ rspec (~> 3.0)
66
+ rspec-collection_matchers (~> 1.0)
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # newly
2
+
3
+ ## DSL that helps scrapping news given a feed definition with url and selectors
4
+
5
+ ## SYNOPSIS:
6
+
7
+ ``` ruby
8
+ # Fecthing breaking news from some website
9
+ require 'newly'
10
+
11
+ # Fecthing breaking news from some website
12
+ my_feed = Newly::Feed.new(
13
+ container: '#ultimas-regiao div, #ultimas-regiao ul li',
14
+ href: 'a',
15
+ title: '.titulo',
16
+ subtitle: '.subtitulo',
17
+ image_source: 'img')
18
+
19
+ news = Newly::NewsCrawler.new(url: 'http://g1.globo.com/bahia/', feed: my_feed).fetch
20
+ news.each do |n|
21
+ puts n.url # news href url
22
+ puts n.title # news title
23
+ puts n.subtitle # news subtitle
24
+ puts n.image # news image src
25
+ end
26
+ ```
27
+
28
+ ## Contributing to newly
29
+
30
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
31
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
32
+ * Fork the project.
33
+ * Start a feature/bugfix branch.
34
+ * Commit and push until you are happy with your contribution.
35
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
36
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
37
+
38
+
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ Jeweler::Tasks.new do |gem|
18
18
  gem.homepage = "http://github.com/alabeduarte/newly"
19
19
  gem.license = "MIT"
20
20
  gem.summary = %Q{Fetching breaking news from websites}
21
- gem.description = %Q{Fetching breaking news from websites}
21
+ gem.description = %Q{DSL that helps scrapping news given a feed definition with url and selectors}
22
22
  gem.email = "alabeduarte@gmail.com"
23
23
  gem.authors = ["Alabê Duarte"]
24
24
  # dependencies defined in Gemfile
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.1.0
1
+ 2.0.0
data/lib/newly/feed.rb ADDED
@@ -0,0 +1,19 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module Newly
5
+ class Feed
6
+ attr_reader :container, :url_pattern, :title, :subtitle, :image_source, :favicon, :host, :limit
7
+
8
+ def initialize(args)
9
+ @container = args[:container]
10
+ @url_pattern = args[:url_pattern]
11
+ @title = args[:title]
12
+ @subtitle = args[:subtitle]
13
+ @image_source = args[:image_source]
14
+ @favicon = args[:favicon]
15
+ @host = args[:host]
16
+ @limit = args[:limit]
17
+ end
18
+ end
19
+ end
data/lib/newly/news.rb ADDED
@@ -0,0 +1,16 @@
1
+ module Newly
2
+ class News
3
+ attr_reader :url, :url_pattern, :title, :subtitle, :image, :feed_url
4
+
5
+ def initialize(args)
6
+ page_crawler = args[:page_crawler]
7
+ feed = args[:feed]
8
+
9
+ @feed_url = args[:feed_url]
10
+ @url = page_crawler.link feed.url_pattern
11
+ @title = page_crawler.titleize feed.title
12
+ @subtitle = page_crawler.titleize feed.subtitle
13
+ @image = page_crawler.image feed.image_source
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,42 @@
1
+ require 'set'
2
+ require 'newly/selector'
3
+ require 'newly/page_crawler'
4
+ require 'newly/news'
5
+
6
+ module Newly
7
+ class NewsCrawler
8
+ attr_reader :title, :selector, :url
9
+
10
+ def initialize(args)
11
+ @feed = args[:feed]
12
+ @url = args[:url]
13
+ raise "The url is required" unless @url
14
+
15
+ @selector = args[:selector] || Newly::Selector.new(Nokogiri::HTML(open @url))
16
+ end
17
+
18
+ def fetch
19
+ news_fetched = Set.new
20
+ all_news = @selector.all(container: @feed.container, max: @feed.limit)
21
+
22
+ all_news.each do |item|
23
+ news = build_news_by(item)
24
+ if news
25
+ news_fetched << news
26
+ end
27
+ end
28
+
29
+ news_fetched.to_a
30
+ end
31
+
32
+ private
33
+ def build_news_by(item)
34
+ if (item)
35
+ page_crawler = Newly::PageCrawler.new(@feed.host, item)
36
+
37
+ Newly::News.new(page_crawler: page_crawler, feed: @feed, feed_url: @url)
38
+ end
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ module Newly
2
+ class PageCrawler
3
+ def initialize(host, document)
4
+ @host = host
5
+ @document = document
6
+ end
7
+
8
+ def titleize(element)
9
+ title = text(element)
10
+ title[0] = title.capitalize[0] if title
11
+
12
+ title
13
+ end
14
+
15
+ def text(element)
16
+ if valid?(element)
17
+ text = get(element).text
18
+ text if valid?(text)
19
+ end
20
+ end
21
+
22
+ def link(element)
23
+ href = find(element, 'href')
24
+ href = "#{@host}/#{href}".gsub('../', '') if href && !href.include?('http')
25
+ href
26
+ end
27
+
28
+ def image(element)
29
+ image = find(element, 'src')
30
+ if (image && image.include?("==/"))
31
+ image = "http://#{image.split("==/").last}"
32
+ end
33
+ image = "#{@host}/#{image}".gsub('../', '') if image && image.include?('../')
34
+ image
35
+ end
36
+
37
+ private
38
+ def valid?(str)
39
+ str && !str.empty?
40
+ end
41
+
42
+ def get(element)
43
+ @document.css(element)
44
+ end
45
+
46
+ def find(element, type)
47
+ get(element).map { |doc| doc[type] }.first if valid?(element)
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,17 @@
1
+ module Newly
2
+ class Selector
3
+ def initialize(selector)
4
+ @selector = selector
5
+ end
6
+
7
+ def all(args)
8
+ args[:max] ?
9
+ @selector.css(args[:container]).first(args[:max]) :
10
+ @selector.css(args[:container])
11
+ end
12
+
13
+ def title
14
+ @selector.at_css("title").text
15
+ end
16
+ end
17
+ end
data/lib/newly.rb CHANGED
@@ -1,34 +1,2 @@
1
- require 'nokogiri'
2
- require 'open-uri'
3
- require 'news'
4
-
5
- class Newly
6
-
7
- attr_reader :title, :selector, :url
8
-
9
- def initialize(url, selector=Nokogiri::HTML(open(url)))
10
- @url = url
11
- @selector = selector
12
- @title = @selector.at_css("title").text
13
- end
14
-
15
- def highlights(args)
16
- news = Array.new
17
- @selector.css(args[:selector]).each do |item|
18
- if (item)
19
- href = item.css(args[:href]).map { |doc| doc['href'] }.first if args[:href]
20
- date = item.css(args[:date]).text if args[:date]
21
- title = item.css(args[:title]).text if args[:title]
22
- subtitle = item.css(args[:subtitle]).text if args[:subtitle]
23
- img = item.css(args[:img]).map { |doc| doc['src'] }.first if args[:img]
24
- if (args[:host])
25
- host = args[:host]
26
- url = "#{host}/#{url}".gsub('../', '') if url
27
- image = "#{host}/#{image}".gsub('../', '') if image && image.include?('../')
28
- end
29
- news << News.new(url: href, keywords: keywords, date: date, title: title, subtitle: subtitle, image: img)
30
- end
31
- end
32
- news
33
- end
34
- end
1
+ require 'newly/feed'
2
+ require 'newly/news_crawler'
data/newly.gemspec CHANGED
@@ -5,16 +5,16 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "newly"
8
- s.version = "1.1.0"
8
+ s.version = "2.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Alab\u{ea} Duarte"]
12
- s.date = "2012-08-14"
13
- s.description = "Fetching breaking news from websites"
12
+ s.date = "2014-07-22"
13
+ s.description = "DSL that helps scrapping news given a feed definition with url and selectors"
14
14
  s.email = "alabeduarte@gmail.com"
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE.txt",
17
- "README.rdoc"
17
+ "README.md"
18
18
  ]
19
19
  s.files = [
20
20
  ".DS_Store",
@@ -23,51 +23,47 @@ Gem::Specification.new do |s|
23
23
  "Gemfile",
24
24
  "Gemfile.lock",
25
25
  "LICENSE.txt",
26
- "README.rdoc",
26
+ "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "lib/newly.rb",
30
- "lib/news.rb",
30
+ "lib/newly/feed.rb",
31
+ "lib/newly/news.rb",
32
+ "lib/newly/news_crawler.rb",
33
+ "lib/newly/page_crawler.rb",
34
+ "lib/newly/selector.rb",
31
35
  "newly.gemspec",
32
36
  "spec/.DS_Store",
33
- "spec/html/ecbahia.html",
34
- "spec/html/g1.html",
35
- "spec/html/g1_bahia.html",
36
- "spec/html/metro1_cidade.html",
37
- "spec/newly_spec.rb",
37
+ "spec/html/page_spec.html",
38
+ "spec/newly/news_crawler_spec.rb",
39
+ "spec/newly/page_crawler_spec.rb",
38
40
  "spec/spec_helper.rb"
39
41
  ]
40
42
  s.homepage = "http://github.com/alabeduarte/newly"
41
43
  s.licenses = ["MIT"]
42
44
  s.require_paths = ["lib"]
43
- s.rubygems_version = "1.8.10"
45
+ s.rubygems_version = "1.8.21"
44
46
  s.summary = "Fetching breaking news from websites"
45
47
 
46
48
  if s.respond_to? :specification_version then
47
49
  s.specification_version = 3
48
50
 
49
51
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
- s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
51
- s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
52
- s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
53
- s.add_development_dependency(%q<bundler>, ["~> 1.1.5"])
54
- s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
55
- s.add_development_dependency(%q<simplecov>, [">= 0"])
52
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.5"])
53
+ s.add_development_dependency(%q<rspec>, ["~> 3.0"])
54
+ s.add_development_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
55
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8"])
56
56
  else
57
- s.add_dependency(%q<nokogiri>, [">= 0"])
58
- s.add_dependency(%q<rspec>, ["~> 2.8.0"])
59
- s.add_dependency(%q<rdoc>, ["~> 3.12"])
60
- s.add_dependency(%q<bundler>, ["~> 1.1.5"])
61
- s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
62
- s.add_dependency(%q<simplecov>, [">= 0"])
57
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
58
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
59
+ s.add_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
60
+ s.add_dependency(%q<jeweler>, ["~> 1.8"])
63
61
  end
64
62
  else
65
- s.add_dependency(%q<nokogiri>, [">= 0"])
66
- s.add_dependency(%q<rspec>, ["~> 2.8.0"])
67
- s.add_dependency(%q<rdoc>, ["~> 3.12"])
68
- s.add_dependency(%q<bundler>, ["~> 1.1.5"])
69
- s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
70
- s.add_dependency(%q<simplecov>, [">= 0"])
63
+ s.add_dependency(%q<nokogiri>, ["~> 1.5"])
64
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
65
+ s.add_dependency(%q<rspec-collection_matchers>, ["~> 1.0"])
66
+ s.add_dependency(%q<jeweler>, ["~> 1.8"])
71
67
  end
72
68
  end
73
69
 
@@ -0,0 +1,51 @@
1
+ <a class="a" href="http://atualidadesweb.com.br">I'm a Example Page</a>
2
+ <a class="b" href="http://atualidadesweb.com.br/sports">I'm a another Example Page</a>
3
+ <a class="c" href="http://atualidadesweb.com.br/economy"></a>
4
+ <a class="d" href="/economy">
5
+ <img class="d-img" src="http://atualidadesweb.com.br/images/logo3.png">
6
+ </a>
7
+ <a class="e" href="../economy">Test</a>
8
+
9
+ <img class="a-img" src="http://atualidadesweb.com.br/images/logo.png">
10
+ <img class="b-img" src="http://atualidadesweb.com.br/images/logo2.png">
11
+ <img class="c-img" src="http://atualidadesweb.com.br/images/logo4__.png==/atualidadesweb.com.br/images/logo4_.png==/atualidadesweb.com.br/images/logo4.png">
12
+ <img class="e-img" src="../images/logo5.png">
13
+
14
+ <div class="chamada chamada-principal">
15
+ <a href="http://g1.globo.com/bemestar/VC-no-Bem-Estar/noticia/2012/09/com-exercicio-fisico-e-dieta-saudavel-jovem-do-df-perde-83-kg-em-um-ano.html" class="foto" title="veja a transformacao do jovem que perdeu 83kg em apenas um ano (globo.com)" rel="bookmark">
16
+ <span class="borda-foto">
17
+ <img width="300" src="http://s2.glbimg.com/yq2Ruxgo6XPF6dMbJKNc5bXjxp0Hopt7xjMom4BO0BMlqexs4Crm0zfq9SXLeJQPRlKshWNRGcI1UffEKpSViw==/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" alt="veja a transformação do jovem que perdeu 83kg em apenas um ano (globo.com)" title="veja a transformacao do jovem que perdeu 83kg em apenas um ano (globo.com)" data-url-smart="DZuxxitB76ctspkSsETBLYY-a8oI3HZE2LAzjf4AHsKTMbXIn83Qq-5Zee3rsy8M/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" data-url-tablet="A4bt7aUjdYQalUJRpYfMX1duzejTqryhzIcdFf2-tmcHu3pYJZxWnLYWZrYYmG1r/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" data-url-desktop="gYRzgHhc1WrILA76XHKVHVduzejTqryhzIcdFf2-tmcHu3pYJZxWnLYWZrYYmG1r/s2.glbimg.com/CR190ZvQOP9jxwmN0kT_CunYRF-Z8ZlT5vANqY5-UdKDu3DgEL3hOr3yojy7lLoS_EaKD0QT1y94uK8RcPde4A==/s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg" />
18
+ </span>
19
+ <span class="conteudo"><p>fenomeno assustador</p></span>
20
+ </a>
21
+ </div>
22
+ <div class="chamada chamada-principal">
23
+ <a>
24
+ <span class="conteudo"><p>A</p></span>
25
+ </a>
26
+ </div>
27
+ <div class="chamada chamada-principal">
28
+ <a>
29
+ <span class="conteudo"><p>B</p></span>
30
+ </a>
31
+ </div>
32
+ <div class="chamada chamada-principal">
33
+ <a>
34
+ <span class="conteudo"><p>C</p></span>
35
+ </a>
36
+ </div>
37
+
38
+ <div class="itens-indice ultnot geral ">
39
+ <section>
40
+ <article class="col-1 linha-1 news">
41
+ <time datetime="2012-09-08T18:32">08/09</time>
42
+ <time datetime="2012-09-08T18:32" pubdate>18h32</time>
43
+ <h1>
44
+ <a href="http://esporte.uol.com.br/ultimas-noticias/reuters/2012/09/08/jackie-stewart-aconselha-hamilton-a-continuar-na-mclaren.htm">
45
+ <span>Jackie Stewart aconselha Hamilton a continuar na McLaren</span>
46
+ </a>
47
+ </h1>
48
+ <p>MONZA, 8 Set (Reuters) - Tricampeao de Formula 1, Jackie Stewart aconselhou Lewis Hamilton neste sabado a...</p>
49
+ </article>
50
+ </section>
51
+ </div>
@@ -0,0 +1,99 @@
1
+ require 'spec_helper'
2
+
3
+ describe Newly::NewsCrawler do
4
+
5
+ describe "fetching news" do
6
+
7
+ it "should fetch news with limit" do
8
+ first_feed_with_limit = Newly::Feed.new(container: ".chamada-principal", limit: 2)
9
+ first_reader = build_reader_with 'http://bla.x', first_feed_with_limit
10
+
11
+ expect(first_reader).to have(2).fetch
12
+ end
13
+
14
+ it "should fetch news without limit" do
15
+ first_feed_without_limit = Newly::Feed.new(
16
+ container: ".chamada-principal",
17
+ url_pattern: "a",
18
+ title: ".conteudo p",
19
+ image_source: "img"
20
+ )
21
+ first_reader = build_reader_with 'http://bla.x', first_feed_without_limit
22
+
23
+ expect(first_reader).to have(4).fetch
24
+ end
25
+
26
+ describe "when news has content" do
27
+ context "first feed" do
28
+ let(:first_feed) do
29
+ Newly::Feed.new(
30
+ container: ".chamada-principal",
31
+ url_pattern: "a",
32
+ title: ".conteudo p",
33
+ image_source: "img"
34
+ )
35
+ end
36
+ let(:first_reader) { build_reader_with 'http://bla.x', first_feed }
37
+
38
+ it "should fetch high quality images" do
39
+ a_news = first_reader.fetch.first
40
+ expect(a_news.image).to eq "http://s.glbimg.com/en/ho/f/original/2012/09/29/exobeso.jpg"
41
+ end
42
+ it "should capitalize the title field" do
43
+ a_news = first_reader.fetch.first
44
+ expect(a_news.title).to eq "Fenomeno assustador"
45
+ end
46
+ end
47
+
48
+ context "second feed" do
49
+ let(:second_feed) do
50
+ Newly::Feed.new(
51
+ container: "div.geral section article.news",
52
+ url_pattern: "h1 a",
53
+ title: "h1 a span",
54
+ subtitle: "p"
55
+ )
56
+ end
57
+ let(:second_reader) { build_reader_with 'http://noticias.uol.com.br/noticias', second_feed }
58
+
59
+ context "fetching news valid fields" do
60
+ let(:a_news) { second_reader.fetch.first }
61
+
62
+ it { expect(a_news.url).to eq 'http://esporte.uol.com.br/ultimas-noticias/reuters/2012/09/08/jackie-stewart-aconselha-hamilton-a-continuar-na-mclaren.htm' }
63
+ it { expect(a_news.title).to eq 'Jackie Stewart aconselha Hamilton a continuar na McLaren' }
64
+ it { expect(a_news.subtitle).to eq 'MONZA, 8 Set (Reuters) - Tricampeao de Formula 1, Jackie Stewart aconselhou Lewis Hamilton neste sabado a...' }
65
+ it { expect(a_news.feed_url).to eq "http://noticias.uol.com.br/noticias" }
66
+ end
67
+ end
68
+
69
+ context "when reader has some invalid field" do
70
+ it "should not return news from invalid container" do
71
+ invalid_feed = Newly::Feed.new(
72
+ url: "http://bla.x",
73
+ container: "invalid"
74
+ )
75
+ invalid_reader = build_reader_with 'http://bla.x', invalid_feed
76
+
77
+ expect(invalid_reader).to have(0).fetch
78
+ end
79
+
80
+ it "should not allow build readers without url" do
81
+ invalid_feed = Newly::Feed.new(container: "div.geral section article.news")
82
+
83
+ expect { Newly::NewsCrawler.new(selector: fake_selector, feed: invalid_feed) }.to raise_error "The url is required"
84
+ end
85
+ end
86
+
87
+ end
88
+
89
+ end
90
+
91
+ private
92
+ def build_reader_with(url, feed)
93
+ Newly::NewsCrawler.new(selector: fake_selector, url: url, feed: feed)
94
+ end
95
+ def fake_selector
96
+ parsed_html = Nokogiri::HTML.parse(File.read 'spec/html/page_spec.html')
97
+ Newly::Selector.new parsed_html
98
+ end
99
+ end
@@ -0,0 +1,52 @@
1
+ require 'spec_helper'
2
+ describe Newly::PageCrawler do
3
+ let(:selector) { Nokogiri::HTML }
4
+ let(:host) { 'http://atualidadesweb.com.br' }
5
+ let(:subject) { Newly::PageCrawler.new(host, parse('spec/html/page_spec.html')) }
6
+
7
+ describe "#text" do
8
+ context "when is valid input" do
9
+ it { expect(subject.text(".a")).to eq "I'm a Example Page" }
10
+ it { expect(subject.text(".b")).to eq "I'm a another Example Page" }
11
+ end
12
+ context "when is invalid input" do
13
+ it { expect(subject.text(".c")).to be_nil }
14
+ it { expect(subject.text("")).to be_nil }
15
+ it { expect(subject.text(nil)).to be_nil }
16
+ end
17
+ end
18
+
19
+ describe "#link" do
20
+ context "when is valid input" do
21
+ it { expect(subject.link(".a")).to eq "#{host}" }
22
+ it { expect(subject.link(".b")).to eq "#{host}/sports" }
23
+ it { expect(subject.link(".c")).to eq "#{host}/economy" }
24
+ it { expect(subject.link(".d")).to eq "#{host}//economy" }
25
+ it { expect(subject.link(".e")).to eq "#{host}/economy" }
26
+ end
27
+ context "when is invalid input" do
28
+ it { expect(subject.link(".absent")).to be_nil }
29
+ it { expect(subject.link("")).to be_nil }
30
+ it { expect(subject.link(nil)).to be_nil }
31
+ end
32
+ end
33
+
34
+ describe "#image" do
35
+ context "when is valid input" do
36
+ it { expect(subject.image("img.a-img")).to eq "#{host}/images/logo.png" }
37
+ it { expect(subject.image("img.b-img")).to eq "#{host}/images/logo2.png" }
38
+ it { expect(subject.image("img.d-img")).to eq "#{host}/images/logo3.png" }
39
+ it { expect(subject.image("img.c-img")).to eq "#{host}/images/logo4.png" }
40
+ it { expect(subject.image("img.e-img")).to eq "#{host}/images/logo5.png" }
41
+ end
42
+ context "when is invalid input" do
43
+ it { expect(subject.image("img.absent")).to be_nil }
44
+ it { expect(subject.image("")).to be_nil }
45
+ it { expect(subject.image(nil)).to be_nil }
46
+ end
47
+ end
48
+
49
+ def parse(path)
50
+ selector.parse(File.read(path))
51
+ end
52
+ end