preadly-bulbasaur 0.7.2 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3585babf7ec241a14a1bb08d933cb648216658e
4
- data.tar.gz: e283f5b067e6a8d7642e28e747df9a36e49877fd
3
+ metadata.gz: 975df2c7ace7a0947cd98b55bfcf896a970e5660
4
+ data.tar.gz: 5a5b086ba797209a5493d4d1db2026631f2091a2
5
5
  SHA512:
6
- metadata.gz: 3ec4836696c83dd0eef9e32ca7cf9375ecc37bc4ed973f5761357702a23a3c5eeaa7198b195ebab39a4e11e3c505a2b6d50ad37034a95cf45a06a680e3d0b048
7
- data.tar.gz: e1324f9d951aaa744e91b4a1113a3f2cc9b2e8222217e3a2ecc9303fb6455ec838819b218a52d64ffbf28a16a5dae7bff6ea4a1c56e9b06f86d464b66e52a4ee
6
+ metadata.gz: a62e0c0d111f954d147003c25d37ede0ee09c1717ac3a15f8194d07d847e2cedf474667b03bf9c2a84f52a93d53920f6d5162c44b399ebbc9551b4db88bf65c9
7
+ data.tar.gz: fc8dd0769358ef5e52da253d0a412065667920a401c78a07eb52b8f14bb5368542db7098874987386b9ddd9952df0847d2c6c6acd43ef4a21765e596b75ec3f2
@@ -11,6 +11,7 @@ module Bulbasaur
11
11
  images = images + extract_images_html(@html)
12
12
  images = images + extract_images_youtube(@html)
13
13
  images = images + extract_images_vimeo(@html)
14
+ images = images + extract_images_meta(@html)
14
15
  images
15
16
  end
16
17
 
@@ -40,5 +41,13 @@ module Bulbasaur
40
41
  end
41
42
  end
42
43
 
44
+ def extract_images_meta(html)
45
+ begin
46
+ Bulbasaur::ExtractImagesFromMeta.new(html).call
47
+ rescue Exception => e
48
+ []
49
+ end
50
+ end
51
+
43
52
  end
44
53
  end
@@ -24,7 +24,7 @@ module Bulbasaur
24
24
  Nokogiri::HTML(@html).xpath("//img").each do |item|
25
25
  url = item.xpath("@src").text
26
26
  alt = item.xpath("@alt").text
27
- images << create_struct(url, alt)
27
+ images << create_struct(url, 'img', alt)
28
28
  end
29
29
  images
30
30
  end
@@ -32,7 +32,7 @@ module Bulbasaur
32
32
  def extract_images_by_tag_style
33
33
  images = Array.new
34
34
  @html.scan(CSS_IMPORT_URL_REGEX).each do |url|
35
- images << create_struct(url)
35
+ images << create_struct(url, 'style')
36
36
  end
37
37
  images
38
38
  end
@@ -41,13 +41,13 @@ module Bulbasaur
41
41
  images = Array.new
42
42
  Nokogiri::HTML(@html).xpath('//a').each do |link|
43
43
  url = link.xpath('@href').text
44
- images << create_struct(url) if url =~ IMG_CANDIDATE_URL_REGEX
44
+ images << create_struct(url, 'link') if url =~ IMG_CANDIDATE_URL_REGEX
45
45
  end
46
46
  images
47
47
  end
48
48
 
49
- def create_struct(url, alt=nil)
50
- {url: url, alt: alt }
49
+ def create_struct(url, source, alt = nil)
50
+ { url: url, alt: alt, source: source }
51
51
  end
52
52
  end
53
53
  end
@@ -0,0 +1,21 @@
1
+ module Bulbasaur
2
+ class ExtractImagesFromMeta
3
+ PROPERTY = 'og:image'
4
+
5
+ def initialize(html)
6
+ @html = html
7
+ end
8
+
9
+ def call
10
+ meta_informations = Bulbasaur::ExtractMetaInformationsFromHTML.new(@html).call
11
+ image_urls = image_meta_tags(meta_informations).map { |meta| { url: meta[:value], source: 'meta' } }
12
+ image_urls
13
+ end
14
+
15
+ private
16
+
17
+ def image_meta_tags(meta_informations)
18
+ meta_informations.select { |meta| meta[:name] == PROPERTY }
19
+ end
20
+ end
21
+ end
@@ -16,7 +16,7 @@ module Bulbasaur
16
16
  @html.scan(EXTRACT_URL_PATTERN).each do |video|
17
17
  vid = get_vid(video)
18
18
  url_image = image_url_for(vid)
19
- images << { url: url_image }
19
+ images << { url: url_image, video_url: video, source: 'vimeo' }
20
20
  end
21
21
  images
22
22
  end
@@ -13,7 +13,7 @@ module Bulbasaur
13
13
  images = Array.new
14
14
  @html.scan(EXTRACT_URL_PATTERN).each do |video|
15
15
  vid = get_vid(video)
16
- images << { url: image_url(vid) }
16
+ images << { url: image_url(vid), video_url: video, source: 'youtube' }
17
17
  end
18
18
  images
19
19
  end
@@ -2,8 +2,8 @@ module Bulbasaur
2
2
 
3
3
  module Version
4
4
  MAJOR = 0
5
- MINOR = 7
6
- PATCH = 2
5
+ MINOR = 8
6
+ PATCH = 0
7
7
  STRING = "#{MAJOR}.#{MINOR}.#{PATCH}"
8
8
  end
9
9
 
data/lib/bulbasaur.rb CHANGED
@@ -2,6 +2,7 @@ require "nokogiri"
2
2
  require "bulbasaur/extracts/extract_images_from_youtube"
3
3
  require "bulbasaur/extracts/extract_images_from_vimeo"
4
4
  require "bulbasaur/extracts/extract_images_from_html"
5
+ require "bulbasaur/extracts/extract_images_from_meta"
5
6
  require "bulbasaur/extracts/extract_images_from_all_resources"
6
7
  require "bulbasaur/extracts/extract_text_from_html.rb"
7
8
  require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
@@ -14,6 +14,7 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
14
14
 
15
15
  let(:html) do
16
16
  %Q(
17
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
17
18
  <p>
18
19
  <iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
19
20
  </p>
@@ -39,8 +40,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
39
40
  )
40
41
  end
41
42
 
42
- it "Does return 15 itens" do
43
- expect(subject.size).to eq 15
43
+ it "Does return 16 items" do
44
+ expect(subject.size).to eq 16
44
45
  end
45
46
  end
46
47
  end
@@ -37,6 +37,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
37
37
  it "Does return the image alt" do
38
38
  expect(subject.first[:alt]).to eq "image alt test"
39
39
  end
40
+
41
+ it 'Does return the image source' do
42
+ expect(subject.first[:source]).to eq 'img'
43
+ end
40
44
  end
41
45
 
42
46
  context "When send html with a image style inline" do
@@ -59,6 +63,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
59
63
  it "Does return the image alt" do
60
64
  expect(subject.first[:alt]).to be_nil
61
65
  end
66
+
67
+ it 'Does return the image source' do
68
+ expect(subject.first[:source]).to eq 'style'
69
+ end
62
70
  end
63
71
 
64
72
  context 'When sending HTML with a link pointing to an image' do
@@ -90,25 +98,29 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
90
98
  end
91
99
 
92
100
  it 'Does return the image URL with parameters' do
93
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil
101
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil, source: 'link'
94
102
  end
95
103
 
96
104
  it 'Does return the image URL without parameters' do
97
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil
105
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil, source: 'link'
98
106
  end
99
107
 
100
108
  it 'Does return the image URL with tilde parameters' do
101
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil
109
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil, source: 'link'
102
110
  end
103
111
 
104
112
  it 'Does return the image URL with upcased and special characters' do
105
- expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil
113
+ expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil, source: 'link'
106
114
  end
107
115
 
108
116
  it 'Does return the image alt' do
109
117
  expect(subject.first[:alt]).to be_nil
110
118
  end
111
119
 
120
+ it 'Does return the image source' do
121
+ expect(subject.first[:source]).to eq 'link'
122
+ end
123
+
112
124
  it 'Does not include links other than for images' do
113
125
  expect(subject).not_to include Hash url: 'http://somewhere.to/get/the_original_image.jpg.exe', alt: nil
114
126
  expect(subject).not_to include Hash url: 'http://somewhere.to/go/to/another_page.html', alt: nil
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Bulbasaur::ExtractImagesFromMeta do
4
+ subject { described_class.new(html).call }
5
+
6
+ describe '#call' do
7
+ context 'when there are no image meta tags' do
8
+ let(:html) { %Q(<meta property="og:description" content="Just a RSpec test." />) }
9
+
10
+ it 'returns an empty array' do
11
+ expect(subject).to be_empty
12
+ end
13
+ end
14
+
15
+ context 'when there is one image meta tag' do
16
+ let :html do
17
+ %Q(
18
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
19
+ <meta property="og:description" content="Just a RSpec test." />
20
+ )
21
+ end
22
+
23
+ it 'returns the image URL found' do
24
+ expect(subject.count).to be 1
25
+ expect(subject).to include Hash url: 'http://somewhere.to/get/an_image.jpg', source: 'meta'
26
+ end
27
+ end
28
+
29
+ context 'when there are multiple image meta tags' do
30
+ let :html do
31
+ %Q(
32
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
33
+ <meta property="og:image" content="http://somewhere.to/get/another_image.jpg" />
34
+ <meta property="og:image" content="http://somewhere.to/get/a_third_image.jpg" />
35
+ <meta property="og:description" content="Just a RSpec test." />
36
+ )
37
+ end
38
+
39
+ it 'returns the image URLs found' do
40
+ expect(subject.count).to be 3
41
+ expect(subject.map { |meta| meta[:url] }).to include 'http://somewhere.to/get/an_image.jpg', 'http://somewhere.to/get/another_image.jpg', 'http://somewhere.to/get/a_third_image.jpg'
42
+ end
43
+ end
44
+ end
45
+ end
@@ -31,6 +31,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromVimeo do
31
31
 
32
32
  it "Does return vime url" do
33
33
  expect(subject.first[:url]).to eq "https://i.vimeocdn.com/video/123456789_640.webp"
34
+ expect(subject.first[:source]).to eq 'vimeo'
35
+ expect(subject.first[:video_url]).to eq 'player.vimeo.com/video/123456789'
34
36
  end
35
37
  end
36
38
 
@@ -35,6 +35,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromYoutube do
35
35
 
36
36
  it "Does return youtube url" do
37
37
  expect(subject.first[:url]).to eq "http://img.youtube.com/vi/123idfake321/maxresdefault.jpg"
38
+ expect(subject.first[:source]).to eq 'youtube'
39
+ expect(subject.first[:video_url]).to eq 'www.youtube.com/embed/123idfake321'
38
40
  end
39
41
  end
40
42
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preadly-bulbasaur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Magno Costa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-13 00:00:00.000000000 Z
11
+ date: 2015-08-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -92,6 +92,7 @@ files:
92
92
  - lib/bulbasaur.rb
93
93
  - lib/bulbasaur/extracts/extract_images_from_all_resources.rb
94
94
  - lib/bulbasaur/extracts/extract_images_from_html.rb
95
+ - lib/bulbasaur/extracts/extract_images_from_meta.rb
95
96
  - lib/bulbasaur/extracts/extract_images_from_vimeo.rb
96
97
  - lib/bulbasaur/extracts/extract_images_from_youtube.rb
97
98
  - lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/bulbasaur/version.rb
105
106
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
106
107
  - spec/bulbasaur/extracts/extract_images_from_html_spec.rb
108
+ - spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
107
109
  - spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
108
110
  - spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
109
111
  - spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
@@ -140,6 +142,7 @@ summary: Bulbasaur is a helper for crawler operations used in Pread.ly
140
142
  test_files:
141
143
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
142
144
  - spec/bulbasaur/extracts/extract_images_from_html_spec.rb
145
+ - spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
143
146
  - spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
144
147
  - spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
145
148
  - spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb