preadly-bulbasaur 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3585babf7ec241a14a1bb08d933cb648216658e
4
- data.tar.gz: e283f5b067e6a8d7642e28e747df9a36e49877fd
3
+ metadata.gz: 975df2c7ace7a0947cd98b55bfcf896a970e5660
4
+ data.tar.gz: 5a5b086ba797209a5493d4d1db2026631f2091a2
5
5
  SHA512:
6
- metadata.gz: 3ec4836696c83dd0eef9e32ca7cf9375ecc37bc4ed973f5761357702a23a3c5eeaa7198b195ebab39a4e11e3c505a2b6d50ad37034a95cf45a06a680e3d0b048
7
- data.tar.gz: e1324f9d951aaa744e91b4a1113a3f2cc9b2e8222217e3a2ecc9303fb6455ec838819b218a52d64ffbf28a16a5dae7bff6ea4a1c56e9b06f86d464b66e52a4ee
6
+ metadata.gz: a62e0c0d111f954d147003c25d37ede0ee09c1717ac3a15f8194d07d847e2cedf474667b03bf9c2a84f52a93d53920f6d5162c44b399ebbc9551b4db88bf65c9
7
+ data.tar.gz: fc8dd0769358ef5e52da253d0a412065667920a401c78a07eb52b8f14bb5368542db7098874987386b9ddd9952df0847d2c6c6acd43ef4a21765e596b75ec3f2
@@ -11,6 +11,7 @@ module Bulbasaur
11
11
  images = images + extract_images_html(@html)
12
12
  images = images + extract_images_youtube(@html)
13
13
  images = images + extract_images_vimeo(@html)
14
+ images = images + extract_images_meta(@html)
14
15
  images
15
16
  end
16
17
 
@@ -40,5 +41,13 @@ module Bulbasaur
40
41
  end
41
42
  end
42
43
 
44
+ def extract_images_meta(html)
45
+ begin
46
+ Bulbasaur::ExtractImagesFromMeta.new(html).call
47
+ rescue Exception => e
48
+ []
49
+ end
50
+ end
51
+
43
52
  end
44
53
  end
@@ -24,7 +24,7 @@ module Bulbasaur
24
24
  Nokogiri::HTML(@html).xpath("//img").each do |item|
25
25
  url = item.xpath("@src").text
26
26
  alt = item.xpath("@alt").text
27
- images << create_struct(url, alt)
27
+ images << create_struct(url, 'img', alt)
28
28
  end
29
29
  images
30
30
  end
@@ -32,7 +32,7 @@ module Bulbasaur
32
32
  def extract_images_by_tag_style
33
33
  images = Array.new
34
34
  @html.scan(CSS_IMPORT_URL_REGEX).each do |url|
35
- images << create_struct(url)
35
+ images << create_struct(url, 'style')
36
36
  end
37
37
  images
38
38
  end
@@ -41,13 +41,13 @@ module Bulbasaur
41
41
  images = Array.new
42
42
  Nokogiri::HTML(@html).xpath('//a').each do |link|
43
43
  url = link.xpath('@href').text
44
- images << create_struct(url) if url =~ IMG_CANDIDATE_URL_REGEX
44
+ images << create_struct(url, 'link') if url =~ IMG_CANDIDATE_URL_REGEX
45
45
  end
46
46
  images
47
47
  end
48
48
 
49
- def create_struct(url, alt=nil)
50
- {url: url, alt: alt }
49
+ def create_struct(url, source, alt = nil)
50
+ { url: url, alt: alt, source: source }
51
51
  end
52
52
  end
53
53
  end
@@ -0,0 +1,21 @@
1
+ module Bulbasaur
2
+ class ExtractImagesFromMeta
3
+ PROPERTY = 'og:image'
4
+
5
+ def initialize(html)
6
+ @html = html
7
+ end
8
+
9
+ def call
10
+ meta_informations = Bulbasaur::ExtractMetaInformationsFromHTML.new(@html).call
11
+ image_urls = image_meta_tags(meta_informations).map { |meta| { url: meta[:value], source: 'meta' } }
12
+ image_urls
13
+ end
14
+
15
+ private
16
+
17
+ def image_meta_tags(meta_informations)
18
+ meta_informations.select { |meta| meta[:name] == PROPERTY }
19
+ end
20
+ end
21
+ end
@@ -16,7 +16,7 @@ module Bulbasaur
16
16
  @html.scan(EXTRACT_URL_PATTERN).each do |video|
17
17
  vid = get_vid(video)
18
18
  url_image = image_url_for(vid)
19
- images << { url: url_image }
19
+ images << { url: url_image, video_url: video, source: 'vimeo' }
20
20
  end
21
21
  images
22
22
  end
@@ -13,7 +13,7 @@ module Bulbasaur
13
13
  images = Array.new
14
14
  @html.scan(EXTRACT_URL_PATTERN).each do |video|
15
15
  vid = get_vid(video)
16
- images << { url: image_url(vid) }
16
+ images << { url: image_url(vid), video_url: video, source: 'youtube' }
17
17
  end
18
18
  images
19
19
  end
@@ -2,8 +2,8 @@ module Bulbasaur
2
2
 
3
3
  module Version
4
4
  MAJOR = 0
5
- MINOR = 7
6
- PATCH = 2
5
+ MINOR = 8
6
+ PATCH = 0
7
7
  STRING = "#{MAJOR}.#{MINOR}.#{PATCH}"
8
8
  end
9
9
 
data/lib/bulbasaur.rb CHANGED
@@ -2,6 +2,7 @@ require "nokogiri"
2
2
  require "bulbasaur/extracts/extract_images_from_youtube"
3
3
  require "bulbasaur/extracts/extract_images_from_vimeo"
4
4
  require "bulbasaur/extracts/extract_images_from_html"
5
+ require "bulbasaur/extracts/extract_images_from_meta"
5
6
  require "bulbasaur/extracts/extract_images_from_all_resources"
6
7
  require "bulbasaur/extracts/extract_text_from_html.rb"
7
8
  require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
@@ -14,6 +14,7 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
14
14
 
15
15
  let(:html) do
16
16
  %Q(
17
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
17
18
  <p>
18
19
  <iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
19
20
  </p>
@@ -39,8 +40,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
39
40
  )
40
41
  end
41
42
 
42
- it "Does return 15 itens" do
43
- expect(subject.size).to eq 15
43
+ it "Does return 16 items" do
44
+ expect(subject.size).to eq 16
44
45
  end
45
46
  end
46
47
  end
@@ -37,6 +37,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
37
37
  it "Does return the image alt" do
38
38
  expect(subject.first[:alt]).to eq "image alt test"
39
39
  end
40
+
41
+ it 'Does return the image source' do
42
+ expect(subject.first[:source]).to eq 'img'
43
+ end
40
44
  end
41
45
 
42
46
  context "When send html with a image style inline" do
@@ -59,6 +63,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
59
63
  it "Does return the image alt" do
60
64
  expect(subject.first[:alt]).to be_nil
61
65
  end
66
+
67
+ it 'Does return the image source' do
68
+ expect(subject.first[:source]).to eq 'style'
69
+ end
62
70
  end
63
71
 
64
72
  context 'When sending HTML with a link pointing to an image' do
@@ -90,25 +98,29 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
90
98
  end
91
99
 
92
100
  it 'Does return the image URL with parameters' do
93
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil
101
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil, source: 'link'
94
102
  end
95
103
 
96
104
  it 'Does return the image URL without parameters' do
97
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil
105
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil, source: 'link'
98
106
  end
99
107
 
100
108
  it 'Does return the image URL with tilde parameters' do
101
- expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil
109
+ expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil, source: 'link'
102
110
  end
103
111
 
104
112
  it 'Does return the image URL with upcased and special characters' do
105
- expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil
113
+ expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil, source: 'link'
106
114
  end
107
115
 
108
116
  it 'Does return the image alt' do
109
117
  expect(subject.first[:alt]).to be_nil
110
118
  end
111
119
 
120
+ it 'Does return the image source' do
121
+ expect(subject.first[:source]).to eq 'link'
122
+ end
123
+
112
124
  it 'Does not include links other than for images' do
113
125
  expect(subject).not_to include Hash url: 'http://somewhere.to/get/the_original_image.jpg.exe', alt: nil
114
126
  expect(subject).not_to include Hash url: 'http://somewhere.to/go/to/another_page.html', alt: nil
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe Bulbasaur::ExtractImagesFromMeta do
4
+ subject { described_class.new(html).call }
5
+
6
+ describe '#call' do
7
+ context 'when there are no image meta tags' do
8
+ let(:html) { %Q(<meta property="og:description" content="Just a RSpec test." />) }
9
+
10
+ it 'returns an empty array' do
11
+ expect(subject).to be_empty
12
+ end
13
+ end
14
+
15
+ context 'when there is one image meta tag' do
16
+ let :html do
17
+ %Q(
18
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
19
+ <meta property="og:description" content="Just a RSpec test." />
20
+ )
21
+ end
22
+
23
+ it 'returns the image URL found' do
24
+ expect(subject.count).to be 1
25
+ expect(subject).to include Hash url: 'http://somewhere.to/get/an_image.jpg', source: 'meta'
26
+ end
27
+ end
28
+
29
+ context 'when there are multiple image meta tags' do
30
+ let :html do
31
+ %Q(
32
+ <meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
33
+ <meta property="og:image" content="http://somewhere.to/get/another_image.jpg" />
34
+ <meta property="og:image" content="http://somewhere.to/get/a_third_image.jpg" />
35
+ <meta property="og:description" content="Just a RSpec test." />
36
+ )
37
+ end
38
+
39
+ it 'returns the image URLs found' do
40
+ expect(subject.count).to be 3
41
+ expect(subject.map { |meta| meta[:url] }).to include 'http://somewhere.to/get/an_image.jpg', 'http://somewhere.to/get/another_image.jpg', 'http://somewhere.to/get/a_third_image.jpg'
42
+ end
43
+ end
44
+ end
45
+ end
@@ -31,6 +31,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromVimeo do
31
31
 
32
32
  it "Does return vime url" do
33
33
  expect(subject.first[:url]).to eq "https://i.vimeocdn.com/video/123456789_640.webp"
34
+ expect(subject.first[:source]).to eq 'vimeo'
35
+ expect(subject.first[:video_url]).to eq 'player.vimeo.com/video/123456789'
34
36
  end
35
37
  end
36
38
 
@@ -35,6 +35,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromYoutube do
35
35
 
36
36
  it "Does return youtube url" do
37
37
  expect(subject.first[:url]).to eq "http://img.youtube.com/vi/123idfake321/maxresdefault.jpg"
38
+ expect(subject.first[:source]).to eq 'youtube'
39
+ expect(subject.first[:video_url]).to eq 'www.youtube.com/embed/123idfake321'
38
40
  end
39
41
  end
40
42
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preadly-bulbasaur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Magno Costa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-13 00:00:00.000000000 Z
11
+ date: 2015-08-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -92,6 +92,7 @@ files:
92
92
  - lib/bulbasaur.rb
93
93
  - lib/bulbasaur/extracts/extract_images_from_all_resources.rb
94
94
  - lib/bulbasaur/extracts/extract_images_from_html.rb
95
+ - lib/bulbasaur/extracts/extract_images_from_meta.rb
95
96
  - lib/bulbasaur/extracts/extract_images_from_vimeo.rb
96
97
  - lib/bulbasaur/extracts/extract_images_from_youtube.rb
97
98
  - lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/bulbasaur/version.rb
105
106
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
106
107
  - spec/bulbasaur/extracts/extract_images_from_html_spec.rb
108
+ - spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
107
109
  - spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
108
110
  - spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
109
111
  - spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
@@ -140,6 +142,7 @@ summary: Bulbasaur is a helper for crawler operations used in Pread.ly
140
142
  test_files:
141
143
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
142
144
  - spec/bulbasaur/extracts/extract_images_from_html_spec.rb
145
+ - spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
143
146
  - spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
144
147
  - spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
145
148
  - spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb