preadly-bulbasaur 0.7.2 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bulbasaur/extracts/extract_images_from_all_resources.rb +9 -0
- data/lib/bulbasaur/extracts/extract_images_from_html.rb +5 -5
- data/lib/bulbasaur/extracts/extract_images_from_meta.rb +21 -0
- data/lib/bulbasaur/extracts/extract_images_from_vimeo.rb +1 -1
- data/lib/bulbasaur/extracts/extract_images_from_youtube.rb +1 -1
- data/lib/bulbasaur/version.rb +2 -2
- data/lib/bulbasaur.rb +1 -0
- data/spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb +3 -2
- data/spec/bulbasaur/extracts/extract_images_from_html_spec.rb +16 -4
- data/spec/bulbasaur/extracts/extract_images_from_meta_spec.rb +45 -0
- data/spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb +2 -0
- data/spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb +2 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 975df2c7ace7a0947cd98b55bfcf896a970e5660
|
4
|
+
data.tar.gz: 5a5b086ba797209a5493d4d1db2026631f2091a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62e0c0d111f954d147003c25d37ede0ee09c1717ac3a15f8194d07d847e2cedf474667b03bf9c2a84f52a93d53920f6d5162c44b399ebbc9551b4db88bf65c9
|
7
|
+
data.tar.gz: fc8dd0769358ef5e52da253d0a412065667920a401c78a07eb52b8f14bb5368542db7098874987386b9ddd9952df0847d2c6c6acd43ef4a21765e596b75ec3f2
|
@@ -11,6 +11,7 @@ module Bulbasaur
|
|
11
11
|
images = images + extract_images_html(@html)
|
12
12
|
images = images + extract_images_youtube(@html)
|
13
13
|
images = images + extract_images_vimeo(@html)
|
14
|
+
images = images + extract_images_meta(@html)
|
14
15
|
images
|
15
16
|
end
|
16
17
|
|
@@ -40,5 +41,13 @@ module Bulbasaur
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
44
|
+
def extract_images_meta(html)
|
45
|
+
begin
|
46
|
+
Bulbasaur::ExtractImagesFromMeta.new(html).call
|
47
|
+
rescue Exception => e
|
48
|
+
[]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
43
52
|
end
|
44
53
|
end
|
@@ -24,7 +24,7 @@ module Bulbasaur
|
|
24
24
|
Nokogiri::HTML(@html).xpath("//img").each do |item|
|
25
25
|
url = item.xpath("@src").text
|
26
26
|
alt = item.xpath("@alt").text
|
27
|
-
images << create_struct(url, alt)
|
27
|
+
images << create_struct(url, 'img', alt)
|
28
28
|
end
|
29
29
|
images
|
30
30
|
end
|
@@ -32,7 +32,7 @@ module Bulbasaur
|
|
32
32
|
def extract_images_by_tag_style
|
33
33
|
images = Array.new
|
34
34
|
@html.scan(CSS_IMPORT_URL_REGEX).each do |url|
|
35
|
-
images << create_struct(url)
|
35
|
+
images << create_struct(url, 'style')
|
36
36
|
end
|
37
37
|
images
|
38
38
|
end
|
@@ -41,13 +41,13 @@ module Bulbasaur
|
|
41
41
|
images = Array.new
|
42
42
|
Nokogiri::HTML(@html).xpath('//a').each do |link|
|
43
43
|
url = link.xpath('@href').text
|
44
|
-
images << create_struct(url) if url =~ IMG_CANDIDATE_URL_REGEX
|
44
|
+
images << create_struct(url, 'link') if url =~ IMG_CANDIDATE_URL_REGEX
|
45
45
|
end
|
46
46
|
images
|
47
47
|
end
|
48
48
|
|
49
|
-
def create_struct(url, alt=nil)
|
50
|
-
{url: url, alt: alt }
|
49
|
+
def create_struct(url, source, alt = nil)
|
50
|
+
{ url: url, alt: alt, source: source }
|
51
51
|
end
|
52
52
|
end
|
53
53
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class ExtractImagesFromMeta
|
3
|
+
PROPERTY = 'og:image'
|
4
|
+
|
5
|
+
def initialize(html)
|
6
|
+
@html = html
|
7
|
+
end
|
8
|
+
|
9
|
+
def call
|
10
|
+
meta_informations = Bulbasaur::ExtractMetaInformationsFromHTML.new(@html).call
|
11
|
+
image_urls = image_meta_tags(meta_informations).map { |meta| { url: meta[:value], source: 'meta' } }
|
12
|
+
image_urls
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def image_meta_tags(meta_informations)
|
18
|
+
meta_informations.select { |meta| meta[:name] == PROPERTY }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/bulbasaur/version.rb
CHANGED
data/lib/bulbasaur.rb
CHANGED
@@ -2,6 +2,7 @@ require "nokogiri"
|
|
2
2
|
require "bulbasaur/extracts/extract_images_from_youtube"
|
3
3
|
require "bulbasaur/extracts/extract_images_from_vimeo"
|
4
4
|
require "bulbasaur/extracts/extract_images_from_html"
|
5
|
+
require "bulbasaur/extracts/extract_images_from_meta"
|
5
6
|
require "bulbasaur/extracts/extract_images_from_all_resources"
|
6
7
|
require "bulbasaur/extracts/extract_text_from_html.rb"
|
7
8
|
require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
|
@@ -14,6 +14,7 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
|
|
14
14
|
|
15
15
|
let(:html) do
|
16
16
|
%Q(
|
17
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
17
18
|
<p>
|
18
19
|
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
19
20
|
</p>
|
@@ -39,8 +40,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
|
|
39
40
|
)
|
40
41
|
end
|
41
42
|
|
42
|
-
it "Does return
|
43
|
-
expect(subject.size).to eq
|
43
|
+
it "Does return 16 items" do
|
44
|
+
expect(subject.size).to eq 16
|
44
45
|
end
|
45
46
|
end
|
46
47
|
end
|
@@ -37,6 +37,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
37
37
|
it "Does return the image alt" do
|
38
38
|
expect(subject.first[:alt]).to eq "image alt test"
|
39
39
|
end
|
40
|
+
|
41
|
+
it 'Does return the image source' do
|
42
|
+
expect(subject.first[:source]).to eq 'img'
|
43
|
+
end
|
40
44
|
end
|
41
45
|
|
42
46
|
context "When send html with a image style inline" do
|
@@ -59,6 +63,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
59
63
|
it "Does return the image alt" do
|
60
64
|
expect(subject.first[:alt]).to be_nil
|
61
65
|
end
|
66
|
+
|
67
|
+
it 'Does return the image source' do
|
68
|
+
expect(subject.first[:source]).to eq 'style'
|
69
|
+
end
|
62
70
|
end
|
63
71
|
|
64
72
|
context 'When sending HTML with a link pointing to an image' do
|
@@ -90,25 +98,29 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
90
98
|
end
|
91
99
|
|
92
100
|
it 'Does return the image URL with parameters' do
|
93
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil
|
101
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil, source: 'link'
|
94
102
|
end
|
95
103
|
|
96
104
|
it 'Does return the image URL without parameters' do
|
97
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil
|
105
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil, source: 'link'
|
98
106
|
end
|
99
107
|
|
100
108
|
it 'Does return the image URL with tilde parameters' do
|
101
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil
|
109
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil, source: 'link'
|
102
110
|
end
|
103
111
|
|
104
112
|
it 'Does return the image URL with upcased and special characters' do
|
105
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil
|
113
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil, source: 'link'
|
106
114
|
end
|
107
115
|
|
108
116
|
it 'Does return the image alt' do
|
109
117
|
expect(subject.first[:alt]).to be_nil
|
110
118
|
end
|
111
119
|
|
120
|
+
it 'Does return the image source' do
|
121
|
+
expect(subject.first[:source]).to eq 'link'
|
122
|
+
end
|
123
|
+
|
112
124
|
it 'Does not include links other than for images' do
|
113
125
|
expect(subject).not_to include Hash url: 'http://somewhere.to/get/the_original_image.jpg.exe', alt: nil
|
114
126
|
expect(subject).not_to include Hash url: 'http://somewhere.to/go/to/another_page.html', alt: nil
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromMeta do
|
4
|
+
subject { described_class.new(html).call }
|
5
|
+
|
6
|
+
describe '#call' do
|
7
|
+
context 'when there are no image meta tags' do
|
8
|
+
let(:html) { %Q(<meta property="og:description" content="Just a RSpec test." />) }
|
9
|
+
|
10
|
+
it 'returns an empty array' do
|
11
|
+
expect(subject).to be_empty
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
context 'when there is one image meta tag' do
|
16
|
+
let :html do
|
17
|
+
%Q(
|
18
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
19
|
+
<meta property="og:description" content="Just a RSpec test." />
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'returns the image URL found' do
|
24
|
+
expect(subject.count).to be 1
|
25
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/an_image.jpg', source: 'meta'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context 'when there are multiple image meta tags' do
|
30
|
+
let :html do
|
31
|
+
%Q(
|
32
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
33
|
+
<meta property="og:image" content="http://somewhere.to/get/another_image.jpg" />
|
34
|
+
<meta property="og:image" content="http://somewhere.to/get/a_third_image.jpg" />
|
35
|
+
<meta property="og:description" content="Just a RSpec test." />
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'returns the image URLs found' do
|
40
|
+
expect(subject.count).to be 3
|
41
|
+
expect(subject.map { |meta| meta[:url] }).to include 'http://somewhere.to/get/an_image.jpg', 'http://somewhere.to/get/another_image.jpg', 'http://somewhere.to/get/a_third_image.jpg'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -31,6 +31,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromVimeo do
|
|
31
31
|
|
32
32
|
it "Does return vime url" do
|
33
33
|
expect(subject.first[:url]).to eq "https://i.vimeocdn.com/video/123456789_640.webp"
|
34
|
+
expect(subject.first[:source]).to eq 'vimeo'
|
35
|
+
expect(subject.first[:video_url]).to eq 'player.vimeo.com/video/123456789'
|
34
36
|
end
|
35
37
|
end
|
36
38
|
|
@@ -35,6 +35,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromYoutube do
|
|
35
35
|
|
36
36
|
it "Does return youtube url" do
|
37
37
|
expect(subject.first[:url]).to eq "http://img.youtube.com/vi/123idfake321/maxresdefault.jpg"
|
38
|
+
expect(subject.first[:source]).to eq 'youtube'
|
39
|
+
expect(subject.first[:video_url]).to eq 'www.youtube.com/embed/123idfake321'
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- lib/bulbasaur.rb
|
93
93
|
- lib/bulbasaur/extracts/extract_images_from_all_resources.rb
|
94
94
|
- lib/bulbasaur/extracts/extract_images_from_html.rb
|
95
|
+
- lib/bulbasaur/extracts/extract_images_from_meta.rb
|
95
96
|
- lib/bulbasaur/extracts/extract_images_from_vimeo.rb
|
96
97
|
- lib/bulbasaur/extracts/extract_images_from_youtube.rb
|
97
98
|
- lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
|
@@ -104,6 +105,7 @@ files:
|
|
104
105
|
- lib/bulbasaur/version.rb
|
105
106
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
108
|
+
- spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
|
107
109
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
108
110
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
109
111
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
@@ -140,6 +142,7 @@ summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
|
140
142
|
test_files:
|
141
143
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
142
144
|
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
145
|
+
- spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
|
143
146
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
144
147
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
145
148
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|