preadly-bulbasaur 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bulbasaur/extracts/extract_images_from_all_resources.rb +9 -0
- data/lib/bulbasaur/extracts/extract_images_from_html.rb +5 -5
- data/lib/bulbasaur/extracts/extract_images_from_meta.rb +21 -0
- data/lib/bulbasaur/extracts/extract_images_from_vimeo.rb +1 -1
- data/lib/bulbasaur/extracts/extract_images_from_youtube.rb +1 -1
- data/lib/bulbasaur/version.rb +2 -2
- data/lib/bulbasaur.rb +1 -0
- data/spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb +3 -2
- data/spec/bulbasaur/extracts/extract_images_from_html_spec.rb +16 -4
- data/spec/bulbasaur/extracts/extract_images_from_meta_spec.rb +45 -0
- data/spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb +2 -0
- data/spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb +2 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 975df2c7ace7a0947cd98b55bfcf896a970e5660
|
4
|
+
data.tar.gz: 5a5b086ba797209a5493d4d1db2026631f2091a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62e0c0d111f954d147003c25d37ede0ee09c1717ac3a15f8194d07d847e2cedf474667b03bf9c2a84f52a93d53920f6d5162c44b399ebbc9551b4db88bf65c9
|
7
|
+
data.tar.gz: fc8dd0769358ef5e52da253d0a412065667920a401c78a07eb52b8f14bb5368542db7098874987386b9ddd9952df0847d2c6c6acd43ef4a21765e596b75ec3f2
|
@@ -11,6 +11,7 @@ module Bulbasaur
|
|
11
11
|
images = images + extract_images_html(@html)
|
12
12
|
images = images + extract_images_youtube(@html)
|
13
13
|
images = images + extract_images_vimeo(@html)
|
14
|
+
images = images + extract_images_meta(@html)
|
14
15
|
images
|
15
16
|
end
|
16
17
|
|
@@ -40,5 +41,13 @@ module Bulbasaur
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
44
|
+
def extract_images_meta(html)
|
45
|
+
begin
|
46
|
+
Bulbasaur::ExtractImagesFromMeta.new(html).call
|
47
|
+
rescue Exception => e
|
48
|
+
[]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
43
52
|
end
|
44
53
|
end
|
@@ -24,7 +24,7 @@ module Bulbasaur
|
|
24
24
|
Nokogiri::HTML(@html).xpath("//img").each do |item|
|
25
25
|
url = item.xpath("@src").text
|
26
26
|
alt = item.xpath("@alt").text
|
27
|
-
images << create_struct(url, alt)
|
27
|
+
images << create_struct(url, 'img', alt)
|
28
28
|
end
|
29
29
|
images
|
30
30
|
end
|
@@ -32,7 +32,7 @@ module Bulbasaur
|
|
32
32
|
def extract_images_by_tag_style
|
33
33
|
images = Array.new
|
34
34
|
@html.scan(CSS_IMPORT_URL_REGEX).each do |url|
|
35
|
-
images << create_struct(url)
|
35
|
+
images << create_struct(url, 'style')
|
36
36
|
end
|
37
37
|
images
|
38
38
|
end
|
@@ -41,13 +41,13 @@ module Bulbasaur
|
|
41
41
|
images = Array.new
|
42
42
|
Nokogiri::HTML(@html).xpath('//a').each do |link|
|
43
43
|
url = link.xpath('@href').text
|
44
|
-
images << create_struct(url) if url =~ IMG_CANDIDATE_URL_REGEX
|
44
|
+
images << create_struct(url, 'link') if url =~ IMG_CANDIDATE_URL_REGEX
|
45
45
|
end
|
46
46
|
images
|
47
47
|
end
|
48
48
|
|
49
|
-
def create_struct(url, alt=nil)
|
50
|
-
{url: url, alt: alt }
|
49
|
+
def create_struct(url, source, alt = nil)
|
50
|
+
{ url: url, alt: alt, source: source }
|
51
51
|
end
|
52
52
|
end
|
53
53
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class ExtractImagesFromMeta
|
3
|
+
PROPERTY = 'og:image'
|
4
|
+
|
5
|
+
def initialize(html)
|
6
|
+
@html = html
|
7
|
+
end
|
8
|
+
|
9
|
+
def call
|
10
|
+
meta_informations = Bulbasaur::ExtractMetaInformationsFromHTML.new(@html).call
|
11
|
+
image_urls = image_meta_tags(meta_informations).map { |meta| { url: meta[:value], source: 'meta' } }
|
12
|
+
image_urls
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def image_meta_tags(meta_informations)
|
18
|
+
meta_informations.select { |meta| meta[:name] == PROPERTY }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/bulbasaur/version.rb
CHANGED
data/lib/bulbasaur.rb
CHANGED
@@ -2,6 +2,7 @@ require "nokogiri"
|
|
2
2
|
require "bulbasaur/extracts/extract_images_from_youtube"
|
3
3
|
require "bulbasaur/extracts/extract_images_from_vimeo"
|
4
4
|
require "bulbasaur/extracts/extract_images_from_html"
|
5
|
+
require "bulbasaur/extracts/extract_images_from_meta"
|
5
6
|
require "bulbasaur/extracts/extract_images_from_all_resources"
|
6
7
|
require "bulbasaur/extracts/extract_text_from_html.rb"
|
7
8
|
require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
|
@@ -14,6 +14,7 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
|
|
14
14
|
|
15
15
|
let(:html) do
|
16
16
|
%Q(
|
17
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
17
18
|
<p>
|
18
19
|
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
19
20
|
</p>
|
@@ -39,8 +40,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
|
|
39
40
|
)
|
40
41
|
end
|
41
42
|
|
42
|
-
it "Does return
|
43
|
-
expect(subject.size).to eq
|
43
|
+
it "Does return 16 items" do
|
44
|
+
expect(subject.size).to eq 16
|
44
45
|
end
|
45
46
|
end
|
46
47
|
end
|
@@ -37,6 +37,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
37
37
|
it "Does return the image alt" do
|
38
38
|
expect(subject.first[:alt]).to eq "image alt test"
|
39
39
|
end
|
40
|
+
|
41
|
+
it 'Does return the image source' do
|
42
|
+
expect(subject.first[:source]).to eq 'img'
|
43
|
+
end
|
40
44
|
end
|
41
45
|
|
42
46
|
context "When send html with a image style inline" do
|
@@ -59,6 +63,10 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
59
63
|
it "Does return the image alt" do
|
60
64
|
expect(subject.first[:alt]).to be_nil
|
61
65
|
end
|
66
|
+
|
67
|
+
it 'Does return the image source' do
|
68
|
+
expect(subject.first[:source]).to eq 'style'
|
69
|
+
end
|
62
70
|
end
|
63
71
|
|
64
72
|
context 'When sending HTML with a link pointing to an image' do
|
@@ -90,25 +98,29 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
90
98
|
end
|
91
99
|
|
92
100
|
it 'Does return the image URL with parameters' do
|
93
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil
|
101
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil, source: 'link'
|
94
102
|
end
|
95
103
|
|
96
104
|
it 'Does return the image URL without parameters' do
|
97
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil
|
105
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil, source: 'link'
|
98
106
|
end
|
99
107
|
|
100
108
|
it 'Does return the image URL with tilde parameters' do
|
101
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil
|
109
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/the_original_image.jpg~original', alt: nil, source: 'link'
|
102
110
|
end
|
103
111
|
|
104
112
|
it 'Does return the image URL with upcased and special characters' do
|
105
|
-
expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil
|
113
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/The_Original_Image%C3%A7a_3.JPG', alt: nil, source: 'link'
|
106
114
|
end
|
107
115
|
|
108
116
|
it 'Does return the image alt' do
|
109
117
|
expect(subject.first[:alt]).to be_nil
|
110
118
|
end
|
111
119
|
|
120
|
+
it 'Does return the image source' do
|
121
|
+
expect(subject.first[:source]).to eq 'link'
|
122
|
+
end
|
123
|
+
|
112
124
|
it 'Does not include links other than for images' do
|
113
125
|
expect(subject).not_to include Hash url: 'http://somewhere.to/get/the_original_image.jpg.exe', alt: nil
|
114
126
|
expect(subject).not_to include Hash url: 'http://somewhere.to/go/to/another_page.html', alt: nil
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromMeta do
|
4
|
+
subject { described_class.new(html).call }
|
5
|
+
|
6
|
+
describe '#call' do
|
7
|
+
context 'when there are no image meta tags' do
|
8
|
+
let(:html) { %Q(<meta property="og:description" content="Just a RSpec test." />) }
|
9
|
+
|
10
|
+
it 'returns an empty array' do
|
11
|
+
expect(subject).to be_empty
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
context 'when there is one image meta tag' do
|
16
|
+
let :html do
|
17
|
+
%Q(
|
18
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
19
|
+
<meta property="og:description" content="Just a RSpec test." />
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'returns the image URL found' do
|
24
|
+
expect(subject.count).to be 1
|
25
|
+
expect(subject).to include Hash url: 'http://somewhere.to/get/an_image.jpg', source: 'meta'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context 'when there are multiple image meta tags' do
|
30
|
+
let :html do
|
31
|
+
%Q(
|
32
|
+
<meta property="og:image" content="http://somewhere.to/get/an_image.jpg" />
|
33
|
+
<meta property="og:image" content="http://somewhere.to/get/another_image.jpg" />
|
34
|
+
<meta property="og:image" content="http://somewhere.to/get/a_third_image.jpg" />
|
35
|
+
<meta property="og:description" content="Just a RSpec test." />
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'returns the image URLs found' do
|
40
|
+
expect(subject.count).to be 3
|
41
|
+
expect(subject.map { |meta| meta[:url] }).to include 'http://somewhere.to/get/an_image.jpg', 'http://somewhere.to/get/another_image.jpg', 'http://somewhere.to/get/a_third_image.jpg'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -31,6 +31,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromVimeo do
|
|
31
31
|
|
32
32
|
it "Does return vime url" do
|
33
33
|
expect(subject.first[:url]).to eq "https://i.vimeocdn.com/video/123456789_640.webp"
|
34
|
+
expect(subject.first[:source]).to eq 'vimeo'
|
35
|
+
expect(subject.first[:video_url]).to eq 'player.vimeo.com/video/123456789'
|
34
36
|
end
|
35
37
|
end
|
36
38
|
|
@@ -35,6 +35,8 @@ RSpec.describe Bulbasaur::ExtractImagesFromYoutube do
|
|
35
35
|
|
36
36
|
it "Does return youtube url" do
|
37
37
|
expect(subject.first[:url]).to eq "http://img.youtube.com/vi/123idfake321/maxresdefault.jpg"
|
38
|
+
expect(subject.first[:source]).to eq 'youtube'
|
39
|
+
expect(subject.first[:video_url]).to eq 'www.youtube.com/embed/123idfake321'
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -92,6 +92,7 @@ files:
|
|
92
92
|
- lib/bulbasaur.rb
|
93
93
|
- lib/bulbasaur/extracts/extract_images_from_all_resources.rb
|
94
94
|
- lib/bulbasaur/extracts/extract_images_from_html.rb
|
95
|
+
- lib/bulbasaur/extracts/extract_images_from_meta.rb
|
95
96
|
- lib/bulbasaur/extracts/extract_images_from_vimeo.rb
|
96
97
|
- lib/bulbasaur/extracts/extract_images_from_youtube.rb
|
97
98
|
- lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
|
@@ -104,6 +105,7 @@ files:
|
|
104
105
|
- lib/bulbasaur/version.rb
|
105
106
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
108
|
+
- spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
|
107
109
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
108
110
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
109
111
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
@@ -140,6 +142,7 @@ summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
|
140
142
|
test_files:
|
141
143
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
142
144
|
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
145
|
+
- spec/bulbasaur/extracts/extract_images_from_meta_spec.rb
|
143
146
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
144
147
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
145
148
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|