preadly-bulbasaur 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +11 -0
- data/bulbasaur.gemspec +23 -0
- data/lib/bulbasaur/extract_text_from_html.rb +7 -0
- data/lib/bulbasaur/extracts/extract_images_from_all_resources.rb +44 -0
- data/lib/bulbasaur/extracts/extract_images_from_html.rb +43 -0
- data/lib/bulbasaur/extracts/extract_images_from_vimeo.rb +34 -0
- data/lib/bulbasaur/extracts/extract_images_from_youtube.rb +32 -0
- data/lib/bulbasaur/replaces/replace_by_tag_image.rb +21 -0
- data/lib/bulbasaur/utils/normalize_url.rb +21 -0
- data/lib/bulbasaur/version.rb +10 -0
- data/lib/bulbasaur.rb +13 -0
- data/spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb +42 -0
- data/spec/bulbasaur/extracts/extract_images_from_html_spec.rb +91 -0
- data/spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb +60 -0
- data/spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb +59 -0
- data/spec/bulbasaur/replaces/replace_by_tag_image_spec.rb +58 -0
- data/spec/bulbasaur/utils/normalize_url_spec.rb +79 -0
- data/spec/bulbasaur_spec.rb +13 -0
- data/spec/spec_helper.rb +14 -0
- metadata +126 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 18d9a3fbb6070fcd08bef48ec132e2aad608844a
|
4
|
+
data.tar.gz: 76a06d3388fb891a8ce067f9e8305a5ef6c317af
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 347b7a826abc28ddedeb50985b09794f5199a9361c15fce10833831ff4b24adde43b83a7f23921a90aa284e77151761ce23ae4e7dcb40b3e84c5102e013f14a9
|
7
|
+
data.tar.gz: eb94820268a8855a67bbfec08a47e4de513cc5d5cbd3967885dfa96e76b45396a8381fdfdd4f36aa670ef4804327da26272dbb8d9583b6121a0271d23b7b4cba
|
data/Rakefile
ADDED
data/bulbasaur.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require './lib/bulbasaur/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "preadly-bulbasaur"
|
8
|
+
spec.version = Bulbasaur::Version::STRING
|
9
|
+
spec.authors = ["Magno Costa"]
|
10
|
+
spec.email = ["magnocosta.br@gmail.com"]
|
11
|
+
spec.description = spec.summary = %q(Bulbasaur is a helper for crawler operations used in Pread.ly)
|
12
|
+
spec.homepage = "https://github.com/preadly/bulbasaur"
|
13
|
+
|
14
|
+
spec.files = Dir["{lib/**/*.rb,README.rdoc,spec/**/*.rb,Rakefile,*.gemspec}"]
|
15
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
16
|
+
spec.require_paths = ["lib"]
|
17
|
+
|
18
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
19
|
+
spec.add_development_dependency "rake", "~> 10.4"
|
20
|
+
spec.add_development_dependency "rspec", "~> 3.3"
|
21
|
+
|
22
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
23
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class ExtractImagesFromAllResources
|
4
|
+
|
5
|
+
def initialize(html)
|
6
|
+
@html = html
|
7
|
+
end
|
8
|
+
|
9
|
+
def call
|
10
|
+
images = Array.new
|
11
|
+
images = images + extract_images_html(@html)
|
12
|
+
images = images + extract_images_youtube(@html)
|
13
|
+
images = images + extract_images_vimeo(@html)
|
14
|
+
images
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def extract_images_youtube(html)
|
20
|
+
begin
|
21
|
+
Bulbasaur::ExtractImagesFromYoutube.new(html).call
|
22
|
+
rescue Exception => e
|
23
|
+
[]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract_images_html(html)
|
28
|
+
begin
|
29
|
+
Bulbasaur::ExtractImagesFromHTML.new(html).call
|
30
|
+
rescue Exception => e
|
31
|
+
[]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def extract_images_vimeo(html)
|
36
|
+
begin
|
37
|
+
Bulbasaur::ExtractImagesFromVimeo.new(html).call
|
38
|
+
rescue Exception => e
|
39
|
+
[]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class ExtractImagesFromHTML
|
4
|
+
|
5
|
+
CSS_IMPORT_URL_REGEX = /(?<=url\()['"]?.+?['"]?.+?(?=\))/
|
6
|
+
IMG_CANDIDATE_URL_REGEX = /https?:\/\/\S*\.(?:png|jpg|jpeg)(?!\.\S)/i
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@html = html
|
10
|
+
end
|
11
|
+
|
12
|
+
def call
|
13
|
+
images = Array.new
|
14
|
+
images = images + extract_images_by_tag_image
|
15
|
+
images = images + extract_images_by_tag_style
|
16
|
+
images
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def extract_images_by_tag_image
|
22
|
+
images = Array.new
|
23
|
+
Nokogiri::HTML(@html).xpath("//img").each do |item|
|
24
|
+
url = item.xpath("@src").text
|
25
|
+
alt = item.xpath("@alt").text
|
26
|
+
images << create_struct(url, alt)
|
27
|
+
end
|
28
|
+
images
|
29
|
+
end
|
30
|
+
|
31
|
+
def extract_images_by_tag_style
|
32
|
+
images = Array.new
|
33
|
+
@html.scan(CSS_IMPORT_URL_REGEX).each do |url|
|
34
|
+
images << create_struct(url)
|
35
|
+
end
|
36
|
+
images
|
37
|
+
end
|
38
|
+
|
39
|
+
def create_struct(url, alt=nil)
|
40
|
+
{url: url, alt: alt }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class ExtractImagesFromVimeo
|
4
|
+
|
5
|
+
# Sizes available: small '100x75', medium '200x150', large '640'.
|
6
|
+
DEFAULT_SIZE = '640'
|
7
|
+
EXTRACT_URL_PATTERN = /player\.vimeo\.com\/(?:v\/|.+?&v=|video\/)\w+/i
|
8
|
+
EXTRACT_VID_PATTERN = /(?<=v\/|video\/)(?<vid>\w+)/i
|
9
|
+
|
10
|
+
def initialize(html)
|
11
|
+
@html = html
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
images = Array.new
|
16
|
+
@html.scan(EXTRACT_URL_PATTERN).each do |video|
|
17
|
+
vid = get_vid(video)
|
18
|
+
url_image = image_url_for(vid)
|
19
|
+
images << { url: url_image }
|
20
|
+
end
|
21
|
+
images
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def get_vid(video)
|
27
|
+
EXTRACT_VID_PATTERN.match(video)[:vid]
|
28
|
+
end
|
29
|
+
|
30
|
+
def image_url_for(vid)
|
31
|
+
"https://i.vimeocdn.com/video/#{vid}_#{DEFAULT_SIZE}.webp"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class ExtractImagesFromYoutube
|
4
|
+
|
5
|
+
EXTRACT_URL_PATTERN = /www\.youtube\.com\/(?:v\/|.+?&v=|embed\/)\w+/i
|
6
|
+
EXTRACT_VID_PATTERN = /(?<=v\/|embed\/)(?<vid>\w+)/i
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@html = html
|
10
|
+
end
|
11
|
+
|
12
|
+
def call
|
13
|
+
images = Array.new
|
14
|
+
@html.scan(EXTRACT_URL_PATTERN).each do |video|
|
15
|
+
vid = get_vid(video)
|
16
|
+
url_image = image_url_for(vid)
|
17
|
+
images << { url: url_image }
|
18
|
+
end
|
19
|
+
images
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def get_vid(video)
|
25
|
+
EXTRACT_VID_PATTERN.match(video)[:vid]
|
26
|
+
end
|
27
|
+
|
28
|
+
def image_url_for(vid)
|
29
|
+
"http://img.youtube.com/vi/#{vid}/0.jpg"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class ReplaceByTagImage
|
4
|
+
|
5
|
+
def initialize(html, image_replaces = [])
|
6
|
+
@html = html
|
7
|
+
@image_replaces = image_replaces
|
8
|
+
end
|
9
|
+
|
10
|
+
def call
|
11
|
+
nokogiri = Nokogiri::HTML::DocumentFragment.parse(@html)
|
12
|
+
nokogiri.css('img').each do |item|
|
13
|
+
url = item.xpath("@src").text
|
14
|
+
replace = @image_replaces.select{ |r| r[:original_image_url] == url }.first
|
15
|
+
item.set_attribute("src", replace[:url]) unless replace.nil?
|
16
|
+
end
|
17
|
+
nokogiri.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
|
3
|
+
class NormalizeURL
|
4
|
+
|
5
|
+
def initialize(base_url, context_url)
|
6
|
+
@base_url = base_url
|
7
|
+
@context_url = context_url
|
8
|
+
end
|
9
|
+
|
10
|
+
def call
|
11
|
+
if @context_url =~ /^https?:\/\//
|
12
|
+
URI::encode @context_url
|
13
|
+
else
|
14
|
+
URI::join(@base_url, @context_url).to_s
|
15
|
+
end
|
16
|
+
rescue
|
17
|
+
raise ArgumentError, "Not possible normalize url, check the params [base_url: #{@base_url}, context_url: #{@context_url}]"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/bulbasaur.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "bulbasaur/extracts/extract_images_from_youtube"
|
3
|
+
require "bulbasaur/extracts/extract_images_from_vimeo"
|
4
|
+
require "bulbasaur/extracts/extract_images_from_html"
|
5
|
+
require "bulbasaur/extracts/extract_images_from_all_resources"
|
6
|
+
require "bulbasaur/replaces/replace_by_tag_image"
|
7
|
+
require "bulbasaur/utils/normalize_url"
|
8
|
+
require "bulbasaur/version"
|
9
|
+
|
10
|
+
|
11
|
+
module Bulbasaur
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromAllResources do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
let(:html) do
|
12
|
+
%Q(
|
13
|
+
<p>
|
14
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
15
|
+
</p>
|
16
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video1" frameborder="0" allowfullscreen></iframe>
|
17
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video2" frameborder="0" allowfullscreen></iframe>
|
18
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video3" frameborder="0" allowfullscreen></iframe>
|
19
|
+
<p>
|
20
|
+
<iframe src="https://player.vimeo.com/video/test0" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
21
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
22
|
+
</p>
|
23
|
+
<iframe src="https://player.vimeo.com/video/test1" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
24
|
+
<iframe src="https://player.vimeo.com/video/test2" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
25
|
+
<iframe src="https://player.vimeo.com/video/test3" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
26
|
+
<p>Hello world</p>
|
27
|
+
<img src='image-0.jpg' alt='image-0' />
|
28
|
+
<img src='image-1.png' alt='image-1' />
|
29
|
+
<div style='backgroung-image: url(image-2.jpg)'>
|
30
|
+
hello Ruby
|
31
|
+
<img src='image-3.png' alt='image-3' />
|
32
|
+
</div>
|
33
|
+
<div style='background: url(image-4.png)'></div>
|
34
|
+
<img src='image-5.png' alt='image-5' />"
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
it "Does return 15 itens" do
|
39
|
+
expect(subject.size).to eq 15
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When send html without images" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
"<p>Hello world</p>"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "Does return nil object" do
|
18
|
+
expect(subject.size).to be_zero
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "When send html with a image tag" do
|
23
|
+
|
24
|
+
let(:html) do
|
25
|
+
"<p>Hello world</p>
|
26
|
+
<img src='image-name.jpg' alt='image alt test' />"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "Does return a image array with 1 item" do
|
30
|
+
expect(subject.size).to eq 1
|
31
|
+
end
|
32
|
+
|
33
|
+
it "Does return the image url" do
|
34
|
+
expect(subject.first[:url]).to eq "image-name.jpg"
|
35
|
+
end
|
36
|
+
|
37
|
+
it "Does return the image alt" do
|
38
|
+
expect(subject.first[:alt]).to eq "image alt test"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "When send html with a image style inline" do
|
43
|
+
|
44
|
+
let(:html) do
|
45
|
+
"<p>Hello world</p>
|
46
|
+
<div style='backgroung-image: url(inline-image.jpg)'>
|
47
|
+
hello Ruby
|
48
|
+
</div>"
|
49
|
+
end
|
50
|
+
|
51
|
+
it "Does return a image array with 1 item" do
|
52
|
+
expect(subject.size).to eq 1
|
53
|
+
end
|
54
|
+
|
55
|
+
it "Does return the image url" do
|
56
|
+
expect(subject.first[:url]).to eq "inline-image.jpg"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "Does return the image alt" do
|
60
|
+
expect(subject.first[:alt]).to be_nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context "When send html with many images" do
|
65
|
+
|
66
|
+
let(:html) do
|
67
|
+
"<p>Hello world</p>
|
68
|
+
<img src='image-0.jpg' alt='image-0' />
|
69
|
+
<img src='image-1.png' alt='image-1' />
|
70
|
+
<div style='backgroung-image: url(image-2.jpg)'>
|
71
|
+
hello Ruby
|
72
|
+
<img src='image-3.png' alt='image-3' />
|
73
|
+
</div>
|
74
|
+
<div style='background: url(image-4.png)'></div>
|
75
|
+
<img src='image-5.png' alt='image-5' />"
|
76
|
+
end
|
77
|
+
|
78
|
+
it "Does return a image array with 6 items" do
|
79
|
+
expect(subject.size).to eq 6
|
80
|
+
end
|
81
|
+
|
82
|
+
it "Does return the image url of 6 itens" do
|
83
|
+
expect(subject.map { |item| item[:url] }).to include "image-0.jpg", "image-1.png", "image-2.jpg", "image-3.png", "image-4.png", "image-5.png"
|
84
|
+
end
|
85
|
+
|
86
|
+
it "Does return the image alt of 4 itens" do
|
87
|
+
expect(subject.map { |item| item[:alt] }).to include "image-0", "image-1", "image-3", "image-5"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromVimeo do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When there is not vimeo images" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
"<p>Lorem ipsum dolor sit amet</p>"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "Does return empty array" do
|
18
|
+
expect(subject.size).to be_zero
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "When has one vimeo video" do
|
23
|
+
|
24
|
+
let(:html) do
|
25
|
+
%Q(<iframe src="https://player.vimeo.com/video/123456789" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "Does return array with 1 image" do
|
29
|
+
expect(subject.size).to eq 1
|
30
|
+
end
|
31
|
+
|
32
|
+
it "Does return vime url" do
|
33
|
+
expect(subject.first[:url]).to eq "https://i.vimeocdn.com/video/123456789_640.webp"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "When many vimeo videos" do
|
38
|
+
|
39
|
+
let(:html) do
|
40
|
+
%Q(
|
41
|
+
<p>
|
42
|
+
<iframe src="https://player.vimeo.com/video/test0" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
43
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
44
|
+
</p>
|
45
|
+
<iframe src="https://player.vimeo.com/video/test1" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
46
|
+
<iframe src="https://player.vimeo.com/video/test2" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
47
|
+
<iframe src="https://player.vimeo.com/video/test3" width="500" height="281" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
|
48
|
+
)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "Does return array with 4 images" do
|
52
|
+
expect(subject.size).to eq 4
|
53
|
+
end
|
54
|
+
|
55
|
+
it "Does return vimeo urls" do
|
56
|
+
expect(subject.map{ |video| video[:url] }).to include "https://i.vimeocdn.com/video/test0_640.webp", "https://i.vimeocdn.com/video/test1_640.webp", "https://i.vimeocdn.com/video/test2_640.webp", "https://i.vimeocdn.com/video/test3_640.webp"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractImagesFromYoutube do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When there is not youtube images" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
"<p>Lorem ipsum dolor sit amet</p>"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "Does return empty array" do
|
18
|
+
expect(subject.size).to be_zero
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "When has one youtube video" do
|
23
|
+
|
24
|
+
let(:html) do
|
25
|
+
%Q(<iframe width="560" height="315" src="https://www.youtube.com/embed/123idfake321" frameborder="0" allowfullscreen></iframe>)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "Does return array with 1 image" do
|
29
|
+
expect(subject.size).to eq 1
|
30
|
+
end
|
31
|
+
|
32
|
+
it "Does return youtube url" do
|
33
|
+
expect(subject.first[:url]).to eq "http://img.youtube.com/vi/123idfake321/0.jpg"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "When many youtube video" do
|
38
|
+
|
39
|
+
let(:html) do
|
40
|
+
%Q(
|
41
|
+
<p>
|
42
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video0" frameborder="0" allowfullscreen></iframe>
|
43
|
+
</p>
|
44
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video1" frameborder="0" allowfullscreen></iframe>
|
45
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video2" frameborder="0" allowfullscreen></iframe>
|
46
|
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/video3" frameborder="0" allowfullscreen></iframe>
|
47
|
+
)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "Does return array with 4 image" do
|
51
|
+
expect(subject.size).to eq 4
|
52
|
+
end
|
53
|
+
|
54
|
+
it "Does return youtube urls" do
|
55
|
+
expect(subject.map{ |video| video[:url] }).to include "http://img.youtube.com/vi/video0/0.jpg", "http://img.youtube.com/vi/video1/0.jpg", "http://img.youtube.com/vi/video2/0.jpg", "http://img.youtube.com/vi/video3/0.jpg"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ReplaceByTagImage do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, image_replaces).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When there is not tag img" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
"<p>Hello</p><div> Welcome </div>"
|
15
|
+
end
|
16
|
+
|
17
|
+
let(:image_replaces) do
|
18
|
+
[{original_image_url:"test.jpg", url: "new-image.png"}]
|
19
|
+
end
|
20
|
+
|
21
|
+
it "Does return html" do
|
22
|
+
expect(subject).to eq "<p>Hello</p><div> Welcome </div>"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context "When there are many img tags" do
|
27
|
+
|
28
|
+
let(:html) do
|
29
|
+
"<p>Lorem inpsu</p>
|
30
|
+
<img src='test-0.jpg' alt='hello'>
|
31
|
+
<img src='test-1.jpg' alt='hello'>
|
32
|
+
<img src='test-1.jpg' alt='hello'>
|
33
|
+
<img src='test-3.jpg' alt='hello'>
|
34
|
+
<img src='test-2.jpg' alt='hello'>".gsub(/\n/," ")
|
35
|
+
end
|
36
|
+
|
37
|
+
let(:image_replaces) do
|
38
|
+
[
|
39
|
+
{original_image_url:"test-0.jpg", url: "new-image-0.png"},
|
40
|
+
{original_image_url:"test-1.jpg", url: "new-image-1.png"},
|
41
|
+
{original_image_url:"test-2.jpg", url: "new-image-2.png"}
|
42
|
+
]
|
43
|
+
end
|
44
|
+
|
45
|
+
it "Does return html parsed" do
|
46
|
+
expect(subject).to eq(
|
47
|
+
'<p>Lorem inpsu</p>
|
48
|
+
<img src="new-image-0.png" alt="hello">
|
49
|
+
<img src="new-image-1.png" alt="hello">
|
50
|
+
<img src="new-image-1.png" alt="hello">
|
51
|
+
<img src="test-3.jpg" alt="hello">
|
52
|
+
<img src="new-image-2.png" alt="hello">'.gsub(/\n/," "))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::NormalizeURL do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(base_url, context_url).call
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:base_url) do
|
10
|
+
"http://pread.ly"
|
11
|
+
end
|
12
|
+
|
13
|
+
let(:context_url) do
|
14
|
+
"http://www.test.com/hello.jpg"
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#call" do
|
18
|
+
|
19
|
+
context "When use url normalized url: http://www.test.com/hello.jpg" do
|
20
|
+
|
21
|
+
it "Does return url normalized: http://www.test.com/hello.jpg" do
|
22
|
+
expect(subject).to eq "http://www.test.com/hello.jpg"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context "When use url unnormalized url: test.jpg" do
|
27
|
+
|
28
|
+
let(:context_url) do
|
29
|
+
"test.jpg"
|
30
|
+
end
|
31
|
+
|
32
|
+
it "Does return url normalized: http://pread.ly/test.jpg" do
|
33
|
+
expect(subject).to eq "http://pread.ly/test.jpg"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "When use url https normalized: https://www.test.com/hello.jpg" do
|
38
|
+
|
39
|
+
let(:context_url) do
|
40
|
+
"https://www.test.com/hello.jpg"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "Does return url https normalized: https://www.test.com/hello.jpg" do
|
44
|
+
expect(subject).to eq "https://www.test.com/hello.jpg"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "When use url not normalized with slash on base: hello.jpg" do
|
49
|
+
|
50
|
+
let(:base_url) do
|
51
|
+
"https://www.test.com/"
|
52
|
+
end
|
53
|
+
|
54
|
+
let(:context_url) do
|
55
|
+
"hello.jpg"
|
56
|
+
end
|
57
|
+
|
58
|
+
it "Does return url normalized: https://www.test.com/hello.jpg" do
|
59
|
+
expect(subject).to eq "https://www.test.com/hello.jpg"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "When base url not valid" do
|
64
|
+
|
65
|
+
let(:base_url) do
|
66
|
+
"test/httml"
|
67
|
+
end
|
68
|
+
|
69
|
+
let(:context_url) do
|
70
|
+
"hello.html"
|
71
|
+
end
|
72
|
+
|
73
|
+
it "Does throws exception argument error" do
|
74
|
+
expect{subject}.to raise_error ArgumentError
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: preadly-bulbasaur
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Magno Costa
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.4'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.4'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.6'
|
69
|
+
description: Bulbasaur is a helper for crawler operations used in Pread.ly
|
70
|
+
email:
|
71
|
+
- magnocosta.br@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- Rakefile
|
77
|
+
- bulbasaur.gemspec
|
78
|
+
- lib/bulbasaur.rb
|
79
|
+
- lib/bulbasaur/extract_text_from_html.rb
|
80
|
+
- lib/bulbasaur/extracts/extract_images_from_all_resources.rb
|
81
|
+
- lib/bulbasaur/extracts/extract_images_from_html.rb
|
82
|
+
- lib/bulbasaur/extracts/extract_images_from_vimeo.rb
|
83
|
+
- lib/bulbasaur/extracts/extract_images_from_youtube.rb
|
84
|
+
- lib/bulbasaur/replaces/replace_by_tag_image.rb
|
85
|
+
- lib/bulbasaur/utils/normalize_url.rb
|
86
|
+
- lib/bulbasaur/version.rb
|
87
|
+
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
88
|
+
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
89
|
+
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
90
|
+
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
91
|
+
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
92
|
+
- spec/bulbasaur/utils/normalize_url_spec.rb
|
93
|
+
- spec/bulbasaur_spec.rb
|
94
|
+
- spec/spec_helper.rb
|
95
|
+
homepage: https://github.com/preadly/bulbasaur
|
96
|
+
licenses: []
|
97
|
+
metadata: {}
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options: []
|
100
|
+
require_paths:
|
101
|
+
- lib
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements: []
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 2.2.2
|
115
|
+
signing_key:
|
116
|
+
specification_version: 4
|
117
|
+
summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
118
|
+
test_files:
|
119
|
+
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
120
|
+
- spec/bulbasaur/extracts/extract_images_from_html_spec.rb
|
121
|
+
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
122
|
+
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
123
|
+
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
124
|
+
- spec/bulbasaur/utils/normalize_url_spec.rb
|
125
|
+
- spec/bulbasaur_spec.rb
|
126
|
+
- spec/spec_helper.rb
|