preadly-bulbasaur 0.8.2 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bulbasaur.rb +1 -0
- data/lib/bulbasaur/removals/remove_attributes.rb +4 -2
- data/lib/bulbasaur/removals/remove_tags.rb +18 -4
- data/lib/bulbasaur/utils/normalize_image_sources.rb +5 -4
- data/lib/bulbasaur/utils/normalize_image_src_set.rb +68 -0
- data/lib/bulbasaur/version.rb +2 -2
- data/spec/bulbasaur/removals/remove_tags_spec.rb +61 -5
- data/spec/bulbasaur/utils/normalize_image_srcset_spec.rb +122 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a87289e0e17c9e9ae9d8f489c3704a14dff5d5ac
|
4
|
+
data.tar.gz: 64d2ec14ed814afe1997c2472678f1af7c3b9166
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d74058dc3e6928672b8fb1342e2d7b81db562110cf7bfa491b470838de5f19bd6cbb8ee36e8d929cdb5e4a6e31c6c56b0dda4de758b18efc71214e0a1ae9671a
|
7
|
+
data.tar.gz: 63c9260e6f33a3a114d46c8ea119f7d026a6e5cebd655b31131af8a775a86dd6e63c6bab260bcc3f7fd55aae58eee05a063fd49a925a2e096c98ce908c144347
|
data/lib/bulbasaur.rb
CHANGED
@@ -11,6 +11,7 @@ require "bulbasaur/removals/remove_attributes"
|
|
11
11
|
require "bulbasaur/replaces/replace_by_tag_image"
|
12
12
|
require "bulbasaur/utils/normalize_url"
|
13
13
|
require "bulbasaur/utils/normalize_image_sources"
|
14
|
+
require "bulbasaur/utils/normalize_image_src_set"
|
14
15
|
require "bulbasaur/version"
|
15
16
|
|
16
17
|
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class RemoveAttributes
|
3
|
+
|
3
4
|
def initialize(html, banned_attrs)
|
4
5
|
@html = html
|
5
6
|
@banned_attrs = banned_attrs
|
6
7
|
end
|
7
8
|
|
8
9
|
def call
|
9
|
-
parsed_html = Nokogiri::HTML::DocumentFragment.parse
|
10
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
10
11
|
@banned_attrs.each { |attr| parsed_html.xpath(".//@#{attr}").remove }
|
11
12
|
parsed_html.to_s
|
12
13
|
end
|
14
|
+
|
13
15
|
end
|
14
|
-
end
|
16
|
+
end
|
@@ -1,14 +1,28 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class RemoveTags
|
3
|
-
|
3
|
+
|
4
|
+
SELECTORS_EMPTY_ANALYSE = "div, p"
|
5
|
+
|
6
|
+
def initialize(html, banned_tags, remove_empty_tags = false)
|
4
7
|
@html = html
|
5
8
|
@banned_tags = banned_tags
|
9
|
+
@remove_empty_tags = remove_empty_tags
|
6
10
|
end
|
7
|
-
|
11
|
+
|
8
12
|
def call
|
9
|
-
parsed_html = Nokogiri::HTML::DocumentFragment.parse
|
13
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
14
|
+
remove_empty_tags(parsed_html) if @remove_empty_tags
|
10
15
|
@banned_tags.each { |tag| parsed_html.css(tag).remove }
|
11
16
|
parsed_html.to_s
|
12
17
|
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def remove_empty_tags(parsed_html)
|
22
|
+
parsed_html.css(SELECTORS_EMPTY_ANALYSE).each do |tag|
|
23
|
+
tag.remove if tag.content.strip.empty? && tag.children.select{ |c| c.name != "text" }.length == 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
13
27
|
end
|
14
|
-
end
|
28
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class NormalizeImageSources
|
3
|
+
|
3
4
|
def initialize(html, target_attrs)
|
4
5
|
@html = html
|
5
6
|
@target_attrs = target_attrs
|
@@ -8,7 +9,7 @@ module Bulbasaur
|
|
8
9
|
def call
|
9
10
|
parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
|
10
11
|
parsed_html.css('img').each do |element|
|
11
|
-
check_for_attrs
|
12
|
+
check_for_attrs(element)
|
12
13
|
end
|
13
14
|
parsed_html.to_s
|
14
15
|
end
|
@@ -25,12 +26,12 @@ module Bulbasaur
|
|
25
26
|
end
|
26
27
|
|
27
28
|
def adjust(element, attr)
|
28
|
-
element.set_attribute
|
29
|
-
remove_target_attrs_from
|
29
|
+
element.set_attribute('src', element.xpath(attr).text)
|
30
|
+
remove_target_attrs_from(element)
|
30
31
|
end
|
31
32
|
|
32
33
|
def remove_target_attrs_from(element)
|
33
34
|
@target_attrs.each { |attr| element.xpath("@#{attr}").remove }
|
34
35
|
end
|
35
36
|
end
|
36
|
-
end
|
37
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class NormalizeImageSrcSet
|
3
|
+
|
4
|
+
REGEX_FIND_WIDTH = /(\d+)w/i
|
5
|
+
REGEX_FIND_HEIGHT = /(\d+)h/i
|
6
|
+
|
7
|
+
def initialize(html)
|
8
|
+
@html = html
|
9
|
+
end
|
10
|
+
|
11
|
+
def call
|
12
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
13
|
+
parsed_html.css("img[srcset]").each do |img|
|
14
|
+
image = extract_src_set_attribute(img)
|
15
|
+
img.set_attribute("src", image[:url])
|
16
|
+
img.set_attribute("width", image[:width]) if image[:width]
|
17
|
+
img.set_attribute("height", image[:height]) if image[:height]
|
18
|
+
end
|
19
|
+
parsed_html.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def extract_src_set_attribute(img)
|
25
|
+
itens = img.get_attribute("srcset").split(",")
|
26
|
+
images = []
|
27
|
+
itens.each do |item|
|
28
|
+
srcset_item = item.split(" ")
|
29
|
+
image_object = {
|
30
|
+
url: extract_url(srcset_item),
|
31
|
+
width: extract_width(srcset_item),
|
32
|
+
height: extract_height(srcset_item)
|
33
|
+
}
|
34
|
+
images << image_object
|
35
|
+
end
|
36
|
+
get_better_image(images)
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_width(itens)
|
40
|
+
extract_by_regex(itens, REGEX_FIND_WIDTH)
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_height(itens)
|
44
|
+
extract_by_regex(itens, REGEX_FIND_HEIGHT)
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_url(srcset_item)
|
48
|
+
srcset_item[0]
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_by_regex(itens, regex)
|
52
|
+
value = itens.select{ |item| item =~ regex }.first
|
53
|
+
value = value.match(regex).captures.first if value
|
54
|
+
value
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_better_image(itens)
|
58
|
+
images = Array.new
|
59
|
+
images.concat itens.select{ |img| !img[:width] && !img[:height] }.each{ |img| img[:area] = 0 }
|
60
|
+
images.concat itens.select{ |img| !img[:width] && img[:height] }.each{ |img| img[:area] = img[:height] }
|
61
|
+
images.concat itens.select{ |img| img[:width] && !img[:height] }.each{ |img| img[:area] = img[:width] }
|
62
|
+
images.concat itens.select{ |img| img[:width] && img[:height] }.each{ |img| img[:area] = img[:width].to_i * img[:height].to_i }
|
63
|
+
images = images.sort { |a, b| b[:area] <=> a[:area] }
|
64
|
+
images.first
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
data/lib/bulbasaur/version.rb
CHANGED
@@ -3,11 +3,15 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveTags do
|
4
4
|
|
5
5
|
subject do
|
6
|
-
described_class.new(html, banned_tags).call
|
6
|
+
described_class.new(html, banned_tags, empty_tags).call
|
7
7
|
end
|
8
8
|
|
9
9
|
describe '#call' do
|
10
10
|
|
11
|
+
let(:empty_tags) do
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
11
15
|
let(:html) do
|
12
16
|
%[
|
13
17
|
<style>
|
@@ -18,6 +22,12 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
18
22
|
<input type="text">
|
19
23
|
</form>
|
20
24
|
<p>hello!</p>
|
25
|
+
<div class="inner top">
|
26
|
+
<p></p>
|
27
|
+
<div> </div>
|
28
|
+
</div>
|
29
|
+
<div></div>
|
30
|
+
<p></p>
|
21
31
|
]
|
22
32
|
end
|
23
33
|
|
@@ -26,25 +36,71 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
26
36
|
let(:banned_tags) do
|
27
37
|
[]
|
28
38
|
end
|
29
|
-
|
39
|
+
|
30
40
|
it 'returns the HTML code as it was before' do
|
31
41
|
expect(subject).to eq html
|
32
42
|
end
|
33
43
|
end
|
34
|
-
|
44
|
+
|
35
45
|
context 'when there are banned tags' do
|
36
46
|
|
37
47
|
let(:banned_tags) do
|
38
48
|
%w(form style)
|
39
49
|
end
|
40
|
-
|
50
|
+
|
41
51
|
it 'returns the HTML code without the banned tags' do
|
42
52
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
43
53
|
<div style="height: 100px; width: 100px;"></div>
|
44
54
|
<p>hello!</p>
|
55
|
+
<div class="inner top">
|
56
|
+
<p></p>
|
57
|
+
<div> </div>
|
58
|
+
</div>
|
59
|
+
<div></div>
|
60
|
+
<p></p>
|
45
61
|
].strip.gsub(/\n/, '').squeeze ' '
|
46
62
|
end
|
47
63
|
end
|
48
|
-
|
64
|
+
|
65
|
+
context 'when the HTML code with the banned tags and defined no empty tags' do
|
66
|
+
|
67
|
+
let(:html) do
|
68
|
+
%[
|
69
|
+
<style>
|
70
|
+
div { color: green; width: 1024px; }
|
71
|
+
</style>
|
72
|
+
<div style="height: 100px; width: 100px;"></div>
|
73
|
+
<form>
|
74
|
+
<input type="text">
|
75
|
+
</form>
|
76
|
+
<p>hello!</p>
|
77
|
+
<div class="inner top">
|
78
|
+
<p></p>
|
79
|
+
<div><img src='test.jpg'></div>
|
80
|
+
</div>
|
81
|
+
<div></div>
|
82
|
+
<p> </p>
|
83
|
+
<div class="helo"> </div>
|
84
|
+
<p></p>
|
85
|
+
]
|
86
|
+
end
|
87
|
+
|
88
|
+
let(:banned_tags) do
|
89
|
+
%w(form style)
|
90
|
+
end
|
49
91
|
|
92
|
+
let(:empty_tags) do
|
93
|
+
true
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'returns the HTML code without the banned tags and without empty tags' do
|
97
|
+
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
98
|
+
<p>hello!</p>
|
99
|
+
<div class="inner top">
|
100
|
+
<div><img src="test.jpg"></div>
|
101
|
+
</div>
|
102
|
+
].strip.gsub(/\n/, '').squeeze ' '
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
50
106
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::NormalizeImageSrcSet do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When there are no srcset attribute" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
<<-HTML
|
15
|
+
<img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
16
|
+
<img lazy-data="https://place.where/an-image/has/no-src-tag.jpg">
|
17
|
+
HTML
|
18
|
+
end
|
19
|
+
|
20
|
+
it "Does returns the HTML code as it was before" do
|
21
|
+
expect(subject).to eq(html)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "When there are srcset attribute without size defined" do
|
26
|
+
|
27
|
+
let(:html) do
|
28
|
+
<<-HTML
|
29
|
+
<img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
|
30
|
+
<img srcset="http://bulbasaur.com/imageB.jpg">
|
31
|
+
<img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg">
|
32
|
+
HTML
|
33
|
+
end
|
34
|
+
|
35
|
+
let(:html_normalized) do
|
36
|
+
<<-HTML
|
37
|
+
<img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
|
38
|
+
<img srcset="http://bulbasaur.com/imageB.jpg" src="http://bulbasaur.com/imageB.jpg">
|
39
|
+
<img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg" src="http://bulbasaur.com/imageC.jpg">
|
40
|
+
HTML
|
41
|
+
end
|
42
|
+
|
43
|
+
it "Does returns the HTML code with src replaced by srcset attribute" do
|
44
|
+
expect(subject).to eq(html_normalized)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "When there are srcset attribute with size defined" do
|
49
|
+
|
50
|
+
let(:html) do
|
51
|
+
<<-HTML
|
52
|
+
<img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="800" height="1200">
|
53
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w">
|
54
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200h">
|
55
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 20w 30h">
|
56
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 2x">
|
57
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200W 100H">
|
58
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w">
|
59
|
+
HTML
|
60
|
+
end
|
61
|
+
|
62
|
+
let(:html_normalized) do
|
63
|
+
<<-HTML
|
64
|
+
<img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="100" height="1200">
|
65
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w" src="http://bulbasaur.com/imageB.jpg" width="200">
|
66
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200h" src="http://bulbasaur.com/imageB.jpg" height="200">
|
67
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 20w 30h" src="http://bulbasaur.com/imageB.jpg" width="20" height="30">
|
68
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 2x" src="http://bulbasaur.com/imageB.jpg">
|
69
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200W 100H" src="http://bulbasaur.com/imageB.jpg" width="200" height="100">
|
70
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w" src="http://bulbasaur.com/imageD.jpg" width="300">
|
71
|
+
HTML
|
72
|
+
end
|
73
|
+
|
74
|
+
it "Does return the HTML code with src and with definitions" do
|
75
|
+
expect(subject).to eq(html_normalized)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "When there are no valid params on srcset" do
|
80
|
+
|
81
|
+
let(:html) do
|
82
|
+
<<-HTML
|
83
|
+
<img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image">
|
84
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x">
|
85
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h">
|
86
|
+
HTML
|
87
|
+
end
|
88
|
+
|
89
|
+
let(:html_normalized) do
|
90
|
+
<<-HTML
|
91
|
+
<img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image" src="http://bulbasaur.com/imageA.jpg" width="100" height="200">
|
92
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x" src="http://bulbasaur.com/imageB.jpg" width="200">
|
93
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageC.jpg" width="600" height="500">
|
94
|
+
HTML
|
95
|
+
end
|
96
|
+
|
97
|
+
it "Does returns the HTML code and ignores invalid params" do
|
98
|
+
expect(subject).to eq(html_normalized)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context "When there are diferents sizes to the same image" do
|
103
|
+
|
104
|
+
let(:html) do
|
105
|
+
<<-HTML
|
106
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h">
|
107
|
+
HTML
|
108
|
+
end
|
109
|
+
|
110
|
+
let(:html_normalized_) do
|
111
|
+
<<-HTML
|
112
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageD.jpg" width="300" height="400">
|
113
|
+
HTML
|
114
|
+
end
|
115
|
+
|
116
|
+
it "Does returns the HTML code with better image" do
|
117
|
+
expect(subject).to eq(html_normalized_)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -101,6 +101,7 @@ files:
|
|
101
101
|
- lib/bulbasaur/removals/remove_tags.rb
|
102
102
|
- lib/bulbasaur/replaces/replace_by_tag_image.rb
|
103
103
|
- lib/bulbasaur/utils/normalize_image_sources.rb
|
104
|
+
- lib/bulbasaur/utils/normalize_image_src_set.rb
|
104
105
|
- lib/bulbasaur/utils/normalize_url.rb
|
105
106
|
- lib/bulbasaur/version.rb
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
@@ -114,6 +115,7 @@ files:
|
|
114
115
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
115
116
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
116
117
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
118
|
+
- spec/bulbasaur/utils/normalize_image_srcset_spec.rb
|
117
119
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
118
120
|
- spec/spec_helper.rb
|
119
121
|
homepage: https://github.com/preadly/bulbasaur
|
@@ -135,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
137
|
version: '0'
|
136
138
|
requirements: []
|
137
139
|
rubyforge_project:
|
138
|
-
rubygems_version: 2.
|
140
|
+
rubygems_version: 2.4.8
|
139
141
|
signing_key:
|
140
142
|
specification_version: 4
|
141
143
|
summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
@@ -151,5 +153,6 @@ test_files:
|
|
151
153
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
152
154
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
153
155
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
156
|
+
- spec/bulbasaur/utils/normalize_image_srcset_spec.rb
|
154
157
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
155
158
|
- spec/spec_helper.rb
|