preadly-bulbasaur 0.8.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bulbasaur.rb +1 -0
- data/lib/bulbasaur/removals/remove_attributes.rb +4 -2
- data/lib/bulbasaur/removals/remove_tags.rb +18 -4
- data/lib/bulbasaur/utils/normalize_image_sources.rb +5 -4
- data/lib/bulbasaur/utils/normalize_image_src_set.rb +68 -0
- data/lib/bulbasaur/version.rb +2 -2
- data/spec/bulbasaur/removals/remove_tags_spec.rb +61 -5
- data/spec/bulbasaur/utils/normalize_image_srcset_spec.rb +122 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a87289e0e17c9e9ae9d8f489c3704a14dff5d5ac
|
4
|
+
data.tar.gz: 64d2ec14ed814afe1997c2472678f1af7c3b9166
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d74058dc3e6928672b8fb1342e2d7b81db562110cf7bfa491b470838de5f19bd6cbb8ee36e8d929cdb5e4a6e31c6c56b0dda4de758b18efc71214e0a1ae9671a
|
7
|
+
data.tar.gz: 63c9260e6f33a3a114d46c8ea119f7d026a6e5cebd655b31131af8a775a86dd6e63c6bab260bcc3f7fd55aae58eee05a063fd49a925a2e096c98ce908c144347
|
data/lib/bulbasaur.rb
CHANGED
@@ -11,6 +11,7 @@ require "bulbasaur/removals/remove_attributes"
|
|
11
11
|
require "bulbasaur/replaces/replace_by_tag_image"
|
12
12
|
require "bulbasaur/utils/normalize_url"
|
13
13
|
require "bulbasaur/utils/normalize_image_sources"
|
14
|
+
require "bulbasaur/utils/normalize_image_src_set"
|
14
15
|
require "bulbasaur/version"
|
15
16
|
|
16
17
|
|
@@ -1,14 +1,16 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class RemoveAttributes
|
3
|
+
|
3
4
|
def initialize(html, banned_attrs)
|
4
5
|
@html = html
|
5
6
|
@banned_attrs = banned_attrs
|
6
7
|
end
|
7
8
|
|
8
9
|
def call
|
9
|
-
parsed_html = Nokogiri::HTML::DocumentFragment.parse
|
10
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
10
11
|
@banned_attrs.each { |attr| parsed_html.xpath(".//@#{attr}").remove }
|
11
12
|
parsed_html.to_s
|
12
13
|
end
|
14
|
+
|
13
15
|
end
|
14
|
-
end
|
16
|
+
end
|
@@ -1,14 +1,28 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class RemoveTags
|
3
|
-
|
3
|
+
|
4
|
+
SELECTORS_EMPTY_ANALYSE = "div, p"
|
5
|
+
|
6
|
+
def initialize(html, banned_tags, remove_empty_tags = false)
|
4
7
|
@html = html
|
5
8
|
@banned_tags = banned_tags
|
9
|
+
@remove_empty_tags = remove_empty_tags
|
6
10
|
end
|
7
|
-
|
11
|
+
|
8
12
|
def call
|
9
|
-
parsed_html = Nokogiri::HTML::DocumentFragment.parse
|
13
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
14
|
+
remove_empty_tags(parsed_html) if @remove_empty_tags
|
10
15
|
@banned_tags.each { |tag| parsed_html.css(tag).remove }
|
11
16
|
parsed_html.to_s
|
12
17
|
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def remove_empty_tags(parsed_html)
|
22
|
+
parsed_html.css(SELECTORS_EMPTY_ANALYSE).each do |tag|
|
23
|
+
tag.remove if tag.content.strip.empty? && tag.children.select{ |c| c.name != "text" }.length == 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
13
27
|
end
|
14
|
-
end
|
28
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class NormalizeImageSources
|
3
|
+
|
3
4
|
def initialize(html, target_attrs)
|
4
5
|
@html = html
|
5
6
|
@target_attrs = target_attrs
|
@@ -8,7 +9,7 @@ module Bulbasaur
|
|
8
9
|
def call
|
9
10
|
parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
|
10
11
|
parsed_html.css('img').each do |element|
|
11
|
-
check_for_attrs
|
12
|
+
check_for_attrs(element)
|
12
13
|
end
|
13
14
|
parsed_html.to_s
|
14
15
|
end
|
@@ -25,12 +26,12 @@ module Bulbasaur
|
|
25
26
|
end
|
26
27
|
|
27
28
|
def adjust(element, attr)
|
28
|
-
element.set_attribute
|
29
|
-
remove_target_attrs_from
|
29
|
+
element.set_attribute('src', element.xpath(attr).text)
|
30
|
+
remove_target_attrs_from(element)
|
30
31
|
end
|
31
32
|
|
32
33
|
def remove_target_attrs_from(element)
|
33
34
|
@target_attrs.each { |attr| element.xpath("@#{attr}").remove }
|
34
35
|
end
|
35
36
|
end
|
36
|
-
end
|
37
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class NormalizeImageSrcSet
|
3
|
+
|
4
|
+
REGEX_FIND_WIDTH = /(\d+)w/i
|
5
|
+
REGEX_FIND_HEIGHT = /(\d+)h/i
|
6
|
+
|
7
|
+
def initialize(html)
|
8
|
+
@html = html
|
9
|
+
end
|
10
|
+
|
11
|
+
def call
|
12
|
+
parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
|
13
|
+
parsed_html.css("img[srcset]").each do |img|
|
14
|
+
image = extract_src_set_attribute(img)
|
15
|
+
img.set_attribute("src", image[:url])
|
16
|
+
img.set_attribute("width", image[:width]) if image[:width]
|
17
|
+
img.set_attribute("height", image[:height]) if image[:height]
|
18
|
+
end
|
19
|
+
parsed_html.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def extract_src_set_attribute(img)
|
25
|
+
itens = img.get_attribute("srcset").split(",")
|
26
|
+
images = []
|
27
|
+
itens.each do |item|
|
28
|
+
srcset_item = item.split(" ")
|
29
|
+
image_object = {
|
30
|
+
url: extract_url(srcset_item),
|
31
|
+
width: extract_width(srcset_item),
|
32
|
+
height: extract_height(srcset_item)
|
33
|
+
}
|
34
|
+
images << image_object
|
35
|
+
end
|
36
|
+
get_better_image(images)
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_width(itens)
|
40
|
+
extract_by_regex(itens, REGEX_FIND_WIDTH)
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_height(itens)
|
44
|
+
extract_by_regex(itens, REGEX_FIND_HEIGHT)
|
45
|
+
end
|
46
|
+
|
47
|
+
def extract_url(srcset_item)
|
48
|
+
srcset_item[0]
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_by_regex(itens, regex)
|
52
|
+
value = itens.select{ |item| item =~ regex }.first
|
53
|
+
value = value.match(regex).captures.first if value
|
54
|
+
value
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_better_image(itens)
|
58
|
+
images = Array.new
|
59
|
+
images.concat itens.select{ |img| !img[:width] && !img[:height] }.each{ |img| img[:area] = 0 }
|
60
|
+
images.concat itens.select{ |img| !img[:width] && img[:height] }.each{ |img| img[:area] = img[:height] }
|
61
|
+
images.concat itens.select{ |img| img[:width] && !img[:height] }.each{ |img| img[:area] = img[:width] }
|
62
|
+
images.concat itens.select{ |img| img[:width] && img[:height] }.each{ |img| img[:area] = img[:width].to_i * img[:height].to_i }
|
63
|
+
images = images.sort { |a, b| b[:area] <=> a[:area] }
|
64
|
+
images.first
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
data/lib/bulbasaur/version.rb
CHANGED
@@ -3,11 +3,15 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveTags do
|
4
4
|
|
5
5
|
subject do
|
6
|
-
described_class.new(html, banned_tags).call
|
6
|
+
described_class.new(html, banned_tags, empty_tags).call
|
7
7
|
end
|
8
8
|
|
9
9
|
describe '#call' do
|
10
10
|
|
11
|
+
let(:empty_tags) do
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
11
15
|
let(:html) do
|
12
16
|
%[
|
13
17
|
<style>
|
@@ -18,6 +22,12 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
18
22
|
<input type="text">
|
19
23
|
</form>
|
20
24
|
<p>hello!</p>
|
25
|
+
<div class="inner top">
|
26
|
+
<p></p>
|
27
|
+
<div> </div>
|
28
|
+
</div>
|
29
|
+
<div></div>
|
30
|
+
<p></p>
|
21
31
|
]
|
22
32
|
end
|
23
33
|
|
@@ -26,25 +36,71 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
26
36
|
let(:banned_tags) do
|
27
37
|
[]
|
28
38
|
end
|
29
|
-
|
39
|
+
|
30
40
|
it 'returns the HTML code as it was before' do
|
31
41
|
expect(subject).to eq html
|
32
42
|
end
|
33
43
|
end
|
34
|
-
|
44
|
+
|
35
45
|
context 'when there are banned tags' do
|
36
46
|
|
37
47
|
let(:banned_tags) do
|
38
48
|
%w(form style)
|
39
49
|
end
|
40
|
-
|
50
|
+
|
41
51
|
it 'returns the HTML code without the banned tags' do
|
42
52
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
43
53
|
<div style="height: 100px; width: 100px;"></div>
|
44
54
|
<p>hello!</p>
|
55
|
+
<div class="inner top">
|
56
|
+
<p></p>
|
57
|
+
<div> </div>
|
58
|
+
</div>
|
59
|
+
<div></div>
|
60
|
+
<p></p>
|
45
61
|
].strip.gsub(/\n/, '').squeeze ' '
|
46
62
|
end
|
47
63
|
end
|
48
|
-
|
64
|
+
|
65
|
+
context 'when the HTML code with the banned tags and defined no empty tags' do
|
66
|
+
|
67
|
+
let(:html) do
|
68
|
+
%[
|
69
|
+
<style>
|
70
|
+
div { color: green; width: 1024px; }
|
71
|
+
</style>
|
72
|
+
<div style="height: 100px; width: 100px;"></div>
|
73
|
+
<form>
|
74
|
+
<input type="text">
|
75
|
+
</form>
|
76
|
+
<p>hello!</p>
|
77
|
+
<div class="inner top">
|
78
|
+
<p></p>
|
79
|
+
<div><img src='test.jpg'></div>
|
80
|
+
</div>
|
81
|
+
<div></div>
|
82
|
+
<p> </p>
|
83
|
+
<div class="helo"> </div>
|
84
|
+
<p></p>
|
85
|
+
]
|
86
|
+
end
|
87
|
+
|
88
|
+
let(:banned_tags) do
|
89
|
+
%w(form style)
|
90
|
+
end
|
49
91
|
|
92
|
+
let(:empty_tags) do
|
93
|
+
true
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'returns the HTML code without the banned tags and without empty tags' do
|
97
|
+
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
98
|
+
<p>hello!</p>
|
99
|
+
<div class="inner top">
|
100
|
+
<div><img src="test.jpg"></div>
|
101
|
+
</div>
|
102
|
+
].strip.gsub(/\n/, '').squeeze ' '
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
50
106
|
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::NormalizeImageSrcSet do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
context "When there are no srcset attribute" do
|
12
|
+
|
13
|
+
let(:html) do
|
14
|
+
<<-HTML
|
15
|
+
<img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
16
|
+
<img lazy-data="https://place.where/an-image/has/no-src-tag.jpg">
|
17
|
+
HTML
|
18
|
+
end
|
19
|
+
|
20
|
+
it "Does returns the HTML code as it was before" do
|
21
|
+
expect(subject).to eq(html)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "When there are srcset attribute without size defined" do
|
26
|
+
|
27
|
+
let(:html) do
|
28
|
+
<<-HTML
|
29
|
+
<img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
|
30
|
+
<img srcset="http://bulbasaur.com/imageB.jpg">
|
31
|
+
<img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg">
|
32
|
+
HTML
|
33
|
+
end
|
34
|
+
|
35
|
+
let(:html_normalized) do
|
36
|
+
<<-HTML
|
37
|
+
<img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
|
38
|
+
<img srcset="http://bulbasaur.com/imageB.jpg" src="http://bulbasaur.com/imageB.jpg">
|
39
|
+
<img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg" src="http://bulbasaur.com/imageC.jpg">
|
40
|
+
HTML
|
41
|
+
end
|
42
|
+
|
43
|
+
it "Does returns the HTML code with src replaced by srcset attribute" do
|
44
|
+
expect(subject).to eq(html_normalized)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "When there are srcset attribute with size defined" do
|
49
|
+
|
50
|
+
let(:html) do
|
51
|
+
<<-HTML
|
52
|
+
<img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="800" height="1200">
|
53
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w">
|
54
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200h">
|
55
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 20w 30h">
|
56
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 2x">
|
57
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200W 100H">
|
58
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w">
|
59
|
+
HTML
|
60
|
+
end
|
61
|
+
|
62
|
+
let(:html_normalized) do
|
63
|
+
<<-HTML
|
64
|
+
<img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="100" height="1200">
|
65
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w" src="http://bulbasaur.com/imageB.jpg" width="200">
|
66
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200h" src="http://bulbasaur.com/imageB.jpg" height="200">
|
67
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 20w 30h" src="http://bulbasaur.com/imageB.jpg" width="20" height="30">
|
68
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 2x" src="http://bulbasaur.com/imageB.jpg">
|
69
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200W 100H" src="http://bulbasaur.com/imageB.jpg" width="200" height="100">
|
70
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w" src="http://bulbasaur.com/imageD.jpg" width="300">
|
71
|
+
HTML
|
72
|
+
end
|
73
|
+
|
74
|
+
it "Does return the HTML code with src and with definitions" do
|
75
|
+
expect(subject).to eq(html_normalized)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "When there are no valid params on srcset" do
|
80
|
+
|
81
|
+
let(:html) do
|
82
|
+
<<-HTML
|
83
|
+
<img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image">
|
84
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x">
|
85
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h">
|
86
|
+
HTML
|
87
|
+
end
|
88
|
+
|
89
|
+
let(:html_normalized) do
|
90
|
+
<<-HTML
|
91
|
+
<img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image" src="http://bulbasaur.com/imageA.jpg" width="100" height="200">
|
92
|
+
<img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x" src="http://bulbasaur.com/imageB.jpg" width="200">
|
93
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageC.jpg" width="600" height="500">
|
94
|
+
HTML
|
95
|
+
end
|
96
|
+
|
97
|
+
it "Does returns the HTML code and ignores invalid params" do
|
98
|
+
expect(subject).to eq(html_normalized)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context "When there are diferents sizes to the same image" do
|
103
|
+
|
104
|
+
let(:html) do
|
105
|
+
<<-HTML
|
106
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h">
|
107
|
+
HTML
|
108
|
+
end
|
109
|
+
|
110
|
+
let(:html_normalized_) do
|
111
|
+
<<-HTML
|
112
|
+
<img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageD.jpg" width="300" height="400">
|
113
|
+
HTML
|
114
|
+
end
|
115
|
+
|
116
|
+
it "Does returns the HTML code with better image" do
|
117
|
+
expect(subject).to eq(html_normalized_)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -101,6 +101,7 @@ files:
|
|
101
101
|
- lib/bulbasaur/removals/remove_tags.rb
|
102
102
|
- lib/bulbasaur/replaces/replace_by_tag_image.rb
|
103
103
|
- lib/bulbasaur/utils/normalize_image_sources.rb
|
104
|
+
- lib/bulbasaur/utils/normalize_image_src_set.rb
|
104
105
|
- lib/bulbasaur/utils/normalize_url.rb
|
105
106
|
- lib/bulbasaur/version.rb
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
|
@@ -114,6 +115,7 @@ files:
|
|
114
115
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
115
116
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
116
117
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
118
|
+
- spec/bulbasaur/utils/normalize_image_srcset_spec.rb
|
117
119
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
118
120
|
- spec/spec_helper.rb
|
119
121
|
homepage: https://github.com/preadly/bulbasaur
|
@@ -135,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
137
|
version: '0'
|
136
138
|
requirements: []
|
137
139
|
rubyforge_project:
|
138
|
-
rubygems_version: 2.
|
140
|
+
rubygems_version: 2.4.8
|
139
141
|
signing_key:
|
140
142
|
specification_version: 4
|
141
143
|
summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
@@ -151,5 +153,6 @@ test_files:
|
|
151
153
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
152
154
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
153
155
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
156
|
+
- spec/bulbasaur/utils/normalize_image_srcset_spec.rb
|
154
157
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
155
158
|
- spec/spec_helper.rb
|