preadly-bulbasaur 0.8.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8260cd6a3e468ab1139c0096ee1cbb5086fe7ff1
4
- data.tar.gz: 146d9e064d05db1eb3c19c7cdc7290c6027fa79a
3
+ metadata.gz: a87289e0e17c9e9ae9d8f489c3704a14dff5d5ac
4
+ data.tar.gz: 64d2ec14ed814afe1997c2472678f1af7c3b9166
5
5
  SHA512:
6
- metadata.gz: 61721cf5b99c6edba03b754fe98b863bd9747ab0bd279d4b3023becfc160309a591d931a2b8b814aab69884c940e026c72d8dc52022769101ef9f79f4da90b49
7
- data.tar.gz: f8da5d762adf404102767f71c0c68ea99a3f49fbbb716612f01e95f9fda6ab8a5c982068d35d14232bdb8d033bfdcefab69d628c36a3fbdeb54dab390ab490d7
6
+ metadata.gz: d74058dc3e6928672b8fb1342e2d7b81db562110cf7bfa491b470838de5f19bd6cbb8ee36e8d929cdb5e4a6e31c6c56b0dda4de758b18efc71214e0a1ae9671a
7
+ data.tar.gz: 63c9260e6f33a3a114d46c8ea119f7d026a6e5cebd655b31131af8a775a86dd6e63c6bab260bcc3f7fd55aae58eee05a063fd49a925a2e096c98ce908c144347
data/lib/bulbasaur.rb CHANGED
@@ -11,6 +11,7 @@ require "bulbasaur/removals/remove_attributes"
11
11
  require "bulbasaur/replaces/replace_by_tag_image"
12
12
  require "bulbasaur/utils/normalize_url"
13
13
  require "bulbasaur/utils/normalize_image_sources"
14
+ require "bulbasaur/utils/normalize_image_src_set"
14
15
  require "bulbasaur/version"
15
16
 
16
17
 
@@ -1,14 +1,16 @@
1
1
  module Bulbasaur
2
2
  class RemoveAttributes
3
+
3
4
  def initialize(html, banned_attrs)
4
5
  @html = html
5
6
  @banned_attrs = banned_attrs
6
7
  end
7
8
 
8
9
  def call
9
- parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
10
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
10
11
  @banned_attrs.each { |attr| parsed_html.xpath(".//@#{attr}").remove }
11
12
  parsed_html.to_s
12
13
  end
14
+
13
15
  end
14
- end
16
+ end
@@ -1,14 +1,28 @@
1
1
  module Bulbasaur
2
2
  class RemoveTags
3
- def initialize(html, banned_tags)
3
+
4
+ SELECTORS_EMPTY_ANALYSE = "div, p"
5
+
6
+ def initialize(html, banned_tags, remove_empty_tags = false)
4
7
  @html = html
5
8
  @banned_tags = banned_tags
9
+ @remove_empty_tags = remove_empty_tags
6
10
  end
7
-
11
+
8
12
  def call
9
- parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
13
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
14
+ remove_empty_tags(parsed_html) if @remove_empty_tags
10
15
  @banned_tags.each { |tag| parsed_html.css(tag).remove }
11
16
  parsed_html.to_s
12
17
  end
18
+
19
+ private
20
+
21
+ def remove_empty_tags(parsed_html)
22
+ parsed_html.css(SELECTORS_EMPTY_ANALYSE).each do |tag|
23
+ tag.remove if tag.content.strip.empty? && tag.children.select{ |c| c.name != "text" }.length == 0
24
+ end
25
+ end
26
+
13
27
  end
14
- end
28
+ end
@@ -1,5 +1,6 @@
1
1
  module Bulbasaur
2
2
  class NormalizeImageSources
3
+
3
4
  def initialize(html, target_attrs)
4
5
  @html = html
5
6
  @target_attrs = target_attrs
@@ -8,7 +9,7 @@ module Bulbasaur
8
9
  def call
9
10
  parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
10
11
  parsed_html.css('img').each do |element|
11
- check_for_attrs element
12
+ check_for_attrs(element)
12
13
  end
13
14
  parsed_html.to_s
14
15
  end
@@ -25,12 +26,12 @@ module Bulbasaur
25
26
  end
26
27
 
27
28
  def adjust(element, attr)
28
- element.set_attribute 'src', element.xpath(attr).text
29
- remove_target_attrs_from element
29
+ element.set_attribute('src', element.xpath(attr).text)
30
+ remove_target_attrs_from(element)
30
31
  end
31
32
 
32
33
  def remove_target_attrs_from(element)
33
34
  @target_attrs.each { |attr| element.xpath("@#{attr}").remove }
34
35
  end
35
36
  end
36
- end
37
+ end
@@ -0,0 +1,68 @@
1
+ module Bulbasaur
2
+ class NormalizeImageSrcSet
3
+
4
+ REGEX_FIND_WIDTH = /(\d+)w/i
5
+ REGEX_FIND_HEIGHT = /(\d+)h/i
6
+
7
+ def initialize(html)
8
+ @html = html
9
+ end
10
+
11
+ def call
12
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
13
+ parsed_html.css("img[srcset]").each do |img|
14
+ image = extract_src_set_attribute(img)
15
+ img.set_attribute("src", image[:url])
16
+ img.set_attribute("width", image[:width]) if image[:width]
17
+ img.set_attribute("height", image[:height]) if image[:height]
18
+ end
19
+ parsed_html.to_s
20
+ end
21
+
22
+ private
23
+
24
+ def extract_src_set_attribute(img)
25
+ itens = img.get_attribute("srcset").split(",")
26
+ images = []
27
+ itens.each do |item|
28
+ srcset_item = item.split(" ")
29
+ image_object = {
30
+ url: extract_url(srcset_item),
31
+ width: extract_width(srcset_item),
32
+ height: extract_height(srcset_item)
33
+ }
34
+ images << image_object
35
+ end
36
+ get_better_image(images)
37
+ end
38
+
39
+ def extract_width(itens)
40
+ extract_by_regex(itens, REGEX_FIND_WIDTH)
41
+ end
42
+
43
+ def extract_height(itens)
44
+ extract_by_regex(itens, REGEX_FIND_HEIGHT)
45
+ end
46
+
47
+ def extract_url(srcset_item)
48
+ srcset_item[0]
49
+ end
50
+
51
+ def extract_by_regex(itens, regex)
52
+ value = itens.select{ |item| item =~ regex }.first
53
+ value = value.match(regex).captures.first if value
54
+ value
55
+ end
56
+
57
+ def get_better_image(itens)
58
+ images = Array.new
59
+ images.concat itens.select{ |img| !img[:width] && !img[:height] }.each{ |img| img[:area] = 0 }
60
+ images.concat itens.select{ |img| !img[:width] && img[:height] }.each{ |img| img[:area] = img[:height] }
61
+ images.concat itens.select{ |img| img[:width] && !img[:height] }.each{ |img| img[:area] = img[:width] }
62
+ images.concat itens.select{ |img| img[:width] && img[:height] }.each{ |img| img[:area] = img[:width].to_i * img[:height].to_i }
63
+ images = images.sort { |a, b| b[:area] <=> a[:area] }
64
+ images.first
65
+ end
66
+
67
+ end
68
+ end
@@ -2,8 +2,8 @@ module Bulbasaur
2
2
 
3
3
  module Version
4
4
  MAJOR = 0
5
- MINOR = 8
6
- PATCH = 2
5
+ MINOR = 9
6
+ PATCH = 0
7
7
  STRING = "#{MAJOR}.#{MINOR}.#{PATCH}"
8
8
  end
9
9
 
@@ -3,11 +3,15 @@ require 'spec_helper'
3
3
  RSpec.describe Bulbasaur::RemoveTags do
4
4
 
5
5
  subject do
6
- described_class.new(html, banned_tags).call
6
+ described_class.new(html, banned_tags, empty_tags).call
7
7
  end
8
8
 
9
9
  describe '#call' do
10
10
 
11
+ let(:empty_tags) do
12
+ false
13
+ end
14
+
11
15
  let(:html) do
12
16
  %[
13
17
  <style>
@@ -18,6 +22,12 @@ RSpec.describe Bulbasaur::RemoveTags do
18
22
  <input type="text">
19
23
  </form>
20
24
  <p>hello!</p>
25
+ <div class="inner top">
26
+ <p></p>
27
+ <div> </div>
28
+ </div>
29
+ <div></div>
30
+ <p></p>
21
31
  ]
22
32
  end
23
33
 
@@ -26,25 +36,71 @@ RSpec.describe Bulbasaur::RemoveTags do
26
36
  let(:banned_tags) do
27
37
  []
28
38
  end
29
-
39
+
30
40
  it 'returns the HTML code as it was before' do
31
41
  expect(subject).to eq html
32
42
  end
33
43
  end
34
-
44
+
35
45
  context 'when there are banned tags' do
36
46
 
37
47
  let(:banned_tags) do
38
48
  %w(form style)
39
49
  end
40
-
50
+
41
51
  it 'returns the HTML code without the banned tags' do
42
52
  expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
43
53
  <div style="height: 100px; width: 100px;"></div>
44
54
  <p>hello!</p>
55
+ <div class="inner top">
56
+ <p></p>
57
+ <div> </div>
58
+ </div>
59
+ <div></div>
60
+ <p></p>
45
61
  ].strip.gsub(/\n/, '').squeeze ' '
46
62
  end
47
63
  end
48
- end
64
+
65
+ context 'when the HTML code with the banned tags and defined no empty tags' do
66
+
67
+ let(:html) do
68
+ %[
69
+ <style>
70
+ div { color: green; width: 1024px; }
71
+ </style>
72
+ <div style="height: 100px; width: 100px;"></div>
73
+ <form>
74
+ <input type="text">
75
+ </form>
76
+ <p>hello!</p>
77
+ <div class="inner top">
78
+ <p></p>
79
+ <div><img src='test.jpg'></div>
80
+ </div>
81
+ <div></div>
82
+ <p> </p>
83
+ <div class="helo"> </div>
84
+ <p></p>
85
+ ]
86
+ end
87
+
88
+ let(:banned_tags) do
89
+ %w(form style)
90
+ end
49
91
 
92
+ let(:empty_tags) do
93
+ true
94
+ end
95
+
96
+ it 'returns the HTML code without the banned tags and without empty tags' do
97
+ expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
98
+ <p>hello!</p>
99
+ <div class="inner top">
100
+ <div><img src="test.jpg"></div>
101
+ </div>
102
+ ].strip.gsub(/\n/, '').squeeze ' '
103
+ end
104
+ end
105
+ end
50
106
  end
@@ -0,0 +1,122 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Bulbasaur::NormalizeImageSrcSet do
4
+
5
+ subject do
6
+ described_class.new(html).call
7
+ end
8
+
9
+ describe "#call" do
10
+
11
+ context "When there are no srcset attribute" do
12
+
13
+ let(:html) do
14
+ <<-HTML
15
+ <img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
16
+ <img lazy-data="https://place.where/an-image/has/no-src-tag.jpg">
17
+ HTML
18
+ end
19
+
20
+ it "Does returns the HTML code as it was before" do
21
+ expect(subject).to eq(html)
22
+ end
23
+ end
24
+
25
+ context "When there are srcset attribute without size defined" do
26
+
27
+ let(:html) do
28
+ <<-HTML
29
+ <img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
30
+ <img srcset="http://bulbasaur.com/imageB.jpg">
31
+ <img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg">
32
+ HTML
33
+ end
34
+
35
+ let(:html_normalized) do
36
+ <<-HTML
37
+ <img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
38
+ <img srcset="http://bulbasaur.com/imageB.jpg" src="http://bulbasaur.com/imageB.jpg">
39
+ <img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg" src="http://bulbasaur.com/imageC.jpg">
40
+ HTML
41
+ end
42
+
43
+ it "Does returns the HTML code with src replaced by srcset attribute" do
44
+ expect(subject).to eq(html_normalized)
45
+ end
46
+ end
47
+
48
+ context "When there are srcset attribute with size defined" do
49
+
50
+ let(:html) do
51
+ <<-HTML
52
+ <img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="800" height="1200">
53
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w">
54
+ <img srcset="http://bulbasaur.com/imageB.jpg 200h">
55
+ <img srcset="http://bulbasaur.com/imageB.jpg 20w 30h">
56
+ <img srcset="http://bulbasaur.com/imageB.jpg 2x">
57
+ <img srcset="http://bulbasaur.com/imageB.jpg 200W 100H">
58
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w">
59
+ HTML
60
+ end
61
+
62
+ let(:html_normalized) do
63
+ <<-HTML
64
+ <img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="100" height="1200">
65
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w" src="http://bulbasaur.com/imageB.jpg" width="200">
66
+ <img srcset="http://bulbasaur.com/imageB.jpg 200h" src="http://bulbasaur.com/imageB.jpg" height="200">
67
+ <img srcset="http://bulbasaur.com/imageB.jpg 20w 30h" src="http://bulbasaur.com/imageB.jpg" width="20" height="30">
68
+ <img srcset="http://bulbasaur.com/imageB.jpg 2x" src="http://bulbasaur.com/imageB.jpg">
69
+ <img srcset="http://bulbasaur.com/imageB.jpg 200W 100H" src="http://bulbasaur.com/imageB.jpg" width="200" height="100">
70
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w" src="http://bulbasaur.com/imageD.jpg" width="300">
71
+ HTML
72
+ end
73
+
74
+ it "Does return the HTML code with src and with definitions" do
75
+ expect(subject).to eq(html_normalized)
76
+ end
77
+ end
78
+
79
+ context "When there are no valid params on srcset" do
80
+
81
+ let(:html) do
82
+ <<-HTML
83
+ <img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image">
84
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x">
85
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h">
86
+ HTML
87
+ end
88
+
89
+ let(:html_normalized) do
90
+ <<-HTML
91
+ <img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image" src="http://bulbasaur.com/imageA.jpg" width="100" height="200">
92
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x" src="http://bulbasaur.com/imageB.jpg" width="200">
93
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageC.jpg" width="600" height="500">
94
+ HTML
95
+ end
96
+
97
+ it "Does returns the HTML code and ignores invalid params" do
98
+ expect(subject).to eq(html_normalized)
99
+ end
100
+ end
101
+
102
+ context "When there are diferents sizes to the same image" do
103
+
104
+ let(:html) do
105
+ <<-HTML
106
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h">
107
+ HTML
108
+ end
109
+
110
+ let(:html_normalized_) do
111
+ <<-HTML
112
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageD.jpg" width="300" height="400">
113
+ HTML
114
+ end
115
+
116
+ it "Does returns the HTML code with better image" do
117
+ expect(subject).to eq(html_normalized_)
118
+ end
119
+ end
120
+
121
+ end
122
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preadly-bulbasaur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Magno Costa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-15 00:00:00.000000000 Z
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -101,6 +101,7 @@ files:
101
101
  - lib/bulbasaur/removals/remove_tags.rb
102
102
  - lib/bulbasaur/replaces/replace_by_tag_image.rb
103
103
  - lib/bulbasaur/utils/normalize_image_sources.rb
104
+ - lib/bulbasaur/utils/normalize_image_src_set.rb
104
105
  - lib/bulbasaur/utils/normalize_url.rb
105
106
  - lib/bulbasaur/version.rb
106
107
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
@@ -114,6 +115,7 @@ files:
114
115
  - spec/bulbasaur/removals/remove_tags_spec.rb
115
116
  - spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
116
117
  - spec/bulbasaur/utils/normalize_image_sources_spec.rb
118
+ - spec/bulbasaur/utils/normalize_image_srcset_spec.rb
117
119
  - spec/bulbasaur/utils/normalize_url_spec.rb
118
120
  - spec/spec_helper.rb
119
121
  homepage: https://github.com/preadly/bulbasaur
@@ -135,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
137
  version: '0'
136
138
  requirements: []
137
139
  rubyforge_project:
138
- rubygems_version: 2.2.2
140
+ rubygems_version: 2.4.8
139
141
  signing_key:
140
142
  specification_version: 4
141
143
  summary: Bulbasaur is a helper for crawler operations used in Pread.ly
@@ -151,5 +153,6 @@ test_files:
151
153
  - spec/bulbasaur/removals/remove_tags_spec.rb
152
154
  - spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
153
155
  - spec/bulbasaur/utils/normalize_image_sources_spec.rb
156
+ - spec/bulbasaur/utils/normalize_image_srcset_spec.rb
154
157
  - spec/bulbasaur/utils/normalize_url_spec.rb
155
158
  - spec/spec_helper.rb