preadly-bulbasaur 0.8.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8260cd6a3e468ab1139c0096ee1cbb5086fe7ff1
4
- data.tar.gz: 146d9e064d05db1eb3c19c7cdc7290c6027fa79a
3
+ metadata.gz: a87289e0e17c9e9ae9d8f489c3704a14dff5d5ac
4
+ data.tar.gz: 64d2ec14ed814afe1997c2472678f1af7c3b9166
5
5
  SHA512:
6
- metadata.gz: 61721cf5b99c6edba03b754fe98b863bd9747ab0bd279d4b3023becfc160309a591d931a2b8b814aab69884c940e026c72d8dc52022769101ef9f79f4da90b49
7
- data.tar.gz: f8da5d762adf404102767f71c0c68ea99a3f49fbbb716612f01e95f9fda6ab8a5c982068d35d14232bdb8d033bfdcefab69d628c36a3fbdeb54dab390ab490d7
6
+ metadata.gz: d74058dc3e6928672b8fb1342e2d7b81db562110cf7bfa491b470838de5f19bd6cbb8ee36e8d929cdb5e4a6e31c6c56b0dda4de758b18efc71214e0a1ae9671a
7
+ data.tar.gz: 63c9260e6f33a3a114d46c8ea119f7d026a6e5cebd655b31131af8a775a86dd6e63c6bab260bcc3f7fd55aae58eee05a063fd49a925a2e096c98ce908c144347
data/lib/bulbasaur.rb CHANGED
@@ -11,6 +11,7 @@ require "bulbasaur/removals/remove_attributes"
11
11
  require "bulbasaur/replaces/replace_by_tag_image"
12
12
  require "bulbasaur/utils/normalize_url"
13
13
  require "bulbasaur/utils/normalize_image_sources"
14
+ require "bulbasaur/utils/normalize_image_src_set"
14
15
  require "bulbasaur/version"
15
16
 
16
17
 
@@ -1,14 +1,16 @@
1
1
  module Bulbasaur
2
2
  class RemoveAttributes
3
+
3
4
  def initialize(html, banned_attrs)
4
5
  @html = html
5
6
  @banned_attrs = banned_attrs
6
7
  end
7
8
 
8
9
  def call
9
- parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
10
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
10
11
  @banned_attrs.each { |attr| parsed_html.xpath(".//@#{attr}").remove }
11
12
  parsed_html.to_s
12
13
  end
14
+
13
15
  end
14
- end
16
+ end
@@ -1,14 +1,28 @@
1
1
  module Bulbasaur
2
2
  class RemoveTags
3
- def initialize(html, banned_tags)
3
+
4
+ SELECTORS_EMPTY_ANALYSE = "div, p"
5
+
6
+ def initialize(html, banned_tags, remove_empty_tags = false)
4
7
  @html = html
5
8
  @banned_tags = banned_tags
9
+ @remove_empty_tags = remove_empty_tags
6
10
  end
7
-
11
+
8
12
  def call
9
- parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
13
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
14
+ remove_empty_tags(parsed_html) if @remove_empty_tags
10
15
  @banned_tags.each { |tag| parsed_html.css(tag).remove }
11
16
  parsed_html.to_s
12
17
  end
18
+
19
+ private
20
+
21
+ def remove_empty_tags(parsed_html)
22
+ parsed_html.css(SELECTORS_EMPTY_ANALYSE).each do |tag|
23
+ tag.remove if tag.content.strip.empty? && tag.children.select{ |c| c.name != "text" }.length == 0
24
+ end
25
+ end
26
+
13
27
  end
14
- end
28
+ end
@@ -1,5 +1,6 @@
1
1
  module Bulbasaur
2
2
  class NormalizeImageSources
3
+
3
4
  def initialize(html, target_attrs)
4
5
  @html = html
5
6
  @target_attrs = target_attrs
@@ -8,7 +9,7 @@ module Bulbasaur
8
9
  def call
9
10
  parsed_html = Nokogiri::HTML::DocumentFragment.parse @html
10
11
  parsed_html.css('img').each do |element|
11
- check_for_attrs element
12
+ check_for_attrs(element)
12
13
  end
13
14
  parsed_html.to_s
14
15
  end
@@ -25,12 +26,12 @@ module Bulbasaur
25
26
  end
26
27
 
27
28
  def adjust(element, attr)
28
- element.set_attribute 'src', element.xpath(attr).text
29
- remove_target_attrs_from element
29
+ element.set_attribute('src', element.xpath(attr).text)
30
+ remove_target_attrs_from(element)
30
31
  end
31
32
 
32
33
  def remove_target_attrs_from(element)
33
34
  @target_attrs.each { |attr| element.xpath("@#{attr}").remove }
34
35
  end
35
36
  end
36
- end
37
+ end
@@ -0,0 +1,68 @@
1
+ module Bulbasaur
2
+ class NormalizeImageSrcSet
3
+
4
+ REGEX_FIND_WIDTH = /(\d+)w/i
5
+ REGEX_FIND_HEIGHT = /(\d+)h/i
6
+
7
+ def initialize(html)
8
+ @html = html
9
+ end
10
+
11
+ def call
12
+ parsed_html = Nokogiri::HTML::DocumentFragment.parse(@html)
13
+ parsed_html.css("img[srcset]").each do |img|
14
+ image = extract_src_set_attribute(img)
15
+ img.set_attribute("src", image[:url])
16
+ img.set_attribute("width", image[:width]) if image[:width]
17
+ img.set_attribute("height", image[:height]) if image[:height]
18
+ end
19
+ parsed_html.to_s
20
+ end
21
+
22
+ private
23
+
24
+ def extract_src_set_attribute(img)
25
+ itens = img.get_attribute("srcset").split(",")
26
+ images = []
27
+ itens.each do |item|
28
+ srcset_item = item.split(" ")
29
+ image_object = {
30
+ url: extract_url(srcset_item),
31
+ width: extract_width(srcset_item),
32
+ height: extract_height(srcset_item)
33
+ }
34
+ images << image_object
35
+ end
36
+ get_better_image(images)
37
+ end
38
+
39
+ def extract_width(itens)
40
+ extract_by_regex(itens, REGEX_FIND_WIDTH)
41
+ end
42
+
43
+ def extract_height(itens)
44
+ extract_by_regex(itens, REGEX_FIND_HEIGHT)
45
+ end
46
+
47
+ def extract_url(srcset_item)
48
+ srcset_item[0]
49
+ end
50
+
51
+ def extract_by_regex(itens, regex)
52
+ value = itens.select{ |item| item =~ regex }.first
53
+ value = value.match(regex).captures.first if value
54
+ value
55
+ end
56
+
57
+ def get_better_image(itens)
58
+ images = Array.new
59
+ images.concat itens.select{ |img| !img[:width] && !img[:height] }.each{ |img| img[:area] = 0 }
60
+ images.concat itens.select{ |img| !img[:width] && img[:height] }.each{ |img| img[:area] = img[:height] }
61
+ images.concat itens.select{ |img| img[:width] && !img[:height] }.each{ |img| img[:area] = img[:width] }
62
+ images.concat itens.select{ |img| img[:width] && img[:height] }.each{ |img| img[:area] = img[:width].to_i * img[:height].to_i }
63
+ images = images.sort { |a, b| b[:area] <=> a[:area] }
64
+ images.first
65
+ end
66
+
67
+ end
68
+ end
@@ -2,8 +2,8 @@ module Bulbasaur
2
2
 
3
3
  module Version
4
4
  MAJOR = 0
5
- MINOR = 8
6
- PATCH = 2
5
+ MINOR = 9
6
+ PATCH = 0
7
7
  STRING = "#{MAJOR}.#{MINOR}.#{PATCH}"
8
8
  end
9
9
 
@@ -3,11 +3,15 @@ require 'spec_helper'
3
3
  RSpec.describe Bulbasaur::RemoveTags do
4
4
 
5
5
  subject do
6
- described_class.new(html, banned_tags).call
6
+ described_class.new(html, banned_tags, empty_tags).call
7
7
  end
8
8
 
9
9
  describe '#call' do
10
10
 
11
+ let(:empty_tags) do
12
+ false
13
+ end
14
+
11
15
  let(:html) do
12
16
  %[
13
17
  <style>
@@ -18,6 +22,12 @@ RSpec.describe Bulbasaur::RemoveTags do
18
22
  <input type="text">
19
23
  </form>
20
24
  <p>hello!</p>
25
+ <div class="inner top">
26
+ <p></p>
27
+ <div> </div>
28
+ </div>
29
+ <div></div>
30
+ <p></p>
21
31
  ]
22
32
  end
23
33
 
@@ -26,25 +36,71 @@ RSpec.describe Bulbasaur::RemoveTags do
26
36
  let(:banned_tags) do
27
37
  []
28
38
  end
29
-
39
+
30
40
  it 'returns the HTML code as it was before' do
31
41
  expect(subject).to eq html
32
42
  end
33
43
  end
34
-
44
+
35
45
  context 'when there are banned tags' do
36
46
 
37
47
  let(:banned_tags) do
38
48
  %w(form style)
39
49
  end
40
-
50
+
41
51
  it 'returns the HTML code without the banned tags' do
42
52
  expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
43
53
  <div style="height: 100px; width: 100px;"></div>
44
54
  <p>hello!</p>
55
+ <div class="inner top">
56
+ <p></p>
57
+ <div> </div>
58
+ </div>
59
+ <div></div>
60
+ <p></p>
45
61
  ].strip.gsub(/\n/, '').squeeze ' '
46
62
  end
47
63
  end
48
- end
64
+
65
+ context 'when the HTML code with the banned tags and defined no empty tags' do
66
+
67
+ let(:html) do
68
+ %[
69
+ <style>
70
+ div { color: green; width: 1024px; }
71
+ </style>
72
+ <div style="height: 100px; width: 100px;"></div>
73
+ <form>
74
+ <input type="text">
75
+ </form>
76
+ <p>hello!</p>
77
+ <div class="inner top">
78
+ <p></p>
79
+ <div><img src='test.jpg'></div>
80
+ </div>
81
+ <div></div>
82
+ <p> </p>
83
+ <div class="helo"> </div>
84
+ <p></p>
85
+ ]
86
+ end
87
+
88
+ let(:banned_tags) do
89
+ %w(form style)
90
+ end
49
91
 
92
+ let(:empty_tags) do
93
+ true
94
+ end
95
+
96
+ it 'returns the HTML code without the banned tags and without empty tags' do
97
+ expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
98
+ <p>hello!</p>
99
+ <div class="inner top">
100
+ <div><img src="test.jpg"></div>
101
+ </div>
102
+ ].strip.gsub(/\n/, '').squeeze ' '
103
+ end
104
+ end
105
+ end
50
106
  end
@@ -0,0 +1,122 @@
1
+ require "spec_helper"
2
+
3
+ RSpec.describe Bulbasaur::NormalizeImageSrcSet do
4
+
5
+ subject do
6
+ described_class.new(html).call
7
+ end
8
+
9
+ describe "#call" do
10
+
11
+ context "When there are no srcset attribute" do
12
+
13
+ let(:html) do
14
+ <<-HTML
15
+ <img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
16
+ <img lazy-data="https://place.where/an-image/has/no-src-tag.jpg">
17
+ HTML
18
+ end
19
+
20
+ it "Does returns the HTML code as it was before" do
21
+ expect(subject).to eq(html)
22
+ end
23
+ end
24
+
25
+ context "When there are srcset attribute without size defined" do
26
+
27
+ let(:html) do
28
+ <<-HTML
29
+ <img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
30
+ <img srcset="http://bulbasaur.com/imageB.jpg">
31
+ <img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg">
32
+ HTML
33
+ end
34
+
35
+ let(:html_normalized) do
36
+ <<-HTML
37
+ <img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg" alt="Image" width="800" height="1200">
38
+ <img srcset="http://bulbasaur.com/imageB.jpg" src="http://bulbasaur.com/imageB.jpg">
39
+ <img srcset="http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageC.jpg,http://bulbasaur.com/imageD.jpg" src="http://bulbasaur.com/imageC.jpg">
40
+ HTML
41
+ end
42
+
43
+ it "Does returns the HTML code with src replaced by srcset attribute" do
44
+ expect(subject).to eq(html_normalized)
45
+ end
46
+ end
47
+
48
+ context "When there are srcset attribute with size defined" do
49
+
50
+ let(:html) do
51
+ <<-HTML
52
+ <img src="http://somewhere.to/get/a-pixel.gif" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="800" height="1200">
53
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w">
54
+ <img srcset="http://bulbasaur.com/imageB.jpg 200h">
55
+ <img srcset="http://bulbasaur.com/imageB.jpg 20w 30h">
56
+ <img srcset="http://bulbasaur.com/imageB.jpg 2x">
57
+ <img srcset="http://bulbasaur.com/imageB.jpg 200W 100H">
58
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w">
59
+ HTML
60
+ end
61
+
62
+ let(:html_normalized) do
63
+ <<-HTML
64
+ <img src="http://bulbasaur.com/imageA.jpg" srcset="http://bulbasaur.com/imageA.jpg 100w" alt="Image" width="100" height="1200">
65
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w" src="http://bulbasaur.com/imageB.jpg" width="200">
66
+ <img srcset="http://bulbasaur.com/imageB.jpg 200h" src="http://bulbasaur.com/imageB.jpg" height="200">
67
+ <img srcset="http://bulbasaur.com/imageB.jpg 20w 30h" src="http://bulbasaur.com/imageB.jpg" width="20" height="30">
68
+ <img srcset="http://bulbasaur.com/imageB.jpg 2x" src="http://bulbasaur.com/imageB.jpg">
69
+ <img srcset="http://bulbasaur.com/imageB.jpg 200W 100H" src="http://bulbasaur.com/imageB.jpg" width="200" height="100">
70
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w,http://bulbasaur.com/imageC.jpg 200w,http://bulbasaur.com/imageD.jpg 300w" src="http://bulbasaur.com/imageD.jpg" width="300">
71
+ HTML
72
+ end
73
+
74
+ it "Does return the HTML code with src and with definitions" do
75
+ expect(subject).to eq(html_normalized)
76
+ end
77
+ end
78
+
79
+ context "When there are no valid params on srcset" do
80
+
81
+ let(:html) do
82
+ <<-HTML
83
+ <img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image">
84
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x">
85
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h">
86
+ HTML
87
+ end
88
+
89
+ let(:html_normalized) do
90
+ <<-HTML
91
+ <img srcset="http://bulbasaur.com/imageA.jpg 100w 200H 12h asd hausd uahsd 12813nusda" alt="Image" src="http://bulbasaur.com/imageA.jpg" width="100" height="200">
92
+ <img srcset="http://bulbasaur.com/imageB.jpg 200w 1x 10x" src="http://bulbasaur.com/imageB.jpg" width="200">
93
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 600w 500h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageC.jpg" width="600" height="500">
94
+ HTML
95
+ end
96
+
97
+ it "Does returns the HTML code and ignores invalid params" do
98
+ expect(subject).to eq(html_normalized)
99
+ end
100
+ end
101
+
102
+ context "When there are diferents sizes to the same image" do
103
+
104
+ let(:html) do
105
+ <<-HTML
106
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h">
107
+ HTML
108
+ end
109
+
110
+ let(:html_normalized_) do
111
+ <<-HTML
112
+ <img srcset="http://bulbasaur.com/imageC.jpg 100w 50h,http://bulbasaur.com/imageC.jpg 200w 300h,http://bulbasaur.com/imageD.jpg 300w 400h" src="http://bulbasaur.com/imageD.jpg" width="300" height="400">
113
+ HTML
114
+ end
115
+
116
+ it "Does returns the HTML code with better image" do
117
+ expect(subject).to eq(html_normalized_)
118
+ end
119
+ end
120
+
121
+ end
122
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preadly-bulbasaur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Magno Costa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-15 00:00:00.000000000 Z
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -101,6 +101,7 @@ files:
101
101
  - lib/bulbasaur/removals/remove_tags.rb
102
102
  - lib/bulbasaur/replaces/replace_by_tag_image.rb
103
103
  - lib/bulbasaur/utils/normalize_image_sources.rb
104
+ - lib/bulbasaur/utils/normalize_image_src_set.rb
104
105
  - lib/bulbasaur/utils/normalize_url.rb
105
106
  - lib/bulbasaur/version.rb
106
107
  - spec/bulbasaur/extracts/extract_images_from_all_resources_spec.rb
@@ -114,6 +115,7 @@ files:
114
115
  - spec/bulbasaur/removals/remove_tags_spec.rb
115
116
  - spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
116
117
  - spec/bulbasaur/utils/normalize_image_sources_spec.rb
118
+ - spec/bulbasaur/utils/normalize_image_srcset_spec.rb
117
119
  - spec/bulbasaur/utils/normalize_url_spec.rb
118
120
  - spec/spec_helper.rb
119
121
  homepage: https://github.com/preadly/bulbasaur
@@ -135,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
137
  version: '0'
136
138
  requirements: []
137
139
  rubyforge_project:
138
- rubygems_version: 2.2.2
140
+ rubygems_version: 2.4.8
139
141
  signing_key:
140
142
  specification_version: 4
141
143
  summary: Bulbasaur is a helper for crawler operations used in Pread.ly
@@ -151,5 +153,6 @@ test_files:
151
153
  - spec/bulbasaur/removals/remove_tags_spec.rb
152
154
  - spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
153
155
  - spec/bulbasaur/utils/normalize_image_sources_spec.rb
156
+ - spec/bulbasaur/utils/normalize_image_srcset_spec.rb
154
157
  - spec/bulbasaur/utils/normalize_url_spec.rb
155
158
  - spec/spec_helper.rb