preadly-bulbasaur 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bulbasaur/extracts/extract_meta_informations_from_html.rb +42 -0
- data/lib/bulbasaur/utils/normalize_image_sources.rb +9 -1
- data/lib/bulbasaur/version.rb +1 -1
- data/lib/bulbasaur.rb +1 -0
- data/spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb +38 -0
- data/spec/bulbasaur/removals/remove_attributes_spec.rb +15 -4
- data/spec/bulbasaur/removals/remove_tags_spec.rb +15 -4
- data/spec/bulbasaur/utils/normalize_image_sources_spec.rb +59 -12
- metadata +5 -4
- data/spec/bulbasaur_spec.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3585babf7ec241a14a1bb08d933cb648216658e
|
4
|
+
data.tar.gz: e283f5b067e6a8d7642e28e747df9a36e49877fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ec4836696c83dd0eef9e32ca7cf9375ecc37bc4ed973f5761357702a23a3c5eeaa7198b195ebab39a4e11e3c505a2b6d50ad37034a95cf45a06a680e3d0b048
|
7
|
+
data.tar.gz: e1324f9d951aaa744e91b4a1113a3f2cc9b2e8222217e3a2ecc9303fb6455ec838819b218a52d64ffbf28a16a5dae7bff6ea4a1c56e9b06f86d464b66e52a4ee
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class ExtractMetaInformationsFromHTML
|
3
|
+
|
4
|
+
def initialize(html)
|
5
|
+
@html = html
|
6
|
+
end
|
7
|
+
|
8
|
+
def call
|
9
|
+
meta_informations = []
|
10
|
+
for_each_meta_information do |meta_information|
|
11
|
+
name = name_of meta_information
|
12
|
+
value = value_of meta_information
|
13
|
+
meta_informations << { name: name, value: value } unless name.nil? || value.nil?
|
14
|
+
end
|
15
|
+
meta_informations
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def for_each_meta_information(&block)
|
21
|
+
if @html
|
22
|
+
Nokogiri::HTML(@html).xpath('//meta').each &block
|
23
|
+
Nokogiri::HTML(@html).xpath('//link').each &block
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def name_of(meta_information)
|
28
|
+
include_attribute? %w(name property rel), meta_information
|
29
|
+
end
|
30
|
+
|
31
|
+
def value_of(meta_information)
|
32
|
+
include_attribute? %w(value content href), meta_information
|
33
|
+
end
|
34
|
+
|
35
|
+
def include_attribute?(attributes, tag)
|
36
|
+
attributes.each do |attr|
|
37
|
+
return tag.attribute(attr).to_s if tag.attributes.include?(attr)
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class NormalizeImageSources
|
3
|
+
DOMAIN_REGEX = /^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)/im
|
4
|
+
|
3
5
|
def initialize(html, target_attrs)
|
4
6
|
@html = html
|
5
7
|
@target_attrs = target_attrs
|
@@ -25,8 +27,14 @@ module Bulbasaur
|
|
25
27
|
end
|
26
28
|
|
27
29
|
def adjust(element, attr)
|
28
|
-
element.set_attribute 'src', element.xpath(attr).text
|
30
|
+
element.set_attribute 'src', lazy_load_url(element, element.xpath(attr).text)
|
29
31
|
element.xpath(attr).remove
|
30
32
|
end
|
33
|
+
|
34
|
+
def lazy_load_url(element, text)
|
35
|
+
text_match = text.match(DOMAIN_REGEX).to_s
|
36
|
+
element_match = element.css('@src').text.match(DOMAIN_REGEX).to_s
|
37
|
+
(text_match == element_match) ? text : "#{element_match}/#{text}"
|
38
|
+
end
|
31
39
|
end
|
32
40
|
end
|
data/lib/bulbasaur/version.rb
CHANGED
data/lib/bulbasaur.rb
CHANGED
@@ -4,6 +4,7 @@ require "bulbasaur/extracts/extract_images_from_vimeo"
|
|
4
4
|
require "bulbasaur/extracts/extract_images_from_html"
|
5
5
|
require "bulbasaur/extracts/extract_images_from_all_resources"
|
6
6
|
require "bulbasaur/extracts/extract_text_from_html.rb"
|
7
|
+
require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
|
7
8
|
require "bulbasaur/removals/remove_tags"
|
8
9
|
require "bulbasaur/removals/remove_attributes"
|
9
10
|
require "bulbasaur/replaces/replace_by_tag_image"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractMetaInformationsFromHTML do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:html) do
|
10
|
+
%Q(
|
11
|
+
<head>
|
12
|
+
<meta name="description" content="test-description">
|
13
|
+
<meta property="keywords" value="test-keywords">
|
14
|
+
<meta NAME="author" VALUE="test-author">
|
15
|
+
<link rel="canonical" href="test-canonical">
|
16
|
+
</head>
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#call" do
|
21
|
+
|
22
|
+
let(:meta_names) do
|
23
|
+
subject.map {|h| h[:name]}
|
24
|
+
end
|
25
|
+
|
26
|
+
let(:meta_values) do
|
27
|
+
subject.map {|h| h[:value]}
|
28
|
+
end
|
29
|
+
|
30
|
+
it "Does extract meta names informations from html" do
|
31
|
+
expect(meta_names).to include "description", "keywords", "author", "canonical"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "Does extract meta values informations from html" do
|
35
|
+
expect(meta_values).to include "test-description", "test-keywords", "test-author", "test-canonical"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveAttributes do
|
4
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, banned_attrs).call
|
7
|
+
end
|
5
8
|
|
6
9
|
describe '#call' do
|
10
|
+
|
7
11
|
let(:html) do
|
8
12
|
%[
|
9
13
|
<style>
|
@@ -18,7 +22,10 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
18
22
|
end
|
19
23
|
|
20
24
|
context 'when there are no banned attributes' do
|
21
|
-
|
25
|
+
|
26
|
+
let(:banned_attrs) do
|
27
|
+
[]
|
28
|
+
end
|
22
29
|
|
23
30
|
it 'returns the HTML code as it was before' do
|
24
31
|
expect(subject).to eq html
|
@@ -26,7 +33,10 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
26
33
|
end
|
27
34
|
|
28
35
|
context 'when there are banned attributes' do
|
29
|
-
|
36
|
+
|
37
|
+
let(:banned_attrs) do
|
38
|
+
%w(style)
|
39
|
+
end
|
30
40
|
|
31
41
|
it 'returns the HTML code without the banned attributes' do
|
32
42
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
@@ -42,4 +52,5 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
42
52
|
end
|
43
53
|
end
|
44
54
|
end
|
45
|
-
|
55
|
+
|
56
|
+
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveTags do
|
4
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, banned_tags).call
|
7
|
+
end
|
5
8
|
|
6
9
|
describe '#call' do
|
10
|
+
|
7
11
|
let(:html) do
|
8
12
|
%[
|
9
13
|
<style>
|
@@ -18,7 +22,10 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
18
22
|
end
|
19
23
|
|
20
24
|
context 'when there are no banned tags' do
|
21
|
-
|
25
|
+
|
26
|
+
let(:banned_tags) do
|
27
|
+
[]
|
28
|
+
end
|
22
29
|
|
23
30
|
it 'returns the HTML code as it was before' do
|
24
31
|
expect(subject).to eq html
|
@@ -26,7 +33,10 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
26
33
|
end
|
27
34
|
|
28
35
|
context 'when there are banned tags' do
|
29
|
-
|
36
|
+
|
37
|
+
let(:banned_tags) do
|
38
|
+
%w(form style)
|
39
|
+
end
|
30
40
|
|
31
41
|
it 'returns the HTML code without the banned tags' do
|
32
42
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
@@ -36,4 +46,5 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
36
46
|
end
|
37
47
|
end
|
38
48
|
end
|
39
|
-
|
49
|
+
|
50
|
+
end
|
@@ -1,25 +1,72 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::NormalizeImageSources do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, target_attrs).call
|
7
|
+
end
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
let(:html) do
|
12
|
+
%Q(
|
13
|
+
<img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
14
|
+
<img src="http://somewhere.to/get/another-pixel.gif" data-image="http://somewhere.to/get/the-other-real-image.jpg">
|
15
|
+
<img src="http://somewhere.to/get/a-third-pixel.gif" data-src="get/the-third-real-image.jpg">
|
16
|
+
<img src="otherplace.to/load/a-fourth-pixel.gif" lazy-data="otherplace.to/load/the-fourth-real-image.jpg">
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
context "When there are no target attributes" do
|
21
|
+
|
22
|
+
let(:target_attrs) do
|
23
|
+
[]
|
24
|
+
end
|
11
25
|
|
12
|
-
it
|
26
|
+
it "Returns the HTML code as it was before" do
|
13
27
|
expect(subject).to eq html
|
14
28
|
end
|
15
29
|
end
|
16
30
|
|
17
|
-
context
|
18
|
-
|
31
|
+
context "When there are target attributes" do
|
32
|
+
|
33
|
+
let(:target_attrs) do
|
34
|
+
%w(data-lazy-src data-image)
|
35
|
+
end
|
36
|
+
|
37
|
+
let(:html_parsed) do
|
38
|
+
%Q(
|
39
|
+
<img src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
40
|
+
<img src="http://somewhere.to/get/the-other-real-image.jpg">
|
41
|
+
<img src="http://somewhere.to/get/a-third-pixel.gif" data-src="get/the-third-real-image.jpg">
|
42
|
+
<img src="otherplace.to/load/a-fourth-pixel.gif" lazy-data="otherplace.to/load/the-fourth-real-image.jpg">
|
43
|
+
)
|
44
|
+
end
|
19
45
|
|
20
|
-
it
|
21
|
-
expect(subject
|
46
|
+
it "Returns the HTML code with the specified image tags adjusted" do
|
47
|
+
expect(subject.delete(" ")).to eq html_parsed.delete(" ")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "When there are target attributes with relative path" do
|
52
|
+
|
53
|
+
let(:target_attrs) do
|
54
|
+
%w(data-lazy-src data-image data-src lazy-data)
|
55
|
+
end
|
56
|
+
|
57
|
+
let(:html_parsed) do
|
58
|
+
%Q(
|
59
|
+
<img src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
60
|
+
<img src="http://somewhere.to/get/the-other-real-image.jpg">
|
61
|
+
<img src="http://somewhere.to/get/the-third-real-image.jpg">
|
62
|
+
<img src="otherplace.to/load/the-fourth-real-image.jpg">
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
it "Returns the HTML code with the involved image tags fixed with domain and path" do
|
67
|
+
expect(subject.delete(" ")).to eq html_parsed.delete(" ")
|
22
68
|
end
|
23
69
|
end
|
24
70
|
end
|
25
|
-
|
71
|
+
|
72
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,7 @@ files:
|
|
94
94
|
- lib/bulbasaur/extracts/extract_images_from_html.rb
|
95
95
|
- lib/bulbasaur/extracts/extract_images_from_vimeo.rb
|
96
96
|
- lib/bulbasaur/extracts/extract_images_from_youtube.rb
|
97
|
+
- lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
|
97
98
|
- lib/bulbasaur/extracts/extract_text_from_html.rb
|
98
99
|
- lib/bulbasaur/removals/remove_attributes.rb
|
99
100
|
- lib/bulbasaur/removals/remove_tags.rb
|
@@ -106,12 +107,12 @@ files:
|
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
107
108
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
108
109
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
110
|
+
- spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb
|
109
111
|
- spec/bulbasaur/removals/remove_attributes_spec.rb
|
110
112
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
111
113
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
112
114
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
113
115
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
114
|
-
- spec/bulbasaur_spec.rb
|
115
116
|
- spec/spec_helper.rb
|
116
117
|
homepage: https://github.com/preadly/bulbasaur
|
117
118
|
licenses: []
|
@@ -142,10 +143,10 @@ test_files:
|
|
142
143
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
143
144
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
144
145
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
146
|
+
- spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb
|
145
147
|
- spec/bulbasaur/removals/remove_attributes_spec.rb
|
146
148
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
147
149
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
148
150
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
149
151
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
150
|
-
- spec/bulbasaur_spec.rb
|
151
152
|
- spec/spec_helper.rb
|