preadly-bulbasaur 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bulbasaur/extracts/extract_meta_informations_from_html.rb +42 -0
- data/lib/bulbasaur/utils/normalize_image_sources.rb +9 -1
- data/lib/bulbasaur/version.rb +1 -1
- data/lib/bulbasaur.rb +1 -0
- data/spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb +38 -0
- data/spec/bulbasaur/removals/remove_attributes_spec.rb +15 -4
- data/spec/bulbasaur/removals/remove_tags_spec.rb +15 -4
- data/spec/bulbasaur/utils/normalize_image_sources_spec.rb +59 -12
- metadata +5 -4
- data/spec/bulbasaur_spec.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a3585babf7ec241a14a1bb08d933cb648216658e
|
4
|
+
data.tar.gz: e283f5b067e6a8d7642e28e747df9a36e49877fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ec4836696c83dd0eef9e32ca7cf9375ecc37bc4ed973f5761357702a23a3c5eeaa7198b195ebab39a4e11e3c505a2b6d50ad37034a95cf45a06a680e3d0b048
|
7
|
+
data.tar.gz: e1324f9d951aaa744e91b4a1113a3f2cc9b2e8222217e3a2ecc9303fb6455ec838819b218a52d64ffbf28a16a5dae7bff6ea4a1c56e9b06f86d464b66e52a4ee
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Bulbasaur
|
2
|
+
class ExtractMetaInformationsFromHTML
|
3
|
+
|
4
|
+
def initialize(html)
|
5
|
+
@html = html
|
6
|
+
end
|
7
|
+
|
8
|
+
def call
|
9
|
+
meta_informations = []
|
10
|
+
for_each_meta_information do |meta_information|
|
11
|
+
name = name_of meta_information
|
12
|
+
value = value_of meta_information
|
13
|
+
meta_informations << { name: name, value: value } unless name.nil? || value.nil?
|
14
|
+
end
|
15
|
+
meta_informations
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def for_each_meta_information(&block)
|
21
|
+
if @html
|
22
|
+
Nokogiri::HTML(@html).xpath('//meta').each &block
|
23
|
+
Nokogiri::HTML(@html).xpath('//link').each &block
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def name_of(meta_information)
|
28
|
+
include_attribute? %w(name property rel), meta_information
|
29
|
+
end
|
30
|
+
|
31
|
+
def value_of(meta_information)
|
32
|
+
include_attribute? %w(value content href), meta_information
|
33
|
+
end
|
34
|
+
|
35
|
+
def include_attribute?(attributes, tag)
|
36
|
+
attributes.each do |attr|
|
37
|
+
return tag.attribute(attr).to_s if tag.attributes.include?(attr)
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
module Bulbasaur
|
2
2
|
class NormalizeImageSources
|
3
|
+
DOMAIN_REGEX = /^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n]+)/im
|
4
|
+
|
3
5
|
def initialize(html, target_attrs)
|
4
6
|
@html = html
|
5
7
|
@target_attrs = target_attrs
|
@@ -25,8 +27,14 @@ module Bulbasaur
|
|
25
27
|
end
|
26
28
|
|
27
29
|
def adjust(element, attr)
|
28
|
-
element.set_attribute 'src', element.xpath(attr).text
|
30
|
+
element.set_attribute 'src', lazy_load_url(element, element.xpath(attr).text)
|
29
31
|
element.xpath(attr).remove
|
30
32
|
end
|
33
|
+
|
34
|
+
def lazy_load_url(element, text)
|
35
|
+
text_match = text.match(DOMAIN_REGEX).to_s
|
36
|
+
element_match = element.css('@src').text.match(DOMAIN_REGEX).to_s
|
37
|
+
(text_match == element_match) ? text : "#{element_match}/#{text}"
|
38
|
+
end
|
31
39
|
end
|
32
40
|
end
|
data/lib/bulbasaur/version.rb
CHANGED
data/lib/bulbasaur.rb
CHANGED
@@ -4,6 +4,7 @@ require "bulbasaur/extracts/extract_images_from_vimeo"
|
|
4
4
|
require "bulbasaur/extracts/extract_images_from_html"
|
5
5
|
require "bulbasaur/extracts/extract_images_from_all_resources"
|
6
6
|
require "bulbasaur/extracts/extract_text_from_html.rb"
|
7
|
+
require "bulbasaur/extracts/extract_meta_informations_from_html.rb"
|
7
8
|
require "bulbasaur/removals/remove_tags"
|
8
9
|
require "bulbasaur/removals/remove_attributes"
|
9
10
|
require "bulbasaur/replaces/replace_by_tag_image"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Bulbasaur::ExtractMetaInformationsFromHTML do
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html).call
|
7
|
+
end
|
8
|
+
|
9
|
+
let(:html) do
|
10
|
+
%Q(
|
11
|
+
<head>
|
12
|
+
<meta name="description" content="test-description">
|
13
|
+
<meta property="keywords" value="test-keywords">
|
14
|
+
<meta NAME="author" VALUE="test-author">
|
15
|
+
<link rel="canonical" href="test-canonical">
|
16
|
+
</head>
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#call" do
|
21
|
+
|
22
|
+
let(:meta_names) do
|
23
|
+
subject.map {|h| h[:name]}
|
24
|
+
end
|
25
|
+
|
26
|
+
let(:meta_values) do
|
27
|
+
subject.map {|h| h[:value]}
|
28
|
+
end
|
29
|
+
|
30
|
+
it "Does extract meta names informations from html" do
|
31
|
+
expect(meta_names).to include "description", "keywords", "author", "canonical"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "Does extract meta values informations from html" do
|
35
|
+
expect(meta_values).to include "test-description", "test-keywords", "test-author", "test-canonical"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveAttributes do
|
4
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, banned_attrs).call
|
7
|
+
end
|
5
8
|
|
6
9
|
describe '#call' do
|
10
|
+
|
7
11
|
let(:html) do
|
8
12
|
%[
|
9
13
|
<style>
|
@@ -18,7 +22,10 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
18
22
|
end
|
19
23
|
|
20
24
|
context 'when there are no banned attributes' do
|
21
|
-
|
25
|
+
|
26
|
+
let(:banned_attrs) do
|
27
|
+
[]
|
28
|
+
end
|
22
29
|
|
23
30
|
it 'returns the HTML code as it was before' do
|
24
31
|
expect(subject).to eq html
|
@@ -26,7 +33,10 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
26
33
|
end
|
27
34
|
|
28
35
|
context 'when there are banned attributes' do
|
29
|
-
|
36
|
+
|
37
|
+
let(:banned_attrs) do
|
38
|
+
%w(style)
|
39
|
+
end
|
30
40
|
|
31
41
|
it 'returns the HTML code without the banned attributes' do
|
32
42
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
@@ -42,4 +52,5 @@ RSpec.describe Bulbasaur::RemoveAttributes do
|
|
42
52
|
end
|
43
53
|
end
|
44
54
|
end
|
45
|
-
|
55
|
+
|
56
|
+
end
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::RemoveTags do
|
4
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, banned_tags).call
|
7
|
+
end
|
5
8
|
|
6
9
|
describe '#call' do
|
10
|
+
|
7
11
|
let(:html) do
|
8
12
|
%[
|
9
13
|
<style>
|
@@ -18,7 +22,10 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
18
22
|
end
|
19
23
|
|
20
24
|
context 'when there are no banned tags' do
|
21
|
-
|
25
|
+
|
26
|
+
let(:banned_tags) do
|
27
|
+
[]
|
28
|
+
end
|
22
29
|
|
23
30
|
it 'returns the HTML code as it was before' do
|
24
31
|
expect(subject).to eq html
|
@@ -26,7 +33,10 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
26
33
|
end
|
27
34
|
|
28
35
|
context 'when there are banned tags' do
|
29
|
-
|
36
|
+
|
37
|
+
let(:banned_tags) do
|
38
|
+
%w(form style)
|
39
|
+
end
|
30
40
|
|
31
41
|
it 'returns the HTML code without the banned tags' do
|
32
42
|
expect(subject.strip.gsub(/\n/, '').squeeze ' ').to eq %[
|
@@ -36,4 +46,5 @@ RSpec.describe Bulbasaur::RemoveTags do
|
|
36
46
|
end
|
37
47
|
end
|
38
48
|
end
|
39
|
-
|
49
|
+
|
50
|
+
end
|
@@ -1,25 +1,72 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Bulbasaur::NormalizeImageSources do
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
|
5
|
+
subject do
|
6
|
+
described_class.new(html, target_attrs).call
|
7
|
+
end
|
8
8
|
|
9
|
-
|
10
|
-
|
9
|
+
describe "#call" do
|
10
|
+
|
11
|
+
let(:html) do
|
12
|
+
%Q(
|
13
|
+
<img src="http://somewhere.to/get/a-pixel.gif" data-lazy-src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
14
|
+
<img src="http://somewhere.to/get/another-pixel.gif" data-image="http://somewhere.to/get/the-other-real-image.jpg">
|
15
|
+
<img src="http://somewhere.to/get/a-third-pixel.gif" data-src="get/the-third-real-image.jpg">
|
16
|
+
<img src="otherplace.to/load/a-fourth-pixel.gif" lazy-data="otherplace.to/load/the-fourth-real-image.jpg">
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
context "When there are no target attributes" do
|
21
|
+
|
22
|
+
let(:target_attrs) do
|
23
|
+
[]
|
24
|
+
end
|
11
25
|
|
12
|
-
it
|
26
|
+
it "Returns the HTML code as it was before" do
|
13
27
|
expect(subject).to eq html
|
14
28
|
end
|
15
29
|
end
|
16
30
|
|
17
|
-
context
|
18
|
-
|
31
|
+
context "When there are target attributes" do
|
32
|
+
|
33
|
+
let(:target_attrs) do
|
34
|
+
%w(data-lazy-src data-image)
|
35
|
+
end
|
36
|
+
|
37
|
+
let(:html_parsed) do
|
38
|
+
%Q(
|
39
|
+
<img src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
40
|
+
<img src="http://somewhere.to/get/the-other-real-image.jpg">
|
41
|
+
<img src="http://somewhere.to/get/a-third-pixel.gif" data-src="get/the-third-real-image.jpg">
|
42
|
+
<img src="otherplace.to/load/a-fourth-pixel.gif" lazy-data="otherplace.to/load/the-fourth-real-image.jpg">
|
43
|
+
)
|
44
|
+
end
|
19
45
|
|
20
|
-
it
|
21
|
-
expect(subject
|
46
|
+
it "Returns the HTML code with the specified image tags adjusted" do
|
47
|
+
expect(subject.delete(" ")).to eq html_parsed.delete(" ")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
context "When there are target attributes with relative path" do
|
52
|
+
|
53
|
+
let(:target_attrs) do
|
54
|
+
%w(data-lazy-src data-image data-src lazy-data)
|
55
|
+
end
|
56
|
+
|
57
|
+
let(:html_parsed) do
|
58
|
+
%Q(
|
59
|
+
<img src="http://somewhere.to/get/the-real-image.jpg" alt="Image" width="800" height="1200">
|
60
|
+
<img src="http://somewhere.to/get/the-other-real-image.jpg">
|
61
|
+
<img src="http://somewhere.to/get/the-third-real-image.jpg">
|
62
|
+
<img src="otherplace.to/load/the-fourth-real-image.jpg">
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
it "Returns the HTML code with the involved image tags fixed with domain and path" do
|
67
|
+
expect(subject.delete(" ")).to eq html_parsed.delete(" ")
|
22
68
|
end
|
23
69
|
end
|
24
70
|
end
|
25
|
-
|
71
|
+
|
72
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08-
|
11
|
+
date: 2015-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,7 @@ files:
|
|
94
94
|
- lib/bulbasaur/extracts/extract_images_from_html.rb
|
95
95
|
- lib/bulbasaur/extracts/extract_images_from_vimeo.rb
|
96
96
|
- lib/bulbasaur/extracts/extract_images_from_youtube.rb
|
97
|
+
- lib/bulbasaur/extracts/extract_meta_informations_from_html.rb
|
97
98
|
- lib/bulbasaur/extracts/extract_text_from_html.rb
|
98
99
|
- lib/bulbasaur/removals/remove_attributes.rb
|
99
100
|
- lib/bulbasaur/removals/remove_tags.rb
|
@@ -106,12 +107,12 @@ files:
|
|
106
107
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
107
108
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
108
109
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
110
|
+
- spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb
|
109
111
|
- spec/bulbasaur/removals/remove_attributes_spec.rb
|
110
112
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
111
113
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
112
114
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
113
115
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
114
|
-
- spec/bulbasaur_spec.rb
|
115
116
|
- spec/spec_helper.rb
|
116
117
|
homepage: https://github.com/preadly/bulbasaur
|
117
118
|
licenses: []
|
@@ -142,10 +143,10 @@ test_files:
|
|
142
143
|
- spec/bulbasaur/extracts/extract_images_from_vimeo_spec.rb
|
143
144
|
- spec/bulbasaur/extracts/extract_images_from_youtube_spec.rb
|
144
145
|
- spec/bulbasaur/extracts/extract_inner_text_from_html_spec.rb
|
146
|
+
- spec/bulbasaur/extracts/extract_meta_informations_from_html_spec.rb
|
145
147
|
- spec/bulbasaur/removals/remove_attributes_spec.rb
|
146
148
|
- spec/bulbasaur/removals/remove_tags_spec.rb
|
147
149
|
- spec/bulbasaur/replaces/replace_by_tag_image_spec.rb
|
148
150
|
- spec/bulbasaur/utils/normalize_image_sources_spec.rb
|
149
151
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
150
|
-
- spec/bulbasaur_spec.rb
|
151
152
|
- spec/spec_helper.rb
|