preadly-bulbasaur 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 014f0ead8b757383f291bc31f15134e1c325032d
|
4
|
+
data.tar.gz: e3299a0a0b906620a99240065456189bebc08aee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7b18fbb448369cafdce3789bd9a4c61ec87a198118908fddfdef685c828c65331da6972701d3b713a86abcffde31ce5af0b446ee3abef285aa3df41bfe88affe
|
7
|
+
data.tar.gz: 37ae03db04175312915da5cd8e9511f3db4f9c56dcebf87d3c3385fb2573b90dd832a73f1cb1a43dec3d3a9b25397a7ffebf4f1bc06898be209fc8a30e14caad
|
@@ -3,7 +3,7 @@ module Bulbasaur
|
|
3
3
|
class ExtractImagesFromHTML
|
4
4
|
|
5
5
|
CSS_IMPORT_URL_REGEX = /(?<=url\()['"]?.+?['"]?.+?(?=\))/
|
6
|
-
IMG_CANDIDATE_URL_REGEX = /https?:\/\/\S*\.(?:png|jpg|jpeg)(?!\.\S)/i
|
6
|
+
IMG_CANDIDATE_URL_REGEX = /https?:\/\/\S*\.(?:png|jpg|jpeg|gif)(?!\.\S)/i
|
7
7
|
|
8
8
|
def initialize(html)
|
9
9
|
@html = html
|
@@ -13,6 +13,7 @@ module Bulbasaur
|
|
13
13
|
images = Array.new
|
14
14
|
images = images + extract_images_by_tag_image
|
15
15
|
images = images + extract_images_by_tag_style
|
16
|
+
images = images + extract_images_by_link
|
16
17
|
images
|
17
18
|
end
|
18
19
|
|
@@ -36,6 +37,15 @@ module Bulbasaur
|
|
36
37
|
images
|
37
38
|
end
|
38
39
|
|
40
|
+
def extract_images_by_link
|
41
|
+
images = Array.new
|
42
|
+
Nokogiri::HTML(@html).xpath('//a').each do |link|
|
43
|
+
url = link.xpath('@href').text
|
44
|
+
images << create_struct(url) if url =~ IMG_CANDIDATE_URL_REGEX
|
45
|
+
end
|
46
|
+
images
|
47
|
+
end
|
48
|
+
|
39
49
|
def create_struct(url, alt=nil)
|
40
50
|
{url: url, alt: alt }
|
41
51
|
end
|
data/lib/bulbasaur/version.rb
CHANGED
@@ -60,6 +60,44 @@ RSpec.describe Bulbasaur::ExtractImagesFromHTML do
|
|
60
60
|
expect(subject.first[:alt]).to be_nil
|
61
61
|
end
|
62
62
|
end
|
63
|
+
|
64
|
+
context 'When sending HTML with a link pointing to an image' do
|
65
|
+
let(:html) do
|
66
|
+
'<p>
|
67
|
+
<a href="http://somewhere.to/get/the_original_image.jpg">
|
68
|
+
Click here to see the original image.
|
69
|
+
</a>
|
70
|
+
<a href="http://somewhere.to/get/the_original_image.jpg?width=400&height=400">
|
71
|
+
Click here to see the original image.
|
72
|
+
</a>
|
73
|
+
<a href="http://somewhere.to/go/to/another_page.html">
|
74
|
+
Click here to go to another page.
|
75
|
+
</a>
|
76
|
+
</p>'
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'Does return an image array with 1 item' do
|
80
|
+
expect(subject.size).to eq 2
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'Does return the image URL with parameters' do
|
84
|
+
expect(subject.last[:url]).to eq 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400'
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'Does return the image URL without parameters' do
|
88
|
+
expect(subject.first[:url]).to eq 'http://somewhere.to/get/the_original_image.jpg'
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'Does return the image alt' do
|
92
|
+
expect(subject.first[:alt]).to be_nil
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'Does not include links other than for images' do
|
96
|
+
expect(subject).to include Hash(url: 'http://somewhere.to/get/the_original_image.jpg', alt: nil)
|
97
|
+
expect(subject).to include Hash(url: 'http://somewhere.to/get/the_original_image.jpg?width=400&height=400', alt: nil)
|
98
|
+
expect(subject).not_to include Hash(url: 'http://somewhere.to/go/to/another_page.html', alt: nil)
|
99
|
+
end
|
100
|
+
end
|
63
101
|
|
64
102
|
context "When send html with many images" do
|
65
103
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preadly-bulbasaur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Magno Costa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -126,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
126
|
version: '0'
|
127
127
|
requirements: []
|
128
128
|
rubyforge_project:
|
129
|
-
rubygems_version: 2.
|
129
|
+
rubygems_version: 2.2.2
|
130
130
|
signing_key:
|
131
131
|
specification_version: 4
|
132
132
|
summary: Bulbasaur is a helper for crawler operations used in Pread.ly
|
@@ -140,4 +140,3 @@ test_files:
|
|
140
140
|
- spec/bulbasaur/utils/normalize_url_spec.rb
|
141
141
|
- spec/bulbasaur_spec.rb
|
142
142
|
- spec/spec_helper.rb
|
143
|
-
has_rdoc:
|