images_from_link 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/handler_link.rb +71 -0
- data/lib/images_from_link.rb +4 -3
- data/lib/images_from_link/version.rb +1 -1
- data/lib/{ImagesLink.rb → images_link.rb} +32 -21
- metadata +4 -4
- data/lib/HandlerLink.rb +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 522fc31690af948adc4b51612c4d25d38b0c12f3
|
4
|
+
data.tar.gz: b26d5ef7ab8aa05bc43d00cf202d0d07e000987b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fbcd60da05ca7a14e0cd8de84b42cdb6fbed090e90533c81cd871e6c7eac54990f17f90b89baff426a41392bf0ee97bad1e79873c2d1ded1beb80266a1ce316a
|
7
|
+
data.tar.gz: 358f80bb52a5129d91947eead0cd68dab285f569d487b7b3752f311c8440f9b7c4640b1ca5a6c52badd6239898e139944e9d88e5869a84345b3a40bd9a06af80
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ ImagesFromLink.get_images('https://www.google.com')
|
|
28
28
|
=>
|
29
29
|
[
|
30
30
|
"https://www.google.com/textinputassistant/tia.png",
|
31
|
-
|
31
|
+
"https://www.google.com/images/nav_logo229.png",
|
32
32
|
"https://www.google.com/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png"
|
33
33
|
]
|
34
34
|
```
|
data/lib/handler_link.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
module HandlerLink
|
2
|
+
FORMAT_IMG = ["jpg", "jpeg", "png", "gif"]
|
3
|
+
|
4
|
+
# gets the url, returns domain
|
5
|
+
def self.get_host_link(link)
|
6
|
+
uri = URI.parse(link)
|
7
|
+
"#{uri.scheme}://#{uri.host}"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.handler_links(array_links, link)
|
11
|
+
array_links.each do |url|
|
12
|
+
if url.include?("(/")
|
13
|
+
uri = get_host_link(link)
|
14
|
+
|
15
|
+
position = url.index("(")
|
16
|
+
url[position] += uri.to_s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# adds scheme if this href
|
22
|
+
def self.handler_prefix_link(host_link, link)
|
23
|
+
abort 'expect strings params' unless host_link.is_a?(String) || link.is_a?(String)
|
24
|
+
|
25
|
+
if link[0] == '/' && link[1] != '/'
|
26
|
+
host_link + link
|
27
|
+
elsif link[0..1] == '//'
|
28
|
+
uri = URI.parse(host_link)
|
29
|
+
"#{uri.scheme}:#{link}"
|
30
|
+
else
|
31
|
+
link
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.remove_unless_symbols(array_images_links)
|
36
|
+
array_images_links.each do |image_url|
|
37
|
+
if (image_url[0..3] != "http" || image_url[0..3] != "www.") && image_url.include?("(")
|
38
|
+
position = image_url.index("(")
|
39
|
+
image_url.reverse!
|
40
|
+
position.times { image_url.chop! }
|
41
|
+
image_url.reverse!
|
42
|
+
image_url.delete!("(,;'')")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.remove_global_unless_symbols(array_images_links)
|
48
|
+
array_images_links.each { |link| link.delete!("(,;'')") }
|
49
|
+
end
|
50
|
+
|
51
|
+
# remove link if link not valid
|
52
|
+
def self.remove_unless_link(array_links)
|
53
|
+
|
54
|
+
array_links.each_with_index do |link, index|
|
55
|
+
array_links[index] = "" if link[0..3] != "http"
|
56
|
+
|
57
|
+
index_ending = nil
|
58
|
+
|
59
|
+
FORMAT_IMG.each do |i|
|
60
|
+
index_ending = i if link.include?(i)
|
61
|
+
end
|
62
|
+
|
63
|
+
unless index_ending == nil
|
64
|
+
position = link.index(index_ending)
|
65
|
+
array_links[index] = "" if (link[position + index_ending.size] =~ /[a-z]/)
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
array_links.delete("")
|
70
|
+
end
|
71
|
+
end
|
data/lib/images_from_link.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require "images_from_link/version"
|
2
|
-
require 'HandlerLink'
|
3
|
-
require 'ImagesLink'
|
4
2
|
|
5
3
|
module ImagesFromLink
|
6
|
-
|
4
|
+
require_relative 'handler_link'
|
5
|
+
require_relative 'images_link'
|
6
|
+
|
7
|
+
# extract images from got url
|
7
8
|
def self.get_images(link)
|
8
9
|
images_link = ImagesLink.new(link)
|
9
10
|
images_link.get_images_from_url
|
@@ -1,67 +1,78 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
-
require_relative '
|
4
|
+
require_relative 'handler_link'
|
5
5
|
|
6
6
|
class ImagesLink
|
7
7
|
|
8
|
+
# expect valid url
|
8
9
|
def initialize(link)
|
9
10
|
@link = link
|
10
11
|
|
11
12
|
begin
|
12
13
|
@response = HTTParty.get(@link)
|
13
14
|
rescue Errno::ECONNREFUSED => e
|
14
|
-
puts "
|
15
|
+
puts "not valid url"
|
15
16
|
abort e.message
|
16
17
|
end
|
17
18
|
|
18
19
|
@doc = Nokogiri::HTML(@response.body)
|
19
20
|
@doc.search('//noscript').each { |node| node.remove } # убираю мешающие ноды
|
20
|
-
@
|
21
|
-
@handler_link = HandlerLink
|
21
|
+
@links = [] # сдесь будут храниться все урлы картинок
|
22
|
+
@handler_link = HandlerLink # обработчик урлов
|
22
23
|
@link_host_name = @handler_link.get_host_link(@link) # беру имя домена
|
23
24
|
end
|
24
25
|
|
25
|
-
#
|
26
|
+
# returns all found images url
|
26
27
|
def get_images_from_url
|
27
|
-
@
|
28
|
-
@handler_link.remove_global_unless_symbols(@
|
29
|
-
@
|
28
|
+
@links = (images_from_img_tag + images_from_link_tag + images_from_extension).uniq
|
29
|
+
@handler_link.remove_global_unless_symbols(@links)
|
30
|
+
@links
|
30
31
|
end
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
# returns all images url with tags img['src']
|
34
|
+
def images_from_img_tag
|
35
|
+
img_array = []
|
36
|
+
# пробегаю по тегам img (meta og:images...), хватаю урл и закидываю в @links
|
35
37
|
@doc.xpath('//img').each do |img|
|
36
38
|
if img['src'] != nil
|
37
|
-
|
38
|
-
src = @handler_link.remove_unless_symbols(
|
39
|
+
array = [img['src'].to_s]
|
40
|
+
src = @handler_link.remove_unless_symbols(array)
|
39
41
|
got_link = @handler_link.handler_prefix_link(@link_host_name, src.to_s.delete!("[\"]"))
|
40
42
|
|
41
|
-
|
43
|
+
img_array << got_link
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
47
|
+
img_array.uniq!
|
48
|
+
@handler_link.remove_unless_link(img_array)
|
49
|
+
img_array
|
50
|
+
end
|
51
|
+
|
52
|
+
# returns all images url with tags link['href']
|
53
|
+
def images_from_link_tag
|
54
|
+
img_array = []
|
45
55
|
@doc.xpath('//link').each do |link|
|
46
56
|
if link['href'] != nil && link['type'] != nil
|
47
57
|
if link['type'].include?("image")
|
48
58
|
got_link = @handler_link.handler_prefix_link(@link_host_name, link['href'])
|
49
|
-
|
59
|
+
img_array << got_link
|
50
60
|
end
|
51
61
|
end
|
52
62
|
end
|
53
63
|
|
54
|
-
|
55
|
-
@handler_link.remove_unless_link(
|
56
|
-
|
64
|
+
img_array.uniq!
|
65
|
+
@handler_link.remove_unless_link(img_array)
|
66
|
+
img_array
|
57
67
|
end
|
58
68
|
|
59
|
-
|
60
|
-
|
69
|
+
# returns all images url with jpg, png, gif...
|
70
|
+
def images_from_extension
|
71
|
+
# нахожу все урлы с jpg, png, gif...
|
61
72
|
@images_links = URI.extract(@doc.to_s.encode("UTF-16be", :invalid => :replace, :replace => "?").encode('UTF-8')).select { |l| l[/\.(?:gif|png|jpe?g)\b/] }
|
62
73
|
@handler_link.handler_links(@images_links, @link) # обрабатываю урлы
|
63
74
|
@handler_link.remove_unless_symbols(@images_links)
|
64
75
|
@handler_link.remove_unless_link(@images_links)
|
65
|
-
@images_links
|
76
|
+
@images_links.uniq
|
66
77
|
end
|
67
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: images_from_link
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zerocool
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -96,10 +96,10 @@ files:
|
|
96
96
|
- bin/console
|
97
97
|
- bin/setup
|
98
98
|
- images_from_link.gemspec
|
99
|
-
- lib/
|
100
|
-
- lib/ImagesLink.rb
|
99
|
+
- lib/handler_link.rb
|
101
100
|
- lib/images_from_link.rb
|
102
101
|
- lib/images_from_link/version.rb
|
102
|
+
- lib/images_link.rb
|
103
103
|
homepage: https://github.com/exwarvlad/images_from_link
|
104
104
|
licenses:
|
105
105
|
- MIT
|
data/lib/HandlerLink.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
class HandlerLink
|
2
|
-
FORMAT_IMG = ["jpg", "jpeg", "png", "gif"]
|
3
|
-
|
4
|
-
# получает урл - отдает домен
|
5
|
-
def get_host_link(link)
|
6
|
-
uri = URI.parse(link)
|
7
|
-
"#{uri.scheme}://" + uri.host
|
8
|
-
end
|
9
|
-
|
10
|
-
def handler_links(arra_links, link)
|
11
|
-
arra_links.each do |url|
|
12
|
-
if url.include?("(/")
|
13
|
-
uri = get_host_link(link)
|
14
|
-
|
15
|
-
position = url.index("(")
|
16
|
-
url[position] += uri.to_s
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# добавляет scheme, если это href
|
22
|
-
def handler_prefix_link(host_link, link)
|
23
|
-
abort 'в качестве аргументов передайте строки' unless host_link.is_a?(String) || link.is_a?(String)
|
24
|
-
|
25
|
-
if link[0] == '/' && link[1] != '/'
|
26
|
-
host_link + link
|
27
|
-
elsif link[0..1] == '//'
|
28
|
-
uri = URI.parse(host_link)
|
29
|
-
"#{uri.scheme}:#{link}"
|
30
|
-
else
|
31
|
-
link
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def remove_unless_symbols(arra_images_links)
|
36
|
-
arra_images_links.each do |image_url|
|
37
|
-
if (image_url[0..3] != "http" || image_url[0..3] != "www.") && image_url.include?("(")
|
38
|
-
position = image_url.index("(")
|
39
|
-
image_url.reverse!
|
40
|
-
position.times { image_url.chop! }
|
41
|
-
image_url.reverse!
|
42
|
-
image_url.delete!("(,;'')")
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def remove_global_unless_symbols(arra_images_links)
|
48
|
-
arra_images_links.each { |link| link.delete!("(,;'')") }
|
49
|
-
end
|
50
|
-
|
51
|
-
def remove_unless_link(arra_link)
|
52
|
-
|
53
|
-
arra_link.each_with_index do |link, index|
|
54
|
-
arra_link[index] = "" if link[0..3] != "http"
|
55
|
-
|
56
|
-
index_ending = nil
|
57
|
-
|
58
|
-
FORMAT_IMG.each do |i|
|
59
|
-
index_ending = i if link.include?(i)
|
60
|
-
end
|
61
|
-
|
62
|
-
unless index_ending == nil
|
63
|
-
position = link.index(index_ending)
|
64
|
-
arra_link[index] = "" if (link[position + index_ending.size] =~ /[a-z]/)
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
arra_link.delete("")
|
69
|
-
end
|
70
|
-
end
|