images_from_link 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/handler_link.rb +71 -0
- data/lib/images_from_link.rb +4 -3
- data/lib/images_from_link/version.rb +1 -1
- data/lib/{ImagesLink.rb → images_link.rb} +32 -21
- metadata +4 -4
- data/lib/HandlerLink.rb +0 -70
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 522fc31690af948adc4b51612c4d25d38b0c12f3
|
4
|
+
data.tar.gz: b26d5ef7ab8aa05bc43d00cf202d0d07e000987b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fbcd60da05ca7a14e0cd8de84b42cdb6fbed090e90533c81cd871e6c7eac54990f17f90b89baff426a41392bf0ee97bad1e79873c2d1ded1beb80266a1ce316a
|
7
|
+
data.tar.gz: 358f80bb52a5129d91947eead0cd68dab285f569d487b7b3752f311c8440f9b7c4640b1ca5a6c52badd6239898e139944e9d88e5869a84345b3a40bd9a06af80
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ ImagesFromLink.get_images('https://www.google.com')
|
|
28
28
|
=>
|
29
29
|
[
|
30
30
|
"https://www.google.com/textinputassistant/tia.png",
|
31
|
-
|
31
|
+
"https://www.google.com/images/nav_logo229.png",
|
32
32
|
"https://www.google.com/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png"
|
33
33
|
]
|
34
34
|
```
|
data/lib/handler_link.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
module HandlerLink
|
2
|
+
FORMAT_IMG = ["jpg", "jpeg", "png", "gif"]
|
3
|
+
|
4
|
+
# gets the url, returns domain
|
5
|
+
def self.get_host_link(link)
|
6
|
+
uri = URI.parse(link)
|
7
|
+
"#{uri.scheme}://#{uri.host}"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.handler_links(array_links, link)
|
11
|
+
array_links.each do |url|
|
12
|
+
if url.include?("(/")
|
13
|
+
uri = get_host_link(link)
|
14
|
+
|
15
|
+
position = url.index("(")
|
16
|
+
url[position] += uri.to_s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# adds scheme if this href
|
22
|
+
def self.handler_prefix_link(host_link, link)
|
23
|
+
abort 'expect strings params' unless host_link.is_a?(String) || link.is_a?(String)
|
24
|
+
|
25
|
+
if link[0] == '/' && link[1] != '/'
|
26
|
+
host_link + link
|
27
|
+
elsif link[0..1] == '//'
|
28
|
+
uri = URI.parse(host_link)
|
29
|
+
"#{uri.scheme}:#{link}"
|
30
|
+
else
|
31
|
+
link
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.remove_unless_symbols(array_images_links)
|
36
|
+
array_images_links.each do |image_url|
|
37
|
+
if (image_url[0..3] != "http" || image_url[0..3] != "www.") && image_url.include?("(")
|
38
|
+
position = image_url.index("(")
|
39
|
+
image_url.reverse!
|
40
|
+
position.times { image_url.chop! }
|
41
|
+
image_url.reverse!
|
42
|
+
image_url.delete!("(,;'')")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.remove_global_unless_symbols(array_images_links)
|
48
|
+
array_images_links.each { |link| link.delete!("(,;'')") }
|
49
|
+
end
|
50
|
+
|
51
|
+
# remove link if link not valid
|
52
|
+
def self.remove_unless_link(array_links)
|
53
|
+
|
54
|
+
array_links.each_with_index do |link, index|
|
55
|
+
array_links[index] = "" if link[0..3] != "http"
|
56
|
+
|
57
|
+
index_ending = nil
|
58
|
+
|
59
|
+
FORMAT_IMG.each do |i|
|
60
|
+
index_ending = i if link.include?(i)
|
61
|
+
end
|
62
|
+
|
63
|
+
unless index_ending == nil
|
64
|
+
position = link.index(index_ending)
|
65
|
+
array_links[index] = "" if (link[position + index_ending.size] =~ /[a-z]/)
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
array_links.delete("")
|
70
|
+
end
|
71
|
+
end
|
data/lib/images_from_link.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require "images_from_link/version"
|
2
|
-
require 'HandlerLink'
|
3
|
-
require 'ImagesLink'
|
4
2
|
|
5
3
|
module ImagesFromLink
|
6
|
-
|
4
|
+
require_relative 'handler_link'
|
5
|
+
require_relative 'images_link'
|
6
|
+
|
7
|
+
# extract images from got url
|
7
8
|
def self.get_images(link)
|
8
9
|
images_link = ImagesLink.new(link)
|
9
10
|
images_link.get_images_from_url
|
@@ -1,67 +1,78 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'uri'
|
4
|
-
require_relative '
|
4
|
+
require_relative 'handler_link'
|
5
5
|
|
6
6
|
class ImagesLink
|
7
7
|
|
8
|
+
# expect valid url
|
8
9
|
def initialize(link)
|
9
10
|
@link = link
|
10
11
|
|
11
12
|
begin
|
12
13
|
@response = HTTParty.get(@link)
|
13
14
|
rescue Errno::ECONNREFUSED => e
|
14
|
-
puts "
|
15
|
+
puts "not valid url"
|
15
16
|
abort e.message
|
16
17
|
end
|
17
18
|
|
18
19
|
@doc = Nokogiri::HTML(@response.body)
|
19
20
|
@doc.search('//noscript').each { |node| node.remove } # убираю мешающие ноды
|
20
|
-
@
|
21
|
-
@handler_link = HandlerLink
|
21
|
+
@links = [] # сдесь будут храниться все урлы картинок
|
22
|
+
@handler_link = HandlerLink # обработчик урлов
|
22
23
|
@link_host_name = @handler_link.get_host_link(@link) # беру имя домена
|
23
24
|
end
|
24
25
|
|
25
|
-
#
|
26
|
+
# returns all found images url
|
26
27
|
def get_images_from_url
|
27
|
-
@
|
28
|
-
@handler_link.remove_global_unless_symbols(@
|
29
|
-
@
|
28
|
+
@links = (images_from_img_tag + images_from_link_tag + images_from_extension).uniq
|
29
|
+
@handler_link.remove_global_unless_symbols(@links)
|
30
|
+
@links
|
30
31
|
end
|
31
32
|
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
# returns all images url with tags img['src']
|
34
|
+
def images_from_img_tag
|
35
|
+
img_array = []
|
36
|
+
# пробегаю по тегам img (meta og:images...), хватаю урл и закидываю в @links
|
35
37
|
@doc.xpath('//img').each do |img|
|
36
38
|
if img['src'] != nil
|
37
|
-
|
38
|
-
src = @handler_link.remove_unless_symbols(
|
39
|
+
array = [img['src'].to_s]
|
40
|
+
src = @handler_link.remove_unless_symbols(array)
|
39
41
|
got_link = @handler_link.handler_prefix_link(@link_host_name, src.to_s.delete!("[\"]"))
|
40
42
|
|
41
|
-
|
43
|
+
img_array << got_link
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
47
|
+
img_array.uniq!
|
48
|
+
@handler_link.remove_unless_link(img_array)
|
49
|
+
img_array
|
50
|
+
end
|
51
|
+
|
52
|
+
# returns all images url with tags link['href']
|
53
|
+
def images_from_link_tag
|
54
|
+
img_array = []
|
45
55
|
@doc.xpath('//link').each do |link|
|
46
56
|
if link['href'] != nil && link['type'] != nil
|
47
57
|
if link['type'].include?("image")
|
48
58
|
got_link = @handler_link.handler_prefix_link(@link_host_name, link['href'])
|
49
|
-
|
59
|
+
img_array << got_link
|
50
60
|
end
|
51
61
|
end
|
52
62
|
end
|
53
63
|
|
54
|
-
|
55
|
-
@handler_link.remove_unless_link(
|
56
|
-
|
64
|
+
img_array.uniq!
|
65
|
+
@handler_link.remove_unless_link(img_array)
|
66
|
+
img_array
|
57
67
|
end
|
58
68
|
|
59
|
-
|
60
|
-
|
69
|
+
# returns all images url with jpg, png, gif...
|
70
|
+
def images_from_extension
|
71
|
+
# нахожу все урлы с jpg, png, gif...
|
61
72
|
@images_links = URI.extract(@doc.to_s.encode("UTF-16be", :invalid => :replace, :replace => "?").encode('UTF-8')).select { |l| l[/\.(?:gif|png|jpe?g)\b/] }
|
62
73
|
@handler_link.handler_links(@images_links, @link) # обрабатываю урлы
|
63
74
|
@handler_link.remove_unless_symbols(@images_links)
|
64
75
|
@handler_link.remove_unless_link(@images_links)
|
65
|
-
@images_links
|
76
|
+
@images_links.uniq
|
66
77
|
end
|
67
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: images_from_link
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zerocool
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -96,10 +96,10 @@ files:
|
|
96
96
|
- bin/console
|
97
97
|
- bin/setup
|
98
98
|
- images_from_link.gemspec
|
99
|
-
- lib/
|
100
|
-
- lib/ImagesLink.rb
|
99
|
+
- lib/handler_link.rb
|
101
100
|
- lib/images_from_link.rb
|
102
101
|
- lib/images_from_link/version.rb
|
102
|
+
- lib/images_link.rb
|
103
103
|
homepage: https://github.com/exwarvlad/images_from_link
|
104
104
|
licenses:
|
105
105
|
- MIT
|
data/lib/HandlerLink.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
class HandlerLink
|
2
|
-
FORMAT_IMG = ["jpg", "jpeg", "png", "gif"]
|
3
|
-
|
4
|
-
# получает урл - отдает домен
|
5
|
-
def get_host_link(link)
|
6
|
-
uri = URI.parse(link)
|
7
|
-
"#{uri.scheme}://" + uri.host
|
8
|
-
end
|
9
|
-
|
10
|
-
def handler_links(arra_links, link)
|
11
|
-
arra_links.each do |url|
|
12
|
-
if url.include?("(/")
|
13
|
-
uri = get_host_link(link)
|
14
|
-
|
15
|
-
position = url.index("(")
|
16
|
-
url[position] += uri.to_s
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# добавляет scheme, если это href
|
22
|
-
def handler_prefix_link(host_link, link)
|
23
|
-
abort 'в качестве аргументов передайте строки' unless host_link.is_a?(String) || link.is_a?(String)
|
24
|
-
|
25
|
-
if link[0] == '/' && link[1] != '/'
|
26
|
-
host_link + link
|
27
|
-
elsif link[0..1] == '//'
|
28
|
-
uri = URI.parse(host_link)
|
29
|
-
"#{uri.scheme}:#{link}"
|
30
|
-
else
|
31
|
-
link
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def remove_unless_symbols(arra_images_links)
|
36
|
-
arra_images_links.each do |image_url|
|
37
|
-
if (image_url[0..3] != "http" || image_url[0..3] != "www.") && image_url.include?("(")
|
38
|
-
position = image_url.index("(")
|
39
|
-
image_url.reverse!
|
40
|
-
position.times { image_url.chop! }
|
41
|
-
image_url.reverse!
|
42
|
-
image_url.delete!("(,;'')")
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def remove_global_unless_symbols(arra_images_links)
|
48
|
-
arra_images_links.each { |link| link.delete!("(,;'')") }
|
49
|
-
end
|
50
|
-
|
51
|
-
def remove_unless_link(arra_link)
|
52
|
-
|
53
|
-
arra_link.each_with_index do |link, index|
|
54
|
-
arra_link[index] = "" if link[0..3] != "http"
|
55
|
-
|
56
|
-
index_ending = nil
|
57
|
-
|
58
|
-
FORMAT_IMG.each do |i|
|
59
|
-
index_ending = i if link.include?(i)
|
60
|
-
end
|
61
|
-
|
62
|
-
unless index_ending == nil
|
63
|
-
position = link.index(index_ending)
|
64
|
-
arra_link[index] = "" if (link[position + index_ending.size] =~ /[a-z]/)
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
arra_link.delete("")
|
69
|
-
end
|
70
|
-
end
|