link_thumbnailer 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +334 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +22 -0
- data/README.md +210 -0
- data/Rakefile +9 -0
- data/lib/generators/link_thumbnailer/install_generator.rb +17 -0
- data/lib/generators/templates/initializer.rb +89 -0
- data/lib/link_thumbnailer.rb +38 -0
- data/lib/link_thumbnailer/configuration.rb +72 -0
- data/lib/link_thumbnailer/exceptions.rb +11 -0
- data/lib/link_thumbnailer/grader.rb +43 -0
- data/lib/link_thumbnailer/graders/base.rb +39 -0
- data/lib/link_thumbnailer/graders/html_attribute.rb +48 -0
- data/lib/link_thumbnailer/graders/length.rb +37 -0
- data/lib/link_thumbnailer/graders/link_density.rb +20 -0
- data/lib/link_thumbnailer/graders/position.rb +13 -0
- data/lib/link_thumbnailer/image_comparator.rb +26 -0
- data/lib/link_thumbnailer/image_comparators/base.rb +19 -0
- data/lib/link_thumbnailer/image_comparators/size.rb +13 -0
- data/lib/link_thumbnailer/image_parser.rb +62 -0
- data/lib/link_thumbnailer/image_validator.rb +32 -0
- data/lib/link_thumbnailer/model.rb +20 -0
- data/lib/link_thumbnailer/models/description.rb +37 -0
- data/lib/link_thumbnailer/models/favicon.rb +27 -0
- data/lib/link_thumbnailer/models/image.rb +56 -0
- data/lib/link_thumbnailer/models/title.rb +22 -0
- data/lib/link_thumbnailer/models/video.rb +44 -0
- data/lib/link_thumbnailer/models/website.rb +54 -0
- data/lib/link_thumbnailer/page.rb +43 -0
- data/lib/link_thumbnailer/parser.rb +15 -0
- data/lib/link_thumbnailer/processor.rb +128 -0
- data/lib/link_thumbnailer/railtie.rb +6 -0
- data/lib/link_thumbnailer/response.rb +39 -0
- data/lib/link_thumbnailer/scraper.rb +62 -0
- data/lib/link_thumbnailer/scrapers/base.rb +69 -0
- data/lib/link_thumbnailer/scrapers/default/base.rb +12 -0
- data/lib/link_thumbnailer/scrapers/default/description.rb +49 -0
- data/lib/link_thumbnailer/scrapers/default/favicon.rb +38 -0
- data/lib/link_thumbnailer/scrapers/default/images.rb +78 -0
- data/lib/link_thumbnailer/scrapers/default/title.rb +27 -0
- data/lib/link_thumbnailer/scrapers/default/videos.rb +18 -0
- data/lib/link_thumbnailer/scrapers/opengraph/base.rb +45 -0
- data/lib/link_thumbnailer/scrapers/opengraph/description.rb +12 -0
- data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +17 -0
- data/lib/link_thumbnailer/scrapers/opengraph/image.rb +107 -0
- data/lib/link_thumbnailer/scrapers/opengraph/images.rb +18 -0
- data/lib/link_thumbnailer/scrapers/opengraph/title.rb +12 -0
- data/lib/link_thumbnailer/scrapers/opengraph/video.rb +115 -0
- data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +18 -0
- data/lib/link_thumbnailer/uri.rb +20 -0
- data/lib/link_thumbnailer/version.rb +5 -0
- data/lib/link_thumbnailer/video_parser.rb +47 -0
- data/link_thumbnailer.gemspec +29 -0
- data/spec/configuration_spec.rb +61 -0
- data/spec/fixture_spec.rb +114 -0
- data/spec/fixtures/bar.png +2907 -0
- data/spec/fixtures/default_from_body.html +13 -0
- data/spec/fixtures/default_from_meta.html +12 -0
- data/spec/fixtures/foo.png +0 -0
- data/spec/fixtures/google_shift_jis.html +6 -0
- data/spec/fixtures/google_utf8.html +6 -0
- data/spec/fixtures/og_not_valid_example.html +12 -0
- data/spec/fixtures/og_valid_example.html +18 -0
- data/spec/fixtures/og_valid_multi_image_example.html +13 -0
- data/spec/fixtures/og_valid_multi_video_example.html +13 -0
- data/spec/grader_spec.rb +27 -0
- data/spec/graders/base_spec.rb +14 -0
- data/spec/graders/html_attribute_spec.rb +50 -0
- data/spec/graders/length_spec.rb +93 -0
- data/spec/graders/link_density_spec.rb +52 -0
- data/spec/graders/position_spec.rb +49 -0
- data/spec/image_comparators/size_spec.rb +58 -0
- data/spec/image_validator_spec.rb +37 -0
- data/spec/model_spec.rb +27 -0
- data/spec/models/description_spec.rb +66 -0
- data/spec/models/favicon_spec.rb +12 -0
- data/spec/models/image_spec.rb +95 -0
- data/spec/models/title_spec.rb +26 -0
- data/spec/models/video_spec.rb +49 -0
- data/spec/models/website_spec.rb +51 -0
- data/spec/page_spec.rb +28 -0
- data/spec/processor_spec.rb +410 -0
- data/spec/response_spec.rb +62 -0
- data/spec/scraper_spec.rb +70 -0
- data/spec/scrapers/base_spec.rb +69 -0
- data/spec/scrapers/opengraph/base_spec.rb +96 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/uri_spec.rb +44 -0
- data/spec/video_parser_spec.rb +148 -0
- metadata +271 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
5
|
+
module LinkThumbnailer
|
|
6
|
+
class Parser
|
|
7
|
+
|
|
8
|
+
def call(source)
|
|
9
|
+
::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
|
|
10
|
+
rescue ::Nokogiri::XML::SyntaxError => e
|
|
11
|
+
raise ::LinkThumbnailer::SyntaxError.new(e.message)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'uri'
|
|
5
|
+
require 'net/http/persistent'
|
|
6
|
+
|
|
7
|
+
module LinkThumbnailer
|
|
8
|
+
class Processor < ::SimpleDelegator
|
|
9
|
+
|
|
10
|
+
attr_accessor :url
|
|
11
|
+
attr_reader :config, :http, :redirect_count
|
|
12
|
+
|
|
13
|
+
def initialize
|
|
14
|
+
@config = ::LinkThumbnailer.page.config
|
|
15
|
+
@http = ::Net::HTTP::Persistent.new
|
|
16
|
+
|
|
17
|
+
super(config)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def call(url = '', redirect_count = 0, headers = {})
|
|
21
|
+
self.url = url
|
|
22
|
+
@redirect_count = redirect_count
|
|
23
|
+
|
|
24
|
+
raise ::LinkThumbnailer::RedirectLimit if too_many_redirections?
|
|
25
|
+
|
|
26
|
+
with_valid_url do
|
|
27
|
+
set_http_headers(headers)
|
|
28
|
+
set_http_options
|
|
29
|
+
perform_request
|
|
30
|
+
end
|
|
31
|
+
rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
|
|
32
|
+
raise ::LinkThumbnailer::HTTPError.new(e.message)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def with_valid_url
|
|
38
|
+
raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
|
|
39
|
+
yield if block_given?
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def set_http_headers(headers = {})
|
|
43
|
+
headers.each { |k, v| http.headers[k] = v }
|
|
44
|
+
http.override_headers['User-Agent'] = user_agent
|
|
45
|
+
config.http_override_headers.each { |k, v| http.override_headers[k] = v }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def set_http_options
|
|
49
|
+
http.verify_mode = ::OpenSSL::SSL::VERIFY_NONE unless ssl_required?
|
|
50
|
+
http.open_timeout = http_open_timeout
|
|
51
|
+
http.read_timeout = http_read_timeout
|
|
52
|
+
http.proxy = :ENV
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def perform_request
|
|
56
|
+
response = http.request(url)
|
|
57
|
+
headers = {}
|
|
58
|
+
headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
|
|
59
|
+
|
|
60
|
+
raise ::LinkThumbnailer::FormatNotSupported.new(response['Content-Type']) unless valid_response_format?(response)
|
|
61
|
+
|
|
62
|
+
case response
|
|
63
|
+
when ::Net::HTTPSuccess
|
|
64
|
+
Response.new(response).body
|
|
65
|
+
when ::Net::HTTPRedirection
|
|
66
|
+
call(
|
|
67
|
+
resolve_relative_url(response['location'].to_s),
|
|
68
|
+
redirect_count + 1,
|
|
69
|
+
headers
|
|
70
|
+
)
|
|
71
|
+
else
|
|
72
|
+
response.error!
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def resolve_relative_url(location)
|
|
77
|
+
location.start_with?('http') ? location : build_absolute_url_for(location)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def build_absolute_url_for(relative_url)
|
|
81
|
+
::URI.parse("#{url.scheme}://#{url.host}#{relative_url}")
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def redirect_limit
|
|
85
|
+
config.redirect_limit
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def user_agent
|
|
89
|
+
config.user_agent
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def http_open_timeout
|
|
93
|
+
config.http_open_timeout
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def http_read_timeout
|
|
97
|
+
config.http_read_timeout
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def ssl_required?
|
|
101
|
+
config.verify_ssl
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def too_many_redirections?
|
|
105
|
+
redirect_count > redirect_limit
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def valid_url_format?
|
|
109
|
+
url.is_a?(::URI::HTTP)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def valid_response_format?(response)
|
|
113
|
+
return true unless config.raise_on_invalid_format
|
|
114
|
+
return true if response['Content-Type'] =~ /text\/html/
|
|
115
|
+
return true if response['Content-Type'] =~ /application\/html/
|
|
116
|
+
return true if response['Content-Type'] =~ /application\/xhtml\+xml/
|
|
117
|
+
return true if response['Content-Type'] =~ /application\/xml/
|
|
118
|
+
return true if response['Content-Type'] =~ /text\/xml/
|
|
119
|
+
return true if response['Content-Type'] =~ /text\/plain/
|
|
120
|
+
false
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def url=(url)
|
|
124
|
+
@url = ::URI.parse(url.to_s)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LinkThumbnailer
|
|
4
|
+
class Response
|
|
5
|
+
def initialize(response)
|
|
6
|
+
@response = response
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def charset
|
|
10
|
+
@charset ||= extract_charset
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def body
|
|
14
|
+
@body ||= extract_body
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def extract_charset
|
|
20
|
+
content_type = @response['Content-Type'] || ''
|
|
21
|
+
m = content_type.match(/charset=(\w+)/)
|
|
22
|
+
(m && m[1]) || ''
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def extract_body
|
|
26
|
+
should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def should_convert_body_to_utf8?
|
|
30
|
+
charset != '' && charset != 'utf-8'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def convert_encoding_to_utf8(body, from)
|
|
34
|
+
Encoding::Converter.new(from, 'utf-8').convert(body)
|
|
35
|
+
rescue EncodingError
|
|
36
|
+
body
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'active_support/core_ext/object/blank'
|
|
5
|
+
require 'active_support/inflector'
|
|
6
|
+
|
|
7
|
+
require 'link_thumbnailer/parser'
|
|
8
|
+
require 'link_thumbnailer/models/website'
|
|
9
|
+
require 'link_thumbnailer/scrapers/default/title'
|
|
10
|
+
require 'link_thumbnailer/scrapers/opengraph/title'
|
|
11
|
+
require 'link_thumbnailer/scrapers/default/description'
|
|
12
|
+
require 'link_thumbnailer/scrapers/opengraph/description'
|
|
13
|
+
require 'link_thumbnailer/scrapers/default/images'
|
|
14
|
+
require 'link_thumbnailer/scrapers/opengraph/images'
|
|
15
|
+
require 'link_thumbnailer/scrapers/default/videos'
|
|
16
|
+
require 'link_thumbnailer/scrapers/opengraph/videos'
|
|
17
|
+
require 'link_thumbnailer/scrapers/default/favicon'
|
|
18
|
+
require 'link_thumbnailer/scrapers/opengraph/favicon'
|
|
19
|
+
|
|
20
|
+
module LinkThumbnailer
|
|
21
|
+
class Scraper < ::SimpleDelegator
|
|
22
|
+
|
|
23
|
+
attr_reader :document, :source, :url, :config, :website
|
|
24
|
+
|
|
25
|
+
def initialize(source, url)
|
|
26
|
+
@source = source
|
|
27
|
+
@url = url
|
|
28
|
+
@config = ::LinkThumbnailer.page.config
|
|
29
|
+
@document = parser.call(source)
|
|
30
|
+
@website = ::LinkThumbnailer::Models::Website.new
|
|
31
|
+
@website.url = url
|
|
32
|
+
|
|
33
|
+
super(config)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def call
|
|
37
|
+
config.attributes.each do |name|
|
|
38
|
+
config.scrapers.each do |scraper_prefix|
|
|
39
|
+
scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
|
|
40
|
+
break unless website.send(name).blank?
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
website
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def scraper_class(prefix, name)
|
|
50
|
+
prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
|
|
51
|
+
name = name.to_s.camelize
|
|
52
|
+
"#{prefix}::#{name}".constantize
|
|
53
|
+
rescue NameError
|
|
54
|
+
raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def parser
|
|
58
|
+
::LinkThumbnailer::Parser.new
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'link_thumbnailer/models/title'
|
|
5
|
+
require 'link_thumbnailer/models/description'
|
|
6
|
+
require 'link_thumbnailer/models/image'
|
|
7
|
+
require 'link_thumbnailer/models/video'
|
|
8
|
+
|
|
9
|
+
module LinkThumbnailer
|
|
10
|
+
module Scrapers
|
|
11
|
+
class Base < ::SimpleDelegator
|
|
12
|
+
|
|
13
|
+
attr_reader :config, :document, :website, :attribute_name
|
|
14
|
+
|
|
15
|
+
def initialize(document, website = nil)
|
|
16
|
+
@config = ::LinkThumbnailer.page.config
|
|
17
|
+
@document = document
|
|
18
|
+
@website = website
|
|
19
|
+
|
|
20
|
+
super(config)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def call(attribute_name)
|
|
24
|
+
return false unless website.present?
|
|
25
|
+
return false unless applicable?
|
|
26
|
+
|
|
27
|
+
@attribute_name = attribute_name
|
|
28
|
+
|
|
29
|
+
website.send("#{attribute_name}=", value)
|
|
30
|
+
website
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def applicable?
|
|
34
|
+
true
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def value
|
|
38
|
+
fail NotImplementedError
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def meta_xpath(options = {})
|
|
44
|
+
meta_xpaths(options).first
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def meta_xpaths(options = {})
|
|
48
|
+
key = options.fetch(:key, :property)
|
|
49
|
+
value = options.fetch(:value, :content)
|
|
50
|
+
attribute = options.fetch(:attribute, attribute_name)
|
|
51
|
+
|
|
52
|
+
document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def abc
|
|
56
|
+
'abcdefghijklmnopqrstuvwxyz'
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def model_class
|
|
60
|
+
"::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def modelize(node, text = nil)
|
|
64
|
+
model_class.new(node, text)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/scrapers/default/base'
|
|
4
|
+
|
|
5
|
+
module LinkThumbnailer
|
|
6
|
+
module Scrapers
|
|
7
|
+
module Default
|
|
8
|
+
class Description < ::LinkThumbnailer::Scrapers::Default::Base
|
|
9
|
+
|
|
10
|
+
def value
|
|
11
|
+
return model_from_meta.to_s if model_from_meta
|
|
12
|
+
return model_from_body.to_s if model_from_body
|
|
13
|
+
nil
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def model_from_meta
|
|
19
|
+
modelize(node_from_meta, node_from_meta.attributes['content'].value) if node_from_meta
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def model_from_body
|
|
23
|
+
nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i) }.sort.last
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def node_from_meta
|
|
27
|
+
@node_from_meta ||= meta_xpath(key: :name)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def nodes_from_body
|
|
31
|
+
candidates.select { |node| valid_paragraph?(node) }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def valid_paragraph?(node)
|
|
35
|
+
true
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def candidates
|
|
39
|
+
document.css('p,td')
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def modelize(node, text, i = 0)
|
|
43
|
+
model_class.new(node, text, i, nodes_from_body.count)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/scrapers/default/base'
|
|
4
|
+
require 'link_thumbnailer/models/favicon'
|
|
5
|
+
|
|
6
|
+
module LinkThumbnailer
|
|
7
|
+
module Scrapers
|
|
8
|
+
module Default
|
|
9
|
+
class Favicon < ::LinkThumbnailer::Scrapers::Default::Base
|
|
10
|
+
|
|
11
|
+
def value
|
|
12
|
+
modelize(to_uri(href)).to_s
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
private
|
|
16
|
+
|
|
17
|
+
def to_uri(href)
|
|
18
|
+
::URI.parse(href)
|
|
19
|
+
rescue ::URI::InvalidURIError
|
|
20
|
+
nil
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def href
|
|
24
|
+
node.attributes['href'].value.to_s if node
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def node
|
|
28
|
+
document.xpath("//link[contains(@rel, 'icon')]").first
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def modelize(uri)
|
|
32
|
+
model_class.new(uri)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/scrapers/default/base'
|
|
4
|
+
require 'link_thumbnailer/models/image'
|
|
5
|
+
|
|
6
|
+
module LinkThumbnailer
|
|
7
|
+
module Scrapers
|
|
8
|
+
module Default
|
|
9
|
+
class Images < ::LinkThumbnailer::Scrapers::Default::Base
|
|
10
|
+
|
|
11
|
+
def value
|
|
12
|
+
images.map do |image|
|
|
13
|
+
modelize(image.uri, image.size, image.type)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def images
|
|
20
|
+
::LinkThumbnailer::ImageParser.new(allowed_urls).images
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def allowed_urls
|
|
24
|
+
abs_urls.shift(config.image_limit)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def urls
|
|
28
|
+
document.search('//img').map { |i| i['src'] }.compact
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def abs_urls
|
|
32
|
+
urls.map do |url|
|
|
33
|
+
uri = validate_url(url)
|
|
34
|
+
|
|
35
|
+
next unless uri
|
|
36
|
+
|
|
37
|
+
uri = prefix_uri(uri) if needs_prefix?(uri)
|
|
38
|
+
uri
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def validate_url(url)
|
|
43
|
+
::URI.parse(url.to_s)
|
|
44
|
+
rescue ::URI::InvalidURIError
|
|
45
|
+
nil
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def needs_prefix?(uri)
|
|
49
|
+
!uri.host
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def prefix_uri(uri)
|
|
53
|
+
::URI.join(prefix_url, uri)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def prefix_url
|
|
57
|
+
base_href || website.url
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def base_href
|
|
61
|
+
base = document.at('//head/base')
|
|
62
|
+
base['href'] if base && ::URI.parse(base['href']).host
|
|
63
|
+
rescue ::URI::InvalidURIError
|
|
64
|
+
nil
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def model_class
|
|
68
|
+
::LinkThumbnailer::Models::Image
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def modelize(uri, size = nil, type = nil)
|
|
72
|
+
model_class.new(uri, size, type)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|