link_thumbnailer 3.2.0 → 3.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/.travis.yml +2 -4
- data/CHANGELOG.md +252 -75
- data/Gemfile +5 -3
- data/README.md +4 -0
- data/Rakefile +2 -0
- data/lib/generators/link_thumbnailer/install_generator.rb +2 -0
- data/lib/generators/templates/initializer.rb +15 -0
- data/lib/link_thumbnailer.rb +2 -0
- data/lib/link_thumbnailer/configuration.rb +74 -68
- data/lib/link_thumbnailer/exceptions.rb +3 -0
- data/lib/link_thumbnailer/grader.rb +2 -0
- data/lib/link_thumbnailer/graders/base.rb +2 -0
- data/lib/link_thumbnailer/graders/html_attribute.rb +2 -0
- data/lib/link_thumbnailer/graders/length.rb +2 -0
- data/lib/link_thumbnailer/graders/link_density.rb +2 -0
- data/lib/link_thumbnailer/graders/position.rb +2 -0
- data/lib/link_thumbnailer/image_comparator.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/base.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/size.rb +2 -0
- data/lib/link_thumbnailer/image_parser.rb +13 -1
- data/lib/link_thumbnailer/image_validator.rb +2 -0
- data/lib/link_thumbnailer/model.rb +20 -17
- data/lib/link_thumbnailer/models/description.rb +2 -0
- data/lib/link_thumbnailer/models/favicon.rb +2 -0
- data/lib/link_thumbnailer/models/image.rb +56 -54
- data/lib/link_thumbnailer/models/title.rb +2 -0
- data/lib/link_thumbnailer/models/video.rb +2 -0
- data/lib/link_thumbnailer/models/website.rb +54 -52
- data/lib/link_thumbnailer/page.rb +4 -1
- data/lib/link_thumbnailer/parser.rb +3 -1
- data/lib/link_thumbnailer/processor.rb +38 -5
- data/lib/link_thumbnailer/railtie.rb +2 -0
- data/lib/link_thumbnailer/response.rb +39 -0
- data/lib/link_thumbnailer/scraper.rb +62 -60
- data/lib/link_thumbnailer/scrapers/base.rb +69 -67
- data/lib/link_thumbnailer/scrapers/default/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/favicon.rb +16 -2
- data/lib/link_thumbnailer/scrapers/default/images.rb +5 -1
- data/lib/link_thumbnailer/scrapers/default/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/videos.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/image.rb +7 -1
- data/lib/link_thumbnailer/scrapers/opengraph/images.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/video.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +2 -0
- data/lib/link_thumbnailer/uri.rb +20 -0
- data/lib/link_thumbnailer/version.rb +3 -1
- data/lib/link_thumbnailer/video_parser.rb +3 -1
- data/link_thumbnailer.gemspec +8 -6
- data/spec/configuration_spec.rb +4 -2
- data/spec/fixture_spec.rb +21 -0
- data/spec/fixtures/default_with_few_favicons.html +15 -0
- data/spec/fixtures/google_shift_jis.html +6 -0
- data/spec/fixtures/google_utf8.html +6 -0
- data/spec/fixtures/google_utf8_no_meta_charset.html +6 -0
- data/spec/fixtures/with_related_path_in_href.html +13 -0
- data/spec/fixtures/with_root_path_in_href.html +13 -0
- data/spec/grader_spec.rb +3 -1
- data/spec/graders/base_spec.rb +2 -0
- data/spec/graders/html_attribute_spec.rb +9 -7
- data/spec/graders/length_spec.rb +10 -6
- data/spec/graders/link_density_spec.rb +4 -2
- data/spec/graders/position_spec.rb +8 -6
- data/spec/image_comparators/size_spec.rb +2 -0
- data/spec/image_validator_spec.rb +3 -1
- data/spec/model_spec.rb +2 -0
- data/spec/models/description_spec.rb +3 -1
- data/spec/models/favicon_spec.rb +2 -0
- data/spec/models/image_spec.rb +6 -4
- data/spec/models/title_spec.rb +2 -0
- data/spec/models/video_spec.rb +7 -5
- data/spec/models/website_spec.rb +5 -3
- data/spec/page_spec.rb +2 -0
- data/spec/processor_spec.rb +74 -23
- data/spec/response_spec.rb +84 -0
- data/spec/scraper_spec.rb +6 -4
- data/spec/scrapers/base_spec.rb +6 -4
- data/spec/scrapers/opengraph/base_spec.rb +8 -6
- data/spec/spec_helper.rb +2 -0
- data/spec/uri_spec.rb +44 -0
- data/spec/video_parser_spec.rb +15 -13
- metadata +37 -19
@@ -1,52 +1,54 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'link_thumbnailer/model'
|
4
|
+
|
5
|
+
module LinkThumbnailer
|
6
|
+
module Models
|
7
|
+
class Website < ::LinkThumbnailer::Model
|
8
|
+
|
9
|
+
attr_accessor :url, :title, :description, :images, :videos, :favicon
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@images = []
|
13
|
+
@videos = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def video=(video)
|
17
|
+
self.videos = video
|
18
|
+
end
|
19
|
+
|
20
|
+
def videos=(videos)
|
21
|
+
Array(videos).each do |video|
|
22
|
+
@videos << video
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def image=(image)
|
27
|
+
self.images = image
|
28
|
+
end
|
29
|
+
|
30
|
+
def images=(images)
|
31
|
+
Array(images).each do |image|
|
32
|
+
next unless image.valid?
|
33
|
+
@images << image
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def images
|
38
|
+
@images.sort!
|
39
|
+
end
|
40
|
+
|
41
|
+
def as_json(*)
|
42
|
+
{
|
43
|
+
url: url.to_s,
|
44
|
+
favicon: favicon,
|
45
|
+
title: title,
|
46
|
+
description: description,
|
47
|
+
images: images.map(&:as_json),
|
48
|
+
videos: videos.map(&:as_json)
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -1,3 +1,6 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'link_thumbnailer/response'
|
1
4
|
require 'link_thumbnailer/processor'
|
2
5
|
require 'link_thumbnailer/scraper'
|
3
6
|
|
@@ -14,7 +17,7 @@ module LinkThumbnailer
|
|
14
17
|
end
|
15
18
|
|
16
19
|
def generate
|
17
|
-
@source = processor.
|
20
|
+
@source = processor.start(url)
|
18
21
|
scraper.call
|
19
22
|
end
|
20
23
|
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
|
3
5
|
module LinkThumbnailer
|
4
6
|
class Parser
|
5
7
|
|
6
8
|
def call(source)
|
7
|
-
::Nokogiri::HTML(source)
|
9
|
+
::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
|
8
10
|
rescue ::Nokogiri::XML::SyntaxError => e
|
9
11
|
raise ::LinkThumbnailer::SyntaxError.new(e.message)
|
10
12
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'delegate'
|
2
4
|
require 'uri'
|
3
5
|
require 'net/http/persistent'
|
@@ -15,6 +17,12 @@ module LinkThumbnailer
|
|
15
17
|
super(config)
|
16
18
|
end
|
17
19
|
|
20
|
+
def start(url)
|
21
|
+
result = call(url)
|
22
|
+
shutdown
|
23
|
+
result
|
24
|
+
end
|
25
|
+
|
18
26
|
def call(url = '', redirect_count = 0, headers = {})
|
19
27
|
self.url = url
|
20
28
|
@redirect_count = redirect_count
|
@@ -26,12 +34,16 @@ module LinkThumbnailer
|
|
26
34
|
set_http_options
|
27
35
|
perform_request
|
28
36
|
end
|
29
|
-
rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
|
37
|
+
rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error, ::Net::HTTP::Persistent::Error => e
|
30
38
|
raise ::LinkThumbnailer::HTTPError.new(e.message)
|
31
39
|
end
|
32
40
|
|
33
41
|
private
|
34
42
|
|
43
|
+
def shutdown
|
44
|
+
http.shutdown
|
45
|
+
end
|
46
|
+
|
35
47
|
def with_valid_url
|
36
48
|
raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
|
37
49
|
yield if block_given?
|
@@ -39,8 +51,8 @@ module LinkThumbnailer
|
|
39
51
|
|
40
52
|
def set_http_headers(headers = {})
|
41
53
|
headers.each { |k, v| http.headers[k] = v }
|
42
|
-
http.
|
43
|
-
http.override_headers[
|
54
|
+
http.override_headers['User-Agent'] = user_agent
|
55
|
+
config.http_override_headers.each { |k, v| http.override_headers[k] = v }
|
44
56
|
end
|
45
57
|
|
46
58
|
def set_http_options
|
@@ -51,7 +63,7 @@ module LinkThumbnailer
|
|
51
63
|
end
|
52
64
|
|
53
65
|
def perform_request
|
54
|
-
response =
|
66
|
+
response = request_in_chunks
|
55
67
|
headers = {}
|
56
68
|
headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
|
57
69
|
|
@@ -59,7 +71,7 @@ module LinkThumbnailer
|
|
59
71
|
|
60
72
|
case response
|
61
73
|
when ::Net::HTTPSuccess
|
62
|
-
response.body
|
74
|
+
Response.new(response).body
|
63
75
|
when ::Net::HTTPRedirection
|
64
76
|
call(
|
65
77
|
resolve_relative_url(response['location'].to_s),
|
@@ -71,6 +83,19 @@ module LinkThumbnailer
|
|
71
83
|
end
|
72
84
|
end
|
73
85
|
|
86
|
+
def request_in_chunks
|
87
|
+
body = String.new
|
88
|
+
response = http.request(url) do |resp|
|
89
|
+
raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(resp.content_length)
|
90
|
+
resp.read_body do |chunk|
|
91
|
+
body.concat(chunk)
|
92
|
+
raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(body.length)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
response.body = body
|
96
|
+
response
|
97
|
+
end
|
98
|
+
|
74
99
|
def resolve_relative_url(location)
|
75
100
|
location.start_with?('http') ? location : build_absolute_url_for(location)
|
76
101
|
end
|
@@ -99,6 +124,10 @@ module LinkThumbnailer
|
|
99
124
|
config.verify_ssl
|
100
125
|
end
|
101
126
|
|
127
|
+
def download_size_limit
|
128
|
+
config.download_size_limit
|
129
|
+
end
|
130
|
+
|
102
131
|
def too_many_redirections?
|
103
132
|
redirect_count > redirect_limit
|
104
133
|
end
|
@@ -118,6 +147,10 @@ module LinkThumbnailer
|
|
118
147
|
false
|
119
148
|
end
|
120
149
|
|
150
|
+
def too_big_download_size?(size)
|
151
|
+
size.to_i > download_size_limit.to_i
|
152
|
+
end
|
153
|
+
|
121
154
|
def url=(url)
|
122
155
|
@url = ::URI.parse(url.to_s)
|
123
156
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LinkThumbnailer
|
4
|
+
class Response
|
5
|
+
def initialize(response)
|
6
|
+
@response = response
|
7
|
+
end
|
8
|
+
|
9
|
+
def charset
|
10
|
+
@charset ||= extract_charset
|
11
|
+
end
|
12
|
+
|
13
|
+
def body
|
14
|
+
@body ||= extract_body
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def extract_charset
|
20
|
+
content_type = @response['Content-Type'] || ''
|
21
|
+
m = content_type.match(/charset=([\w-]+)/)
|
22
|
+
(m && m[1]) || @response.body.scrub =~ /<meta[^>]*charset\s*=\s*["']?(.+?)["' >]/i && $1 || ''
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_body
|
26
|
+
should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
|
27
|
+
end
|
28
|
+
|
29
|
+
def should_convert_body_to_utf8?
|
30
|
+
charset != '' && charset != 'utf-8'
|
31
|
+
end
|
32
|
+
|
33
|
+
def convert_encoding_to_utf8(body, from)
|
34
|
+
Encoding::Converter.new(from, 'utf-8').convert(body)
|
35
|
+
rescue EncodingError
|
36
|
+
body
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,60 +1,62 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
|
5
|
-
require '
|
6
|
-
|
7
|
-
require 'link_thumbnailer/
|
8
|
-
require 'link_thumbnailer/
|
9
|
-
require 'link_thumbnailer/scrapers/default/
|
10
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
11
|
-
require 'link_thumbnailer/scrapers/default/
|
12
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
13
|
-
require 'link_thumbnailer/scrapers/default/
|
14
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
15
|
-
require 'link_thumbnailer/scrapers/default/
|
16
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
29
|
-
@
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
"
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'delegate'
|
4
|
+
require 'active_support/core_ext/object/blank'
|
5
|
+
require 'active_support/inflector'
|
6
|
+
|
7
|
+
require 'link_thumbnailer/parser'
|
8
|
+
require 'link_thumbnailer/models/website'
|
9
|
+
require 'link_thumbnailer/scrapers/default/title'
|
10
|
+
require 'link_thumbnailer/scrapers/opengraph/title'
|
11
|
+
require 'link_thumbnailer/scrapers/default/description'
|
12
|
+
require 'link_thumbnailer/scrapers/opengraph/description'
|
13
|
+
require 'link_thumbnailer/scrapers/default/images'
|
14
|
+
require 'link_thumbnailer/scrapers/opengraph/images'
|
15
|
+
require 'link_thumbnailer/scrapers/default/videos'
|
16
|
+
require 'link_thumbnailer/scrapers/opengraph/videos'
|
17
|
+
require 'link_thumbnailer/scrapers/default/favicon'
|
18
|
+
require 'link_thumbnailer/scrapers/opengraph/favicon'
|
19
|
+
|
20
|
+
module LinkThumbnailer
|
21
|
+
class Scraper < ::SimpleDelegator
|
22
|
+
|
23
|
+
attr_reader :document, :source, :url, :config, :website
|
24
|
+
|
25
|
+
def initialize(source, url)
|
26
|
+
@source = source
|
27
|
+
@url = url
|
28
|
+
@config = ::LinkThumbnailer.page.config
|
29
|
+
@document = parser.call(source)
|
30
|
+
@website = ::LinkThumbnailer::Models::Website.new
|
31
|
+
@website.url = url
|
32
|
+
|
33
|
+
super(config)
|
34
|
+
end
|
35
|
+
|
36
|
+
def call
|
37
|
+
config.attributes.each do |name|
|
38
|
+
config.scrapers.each do |scraper_prefix|
|
39
|
+
scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
|
40
|
+
break unless website.send(name).blank?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
website
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def scraper_class(prefix, name)
|
50
|
+
prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
|
51
|
+
name = name.to_s.camelize
|
52
|
+
"#{prefix}::#{name}".constantize
|
53
|
+
rescue NameError
|
54
|
+
raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
|
55
|
+
end
|
56
|
+
|
57
|
+
def parser
|
58
|
+
::LinkThumbnailer::Parser.new
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -1,67 +1,69 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'link_thumbnailer/models/
|
5
|
-
require 'link_thumbnailer/models/
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
@
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'delegate'
|
4
|
+
require 'link_thumbnailer/models/title'
|
5
|
+
require 'link_thumbnailer/models/description'
|
6
|
+
require 'link_thumbnailer/models/image'
|
7
|
+
require 'link_thumbnailer/models/video'
|
8
|
+
|
9
|
+
module LinkThumbnailer
|
10
|
+
module Scrapers
|
11
|
+
class Base < ::SimpleDelegator
|
12
|
+
|
13
|
+
attr_reader :config, :document, :website, :attribute_name
|
14
|
+
|
15
|
+
def initialize(document, website = nil)
|
16
|
+
@config = ::LinkThumbnailer.page.config
|
17
|
+
@document = document
|
18
|
+
@website = website
|
19
|
+
|
20
|
+
super(config)
|
21
|
+
end
|
22
|
+
|
23
|
+
def call(attribute_name)
|
24
|
+
return false unless website.present?
|
25
|
+
return false unless applicable?
|
26
|
+
|
27
|
+
@attribute_name = attribute_name
|
28
|
+
|
29
|
+
website.send("#{attribute_name}=", value)
|
30
|
+
website
|
31
|
+
end
|
32
|
+
|
33
|
+
def applicable?
|
34
|
+
true
|
35
|
+
end
|
36
|
+
|
37
|
+
def value
|
38
|
+
fail NotImplementedError
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def meta_xpath(options = {})
|
44
|
+
meta_xpaths(options).first
|
45
|
+
end
|
46
|
+
|
47
|
+
def meta_xpaths(options = {})
|
48
|
+
key = options.fetch(:key, :property)
|
49
|
+
value = options.fetch(:value, :content)
|
50
|
+
attribute = options.fetch(:attribute, attribute_name)
|
51
|
+
|
52
|
+
document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
|
53
|
+
end
|
54
|
+
|
55
|
+
def abc
|
56
|
+
'abcdefghijklmnopqrstuvwxyz'
|
57
|
+
end
|
58
|
+
|
59
|
+
def model_class
|
60
|
+
"::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
|
61
|
+
end
|
62
|
+
|
63
|
+
def modelize(node, text = nil)
|
64
|
+
model_class.new(node, text)
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|