link_thumbnailer 3.2.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/.travis.yml +2 -4
- data/CHANGELOG.md +252 -75
- data/Gemfile +5 -3
- data/README.md +4 -0
- data/Rakefile +2 -0
- data/lib/generators/link_thumbnailer/install_generator.rb +2 -0
- data/lib/generators/templates/initializer.rb +15 -0
- data/lib/link_thumbnailer.rb +2 -0
- data/lib/link_thumbnailer/configuration.rb +74 -68
- data/lib/link_thumbnailer/exceptions.rb +3 -0
- data/lib/link_thumbnailer/grader.rb +2 -0
- data/lib/link_thumbnailer/graders/base.rb +2 -0
- data/lib/link_thumbnailer/graders/html_attribute.rb +2 -0
- data/lib/link_thumbnailer/graders/length.rb +2 -0
- data/lib/link_thumbnailer/graders/link_density.rb +2 -0
- data/lib/link_thumbnailer/graders/position.rb +2 -0
- data/lib/link_thumbnailer/image_comparator.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/base.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/size.rb +2 -0
- data/lib/link_thumbnailer/image_parser.rb +13 -1
- data/lib/link_thumbnailer/image_validator.rb +2 -0
- data/lib/link_thumbnailer/model.rb +20 -17
- data/lib/link_thumbnailer/models/description.rb +2 -0
- data/lib/link_thumbnailer/models/favicon.rb +2 -0
- data/lib/link_thumbnailer/models/image.rb +56 -54
- data/lib/link_thumbnailer/models/title.rb +2 -0
- data/lib/link_thumbnailer/models/video.rb +2 -0
- data/lib/link_thumbnailer/models/website.rb +54 -52
- data/lib/link_thumbnailer/page.rb +4 -1
- data/lib/link_thumbnailer/parser.rb +3 -1
- data/lib/link_thumbnailer/processor.rb +38 -5
- data/lib/link_thumbnailer/railtie.rb +2 -0
- data/lib/link_thumbnailer/response.rb +39 -0
- data/lib/link_thumbnailer/scraper.rb +62 -60
- data/lib/link_thumbnailer/scrapers/base.rb +69 -67
- data/lib/link_thumbnailer/scrapers/default/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/favicon.rb +16 -2
- data/lib/link_thumbnailer/scrapers/default/images.rb +5 -1
- data/lib/link_thumbnailer/scrapers/default/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/videos.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/image.rb +7 -1
- data/lib/link_thumbnailer/scrapers/opengraph/images.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/video.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +2 -0
- data/lib/link_thumbnailer/uri.rb +20 -0
- data/lib/link_thumbnailer/version.rb +3 -1
- data/lib/link_thumbnailer/video_parser.rb +3 -1
- data/link_thumbnailer.gemspec +8 -6
- data/spec/configuration_spec.rb +4 -2
- data/spec/fixture_spec.rb +21 -0
- data/spec/fixtures/default_with_few_favicons.html +15 -0
- data/spec/fixtures/google_shift_jis.html +6 -0
- data/spec/fixtures/google_utf8.html +6 -0
- data/spec/fixtures/google_utf8_no_meta_charset.html +6 -0
- data/spec/fixtures/with_related_path_in_href.html +13 -0
- data/spec/fixtures/with_root_path_in_href.html +13 -0
- data/spec/grader_spec.rb +3 -1
- data/spec/graders/base_spec.rb +2 -0
- data/spec/graders/html_attribute_spec.rb +9 -7
- data/spec/graders/length_spec.rb +10 -6
- data/spec/graders/link_density_spec.rb +4 -2
- data/spec/graders/position_spec.rb +8 -6
- data/spec/image_comparators/size_spec.rb +2 -0
- data/spec/image_validator_spec.rb +3 -1
- data/spec/model_spec.rb +2 -0
- data/spec/models/description_spec.rb +3 -1
- data/spec/models/favicon_spec.rb +2 -0
- data/spec/models/image_spec.rb +6 -4
- data/spec/models/title_spec.rb +2 -0
- data/spec/models/video_spec.rb +7 -5
- data/spec/models/website_spec.rb +5 -3
- data/spec/page_spec.rb +2 -0
- data/spec/processor_spec.rb +74 -23
- data/spec/response_spec.rb +84 -0
- data/spec/scraper_spec.rb +6 -4
- data/spec/scrapers/base_spec.rb +6 -4
- data/spec/scrapers/opengraph/base_spec.rb +8 -6
- data/spec/spec_helper.rb +2 -0
- data/spec/uri_spec.rb +44 -0
- data/spec/video_parser_spec.rb +15 -13
- metadata +37 -19
data/Gemfile
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
source 'https://rubygems.org'
|
|
2
4
|
|
|
3
5
|
# Specify your gem's dependencies in link_thumbnailer.gemspec
|
|
4
6
|
gemspec
|
|
5
7
|
|
|
6
8
|
group :development, :test do
|
|
7
|
-
gem 'rspec', '
|
|
8
|
-
gem 'webmock', '
|
|
9
|
-
gem 'pry', '
|
|
9
|
+
gem 'rspec', '>= 2.14'
|
|
10
|
+
gem 'webmock', '>= 1.14'
|
|
11
|
+
gem 'pry', '>= 0.9'
|
|
10
12
|
end
|
data/README.md
CHANGED
|
@@ -165,6 +165,10 @@ LinkThumbnailer.configure do |config|
|
|
|
165
165
|
# Sets number of concurrent http connections that can be opened to fetch images informations such as size and type.
|
|
166
166
|
#
|
|
167
167
|
# config.max_concurrency = 20
|
|
168
|
+
|
|
169
|
+
# Sets the default encoding.
|
|
170
|
+
#
|
|
171
|
+
# config.encoding = 'utf-8'
|
|
168
172
|
end
|
|
169
173
|
```
|
|
170
174
|
|
data/Rakefile
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
# Use this hook to configure LinkThumbnailer bahaviors.
|
|
2
4
|
LinkThumbnailer.configure do |config|
|
|
3
5
|
# Numbers of redirects before raising an exception when trying to parse given url.
|
|
@@ -33,6 +35,11 @@ LinkThumbnailer.configure do |config|
|
|
|
33
35
|
#
|
|
34
36
|
# config.attributes = [:title, :images, :description, :videos, :favicon]
|
|
35
37
|
|
|
38
|
+
# Prior favicon size. If the website doesn't have such size - returns the first favicon.
|
|
39
|
+
# Value should be like '32x32' or '16x16'. Default value is nil.
|
|
40
|
+
#
|
|
41
|
+
# config.favicon_size = nil
|
|
42
|
+
|
|
36
43
|
# List of procedures used to rate the website description. Add you custom class
|
|
37
44
|
# here. See wiki for more details on how to build your own graders.
|
|
38
45
|
#
|
|
@@ -80,4 +87,12 @@ LinkThumbnailer.configure do |config|
|
|
|
80
87
|
# Defines the strategies to use to scrap the website. See the [Open Graph Protocol](http://ogp.me/) for more information.
|
|
81
88
|
#
|
|
82
89
|
# config.scrapers = [:opengraph, :default]
|
|
90
|
+
|
|
91
|
+
# Limit for download size in bytes. When using ActiveSupport, you can also use values like 10.megabytes
|
|
92
|
+
#
|
|
93
|
+
# config.download_size_limit = 10 * 1024 * 1024
|
|
94
|
+
|
|
95
|
+
# Sets the default encoding.
|
|
96
|
+
#
|
|
97
|
+
# config.encoding = 'utf-8'
|
|
83
98
|
end
|
data/lib/link_thumbnailer.rb
CHANGED
|
@@ -1,68 +1,74 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
:
|
|
29
|
-
:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@
|
|
42
|
-
@
|
|
43
|
-
@
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
%r{^http://
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
->(description) { ::LinkThumbnailer::Graders::
|
|
55
|
-
->(description) { ::LinkThumbnailer::Graders::
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@
|
|
61
|
-
@
|
|
62
|
-
@
|
|
63
|
-
@
|
|
64
|
-
@
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LinkThumbnailer
|
|
4
|
+
|
|
5
|
+
# Access point for the gem configurations.
|
|
6
|
+
#
|
|
7
|
+
# @return [LinkThumbnailer::Configuration] a configuration instance.
|
|
8
|
+
def self.config
|
|
9
|
+
@config ||= Configuration.new
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Configure hook used in the gem initializer. Convinient way to set all the
|
|
13
|
+
# gem configurations.
|
|
14
|
+
#
|
|
15
|
+
# @example inside config/initializers/link_thumbnaler.rb
|
|
16
|
+
# LinkThumbnailer.configure do |config|
|
|
17
|
+
# config.user_agent = 'link_thumbnailer'
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# @return [void]
|
|
21
|
+
def self.configure
|
|
22
|
+
yield config if block_given?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
class Configuration
|
|
26
|
+
|
|
27
|
+
attr_accessor :redirect_limit, :blacklist_urls, :user_agent,
|
|
28
|
+
:verify_ssl, :http_open_timeout, :http_read_timeout, :attributes,
|
|
29
|
+
:graders, :description_min_length, :positive_regex, :negative_regex,
|
|
30
|
+
:image_limit, :image_stats, :raise_on_invalid_format, :max_concurrency,
|
|
31
|
+
:scrapers, :http_override_headers, :download_size_limit, :encoding,
|
|
32
|
+
:favicon_size
|
|
33
|
+
|
|
34
|
+
alias_method :http_timeout, :http_open_timeout
|
|
35
|
+
alias_method :http_timeout=, :http_open_timeout=
|
|
36
|
+
|
|
37
|
+
# Create a new instance.
|
|
38
|
+
#
|
|
39
|
+
# @return [LinkThumbnailer::Configuration]
|
|
40
|
+
def initialize
|
|
41
|
+
@redirect_limit = 3
|
|
42
|
+
@user_agent = 'link_thumbnailer'
|
|
43
|
+
@verify_ssl = true
|
|
44
|
+
@http_open_timeout = 5
|
|
45
|
+
@http_read_timeout = 5
|
|
46
|
+
@blacklist_urls = [
|
|
47
|
+
%r{^http://ad\.doubleclick\.net/},
|
|
48
|
+
%r{^http://b\.scorecardresearch\.com/},
|
|
49
|
+
%r{^http://pixel\.quantserve\.com/},
|
|
50
|
+
%r{^http://s7\.addthis\.com/}
|
|
51
|
+
]
|
|
52
|
+
@attributes = [:title, :images, :description, :videos, :favicon]
|
|
53
|
+
@graders = [
|
|
54
|
+
->(description) { ::LinkThumbnailer::Graders::Length.new(description) },
|
|
55
|
+
->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
|
|
56
|
+
->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
|
|
57
|
+
->(description) { ::LinkThumbnailer::Graders::Position.new(description, weigth: 3) },
|
|
58
|
+
->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) },
|
|
59
|
+
]
|
|
60
|
+
@description_min_length = 50
|
|
61
|
+
@positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
|
|
62
|
+
@negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
|
|
63
|
+
@image_limit = 5
|
|
64
|
+
@image_stats = true
|
|
65
|
+
@raise_on_invalid_format = false
|
|
66
|
+
@max_concurrency = 20
|
|
67
|
+
@scrapers = [:opengraph, :default]
|
|
68
|
+
@http_override_headers = { 'Accept-Encoding' => 'none' }
|
|
69
|
+
@download_size_limit = 10 * 1024 * 1024
|
|
70
|
+
@encoding = 'utf-8'
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module LinkThumbnailer
|
|
2
4
|
Exceptions = Class.new(StandardError)
|
|
3
5
|
RedirectLimit = Class.new(Exceptions)
|
|
@@ -6,4 +8,5 @@ module LinkThumbnailer
|
|
|
6
8
|
ScraperInvalid = Class.new(Exceptions)
|
|
7
9
|
HTTPError = Class.new(Exceptions)
|
|
8
10
|
SyntaxError = Class.new(Exceptions)
|
|
11
|
+
DownloadSizeLimit = Class.new(Exceptions)
|
|
9
12
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'image_info'
|
|
2
4
|
|
|
3
5
|
module LinkThumbnailer
|
|
@@ -6,7 +8,7 @@ module LinkThumbnailer
|
|
|
6
8
|
attr_reader :images
|
|
7
9
|
|
|
8
10
|
def initialize(urls)
|
|
9
|
-
@images = perform? ?
|
|
11
|
+
@images = perform? ? image_info(urls) : default_images(urls)
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
def size
|
|
@@ -19,6 +21,10 @@ module LinkThumbnailer
|
|
|
19
21
|
|
|
20
22
|
private
|
|
21
23
|
|
|
24
|
+
def default_images(urls)
|
|
25
|
+
Array(urls).compact.map(&method(:build_default_image))
|
|
26
|
+
end
|
|
27
|
+
|
|
22
28
|
def build_default_image(uri)
|
|
23
29
|
NullImage.new(uri)
|
|
24
30
|
end
|
|
@@ -31,6 +37,12 @@ module LinkThumbnailer
|
|
|
31
37
|
::LinkThumbnailer.page.config.max_concurrency
|
|
32
38
|
end
|
|
33
39
|
|
|
40
|
+
def image_info(urls)
|
|
41
|
+
::ImageInfo.from(urls, max_concurrency: max_concurrency)
|
|
42
|
+
rescue
|
|
43
|
+
default_images(urls)
|
|
44
|
+
end
|
|
45
|
+
|
|
34
46
|
class NullImage
|
|
35
47
|
attr_reader :uri
|
|
36
48
|
|
|
@@ -1,17 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LinkThumbnailer
|
|
4
|
+
class Model
|
|
5
|
+
|
|
6
|
+
def to_json(*args)
|
|
7
|
+
as_json.to_json(*args)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def sanitize(str)
|
|
13
|
+
return unless str
|
|
14
|
+
|
|
15
|
+
str = str.encode("UTF-16", "UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
16
|
+
str = str.encode("UTF-8", "UTF-16").strip.gsub(/[\r\n\f]+/, "\n")
|
|
17
|
+
str
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -1,54 +1,56 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require 'link_thumbnailer/
|
|
4
|
-
require 'link_thumbnailer/
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/model'
|
|
4
|
+
require 'link_thumbnailer/image_parser'
|
|
5
|
+
require 'link_thumbnailer/image_comparator'
|
|
6
|
+
require 'link_thumbnailer/image_validator'
|
|
7
|
+
|
|
8
|
+
module LinkThumbnailer
|
|
9
|
+
module Models
|
|
10
|
+
class Image < ::LinkThumbnailer::Model
|
|
11
|
+
|
|
12
|
+
attr_reader :src, :type, :size
|
|
13
|
+
|
|
14
|
+
def initialize(src, size = nil, type = nil)
|
|
15
|
+
@src = src
|
|
16
|
+
@size = size || parser.size
|
|
17
|
+
@type = type || parser.type
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def to_s
|
|
21
|
+
src.to_s
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def <=>(other)
|
|
25
|
+
comparator.call(other)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def valid?
|
|
29
|
+
validator.call
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def as_json(*)
|
|
33
|
+
{
|
|
34
|
+
src: src.to_s,
|
|
35
|
+
size: size,
|
|
36
|
+
type: type
|
|
37
|
+
}
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def parser
|
|
43
|
+
@parser ||= ::LinkThumbnailer::ImageParser.new(src)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def validator
|
|
47
|
+
::LinkThumbnailer::ImageValidator.new(self)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def comparator
|
|
51
|
+
::LinkThumbnailer::ImageComparator.new(self)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|