link_thumbnailer 3.2.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +5 -5
  2. data/.ruby-version +1 -0
  3. data/.travis.yml +2 -4
  4. data/CHANGELOG.md +252 -75
  5. data/Gemfile +5 -3
  6. data/README.md +4 -0
  7. data/Rakefile +2 -0
  8. data/lib/generators/link_thumbnailer/install_generator.rb +2 -0
  9. data/lib/generators/templates/initializer.rb +15 -0
  10. data/lib/link_thumbnailer.rb +2 -0
  11. data/lib/link_thumbnailer/configuration.rb +74 -68
  12. data/lib/link_thumbnailer/exceptions.rb +3 -0
  13. data/lib/link_thumbnailer/grader.rb +2 -0
  14. data/lib/link_thumbnailer/graders/base.rb +2 -0
  15. data/lib/link_thumbnailer/graders/html_attribute.rb +2 -0
  16. data/lib/link_thumbnailer/graders/length.rb +2 -0
  17. data/lib/link_thumbnailer/graders/link_density.rb +2 -0
  18. data/lib/link_thumbnailer/graders/position.rb +2 -0
  19. data/lib/link_thumbnailer/image_comparator.rb +2 -0
  20. data/lib/link_thumbnailer/image_comparators/base.rb +2 -0
  21. data/lib/link_thumbnailer/image_comparators/size.rb +2 -0
  22. data/lib/link_thumbnailer/image_parser.rb +13 -1
  23. data/lib/link_thumbnailer/image_validator.rb +2 -0
  24. data/lib/link_thumbnailer/model.rb +20 -17
  25. data/lib/link_thumbnailer/models/description.rb +2 -0
  26. data/lib/link_thumbnailer/models/favicon.rb +2 -0
  27. data/lib/link_thumbnailer/models/image.rb +56 -54
  28. data/lib/link_thumbnailer/models/title.rb +2 -0
  29. data/lib/link_thumbnailer/models/video.rb +2 -0
  30. data/lib/link_thumbnailer/models/website.rb +54 -52
  31. data/lib/link_thumbnailer/page.rb +4 -1
  32. data/lib/link_thumbnailer/parser.rb +3 -1
  33. data/lib/link_thumbnailer/processor.rb +38 -5
  34. data/lib/link_thumbnailer/railtie.rb +2 -0
  35. data/lib/link_thumbnailer/response.rb +39 -0
  36. data/lib/link_thumbnailer/scraper.rb +62 -60
  37. data/lib/link_thumbnailer/scrapers/base.rb +69 -67
  38. data/lib/link_thumbnailer/scrapers/default/base.rb +2 -0
  39. data/lib/link_thumbnailer/scrapers/default/description.rb +2 -0
  40. data/lib/link_thumbnailer/scrapers/default/favicon.rb +16 -2
  41. data/lib/link_thumbnailer/scrapers/default/images.rb +5 -1
  42. data/lib/link_thumbnailer/scrapers/default/title.rb +2 -0
  43. data/lib/link_thumbnailer/scrapers/default/videos.rb +2 -0
  44. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +2 -0
  45. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +2 -0
  46. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +2 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +7 -1
  48. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +2 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +2 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +2 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +2 -0
  52. data/lib/link_thumbnailer/uri.rb +20 -0
  53. data/lib/link_thumbnailer/version.rb +3 -1
  54. data/lib/link_thumbnailer/video_parser.rb +3 -1
  55. data/link_thumbnailer.gemspec +8 -6
  56. data/spec/configuration_spec.rb +4 -2
  57. data/spec/fixture_spec.rb +21 -0
  58. data/spec/fixtures/default_with_few_favicons.html +15 -0
  59. data/spec/fixtures/google_shift_jis.html +6 -0
  60. data/spec/fixtures/google_utf8.html +6 -0
  61. data/spec/fixtures/google_utf8_no_meta_charset.html +6 -0
  62. data/spec/fixtures/with_related_path_in_href.html +13 -0
  63. data/spec/fixtures/with_root_path_in_href.html +13 -0
  64. data/spec/grader_spec.rb +3 -1
  65. data/spec/graders/base_spec.rb +2 -0
  66. data/spec/graders/html_attribute_spec.rb +9 -7
  67. data/spec/graders/length_spec.rb +10 -6
  68. data/spec/graders/link_density_spec.rb +4 -2
  69. data/spec/graders/position_spec.rb +8 -6
  70. data/spec/image_comparators/size_spec.rb +2 -0
  71. data/spec/image_validator_spec.rb +3 -1
  72. data/spec/model_spec.rb +2 -0
  73. data/spec/models/description_spec.rb +3 -1
  74. data/spec/models/favicon_spec.rb +2 -0
  75. data/spec/models/image_spec.rb +6 -4
  76. data/spec/models/title_spec.rb +2 -0
  77. data/spec/models/video_spec.rb +7 -5
  78. data/spec/models/website_spec.rb +5 -3
  79. data/spec/page_spec.rb +2 -0
  80. data/spec/processor_spec.rb +74 -23
  81. data/spec/response_spec.rb +84 -0
  82. data/spec/scraper_spec.rb +6 -4
  83. data/spec/scrapers/base_spec.rb +6 -4
  84. data/spec/scrapers/opengraph/base_spec.rb +8 -6
  85. data/spec/spec_helper.rb +2 -0
  86. data/spec/uri_spec.rb +44 -0
  87. data/spec/video_parser_spec.rb +15 -13
  88. metadata +37 -19
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'link_thumbnailer/model'
2
4
 
3
5
  module LinkThumbnailer
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'link_thumbnailer/model'
2
4
  require 'link_thumbnailer/video_parser'
3
5
 
@@ -1,52 +1,54 @@
1
- require 'link_thumbnailer/model'
2
-
3
- module LinkThumbnailer
4
- module Models
5
- class Website < ::LinkThumbnailer::Model
6
-
7
- attr_accessor :url, :title, :description, :images, :videos, :favicon
8
-
9
- def initialize
10
- @images = []
11
- @videos = []
12
- end
13
-
14
- def video=(video)
15
- self.videos = video
16
- end
17
-
18
- def videos=(videos)
19
- Array(videos).each do |video|
20
- @videos << video
21
- end
22
- end
23
-
24
- def image=(image)
25
- self.images = image
26
- end
27
-
28
- def images=(images)
29
- Array(images).each do |image|
30
- next unless image.valid?
31
- @images << image
32
- end
33
- end
34
-
35
- def images
36
- @images.sort!
37
- end
38
-
39
- def as_json(*)
40
- {
41
- url: url.to_s,
42
- favicon: favicon,
43
- title: title,
44
- description: description,
45
- images: images.map(&:as_json),
46
- videos: videos.map(&:as_json)
47
- }
48
- end
49
-
50
- end
51
- end
52
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/model'
4
+
5
+ module LinkThumbnailer
6
+ module Models
7
+ class Website < ::LinkThumbnailer::Model
8
+
9
+ attr_accessor :url, :title, :description, :images, :videos, :favicon
10
+
11
+ def initialize
12
+ @images = []
13
+ @videos = []
14
+ end
15
+
16
+ def video=(video)
17
+ self.videos = video
18
+ end
19
+
20
+ def videos=(videos)
21
+ Array(videos).each do |video|
22
+ @videos << video
23
+ end
24
+ end
25
+
26
+ def image=(image)
27
+ self.images = image
28
+ end
29
+
30
+ def images=(images)
31
+ Array(images).each do |image|
32
+ next unless image.valid?
33
+ @images << image
34
+ end
35
+ end
36
+
37
+ def images
38
+ @images.sort!
39
+ end
40
+
41
+ def as_json(*)
42
+ {
43
+ url: url.to_s,
44
+ favicon: favicon,
45
+ title: title,
46
+ description: description,
47
+ images: images.map(&:as_json),
48
+ videos: videos.map(&:as_json)
49
+ }
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -1,3 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/response'
1
4
  require 'link_thumbnailer/processor'
2
5
  require 'link_thumbnailer/scraper'
3
6
 
@@ -14,7 +17,7 @@ module LinkThumbnailer
14
17
  end
15
18
 
16
19
  def generate
17
- @source = processor.call(url)
20
+ @source = processor.start(url)
18
21
  scraper.call
19
22
  end
20
23
 
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
 
3
5
  module LinkThumbnailer
4
6
  class Parser
5
7
 
6
8
  def call(source)
7
- ::Nokogiri::HTML(source)
9
+ ::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
8
10
  rescue ::Nokogiri::XML::SyntaxError => e
9
11
  raise ::LinkThumbnailer::SyntaxError.new(e.message)
10
12
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'delegate'
2
4
  require 'uri'
3
5
  require 'net/http/persistent'
@@ -15,6 +17,12 @@ module LinkThumbnailer
15
17
  super(config)
16
18
  end
17
19
 
20
+ def start(url)
21
+ result = call(url)
22
+ shutdown
23
+ result
24
+ end
25
+
18
26
  def call(url = '', redirect_count = 0, headers = {})
19
27
  self.url = url
20
28
  @redirect_count = redirect_count
@@ -26,12 +34,16 @@ module LinkThumbnailer
26
34
  set_http_options
27
35
  perform_request
28
36
  end
29
- rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
37
+ rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error, ::Net::HTTP::Persistent::Error => e
30
38
  raise ::LinkThumbnailer::HTTPError.new(e.message)
31
39
  end
32
40
 
33
41
  private
34
42
 
43
+ def shutdown
44
+ http.shutdown
45
+ end
46
+
35
47
  def with_valid_url
36
48
  raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
37
49
  yield if block_given?
@@ -39,8 +51,8 @@ module LinkThumbnailer
39
51
 
40
52
  def set_http_headers(headers = {})
41
53
  headers.each { |k, v| http.headers[k] = v }
42
- http.headers['User-Agent'] = user_agent
43
- http.override_headers['Accept-Encoding'] = 'none'
54
+ http.override_headers['User-Agent'] = user_agent
55
+ config.http_override_headers.each { |k, v| http.override_headers[k] = v }
44
56
  end
45
57
 
46
58
  def set_http_options
@@ -51,7 +63,7 @@ module LinkThumbnailer
51
63
  end
52
64
 
53
65
  def perform_request
54
- response = http.request(url)
66
+ response = request_in_chunks
55
67
  headers = {}
56
68
  headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
57
69
 
@@ -59,7 +71,7 @@ module LinkThumbnailer
59
71
 
60
72
  case response
61
73
  when ::Net::HTTPSuccess
62
- response.body
74
+ Response.new(response).body
63
75
  when ::Net::HTTPRedirection
64
76
  call(
65
77
  resolve_relative_url(response['location'].to_s),
@@ -71,6 +83,19 @@ module LinkThumbnailer
71
83
  end
72
84
  end
73
85
 
86
+ def request_in_chunks
87
+ body = String.new
88
+ response = http.request(url) do |resp|
89
+ raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(resp.content_length)
90
+ resp.read_body do |chunk|
91
+ body.concat(chunk)
92
+ raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(body.length)
93
+ end
94
+ end
95
+ response.body = body
96
+ response
97
+ end
98
+
74
99
  def resolve_relative_url(location)
75
100
  location.start_with?('http') ? location : build_absolute_url_for(location)
76
101
  end
@@ -99,6 +124,10 @@ module LinkThumbnailer
99
124
  config.verify_ssl
100
125
  end
101
126
 
127
+ def download_size_limit
128
+ config.download_size_limit
129
+ end
130
+
102
131
  def too_many_redirections?
103
132
  redirect_count > redirect_limit
104
133
  end
@@ -118,6 +147,10 @@ module LinkThumbnailer
118
147
  false
119
148
  end
120
149
 
150
+ def too_big_download_size?(size)
151
+ size.to_i > download_size_limit.to_i
152
+ end
153
+
121
154
  def url=(url)
122
155
  @url = ::URI.parse(url.to_s)
123
156
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module LinkThumbnailer
2
4
  class Railtie < ::Rails::Railtie
3
5
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Response
5
+ def initialize(response)
6
+ @response = response
7
+ end
8
+
9
+ def charset
10
+ @charset ||= extract_charset
11
+ end
12
+
13
+ def body
14
+ @body ||= extract_body
15
+ end
16
+
17
+ private
18
+
19
+ def extract_charset
20
+ content_type = @response['Content-Type'] || ''
21
+ m = content_type.match(/charset=([\w-]+)/)
22
+ (m && m[1]) || @response.body.scrub =~ /<meta[^>]*charset\s*=\s*["']?(.+?)["' >]/i && $1 || ''
23
+ end
24
+
25
+ def extract_body
26
+ should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
27
+ end
28
+
29
+ def should_convert_body_to_utf8?
30
+ charset != '' && charset != 'utf-8'
31
+ end
32
+
33
+ def convert_encoding_to_utf8(body, from)
34
+ Encoding::Converter.new(from, 'utf-8').convert(body)
35
+ rescue EncodingError
36
+ body
37
+ end
38
+ end
39
+ end
@@ -1,60 +1,62 @@
1
- require 'delegate'
2
- require 'active_support/core_ext/object/blank'
3
- require 'active_support/inflector'
4
-
5
- require 'link_thumbnailer/parser'
6
- require 'link_thumbnailer/models/website'
7
- require 'link_thumbnailer/scrapers/default/title'
8
- require 'link_thumbnailer/scrapers/opengraph/title'
9
- require 'link_thumbnailer/scrapers/default/description'
10
- require 'link_thumbnailer/scrapers/opengraph/description'
11
- require 'link_thumbnailer/scrapers/default/images'
12
- require 'link_thumbnailer/scrapers/opengraph/images'
13
- require 'link_thumbnailer/scrapers/default/videos'
14
- require 'link_thumbnailer/scrapers/opengraph/videos'
15
- require 'link_thumbnailer/scrapers/default/favicon'
16
- require 'link_thumbnailer/scrapers/opengraph/favicon'
17
-
18
- module LinkThumbnailer
19
- class Scraper < ::SimpleDelegator
20
-
21
- attr_reader :document, :source, :url, :config, :website
22
-
23
- def initialize(source, url)
24
- @source = source
25
- @url = url
26
- @config = ::LinkThumbnailer.page.config
27
- @document = parser.call(source)
28
- @website = ::LinkThumbnailer::Models::Website.new
29
- @website.url = url
30
-
31
- super(config)
32
- end
33
-
34
- def call
35
- config.attributes.each do |name|
36
- config.scrapers.each do |scraper_prefix|
37
- scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
38
- break unless website.send(name).blank?
39
- end
40
- end
41
-
42
- website
43
- end
44
-
45
- private
46
-
47
- def scraper_class(prefix, name)
48
- prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
49
- name = name.to_s.camelize
50
- "#{prefix}::#{name}".constantize
51
- rescue NameError
52
- raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
53
- end
54
-
55
- def parser
56
- ::LinkThumbnailer::Parser.new
57
- end
58
-
59
- end
60
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+
7
+ require 'link_thumbnailer/parser'
8
+ require 'link_thumbnailer/models/website'
9
+ require 'link_thumbnailer/scrapers/default/title'
10
+ require 'link_thumbnailer/scrapers/opengraph/title'
11
+ require 'link_thumbnailer/scrapers/default/description'
12
+ require 'link_thumbnailer/scrapers/opengraph/description'
13
+ require 'link_thumbnailer/scrapers/default/images'
14
+ require 'link_thumbnailer/scrapers/opengraph/images'
15
+ require 'link_thumbnailer/scrapers/default/videos'
16
+ require 'link_thumbnailer/scrapers/opengraph/videos'
17
+ require 'link_thumbnailer/scrapers/default/favicon'
18
+ require 'link_thumbnailer/scrapers/opengraph/favicon'
19
+
20
+ module LinkThumbnailer
21
+ class Scraper < ::SimpleDelegator
22
+
23
+ attr_reader :document, :source, :url, :config, :website
24
+
25
+ def initialize(source, url)
26
+ @source = source
27
+ @url = url
28
+ @config = ::LinkThumbnailer.page.config
29
+ @document = parser.call(source)
30
+ @website = ::LinkThumbnailer::Models::Website.new
31
+ @website.url = url
32
+
33
+ super(config)
34
+ end
35
+
36
+ def call
37
+ config.attributes.each do |name|
38
+ config.scrapers.each do |scraper_prefix|
39
+ scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
40
+ break unless website.send(name).blank?
41
+ end
42
+ end
43
+
44
+ website
45
+ end
46
+
47
+ private
48
+
49
+ def scraper_class(prefix, name)
50
+ prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
51
+ name = name.to_s.camelize
52
+ "#{prefix}::#{name}".constantize
53
+ rescue NameError
54
+ raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
55
+ end
56
+
57
+ def parser
58
+ ::LinkThumbnailer::Parser.new
59
+ end
60
+
61
+ end
62
+ end
@@ -1,67 +1,69 @@
1
- require 'delegate'
2
- require 'link_thumbnailer/models/title'
3
- require 'link_thumbnailer/models/description'
4
- require 'link_thumbnailer/models/image'
5
- require 'link_thumbnailer/models/video'
6
-
7
- module LinkThumbnailer
8
- module Scrapers
9
- class Base < ::SimpleDelegator
10
-
11
- attr_reader :config, :document, :website, :attribute_name
12
-
13
- def initialize(document, website = nil)
14
- @config = ::LinkThumbnailer.page.config
15
- @document = document
16
- @website = website
17
-
18
- super(config)
19
- end
20
-
21
- def call(attribute_name)
22
- return false unless website.present?
23
- return false unless applicable?
24
-
25
- @attribute_name = attribute_name
26
-
27
- website.send("#{attribute_name}=", value)
28
- website
29
- end
30
-
31
- def applicable?
32
- true
33
- end
34
-
35
- def value
36
- fail NotImplementedError
37
- end
38
-
39
- private
40
-
41
- def meta_xpath(options = {})
42
- meta_xpaths(options).first
43
- end
44
-
45
- def meta_xpaths(options = {})
46
- key = options.fetch(:key, :property)
47
- value = options.fetch(:value, :content)
48
- attribute = options.fetch(:attribute, attribute_name)
49
-
50
- document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and @#{value}]")
51
- end
52
-
53
- def abc
54
- 'abcdefghijklmnopqrstuvwxyz'
55
- end
56
-
57
- def model_class
58
- "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
59
- end
60
-
61
- def modelize(node, text = nil)
62
- model_class.new(node, text)
63
- end
64
-
65
- end
66
- end
67
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/models/title'
5
+ require 'link_thumbnailer/models/description'
6
+ require 'link_thumbnailer/models/image'
7
+ require 'link_thumbnailer/models/video'
8
+
9
+ module LinkThumbnailer
10
+ module Scrapers
11
+ class Base < ::SimpleDelegator
12
+
13
+ attr_reader :config, :document, :website, :attribute_name
14
+
15
+ def initialize(document, website = nil)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @document = document
18
+ @website = website
19
+
20
+ super(config)
21
+ end
22
+
23
+ def call(attribute_name)
24
+ return false unless website.present?
25
+ return false unless applicable?
26
+
27
+ @attribute_name = attribute_name
28
+
29
+ website.send("#{attribute_name}=", value)
30
+ website
31
+ end
32
+
33
+ def applicable?
34
+ true
35
+ end
36
+
37
+ def value
38
+ fail NotImplementedError
39
+ end
40
+
41
+ private
42
+
43
+ def meta_xpath(options = {})
44
+ meta_xpaths(options).first
45
+ end
46
+
47
+ def meta_xpaths(options = {})
48
+ key = options.fetch(:key, :property)
49
+ value = options.fetch(:value, :content)
50
+ attribute = options.fetch(:attribute, attribute_name)
51
+
52
+ document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
53
+ end
54
+
55
+ def abc
56
+ 'abcdefghijklmnopqrstuvwxyz'
57
+ end
58
+
59
+ def model_class
60
+ "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
61
+ end
62
+
63
+ def modelize(node, text = nil)
64
+ model_class.new(node, text)
65
+ end
66
+
67
+ end
68
+ end
69
+ end