link_thumbnailer 3.2.0 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +5 -5
  2. data/.ruby-version +1 -0
  3. data/.travis.yml +2 -4
  4. data/CHANGELOG.md +252 -75
  5. data/Gemfile +5 -3
  6. data/README.md +4 -0
  7. data/Rakefile +2 -0
  8. data/lib/generators/link_thumbnailer/install_generator.rb +2 -0
  9. data/lib/generators/templates/initializer.rb +15 -0
  10. data/lib/link_thumbnailer.rb +2 -0
  11. data/lib/link_thumbnailer/configuration.rb +74 -68
  12. data/lib/link_thumbnailer/exceptions.rb +3 -0
  13. data/lib/link_thumbnailer/grader.rb +2 -0
  14. data/lib/link_thumbnailer/graders/base.rb +2 -0
  15. data/lib/link_thumbnailer/graders/html_attribute.rb +2 -0
  16. data/lib/link_thumbnailer/graders/length.rb +2 -0
  17. data/lib/link_thumbnailer/graders/link_density.rb +2 -0
  18. data/lib/link_thumbnailer/graders/position.rb +2 -0
  19. data/lib/link_thumbnailer/image_comparator.rb +2 -0
  20. data/lib/link_thumbnailer/image_comparators/base.rb +2 -0
  21. data/lib/link_thumbnailer/image_comparators/size.rb +2 -0
  22. data/lib/link_thumbnailer/image_parser.rb +13 -1
  23. data/lib/link_thumbnailer/image_validator.rb +2 -0
  24. data/lib/link_thumbnailer/model.rb +20 -17
  25. data/lib/link_thumbnailer/models/description.rb +2 -0
  26. data/lib/link_thumbnailer/models/favicon.rb +2 -0
  27. data/lib/link_thumbnailer/models/image.rb +56 -54
  28. data/lib/link_thumbnailer/models/title.rb +2 -0
  29. data/lib/link_thumbnailer/models/video.rb +2 -0
  30. data/lib/link_thumbnailer/models/website.rb +54 -52
  31. data/lib/link_thumbnailer/page.rb +4 -1
  32. data/lib/link_thumbnailer/parser.rb +3 -1
  33. data/lib/link_thumbnailer/processor.rb +38 -5
  34. data/lib/link_thumbnailer/railtie.rb +2 -0
  35. data/lib/link_thumbnailer/response.rb +39 -0
  36. data/lib/link_thumbnailer/scraper.rb +62 -60
  37. data/lib/link_thumbnailer/scrapers/base.rb +69 -67
  38. data/lib/link_thumbnailer/scrapers/default/base.rb +2 -0
  39. data/lib/link_thumbnailer/scrapers/default/description.rb +2 -0
  40. data/lib/link_thumbnailer/scrapers/default/favicon.rb +16 -2
  41. data/lib/link_thumbnailer/scrapers/default/images.rb +5 -1
  42. data/lib/link_thumbnailer/scrapers/default/title.rb +2 -0
  43. data/lib/link_thumbnailer/scrapers/default/videos.rb +2 -0
  44. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +2 -0
  45. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +2 -0
  46. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +2 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +7 -1
  48. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +2 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +2 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +2 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +2 -0
  52. data/lib/link_thumbnailer/uri.rb +20 -0
  53. data/lib/link_thumbnailer/version.rb +3 -1
  54. data/lib/link_thumbnailer/video_parser.rb +3 -1
  55. data/link_thumbnailer.gemspec +8 -6
  56. data/spec/configuration_spec.rb +4 -2
  57. data/spec/fixture_spec.rb +21 -0
  58. data/spec/fixtures/default_with_few_favicons.html +15 -0
  59. data/spec/fixtures/google_shift_jis.html +6 -0
  60. data/spec/fixtures/google_utf8.html +6 -0
  61. data/spec/fixtures/google_utf8_no_meta_charset.html +6 -0
  62. data/spec/fixtures/with_related_path_in_href.html +13 -0
  63. data/spec/fixtures/with_root_path_in_href.html +13 -0
  64. data/spec/grader_spec.rb +3 -1
  65. data/spec/graders/base_spec.rb +2 -0
  66. data/spec/graders/html_attribute_spec.rb +9 -7
  67. data/spec/graders/length_spec.rb +10 -6
  68. data/spec/graders/link_density_spec.rb +4 -2
  69. data/spec/graders/position_spec.rb +8 -6
  70. data/spec/image_comparators/size_spec.rb +2 -0
  71. data/spec/image_validator_spec.rb +3 -1
  72. data/spec/model_spec.rb +2 -0
  73. data/spec/models/description_spec.rb +3 -1
  74. data/spec/models/favicon_spec.rb +2 -0
  75. data/spec/models/image_spec.rb +6 -4
  76. data/spec/models/title_spec.rb +2 -0
  77. data/spec/models/video_spec.rb +7 -5
  78. data/spec/models/website_spec.rb +5 -3
  79. data/spec/page_spec.rb +2 -0
  80. data/spec/processor_spec.rb +74 -23
  81. data/spec/response_spec.rb +84 -0
  82. data/spec/scraper_spec.rb +6 -4
  83. data/spec/scrapers/base_spec.rb +6 -4
  84. data/spec/scrapers/opengraph/base_spec.rb +8 -6
  85. data/spec/spec_helper.rb +2 -0
  86. data/spec/uri_spec.rb +44 -0
  87. data/spec/video_parser_spec.rb +15 -13
  88. metadata +37 -19
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'link_thumbnailer/model'
2
4
 
3
5
  module LinkThumbnailer
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'link_thumbnailer/model'
2
4
  require 'link_thumbnailer/video_parser'
3
5
 
@@ -1,52 +1,54 @@
1
- require 'link_thumbnailer/model'
2
-
3
- module LinkThumbnailer
4
- module Models
5
- class Website < ::LinkThumbnailer::Model
6
-
7
- attr_accessor :url, :title, :description, :images, :videos, :favicon
8
-
9
- def initialize
10
- @images = []
11
- @videos = []
12
- end
13
-
14
- def video=(video)
15
- self.videos = video
16
- end
17
-
18
- def videos=(videos)
19
- Array(videos).each do |video|
20
- @videos << video
21
- end
22
- end
23
-
24
- def image=(image)
25
- self.images = image
26
- end
27
-
28
- def images=(images)
29
- Array(images).each do |image|
30
- next unless image.valid?
31
- @images << image
32
- end
33
- end
34
-
35
- def images
36
- @images.sort!
37
- end
38
-
39
- def as_json(*)
40
- {
41
- url: url.to_s,
42
- favicon: favicon,
43
- title: title,
44
- description: description,
45
- images: images.map(&:as_json),
46
- videos: videos.map(&:as_json)
47
- }
48
- end
49
-
50
- end
51
- end
52
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/model'
4
+
5
+ module LinkThumbnailer
6
+ module Models
7
+ class Website < ::LinkThumbnailer::Model
8
+
9
+ attr_accessor :url, :title, :description, :images, :videos, :favicon
10
+
11
+ def initialize
12
+ @images = []
13
+ @videos = []
14
+ end
15
+
16
+ def video=(video)
17
+ self.videos = video
18
+ end
19
+
20
+ def videos=(videos)
21
+ Array(videos).each do |video|
22
+ @videos << video
23
+ end
24
+ end
25
+
26
+ def image=(image)
27
+ self.images = image
28
+ end
29
+
30
+ def images=(images)
31
+ Array(images).each do |image|
32
+ next unless image.valid?
33
+ @images << image
34
+ end
35
+ end
36
+
37
+ def images
38
+ @images.sort!
39
+ end
40
+
41
+ def as_json(*)
42
+ {
43
+ url: url.to_s,
44
+ favicon: favicon,
45
+ title: title,
46
+ description: description,
47
+ images: images.map(&:as_json),
48
+ videos: videos.map(&:as_json)
49
+ }
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -1,3 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/response'
1
4
  require 'link_thumbnailer/processor'
2
5
  require 'link_thumbnailer/scraper'
3
6
 
@@ -14,7 +17,7 @@ module LinkThumbnailer
14
17
  end
15
18
 
16
19
  def generate
17
- @source = processor.call(url)
20
+ @source = processor.start(url)
18
21
  scraper.call
19
22
  end
20
23
 
@@ -1,10 +1,12 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'nokogiri'
2
4
 
3
5
  module LinkThumbnailer
4
6
  class Parser
5
7
 
6
8
  def call(source)
7
- ::Nokogiri::HTML(source)
9
+ ::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
8
10
  rescue ::Nokogiri::XML::SyntaxError => e
9
11
  raise ::LinkThumbnailer::SyntaxError.new(e.message)
10
12
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'delegate'
2
4
  require 'uri'
3
5
  require 'net/http/persistent'
@@ -15,6 +17,12 @@ module LinkThumbnailer
15
17
  super(config)
16
18
  end
17
19
 
20
+ def start(url)
21
+ result = call(url)
22
+ shutdown
23
+ result
24
+ end
25
+
18
26
  def call(url = '', redirect_count = 0, headers = {})
19
27
  self.url = url
20
28
  @redirect_count = redirect_count
@@ -26,12 +34,16 @@ module LinkThumbnailer
26
34
  set_http_options
27
35
  perform_request
28
36
  end
29
- rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
37
+ rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error, ::Net::HTTP::Persistent::Error => e
30
38
  raise ::LinkThumbnailer::HTTPError.new(e.message)
31
39
  end
32
40
 
33
41
  private
34
42
 
43
+ def shutdown
44
+ http.shutdown
45
+ end
46
+
35
47
  def with_valid_url
36
48
  raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
37
49
  yield if block_given?
@@ -39,8 +51,8 @@ module LinkThumbnailer
39
51
 
40
52
  def set_http_headers(headers = {})
41
53
  headers.each { |k, v| http.headers[k] = v }
42
- http.headers['User-Agent'] = user_agent
43
- http.override_headers['Accept-Encoding'] = 'none'
54
+ http.override_headers['User-Agent'] = user_agent
55
+ config.http_override_headers.each { |k, v| http.override_headers[k] = v }
44
56
  end
45
57
 
46
58
  def set_http_options
@@ -51,7 +63,7 @@ module LinkThumbnailer
51
63
  end
52
64
 
53
65
  def perform_request
54
- response = http.request(url)
66
+ response = request_in_chunks
55
67
  headers = {}
56
68
  headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
57
69
 
@@ -59,7 +71,7 @@ module LinkThumbnailer
59
71
 
60
72
  case response
61
73
  when ::Net::HTTPSuccess
62
- response.body
74
+ Response.new(response).body
63
75
  when ::Net::HTTPRedirection
64
76
  call(
65
77
  resolve_relative_url(response['location'].to_s),
@@ -71,6 +83,19 @@ module LinkThumbnailer
71
83
  end
72
84
  end
73
85
 
86
+ def request_in_chunks
87
+ body = String.new
88
+ response = http.request(url) do |resp|
89
+ raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(resp.content_length)
90
+ resp.read_body do |chunk|
91
+ body.concat(chunk)
92
+ raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(body.length)
93
+ end
94
+ end
95
+ response.body = body
96
+ response
97
+ end
98
+
74
99
  def resolve_relative_url(location)
75
100
  location.start_with?('http') ? location : build_absolute_url_for(location)
76
101
  end
@@ -99,6 +124,10 @@ module LinkThumbnailer
99
124
  config.verify_ssl
100
125
  end
101
126
 
127
+ def download_size_limit
128
+ config.download_size_limit
129
+ end
130
+
102
131
  def too_many_redirections?
103
132
  redirect_count > redirect_limit
104
133
  end
@@ -118,6 +147,10 @@ module LinkThumbnailer
118
147
  false
119
148
  end
120
149
 
150
+ def too_big_download_size?(size)
151
+ size.to_i > download_size_limit.to_i
152
+ end
153
+
121
154
  def url=(url)
122
155
  @url = ::URI.parse(url.to_s)
123
156
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module LinkThumbnailer
2
4
  class Railtie < ::Rails::Railtie
3
5
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Response
5
+ def initialize(response)
6
+ @response = response
7
+ end
8
+
9
+ def charset
10
+ @charset ||= extract_charset
11
+ end
12
+
13
+ def body
14
+ @body ||= extract_body
15
+ end
16
+
17
+ private
18
+
19
+ def extract_charset
20
+ content_type = @response['Content-Type'] || ''
21
+ m = content_type.match(/charset=([\w-]+)/)
22
+ (m && m[1]) || @response.body.scrub =~ /<meta[^>]*charset\s*=\s*["']?(.+?)["' >]/i && $1 || ''
23
+ end
24
+
25
+ def extract_body
26
+ should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
27
+ end
28
+
29
+ def should_convert_body_to_utf8?
30
+ charset != '' && charset != 'utf-8'
31
+ end
32
+
33
+ def convert_encoding_to_utf8(body, from)
34
+ Encoding::Converter.new(from, 'utf-8').convert(body)
35
+ rescue EncodingError
36
+ body
37
+ end
38
+ end
39
+ end
@@ -1,60 +1,62 @@
1
- require 'delegate'
2
- require 'active_support/core_ext/object/blank'
3
- require 'active_support/inflector'
4
-
5
- require 'link_thumbnailer/parser'
6
- require 'link_thumbnailer/models/website'
7
- require 'link_thumbnailer/scrapers/default/title'
8
- require 'link_thumbnailer/scrapers/opengraph/title'
9
- require 'link_thumbnailer/scrapers/default/description'
10
- require 'link_thumbnailer/scrapers/opengraph/description'
11
- require 'link_thumbnailer/scrapers/default/images'
12
- require 'link_thumbnailer/scrapers/opengraph/images'
13
- require 'link_thumbnailer/scrapers/default/videos'
14
- require 'link_thumbnailer/scrapers/opengraph/videos'
15
- require 'link_thumbnailer/scrapers/default/favicon'
16
- require 'link_thumbnailer/scrapers/opengraph/favicon'
17
-
18
- module LinkThumbnailer
19
- class Scraper < ::SimpleDelegator
20
-
21
- attr_reader :document, :source, :url, :config, :website
22
-
23
- def initialize(source, url)
24
- @source = source
25
- @url = url
26
- @config = ::LinkThumbnailer.page.config
27
- @document = parser.call(source)
28
- @website = ::LinkThumbnailer::Models::Website.new
29
- @website.url = url
30
-
31
- super(config)
32
- end
33
-
34
- def call
35
- config.attributes.each do |name|
36
- config.scrapers.each do |scraper_prefix|
37
- scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
38
- break unless website.send(name).blank?
39
- end
40
- end
41
-
42
- website
43
- end
44
-
45
- private
46
-
47
- def scraper_class(prefix, name)
48
- prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
49
- name = name.to_s.camelize
50
- "#{prefix}::#{name}".constantize
51
- rescue NameError
52
- raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
53
- end
54
-
55
- def parser
56
- ::LinkThumbnailer::Parser.new
57
- end
58
-
59
- end
60
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+
7
+ require 'link_thumbnailer/parser'
8
+ require 'link_thumbnailer/models/website'
9
+ require 'link_thumbnailer/scrapers/default/title'
10
+ require 'link_thumbnailer/scrapers/opengraph/title'
11
+ require 'link_thumbnailer/scrapers/default/description'
12
+ require 'link_thumbnailer/scrapers/opengraph/description'
13
+ require 'link_thumbnailer/scrapers/default/images'
14
+ require 'link_thumbnailer/scrapers/opengraph/images'
15
+ require 'link_thumbnailer/scrapers/default/videos'
16
+ require 'link_thumbnailer/scrapers/opengraph/videos'
17
+ require 'link_thumbnailer/scrapers/default/favicon'
18
+ require 'link_thumbnailer/scrapers/opengraph/favicon'
19
+
20
+ module LinkThumbnailer
21
+ class Scraper < ::SimpleDelegator
22
+
23
+ attr_reader :document, :source, :url, :config, :website
24
+
25
+ def initialize(source, url)
26
+ @source = source
27
+ @url = url
28
+ @config = ::LinkThumbnailer.page.config
29
+ @document = parser.call(source)
30
+ @website = ::LinkThumbnailer::Models::Website.new
31
+ @website.url = url
32
+
33
+ super(config)
34
+ end
35
+
36
+ def call
37
+ config.attributes.each do |name|
38
+ config.scrapers.each do |scraper_prefix|
39
+ scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
40
+ break unless website.send(name).blank?
41
+ end
42
+ end
43
+
44
+ website
45
+ end
46
+
47
+ private
48
+
49
+ def scraper_class(prefix, name)
50
+ prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
51
+ name = name.to_s.camelize
52
+ "#{prefix}::#{name}".constantize
53
+ rescue NameError
54
+ raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
55
+ end
56
+
57
+ def parser
58
+ ::LinkThumbnailer::Parser.new
59
+ end
60
+
61
+ end
62
+ end
@@ -1,67 +1,69 @@
1
- require 'delegate'
2
- require 'link_thumbnailer/models/title'
3
- require 'link_thumbnailer/models/description'
4
- require 'link_thumbnailer/models/image'
5
- require 'link_thumbnailer/models/video'
6
-
7
- module LinkThumbnailer
8
- module Scrapers
9
- class Base < ::SimpleDelegator
10
-
11
- attr_reader :config, :document, :website, :attribute_name
12
-
13
- def initialize(document, website = nil)
14
- @config = ::LinkThumbnailer.page.config
15
- @document = document
16
- @website = website
17
-
18
- super(config)
19
- end
20
-
21
- def call(attribute_name)
22
- return false unless website.present?
23
- return false unless applicable?
24
-
25
- @attribute_name = attribute_name
26
-
27
- website.send("#{attribute_name}=", value)
28
- website
29
- end
30
-
31
- def applicable?
32
- true
33
- end
34
-
35
- def value
36
- fail NotImplementedError
37
- end
38
-
39
- private
40
-
41
- def meta_xpath(options = {})
42
- meta_xpaths(options).first
43
- end
44
-
45
- def meta_xpaths(options = {})
46
- key = options.fetch(:key, :property)
47
- value = options.fetch(:value, :content)
48
- attribute = options.fetch(:attribute, attribute_name)
49
-
50
- document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and @#{value}]")
51
- end
52
-
53
- def abc
54
- 'abcdefghijklmnopqrstuvwxyz'
55
- end
56
-
57
- def model_class
58
- "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
59
- end
60
-
61
- def modelize(node, text = nil)
62
- model_class.new(node, text)
63
- end
64
-
65
- end
66
- end
67
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/models/title'
5
+ require 'link_thumbnailer/models/description'
6
+ require 'link_thumbnailer/models/image'
7
+ require 'link_thumbnailer/models/video'
8
+
9
+ module LinkThumbnailer
10
+ module Scrapers
11
+ class Base < ::SimpleDelegator
12
+
13
+ attr_reader :config, :document, :website, :attribute_name
14
+
15
+ def initialize(document, website = nil)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @document = document
18
+ @website = website
19
+
20
+ super(config)
21
+ end
22
+
23
+ def call(attribute_name)
24
+ return false unless website.present?
25
+ return false unless applicable?
26
+
27
+ @attribute_name = attribute_name
28
+
29
+ website.send("#{attribute_name}=", value)
30
+ website
31
+ end
32
+
33
+ def applicable?
34
+ true
35
+ end
36
+
37
+ def value
38
+ fail NotImplementedError
39
+ end
40
+
41
+ private
42
+
43
+ def meta_xpath(options = {})
44
+ meta_xpaths(options).first
45
+ end
46
+
47
+ def meta_xpaths(options = {})
48
+ key = options.fetch(:key, :property)
49
+ value = options.fetch(:value, :content)
50
+ attribute = options.fetch(:attribute, attribute_name)
51
+
52
+ document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
53
+ end
54
+
55
+ def abc
56
+ 'abcdefghijklmnopqrstuvwxyz'
57
+ end
58
+
59
+ def model_class
60
+ "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
61
+ end
62
+
63
+ def modelize(node, text = nil)
64
+ model_class.new(node, text)
65
+ end
66
+
67
+ end
68
+ end
69
+ end