link_thumbnailer 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +5 -0
  6. data/CHANGELOG.md +334 -0
  7. data/Gemfile +12 -0
  8. data/LICENSE.txt +22 -0
  9. data/README.md +210 -0
  10. data/Rakefile +9 -0
  11. data/lib/generators/link_thumbnailer/install_generator.rb +17 -0
  12. data/lib/generators/templates/initializer.rb +89 -0
  13. data/lib/link_thumbnailer.rb +38 -0
  14. data/lib/link_thumbnailer/configuration.rb +72 -0
  15. data/lib/link_thumbnailer/exceptions.rb +11 -0
  16. data/lib/link_thumbnailer/grader.rb +43 -0
  17. data/lib/link_thumbnailer/graders/base.rb +39 -0
  18. data/lib/link_thumbnailer/graders/html_attribute.rb +48 -0
  19. data/lib/link_thumbnailer/graders/length.rb +37 -0
  20. data/lib/link_thumbnailer/graders/link_density.rb +20 -0
  21. data/lib/link_thumbnailer/graders/position.rb +13 -0
  22. data/lib/link_thumbnailer/image_comparator.rb +26 -0
  23. data/lib/link_thumbnailer/image_comparators/base.rb +19 -0
  24. data/lib/link_thumbnailer/image_comparators/size.rb +13 -0
  25. data/lib/link_thumbnailer/image_parser.rb +62 -0
  26. data/lib/link_thumbnailer/image_validator.rb +32 -0
  27. data/lib/link_thumbnailer/model.rb +20 -0
  28. data/lib/link_thumbnailer/models/description.rb +37 -0
  29. data/lib/link_thumbnailer/models/favicon.rb +27 -0
  30. data/lib/link_thumbnailer/models/image.rb +56 -0
  31. data/lib/link_thumbnailer/models/title.rb +22 -0
  32. data/lib/link_thumbnailer/models/video.rb +44 -0
  33. data/lib/link_thumbnailer/models/website.rb +54 -0
  34. data/lib/link_thumbnailer/page.rb +43 -0
  35. data/lib/link_thumbnailer/parser.rb +15 -0
  36. data/lib/link_thumbnailer/processor.rb +128 -0
  37. data/lib/link_thumbnailer/railtie.rb +6 -0
  38. data/lib/link_thumbnailer/response.rb +39 -0
  39. data/lib/link_thumbnailer/scraper.rb +62 -0
  40. data/lib/link_thumbnailer/scrapers/base.rb +69 -0
  41. data/lib/link_thumbnailer/scrapers/default/base.rb +12 -0
  42. data/lib/link_thumbnailer/scrapers/default/description.rb +49 -0
  43. data/lib/link_thumbnailer/scrapers/default/favicon.rb +38 -0
  44. data/lib/link_thumbnailer/scrapers/default/images.rb +78 -0
  45. data/lib/link_thumbnailer/scrapers/default/title.rb +27 -0
  46. data/lib/link_thumbnailer/scrapers/default/videos.rb +18 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +45 -0
  48. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +12 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +17 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +107 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +18 -0
  52. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +12 -0
  53. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +115 -0
  54. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +18 -0
  55. data/lib/link_thumbnailer/uri.rb +20 -0
  56. data/lib/link_thumbnailer/version.rb +5 -0
  57. data/lib/link_thumbnailer/video_parser.rb +47 -0
  58. data/link_thumbnailer.gemspec +29 -0
  59. data/spec/configuration_spec.rb +61 -0
  60. data/spec/fixture_spec.rb +114 -0
  61. data/spec/fixtures/bar.png +2907 -0
  62. data/spec/fixtures/default_from_body.html +13 -0
  63. data/spec/fixtures/default_from_meta.html +12 -0
  64. data/spec/fixtures/foo.png +0 -0
  65. data/spec/fixtures/google_shift_jis.html +6 -0
  66. data/spec/fixtures/google_utf8.html +6 -0
  67. data/spec/fixtures/og_not_valid_example.html +12 -0
  68. data/spec/fixtures/og_valid_example.html +18 -0
  69. data/spec/fixtures/og_valid_multi_image_example.html +13 -0
  70. data/spec/fixtures/og_valid_multi_video_example.html +13 -0
  71. data/spec/grader_spec.rb +27 -0
  72. data/spec/graders/base_spec.rb +14 -0
  73. data/spec/graders/html_attribute_spec.rb +50 -0
  74. data/spec/graders/length_spec.rb +93 -0
  75. data/spec/graders/link_density_spec.rb +52 -0
  76. data/spec/graders/position_spec.rb +49 -0
  77. data/spec/image_comparators/size_spec.rb +58 -0
  78. data/spec/image_validator_spec.rb +37 -0
  79. data/spec/model_spec.rb +27 -0
  80. data/spec/models/description_spec.rb +66 -0
  81. data/spec/models/favicon_spec.rb +12 -0
  82. data/spec/models/image_spec.rb +95 -0
  83. data/spec/models/title_spec.rb +26 -0
  84. data/spec/models/video_spec.rb +49 -0
  85. data/spec/models/website_spec.rb +51 -0
  86. data/spec/page_spec.rb +28 -0
  87. data/spec/processor_spec.rb +410 -0
  88. data/spec/response_spec.rb +62 -0
  89. data/spec/scraper_spec.rb +70 -0
  90. data/spec/scrapers/base_spec.rb +69 -0
  91. data/spec/scrapers/opengraph/base_spec.rb +96 -0
  92. data/spec/spec_helper.rb +11 -0
  93. data/spec/uri_spec.rb +44 -0
  94. data/spec/video_parser_spec.rb +148 -0
  95. metadata +271 -0
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module LinkThumbnailer
6
+ class Parser
7
+
8
+ def call(source)
9
+ ::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
10
+ rescue ::Nokogiri::XML::SyntaxError => e
11
+ raise ::LinkThumbnailer::SyntaxError.new(e.message)
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'uri'
5
+ require 'net/http/persistent'
6
+
7
+ module LinkThumbnailer
8
+ class Processor < ::SimpleDelegator
9
+
10
+ attr_accessor :url
11
+ attr_reader :config, :http, :redirect_count
12
+
13
+ def initialize
14
+ @config = ::LinkThumbnailer.page.config
15
+ @http = ::Net::HTTP::Persistent.new
16
+
17
+ super(config)
18
+ end
19
+
20
+ def call(url = '', redirect_count = 0, headers = {})
21
+ self.url = url
22
+ @redirect_count = redirect_count
23
+
24
+ raise ::LinkThumbnailer::RedirectLimit if too_many_redirections?
25
+
26
+ with_valid_url do
27
+ set_http_headers(headers)
28
+ set_http_options
29
+ perform_request
30
+ end
31
+ rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
32
+ raise ::LinkThumbnailer::HTTPError.new(e.message)
33
+ end
34
+
35
+ private
36
+
37
+ def with_valid_url
38
+ raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
39
+ yield if block_given?
40
+ end
41
+
42
+ def set_http_headers(headers = {})
43
+ headers.each { |k, v| http.headers[k] = v }
44
+ http.override_headers['User-Agent'] = user_agent
45
+ config.http_override_headers.each { |k, v| http.override_headers[k] = v }
46
+ end
47
+
48
+ def set_http_options
49
+ http.verify_mode = ::OpenSSL::SSL::VERIFY_NONE unless ssl_required?
50
+ http.open_timeout = http_open_timeout
51
+ http.read_timeout = http_read_timeout
52
+ http.proxy = :ENV
53
+ end
54
+
55
+ def perform_request
56
+ response = http.request(url)
57
+ headers = {}
58
+ headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
59
+
60
+ raise ::LinkThumbnailer::FormatNotSupported.new(response['Content-Type']) unless valid_response_format?(response)
61
+
62
+ case response
63
+ when ::Net::HTTPSuccess
64
+ Response.new(response).body
65
+ when ::Net::HTTPRedirection
66
+ call(
67
+ resolve_relative_url(response['location'].to_s),
68
+ redirect_count + 1,
69
+ headers
70
+ )
71
+ else
72
+ response.error!
73
+ end
74
+ end
75
+
76
+ def resolve_relative_url(location)
77
+ location.start_with?('http') ? location : build_absolute_url_for(location)
78
+ end
79
+
80
+ def build_absolute_url_for(relative_url)
81
+ ::URI.parse("#{url.scheme}://#{url.host}#{relative_url}")
82
+ end
83
+
84
+ def redirect_limit
85
+ config.redirect_limit
86
+ end
87
+
88
+ def user_agent
89
+ config.user_agent
90
+ end
91
+
92
+ def http_open_timeout
93
+ config.http_open_timeout
94
+ end
95
+
96
+ def http_read_timeout
97
+ config.http_read_timeout
98
+ end
99
+
100
+ def ssl_required?
101
+ config.verify_ssl
102
+ end
103
+
104
+ def too_many_redirections?
105
+ redirect_count > redirect_limit
106
+ end
107
+
108
+ def valid_url_format?
109
+ url.is_a?(::URI::HTTP)
110
+ end
111
+
112
+ def valid_response_format?(response)
113
+ return true unless config.raise_on_invalid_format
114
+ return true if response['Content-Type'] =~ /text\/html/
115
+ return true if response['Content-Type'] =~ /application\/html/
116
+ return true if response['Content-Type'] =~ /application\/xhtml\+xml/
117
+ return true if response['Content-Type'] =~ /application\/xml/
118
+ return true if response['Content-Type'] =~ /text\/xml/
119
+ return true if response['Content-Type'] =~ /text\/plain/
120
+ false
121
+ end
122
+
123
+ def url=(url)
124
+ @url = ::URI.parse(url.to_s)
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Railtie < ::Rails::Railtie
5
+ end
6
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Response
5
+ def initialize(response)
6
+ @response = response
7
+ end
8
+
9
+ def charset
10
+ @charset ||= extract_charset
11
+ end
12
+
13
+ def body
14
+ @body ||= extract_body
15
+ end
16
+
17
+ private
18
+
19
+ def extract_charset
20
+ content_type = @response['Content-Type'] || ''
21
+ m = content_type.match(/charset=(\w+)/)
22
+ (m && m[1]) || ''
23
+ end
24
+
25
+ def extract_body
26
+ should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
27
+ end
28
+
29
+ def should_convert_body_to_utf8?
30
+ charset != '' && charset != 'utf-8'
31
+ end
32
+
33
+ def convert_encoding_to_utf8(body, from)
34
+ Encoding::Converter.new(from, 'utf-8').convert(body)
35
+ rescue EncodingError
36
+ body
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+
7
+ require 'link_thumbnailer/parser'
8
+ require 'link_thumbnailer/models/website'
9
+ require 'link_thumbnailer/scrapers/default/title'
10
+ require 'link_thumbnailer/scrapers/opengraph/title'
11
+ require 'link_thumbnailer/scrapers/default/description'
12
+ require 'link_thumbnailer/scrapers/opengraph/description'
13
+ require 'link_thumbnailer/scrapers/default/images'
14
+ require 'link_thumbnailer/scrapers/opengraph/images'
15
+ require 'link_thumbnailer/scrapers/default/videos'
16
+ require 'link_thumbnailer/scrapers/opengraph/videos'
17
+ require 'link_thumbnailer/scrapers/default/favicon'
18
+ require 'link_thumbnailer/scrapers/opengraph/favicon'
19
+
20
+ module LinkThumbnailer
21
+ class Scraper < ::SimpleDelegator
22
+
23
+ attr_reader :document, :source, :url, :config, :website
24
+
25
+ def initialize(source, url)
26
+ @source = source
27
+ @url = url
28
+ @config = ::LinkThumbnailer.page.config
29
+ @document = parser.call(source)
30
+ @website = ::LinkThumbnailer::Models::Website.new
31
+ @website.url = url
32
+
33
+ super(config)
34
+ end
35
+
36
+ def call
37
+ config.attributes.each do |name|
38
+ config.scrapers.each do |scraper_prefix|
39
+ scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
40
+ break unless website.send(name).blank?
41
+ end
42
+ end
43
+
44
+ website
45
+ end
46
+
47
+ private
48
+
49
+ def scraper_class(prefix, name)
50
+ prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
51
+ name = name.to_s.camelize
52
+ "#{prefix}::#{name}".constantize
53
+ rescue NameError
54
+ raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
55
+ end
56
+
57
+ def parser
58
+ ::LinkThumbnailer::Parser.new
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/models/title'
5
+ require 'link_thumbnailer/models/description'
6
+ require 'link_thumbnailer/models/image'
7
+ require 'link_thumbnailer/models/video'
8
+
9
+ module LinkThumbnailer
10
+ module Scrapers
11
+ class Base < ::SimpleDelegator
12
+
13
+ attr_reader :config, :document, :website, :attribute_name
14
+
15
+ def initialize(document, website = nil)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @document = document
18
+ @website = website
19
+
20
+ super(config)
21
+ end
22
+
23
+ def call(attribute_name)
24
+ return false unless website.present?
25
+ return false unless applicable?
26
+
27
+ @attribute_name = attribute_name
28
+
29
+ website.send("#{attribute_name}=", value)
30
+ website
31
+ end
32
+
33
+ def applicable?
34
+ true
35
+ end
36
+
37
+ def value
38
+ fail NotImplementedError
39
+ end
40
+
41
+ private
42
+
43
+ def meta_xpath(options = {})
44
+ meta_xpaths(options).first
45
+ end
46
+
47
+ def meta_xpaths(options = {})
48
+ key = options.fetch(:key, :property)
49
+ value = options.fetch(:value, :content)
50
+ attribute = options.fetch(:attribute, attribute_name)
51
+
52
+ document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
53
+ end
54
+
55
+ def abc
56
+ 'abcdefghijklmnopqrstuvwxyz'
57
+ end
58
+
59
+ def model_class
60
+ "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
61
+ end
62
+
63
+ def modelize(node, text = nil)
64
+ model_class.new(node, text)
65
+ end
66
+
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/base'
4
+
5
+ module LinkThumbnailer
6
+ module Scrapers
7
+ module Default
8
+ class Base < ::LinkThumbnailer::Scrapers::Base
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+
5
+ module LinkThumbnailer
6
+ module Scrapers
7
+ module Default
8
+ class Description < ::LinkThumbnailer::Scrapers::Default::Base
9
+
10
+ def value
11
+ return model_from_meta.to_s if model_from_meta
12
+ return model_from_body.to_s if model_from_body
13
+ nil
14
+ end
15
+
16
+ private
17
+
18
+ def model_from_meta
19
+ modelize(node_from_meta, node_from_meta.attributes['content'].value) if node_from_meta
20
+ end
21
+
22
+ def model_from_body
23
+ nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i) }.sort.last
24
+ end
25
+
26
+ def node_from_meta
27
+ @node_from_meta ||= meta_xpath(key: :name)
28
+ end
29
+
30
+ def nodes_from_body
31
+ candidates.select { |node| valid_paragraph?(node) }
32
+ end
33
+
34
+ def valid_paragraph?(node)
35
+ true
36
+ end
37
+
38
+ def candidates
39
+ document.css('p,td')
40
+ end
41
+
42
+ def modelize(node, text, i = 0)
43
+ model_class.new(node, text, i, nodes_from_body.count)
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+ require 'link_thumbnailer/models/favicon'
5
+
6
+ module LinkThumbnailer
7
+ module Scrapers
8
+ module Default
9
+ class Favicon < ::LinkThumbnailer::Scrapers::Default::Base
10
+
11
+ def value
12
+ modelize(to_uri(href)).to_s
13
+ end
14
+
15
+ private
16
+
17
+ def to_uri(href)
18
+ ::URI.parse(href)
19
+ rescue ::URI::InvalidURIError
20
+ nil
21
+ end
22
+
23
+ def href
24
+ node.attributes['href'].value.to_s if node
25
+ end
26
+
27
+ def node
28
+ document.xpath("//link[contains(@rel, 'icon')]").first
29
+ end
30
+
31
+ def modelize(uri)
32
+ model_class.new(uri)
33
+ end
34
+
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+ require 'link_thumbnailer/models/image'
5
+
6
+ module LinkThumbnailer
7
+ module Scrapers
8
+ module Default
9
+ class Images < ::LinkThumbnailer::Scrapers::Default::Base
10
+
11
+ def value
12
+ images.map do |image|
13
+ modelize(image.uri, image.size, image.type)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def images
20
+ ::LinkThumbnailer::ImageParser.new(allowed_urls).images
21
+ end
22
+
23
+ def allowed_urls
24
+ abs_urls.shift(config.image_limit)
25
+ end
26
+
27
+ def urls
28
+ document.search('//img').map { |i| i['src'] }.compact
29
+ end
30
+
31
+ def abs_urls
32
+ urls.map do |url|
33
+ uri = validate_url(url)
34
+
35
+ next unless uri
36
+
37
+ uri = prefix_uri(uri) if needs_prefix?(uri)
38
+ uri
39
+ end
40
+ end
41
+
42
+ def validate_url(url)
43
+ ::URI.parse(url.to_s)
44
+ rescue ::URI::InvalidURIError
45
+ nil
46
+ end
47
+
48
+ def needs_prefix?(uri)
49
+ !uri.host
50
+ end
51
+
52
+ def prefix_uri(uri)
53
+ ::URI.join(prefix_url, uri)
54
+ end
55
+
56
+ def prefix_url
57
+ base_href || website.url
58
+ end
59
+
60
+ def base_href
61
+ base = document.at('//head/base')
62
+ base['href'] if base && ::URI.parse(base['href']).host
63
+ rescue ::URI::InvalidURIError
64
+ nil
65
+ end
66
+
67
+ def model_class
68
+ ::LinkThumbnailer::Models::Image
69
+ end
70
+
71
+ def modelize(uri, size = nil, type = nil)
72
+ model_class.new(uri, size, type)
73
+ end
74
+
75
+ end
76
+ end
77
+ end
78
+ end