link_thumbnailer 3.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +5 -0
  6. data/CHANGELOG.md +334 -0
  7. data/Gemfile +12 -0
  8. data/LICENSE.txt +22 -0
  9. data/README.md +210 -0
  10. data/Rakefile +9 -0
  11. data/lib/generators/link_thumbnailer/install_generator.rb +17 -0
  12. data/lib/generators/templates/initializer.rb +89 -0
  13. data/lib/link_thumbnailer.rb +38 -0
  14. data/lib/link_thumbnailer/configuration.rb +72 -0
  15. data/lib/link_thumbnailer/exceptions.rb +11 -0
  16. data/lib/link_thumbnailer/grader.rb +43 -0
  17. data/lib/link_thumbnailer/graders/base.rb +39 -0
  18. data/lib/link_thumbnailer/graders/html_attribute.rb +48 -0
  19. data/lib/link_thumbnailer/graders/length.rb +37 -0
  20. data/lib/link_thumbnailer/graders/link_density.rb +20 -0
  21. data/lib/link_thumbnailer/graders/position.rb +13 -0
  22. data/lib/link_thumbnailer/image_comparator.rb +26 -0
  23. data/lib/link_thumbnailer/image_comparators/base.rb +19 -0
  24. data/lib/link_thumbnailer/image_comparators/size.rb +13 -0
  25. data/lib/link_thumbnailer/image_parser.rb +62 -0
  26. data/lib/link_thumbnailer/image_validator.rb +32 -0
  27. data/lib/link_thumbnailer/model.rb +20 -0
  28. data/lib/link_thumbnailer/models/description.rb +37 -0
  29. data/lib/link_thumbnailer/models/favicon.rb +27 -0
  30. data/lib/link_thumbnailer/models/image.rb +56 -0
  31. data/lib/link_thumbnailer/models/title.rb +22 -0
  32. data/lib/link_thumbnailer/models/video.rb +44 -0
  33. data/lib/link_thumbnailer/models/website.rb +54 -0
  34. data/lib/link_thumbnailer/page.rb +43 -0
  35. data/lib/link_thumbnailer/parser.rb +15 -0
  36. data/lib/link_thumbnailer/processor.rb +128 -0
  37. data/lib/link_thumbnailer/railtie.rb +6 -0
  38. data/lib/link_thumbnailer/response.rb +39 -0
  39. data/lib/link_thumbnailer/scraper.rb +62 -0
  40. data/lib/link_thumbnailer/scrapers/base.rb +69 -0
  41. data/lib/link_thumbnailer/scrapers/default/base.rb +12 -0
  42. data/lib/link_thumbnailer/scrapers/default/description.rb +49 -0
  43. data/lib/link_thumbnailer/scrapers/default/favicon.rb +38 -0
  44. data/lib/link_thumbnailer/scrapers/default/images.rb +78 -0
  45. data/lib/link_thumbnailer/scrapers/default/title.rb +27 -0
  46. data/lib/link_thumbnailer/scrapers/default/videos.rb +18 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +45 -0
  48. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +12 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +17 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +107 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +18 -0
  52. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +12 -0
  53. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +115 -0
  54. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +18 -0
  55. data/lib/link_thumbnailer/uri.rb +20 -0
  56. data/lib/link_thumbnailer/version.rb +5 -0
  57. data/lib/link_thumbnailer/video_parser.rb +47 -0
  58. data/link_thumbnailer.gemspec +29 -0
  59. data/spec/configuration_spec.rb +61 -0
  60. data/spec/fixture_spec.rb +114 -0
  61. data/spec/fixtures/bar.png +2907 -0
  62. data/spec/fixtures/default_from_body.html +13 -0
  63. data/spec/fixtures/default_from_meta.html +12 -0
  64. data/spec/fixtures/foo.png +0 -0
  65. data/spec/fixtures/google_shift_jis.html +6 -0
  66. data/spec/fixtures/google_utf8.html +6 -0
  67. data/spec/fixtures/og_not_valid_example.html +12 -0
  68. data/spec/fixtures/og_valid_example.html +18 -0
  69. data/spec/fixtures/og_valid_multi_image_example.html +13 -0
  70. data/spec/fixtures/og_valid_multi_video_example.html +13 -0
  71. data/spec/grader_spec.rb +27 -0
  72. data/spec/graders/base_spec.rb +14 -0
  73. data/spec/graders/html_attribute_spec.rb +50 -0
  74. data/spec/graders/length_spec.rb +93 -0
  75. data/spec/graders/link_density_spec.rb +52 -0
  76. data/spec/graders/position_spec.rb +49 -0
  77. data/spec/image_comparators/size_spec.rb +58 -0
  78. data/spec/image_validator_spec.rb +37 -0
  79. data/spec/model_spec.rb +27 -0
  80. data/spec/models/description_spec.rb +66 -0
  81. data/spec/models/favicon_spec.rb +12 -0
  82. data/spec/models/image_spec.rb +95 -0
  83. data/spec/models/title_spec.rb +26 -0
  84. data/spec/models/video_spec.rb +49 -0
  85. data/spec/models/website_spec.rb +51 -0
  86. data/spec/page_spec.rb +28 -0
  87. data/spec/processor_spec.rb +410 -0
  88. data/spec/response_spec.rb +62 -0
  89. data/spec/scraper_spec.rb +70 -0
  90. data/spec/scrapers/base_spec.rb +69 -0
  91. data/spec/scrapers/opengraph/base_spec.rb +96 -0
  92. data/spec/spec_helper.rb +11 -0
  93. data/spec/uri_spec.rb +44 -0
  94. data/spec/video_parser_spec.rb +148 -0
  95. metadata +271 -0
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module LinkThumbnailer
6
+ class Parser
7
+
8
+ def call(source)
9
+ ::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
10
+ rescue ::Nokogiri::XML::SyntaxError => e
11
+ raise ::LinkThumbnailer::SyntaxError.new(e.message)
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'uri'
5
+ require 'net/http/persistent'
6
+
7
+ module LinkThumbnailer
8
+ class Processor < ::SimpleDelegator
9
+
10
+ attr_accessor :url
11
+ attr_reader :config, :http, :redirect_count
12
+
13
+ def initialize
14
+ @config = ::LinkThumbnailer.page.config
15
+ @http = ::Net::HTTP::Persistent.new
16
+
17
+ super(config)
18
+ end
19
+
20
+ def call(url = '', redirect_count = 0, headers = {})
21
+ self.url = url
22
+ @redirect_count = redirect_count
23
+
24
+ raise ::LinkThumbnailer::RedirectLimit if too_many_redirections?
25
+
26
+ with_valid_url do
27
+ set_http_headers(headers)
28
+ set_http_options
29
+ perform_request
30
+ end
31
+ rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
32
+ raise ::LinkThumbnailer::HTTPError.new(e.message)
33
+ end
34
+
35
+ private
36
+
37
+ def with_valid_url
38
+ raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
39
+ yield if block_given?
40
+ end
41
+
42
+ def set_http_headers(headers = {})
43
+ headers.each { |k, v| http.headers[k] = v }
44
+ http.override_headers['User-Agent'] = user_agent
45
+ config.http_override_headers.each { |k, v| http.override_headers[k] = v }
46
+ end
47
+
48
+ def set_http_options
49
+ http.verify_mode = ::OpenSSL::SSL::VERIFY_NONE unless ssl_required?
50
+ http.open_timeout = http_open_timeout
51
+ http.read_timeout = http_read_timeout
52
+ http.proxy = :ENV
53
+ end
54
+
55
+ def perform_request
56
+ response = http.request(url)
57
+ headers = {}
58
+ headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
59
+
60
+ raise ::LinkThumbnailer::FormatNotSupported.new(response['Content-Type']) unless valid_response_format?(response)
61
+
62
+ case response
63
+ when ::Net::HTTPSuccess
64
+ Response.new(response).body
65
+ when ::Net::HTTPRedirection
66
+ call(
67
+ resolve_relative_url(response['location'].to_s),
68
+ redirect_count + 1,
69
+ headers
70
+ )
71
+ else
72
+ response.error!
73
+ end
74
+ end
75
+
76
+ def resolve_relative_url(location)
77
+ location.start_with?('http') ? location : build_absolute_url_for(location)
78
+ end
79
+
80
+ def build_absolute_url_for(relative_url)
81
+ ::URI.parse("#{url.scheme}://#{url.host}#{relative_url}")
82
+ end
83
+
84
+ def redirect_limit
85
+ config.redirect_limit
86
+ end
87
+
88
+ def user_agent
89
+ config.user_agent
90
+ end
91
+
92
+ def http_open_timeout
93
+ config.http_open_timeout
94
+ end
95
+
96
+ def http_read_timeout
97
+ config.http_read_timeout
98
+ end
99
+
100
+ def ssl_required?
101
+ config.verify_ssl
102
+ end
103
+
104
+ def too_many_redirections?
105
+ redirect_count > redirect_limit
106
+ end
107
+
108
+ def valid_url_format?
109
+ url.is_a?(::URI::HTTP)
110
+ end
111
+
112
+ def valid_response_format?(response)
113
+ return true unless config.raise_on_invalid_format
114
+ return true if response['Content-Type'] =~ /text\/html/
115
+ return true if response['Content-Type'] =~ /application\/html/
116
+ return true if response['Content-Type'] =~ /application\/xhtml\+xml/
117
+ return true if response['Content-Type'] =~ /application\/xml/
118
+ return true if response['Content-Type'] =~ /text\/xml/
119
+ return true if response['Content-Type'] =~ /text\/plain/
120
+ false
121
+ end
122
+
123
+ def url=(url)
124
+ @url = ::URI.parse(url.to_s)
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Railtie < ::Rails::Railtie
5
+ end
6
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ class Response
5
+ def initialize(response)
6
+ @response = response
7
+ end
8
+
9
+ def charset
10
+ @charset ||= extract_charset
11
+ end
12
+
13
+ def body
14
+ @body ||= extract_body
15
+ end
16
+
17
+ private
18
+
19
+ def extract_charset
20
+ content_type = @response['Content-Type'] || ''
21
+ m = content_type.match(/charset=(\w+)/)
22
+ (m && m[1]) || ''
23
+ end
24
+
25
+ def extract_body
26
+ should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
27
+ end
28
+
29
+ def should_convert_body_to_utf8?
30
+ charset != '' && charset != 'utf-8'
31
+ end
32
+
33
+ def convert_encoding_to_utf8(body, from)
34
+ Encoding::Converter.new(from, 'utf-8').convert(body)
35
+ rescue EncodingError
36
+ body
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'active_support/core_ext/object/blank'
5
+ require 'active_support/inflector'
6
+
7
+ require 'link_thumbnailer/parser'
8
+ require 'link_thumbnailer/models/website'
9
+ require 'link_thumbnailer/scrapers/default/title'
10
+ require 'link_thumbnailer/scrapers/opengraph/title'
11
+ require 'link_thumbnailer/scrapers/default/description'
12
+ require 'link_thumbnailer/scrapers/opengraph/description'
13
+ require 'link_thumbnailer/scrapers/default/images'
14
+ require 'link_thumbnailer/scrapers/opengraph/images'
15
+ require 'link_thumbnailer/scrapers/default/videos'
16
+ require 'link_thumbnailer/scrapers/opengraph/videos'
17
+ require 'link_thumbnailer/scrapers/default/favicon'
18
+ require 'link_thumbnailer/scrapers/opengraph/favicon'
19
+
20
+ module LinkThumbnailer
21
+ class Scraper < ::SimpleDelegator
22
+
23
+ attr_reader :document, :source, :url, :config, :website
24
+
25
+ def initialize(source, url)
26
+ @source = source
27
+ @url = url
28
+ @config = ::LinkThumbnailer.page.config
29
+ @document = parser.call(source)
30
+ @website = ::LinkThumbnailer::Models::Website.new
31
+ @website.url = url
32
+
33
+ super(config)
34
+ end
35
+
36
+ def call
37
+ config.attributes.each do |name|
38
+ config.scrapers.each do |scraper_prefix|
39
+ scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
40
+ break unless website.send(name).blank?
41
+ end
42
+ end
43
+
44
+ website
45
+ end
46
+
47
+ private
48
+
49
+ def scraper_class(prefix, name)
50
+ prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
51
+ name = name.to_s.camelize
52
+ "#{prefix}::#{name}".constantize
53
+ rescue NameError
54
+ raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
55
+ end
56
+
57
+ def parser
58
+ ::LinkThumbnailer::Parser.new
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/models/title'
5
+ require 'link_thumbnailer/models/description'
6
+ require 'link_thumbnailer/models/image'
7
+ require 'link_thumbnailer/models/video'
8
+
9
+ module LinkThumbnailer
10
+ module Scrapers
11
+ class Base < ::SimpleDelegator
12
+
13
+ attr_reader :config, :document, :website, :attribute_name
14
+
15
+ def initialize(document, website = nil)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @document = document
18
+ @website = website
19
+
20
+ super(config)
21
+ end
22
+
23
+ def call(attribute_name)
24
+ return false unless website.present?
25
+ return false unless applicable?
26
+
27
+ @attribute_name = attribute_name
28
+
29
+ website.send("#{attribute_name}=", value)
30
+ website
31
+ end
32
+
33
+ def applicable?
34
+ true
35
+ end
36
+
37
+ def value
38
+ fail NotImplementedError
39
+ end
40
+
41
+ private
42
+
43
+ def meta_xpath(options = {})
44
+ meta_xpaths(options).first
45
+ end
46
+
47
+ def meta_xpaths(options = {})
48
+ key = options.fetch(:key, :property)
49
+ value = options.fetch(:value, :content)
50
+ attribute = options.fetch(:attribute, attribute_name)
51
+
52
+ document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
53
+ end
54
+
55
+ def abc
56
+ 'abcdefghijklmnopqrstuvwxyz'
57
+ end
58
+
59
+ def model_class
60
+ "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
61
+ end
62
+
63
+ def modelize(node, text = nil)
64
+ model_class.new(node, text)
65
+ end
66
+
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/base'
4
+
5
+ module LinkThumbnailer
6
+ module Scrapers
7
+ module Default
8
+ class Base < ::LinkThumbnailer::Scrapers::Base
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+
5
+ module LinkThumbnailer
6
+ module Scrapers
7
+ module Default
8
+ class Description < ::LinkThumbnailer::Scrapers::Default::Base
9
+
10
+ def value
11
+ return model_from_meta.to_s if model_from_meta
12
+ return model_from_body.to_s if model_from_body
13
+ nil
14
+ end
15
+
16
+ private
17
+
18
+ def model_from_meta
19
+ modelize(node_from_meta, node_from_meta.attributes['content'].value) if node_from_meta
20
+ end
21
+
22
+ def model_from_body
23
+ nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i) }.sort.last
24
+ end
25
+
26
+ def node_from_meta
27
+ @node_from_meta ||= meta_xpath(key: :name)
28
+ end
29
+
30
+ def nodes_from_body
31
+ candidates.select { |node| valid_paragraph?(node) }
32
+ end
33
+
34
+ def valid_paragraph?(node)
35
+ true
36
+ end
37
+
38
+ def candidates
39
+ document.css('p,td')
40
+ end
41
+
42
+ def modelize(node, text, i = 0)
43
+ model_class.new(node, text, i, nodes_from_body.count)
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+ require 'link_thumbnailer/models/favicon'
5
+
6
+ module LinkThumbnailer
7
+ module Scrapers
8
+ module Default
9
+ class Favicon < ::LinkThumbnailer::Scrapers::Default::Base
10
+
11
+ def value
12
+ modelize(to_uri(href)).to_s
13
+ end
14
+
15
+ private
16
+
17
+ def to_uri(href)
18
+ ::URI.parse(href)
19
+ rescue ::URI::InvalidURIError
20
+ nil
21
+ end
22
+
23
+ def href
24
+ node.attributes['href'].value.to_s if node
25
+ end
26
+
27
+ def node
28
+ document.xpath("//link[contains(@rel, 'icon')]").first
29
+ end
30
+
31
+ def modelize(uri)
32
+ model_class.new(uri)
33
+ end
34
+
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'link_thumbnailer/scrapers/default/base'
4
+ require 'link_thumbnailer/models/image'
5
+
6
+ module LinkThumbnailer
7
+ module Scrapers
8
+ module Default
9
+ class Images < ::LinkThumbnailer::Scrapers::Default::Base
10
+
11
+ def value
12
+ images.map do |image|
13
+ modelize(image.uri, image.size, image.type)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def images
20
+ ::LinkThumbnailer::ImageParser.new(allowed_urls).images
21
+ end
22
+
23
+ def allowed_urls
24
+ abs_urls.shift(config.image_limit)
25
+ end
26
+
27
+ def urls
28
+ document.search('//img').map { |i| i['src'] }.compact
29
+ end
30
+
31
+ def abs_urls
32
+ urls.map do |url|
33
+ uri = validate_url(url)
34
+
35
+ next unless uri
36
+
37
+ uri = prefix_uri(uri) if needs_prefix?(uri)
38
+ uri
39
+ end
40
+ end
41
+
42
+ def validate_url(url)
43
+ ::URI.parse(url.to_s)
44
+ rescue ::URI::InvalidURIError
45
+ nil
46
+ end
47
+
48
+ def needs_prefix?(uri)
49
+ !uri.host
50
+ end
51
+
52
+ def prefix_uri(uri)
53
+ ::URI.join(prefix_url, uri)
54
+ end
55
+
56
+ def prefix_url
57
+ base_href || website.url
58
+ end
59
+
60
+ def base_href
61
+ base = document.at('//head/base')
62
+ base['href'] if base && ::URI.parse(base['href']).host
63
+ rescue ::URI::InvalidURIError
64
+ nil
65
+ end
66
+
67
+ def model_class
68
+ ::LinkThumbnailer::Models::Image
69
+ end
70
+
71
+ def modelize(uri, size = nil, type = nil)
72
+ model_class.new(uri, size, type)
73
+ end
74
+
75
+ end
76
+ end
77
+ end
78
+ end