link_thumbnailer 3.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +5 -0
  6. data/CHANGELOG.md +334 -0
  7. data/Gemfile +12 -0
  8. data/LICENSE.txt +22 -0
  9. data/README.md +210 -0
  10. data/Rakefile +9 -0
  11. data/lib/generators/link_thumbnailer/install_generator.rb +17 -0
  12. data/lib/generators/templates/initializer.rb +89 -0
  13. data/lib/link_thumbnailer.rb +38 -0
  14. data/lib/link_thumbnailer/configuration.rb +72 -0
  15. data/lib/link_thumbnailer/exceptions.rb +11 -0
  16. data/lib/link_thumbnailer/grader.rb +43 -0
  17. data/lib/link_thumbnailer/graders/base.rb +39 -0
  18. data/lib/link_thumbnailer/graders/html_attribute.rb +48 -0
  19. data/lib/link_thumbnailer/graders/length.rb +37 -0
  20. data/lib/link_thumbnailer/graders/link_density.rb +20 -0
  21. data/lib/link_thumbnailer/graders/position.rb +13 -0
  22. data/lib/link_thumbnailer/image_comparator.rb +26 -0
  23. data/lib/link_thumbnailer/image_comparators/base.rb +19 -0
  24. data/lib/link_thumbnailer/image_comparators/size.rb +13 -0
  25. data/lib/link_thumbnailer/image_parser.rb +62 -0
  26. data/lib/link_thumbnailer/image_validator.rb +32 -0
  27. data/lib/link_thumbnailer/model.rb +20 -0
  28. data/lib/link_thumbnailer/models/description.rb +37 -0
  29. data/lib/link_thumbnailer/models/favicon.rb +27 -0
  30. data/lib/link_thumbnailer/models/image.rb +56 -0
  31. data/lib/link_thumbnailer/models/title.rb +22 -0
  32. data/lib/link_thumbnailer/models/video.rb +44 -0
  33. data/lib/link_thumbnailer/models/website.rb +54 -0
  34. data/lib/link_thumbnailer/page.rb +43 -0
  35. data/lib/link_thumbnailer/parser.rb +15 -0
  36. data/lib/link_thumbnailer/processor.rb +128 -0
  37. data/lib/link_thumbnailer/railtie.rb +6 -0
  38. data/lib/link_thumbnailer/response.rb +39 -0
  39. data/lib/link_thumbnailer/scraper.rb +62 -0
  40. data/lib/link_thumbnailer/scrapers/base.rb +69 -0
  41. data/lib/link_thumbnailer/scrapers/default/base.rb +12 -0
  42. data/lib/link_thumbnailer/scrapers/default/description.rb +49 -0
  43. data/lib/link_thumbnailer/scrapers/default/favicon.rb +38 -0
  44. data/lib/link_thumbnailer/scrapers/default/images.rb +78 -0
  45. data/lib/link_thumbnailer/scrapers/default/title.rb +27 -0
  46. data/lib/link_thumbnailer/scrapers/default/videos.rb +18 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +45 -0
  48. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +12 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +17 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +107 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +18 -0
  52. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +12 -0
  53. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +115 -0
  54. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +18 -0
  55. data/lib/link_thumbnailer/uri.rb +20 -0
  56. data/lib/link_thumbnailer/version.rb +5 -0
  57. data/lib/link_thumbnailer/video_parser.rb +47 -0
  58. data/link_thumbnailer.gemspec +29 -0
  59. data/spec/configuration_spec.rb +61 -0
  60. data/spec/fixture_spec.rb +114 -0
  61. data/spec/fixtures/bar.png +2907 -0
  62. data/spec/fixtures/default_from_body.html +13 -0
  63. data/spec/fixtures/default_from_meta.html +12 -0
  64. data/spec/fixtures/foo.png +0 -0
  65. data/spec/fixtures/google_shift_jis.html +6 -0
  66. data/spec/fixtures/google_utf8.html +6 -0
  67. data/spec/fixtures/og_not_valid_example.html +12 -0
  68. data/spec/fixtures/og_valid_example.html +18 -0
  69. data/spec/fixtures/og_valid_multi_image_example.html +13 -0
  70. data/spec/fixtures/og_valid_multi_video_example.html +13 -0
  71. data/spec/grader_spec.rb +27 -0
  72. data/spec/graders/base_spec.rb +14 -0
  73. data/spec/graders/html_attribute_spec.rb +50 -0
  74. data/spec/graders/length_spec.rb +93 -0
  75. data/spec/graders/link_density_spec.rb +52 -0
  76. data/spec/graders/position_spec.rb +49 -0
  77. data/spec/image_comparators/size_spec.rb +58 -0
  78. data/spec/image_validator_spec.rb +37 -0
  79. data/spec/model_spec.rb +27 -0
  80. data/spec/models/description_spec.rb +66 -0
  81. data/spec/models/favicon_spec.rb +12 -0
  82. data/spec/models/image_spec.rb +95 -0
  83. data/spec/models/title_spec.rb +26 -0
  84. data/spec/models/video_spec.rb +49 -0
  85. data/spec/models/website_spec.rb +51 -0
  86. data/spec/page_spec.rb +28 -0
  87. data/spec/processor_spec.rb +410 -0
  88. data/spec/response_spec.rb +62 -0
  89. data/spec/scraper_spec.rb +70 -0
  90. data/spec/scrapers/base_spec.rb +69 -0
  91. data/spec/scrapers/opengraph/base_spec.rb +96 -0
  92. data/spec/spec_helper.rb +11 -0
  93. data/spec/uri_spec.rb +44 -0
  94. data/spec/video_parser_spec.rb +148 -0
  95. metadata +271 -0
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new('spec')
7
+
8
+ task default: :spec
9
+ task test: :spec
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Generators
5
+ class InstallGenerator < ::Rails::Generators::Base
6
+
7
+ source_root File.expand_path('../../templates', __FILE__)
8
+
9
+ desc 'Creates a LinkThumbnailer initializer for your application.'
10
+
11
+ def copy_initializer
12
+ template 'initializer.rb', 'config/initializers/link_thumbnailer.rb'
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Use this hook to configure LinkThumbnailer bahaviors.
4
+ LinkThumbnailer.configure do |config|
5
+ # Numbers of redirects before raising an exception when trying to parse given url.
6
+ #
7
+ # config.redirect_limit = 3
8
+
9
+ # Set user agent
10
+ #
11
+ # config.user_agent = 'link_thumbnailer'
12
+
13
+ # Enable or disable SSL verification
14
+ #
15
+ # config.verify_ssl = true
16
+
17
+ # The amount of time in seconds to wait for a connection to be opened.
18
+ # If the HTTP object cannot open a connection in this many seconds,
19
+ # it raises a Net::OpenTimeout exception.
20
+ #
21
+ # See http://www.ruby-doc.org/stdlib-2.1.1/libdoc/net/http/rdoc/Net/HTTP.html#open_timeout
22
+ #
23
+ # config.http_open_timeout = 5
24
+
25
+ # List of blacklisted urls you want to skip when searching for images.
26
+ #
27
+ # config.blacklist_urls = [
28
+ # %r{^http://ad\.doubleclick\.net/},
29
+ # %r{^http://b\.scorecardresearch\.com/},
30
+ # %r{^http://pixel\.quantserve\.com/},
31
+ # %r{^http://s7\.addthis\.com/}
32
+ # ]
33
+
34
+ # List of attributes you want LinkThumbnailer to fetch on a website.
35
+ #
36
+ # config.attributes = [:title, :images, :description, :videos, :favicon]
37
+
38
+ # List of procedures used to rate the website description. Add you custom class
39
+ # here. See wiki for more details on how to build your own graders.
40
+ #
41
+ # config.graders = [
42
+ # ->(description) { ::LinkThumbnailer::Graders::Length.new(description) },
43
+ # ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
44
+ # ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
45
+ # ->(description) { ::LinkThumbnailer::Graders::Position.new(description, weight: 3) },
46
+ # ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) }
47
+ # ]
48
+
49
+ # Minimum description length for a website.
50
+ #
51
+ # config.description_min_length = 25
52
+
53
+ # Regex of words considered positive to rate website description.
54
+ #
55
+ # config.positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
56
+
57
+ # Regex of words considered negative to rate website description.
58
+ #
59
+ # config.negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
60
+
61
+ # Numbers of images to fetch. Fetching too many images will be slow.
62
+ # Note that LinkThumbnailer will only sort fetched images between each other.
63
+ # Meaning that they could be a "better" image on the page.
64
+ #
65
+ # config.image_limit = 5
66
+
67
+ # Whether you want LinkThumbnailer to return image size and type or not.
68
+ # Setting this value to false will increase performance since for each images, LinkThumbnailer
69
+ # does not have to fetch its size and type.
70
+ #
71
+ # config.image_stats = true
72
+
73
+ # Whether you want LinkThumbnailer to raise an exception if the Content-Type of the HTTP request
74
+ # is not an html or xml.
75
+ #
76
+ # config.raise_on_invalid_format = false
77
+
78
+ # Sets number of concurrent http connections that can be opened to fetch images informations such as size and type.
79
+ #
80
+ # config.max_concurrency = 20
81
+
82
+ # Defines the strategies to use to scrap the website. See the [Open Graph Protocol](http://ogp.me/) for more information.
83
+ #
84
+ # config.scrapers = [:opengraph, :default]
85
+
86
+ # Sets the default encoding.
87
+ #
88
+ # config.encoding = 'utf-8'
89
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'link_thumbnailer/version'
5
+ require 'link_thumbnailer/configuration'
6
+ require 'link_thumbnailer/exceptions'
7
+ require 'link_thumbnailer/page'
8
+
9
+ module LinkThumbnailer
10
+
11
+ class << self
12
+
13
+ attr_reader :page
14
+
15
+ def generate(url, options = {})
16
+ @page = ::LinkThumbnailer::Page.new(url, options)
17
+
18
+ page.generate
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ begin
26
+ require 'rails'
27
+ rescue LoadError
28
+ end
29
+
30
+ $stderr.puts <<-EOC if !defined?(Rails)
31
+ warning: no framework detected.
32
+
33
+ Your Gemfile might not be configured properly.
34
+ ---- e.g. ----
35
+ Rails:
36
+ gem 'link_thumbnailer'
37
+
38
+ EOC
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+
5
+ # Access point for the gem configurations.
6
+ #
7
+ # @return [LinkThumbnailer::Configuration] a configuration instance.
8
+ def self.config
9
+ @config ||= Configuration.new
10
+ end
11
+
12
+ # Configure hook used in the gem initializer. Convinient way to set all the
13
+ # gem configurations.
14
+ #
15
+ # @example inside config/initializers/link_thumbnaler.rb
16
+ # LinkThumbnailer.configure do |config|
17
+ # config.user_agent = 'link_thumbnailer'
18
+ # end
19
+ #
20
+ # @return [void]
21
+ def self.configure
22
+ yield config if block_given?
23
+ end
24
+
25
+ class Configuration
26
+
27
+ attr_accessor :redirect_limit, :blacklist_urls, :user_agent,
28
+ :verify_ssl, :http_open_timeout, :http_read_timeout, :attributes,
29
+ :graders, :description_min_length, :positive_regex, :negative_regex,
30
+ :image_limit, :image_stats, :raise_on_invalid_format, :max_concurrency,
31
+ :scrapers, :http_override_headers, :encoding
32
+
33
+ alias_method :http_timeout, :http_open_timeout
34
+ alias_method :http_timeout=, :http_open_timeout=
35
+
36
+ # Create a new instance.
37
+ #
38
+ # @return [LinkThumbnailer::Configuration]
39
+ def initialize
40
+ @redirect_limit = 3
41
+ @user_agent = 'link_thumbnailer'
42
+ @verify_ssl = true
43
+ @http_open_timeout = 5
44
+ @http_read_timeout = 5
45
+ @blacklist_urls = [
46
+ %r{^http://ad\.doubleclick\.net/},
47
+ %r{^http://b\.scorecardresearch\.com/},
48
+ %r{^http://pixel\.quantserve\.com/},
49
+ %r{^http://s7\.addthis\.com/}
50
+ ]
51
+ @attributes = [:title, :images, :description, :videos, :favicon]
52
+ @graders = [
53
+ ->(description) { ::LinkThumbnailer::Graders::Length.new(description) },
54
+ ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
55
+ ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
56
+ ->(description) { ::LinkThumbnailer::Graders::Position.new(description, weigth: 3) },
57
+ ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) },
58
+ ]
59
+ @description_min_length = 50
60
+ @positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
61
+ @negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
62
+ @image_limit = 5
63
+ @image_stats = true
64
+ @raise_on_invalid_format = false
65
+ @max_concurrency = 20
66
+ @scrapers = [:opengraph, :default]
67
+ @http_override_headers = { 'Accept-Encoding' => 'none' }
68
+ @encoding = 'utf-8'
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ Exceptions = Class.new(StandardError)
5
+ RedirectLimit = Class.new(Exceptions)
6
+ BadUriFormat = Class.new(Exceptions)
7
+ FormatNotSupported = Class.new(Exceptions)
8
+ ScraperInvalid = Class.new(Exceptions)
9
+ HTTPError = Class.new(Exceptions)
10
+ SyntaxError = Class.new(Exceptions)
11
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/graders/base'
5
+ require 'link_thumbnailer/graders/length'
6
+ require 'link_thumbnailer/graders/html_attribute'
7
+ require 'link_thumbnailer/graders/link_density'
8
+ require 'link_thumbnailer/graders/position'
9
+
10
+ module LinkThumbnailer
11
+ class Grader < ::SimpleDelegator
12
+
13
+ attr_reader :config, :description
14
+
15
+ def initialize(description)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @description = description
18
+
19
+ super(config)
20
+ end
21
+
22
+ # For given description, computes probabilities returned by each graders by multipying them together.
23
+ #
24
+ # @return [Float] the probability for the given description to be considered good
25
+ def call
26
+ probability = 1.0
27
+
28
+ graders.each do |lambda|
29
+ instance = lambda.call(description)
30
+ probability *= instance.call.to_f ** instance.weight
31
+ end
32
+
33
+ probability
34
+ end
35
+
36
+ private
37
+
38
+ def graders
39
+ config.graders
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+
5
+ module LinkThumbnailer
6
+ module Graders
7
+ class Base < ::SimpleDelegator
8
+
9
+ attr_reader :config, :description, :options
10
+
11
+ def initialize(description, options = {})
12
+ @config = ::LinkThumbnailer.page.config
13
+ @description = description
14
+ @options = options
15
+
16
+ super(config)
17
+ end
18
+
19
+ def call
20
+ fail NotImplementedError
21
+ end
22
+
23
+ def weight
24
+ options.fetch(:weigth, 1)
25
+ end
26
+
27
+ private
28
+
29
+ def node
30
+ description.node
31
+ end
32
+
33
+ def text
34
+ description.text
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class HtmlAttribute < ::LinkThumbnailer::Graders::Base
6
+
7
+ attr_reader :attribute_name
8
+
9
+ def initialize(description, attribute_name)
10
+ super(description)
11
+ @attribute_name = attribute_name.to_sym
12
+ end
13
+
14
+ def call
15
+ return 1.0 if positive?
16
+ return 0.0 if negative?
17
+ 1.0
18
+ end
19
+
20
+ private
21
+
22
+ def attribute
23
+ node[attribute_name]
24
+ end
25
+
26
+ def attribute?
27
+ attribute && !attribute.empty?
28
+ end
29
+
30
+ def negative?
31
+ attribute? && attribute =~ negative_regex
32
+ end
33
+
34
+ def positive?
35
+ attribute? && attribute =~ positive_regex
36
+ end
37
+
38
+ def negative_regex
39
+ config.negative_regex
40
+ end
41
+
42
+ def positive_regex
43
+ config.positive_regex
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class Length < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ return 0.0 if too_short?
9
+
10
+ y / get_gaussian_value_for(ideal_description_length)
11
+ end
12
+
13
+ private
14
+
15
+ def get_gaussian_value_for(x)
16
+ Math.sqrt(2.0 * Math::PI ** 2) * Math.exp(-(x - ideal_description_length) ** 2 / 2.0 * 0.005 ** 2)
17
+ end
18
+
19
+ def x
20
+ text.length
21
+ end
22
+
23
+ def y
24
+ get_gaussian_value_for(x)
25
+ end
26
+
27
+ def ideal_description_length
28
+ options.fetch(:ideal_description_length, 120).to_f
29
+ end
30
+
31
+ def too_short?
32
+ text.length < config.description_min_length
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class LinkDensity < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ return 0.0 if text.length == 0
9
+ 1.0 - (links.count.to_f / text.length.to_f)
10
+ end
11
+
12
+ private
13
+
14
+ def links
15
+ node.css('a').map(&:text).compact.reject(&:empty?)
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class Position < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ 1.0 - (description.position.to_f / description.candidates_number.to_f)
9
+ end
10
+
11
+ end
12
+ end
13
+ end