link_thumbnailer 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +5 -0
  6. data/CHANGELOG.md +334 -0
  7. data/Gemfile +12 -0
  8. data/LICENSE.txt +22 -0
  9. data/README.md +210 -0
  10. data/Rakefile +9 -0
  11. data/lib/generators/link_thumbnailer/install_generator.rb +17 -0
  12. data/lib/generators/templates/initializer.rb +89 -0
  13. data/lib/link_thumbnailer.rb +38 -0
  14. data/lib/link_thumbnailer/configuration.rb +72 -0
  15. data/lib/link_thumbnailer/exceptions.rb +11 -0
  16. data/lib/link_thumbnailer/grader.rb +43 -0
  17. data/lib/link_thumbnailer/graders/base.rb +39 -0
  18. data/lib/link_thumbnailer/graders/html_attribute.rb +48 -0
  19. data/lib/link_thumbnailer/graders/length.rb +37 -0
  20. data/lib/link_thumbnailer/graders/link_density.rb +20 -0
  21. data/lib/link_thumbnailer/graders/position.rb +13 -0
  22. data/lib/link_thumbnailer/image_comparator.rb +26 -0
  23. data/lib/link_thumbnailer/image_comparators/base.rb +19 -0
  24. data/lib/link_thumbnailer/image_comparators/size.rb +13 -0
  25. data/lib/link_thumbnailer/image_parser.rb +62 -0
  26. data/lib/link_thumbnailer/image_validator.rb +32 -0
  27. data/lib/link_thumbnailer/model.rb +20 -0
  28. data/lib/link_thumbnailer/models/description.rb +37 -0
  29. data/lib/link_thumbnailer/models/favicon.rb +27 -0
  30. data/lib/link_thumbnailer/models/image.rb +56 -0
  31. data/lib/link_thumbnailer/models/title.rb +22 -0
  32. data/lib/link_thumbnailer/models/video.rb +44 -0
  33. data/lib/link_thumbnailer/models/website.rb +54 -0
  34. data/lib/link_thumbnailer/page.rb +43 -0
  35. data/lib/link_thumbnailer/parser.rb +15 -0
  36. data/lib/link_thumbnailer/processor.rb +128 -0
  37. data/lib/link_thumbnailer/railtie.rb +6 -0
  38. data/lib/link_thumbnailer/response.rb +39 -0
  39. data/lib/link_thumbnailer/scraper.rb +62 -0
  40. data/lib/link_thumbnailer/scrapers/base.rb +69 -0
  41. data/lib/link_thumbnailer/scrapers/default/base.rb +12 -0
  42. data/lib/link_thumbnailer/scrapers/default/description.rb +49 -0
  43. data/lib/link_thumbnailer/scrapers/default/favicon.rb +38 -0
  44. data/lib/link_thumbnailer/scrapers/default/images.rb +78 -0
  45. data/lib/link_thumbnailer/scrapers/default/title.rb +27 -0
  46. data/lib/link_thumbnailer/scrapers/default/videos.rb +18 -0
  47. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +45 -0
  48. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +12 -0
  49. data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +17 -0
  50. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +107 -0
  51. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +18 -0
  52. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +12 -0
  53. data/lib/link_thumbnailer/scrapers/opengraph/video.rb +115 -0
  54. data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +18 -0
  55. data/lib/link_thumbnailer/uri.rb +20 -0
  56. data/lib/link_thumbnailer/version.rb +5 -0
  57. data/lib/link_thumbnailer/video_parser.rb +47 -0
  58. data/link_thumbnailer.gemspec +29 -0
  59. data/spec/configuration_spec.rb +61 -0
  60. data/spec/fixture_spec.rb +114 -0
  61. data/spec/fixtures/bar.png +2907 -0
  62. data/spec/fixtures/default_from_body.html +13 -0
  63. data/spec/fixtures/default_from_meta.html +12 -0
  64. data/spec/fixtures/foo.png +0 -0
  65. data/spec/fixtures/google_shift_jis.html +6 -0
  66. data/spec/fixtures/google_utf8.html +6 -0
  67. data/spec/fixtures/og_not_valid_example.html +12 -0
  68. data/spec/fixtures/og_valid_example.html +18 -0
  69. data/spec/fixtures/og_valid_multi_image_example.html +13 -0
  70. data/spec/fixtures/og_valid_multi_video_example.html +13 -0
  71. data/spec/grader_spec.rb +27 -0
  72. data/spec/graders/base_spec.rb +14 -0
  73. data/spec/graders/html_attribute_spec.rb +50 -0
  74. data/spec/graders/length_spec.rb +93 -0
  75. data/spec/graders/link_density_spec.rb +52 -0
  76. data/spec/graders/position_spec.rb +49 -0
  77. data/spec/image_comparators/size_spec.rb +58 -0
  78. data/spec/image_validator_spec.rb +37 -0
  79. data/spec/model_spec.rb +27 -0
  80. data/spec/models/description_spec.rb +66 -0
  81. data/spec/models/favicon_spec.rb +12 -0
  82. data/spec/models/image_spec.rb +95 -0
  83. data/spec/models/title_spec.rb +26 -0
  84. data/spec/models/video_spec.rb +49 -0
  85. data/spec/models/website_spec.rb +51 -0
  86. data/spec/page_spec.rb +28 -0
  87. data/spec/processor_spec.rb +410 -0
  88. data/spec/response_spec.rb +62 -0
  89. data/spec/scraper_spec.rb +70 -0
  90. data/spec/scrapers/base_spec.rb +69 -0
  91. data/spec/scrapers/opengraph/base_spec.rb +96 -0
  92. data/spec/spec_helper.rb +11 -0
  93. data/spec/uri_spec.rb +44 -0
  94. data/spec/video_parser_spec.rb +148 -0
  95. metadata +271 -0
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new('spec')
7
+
8
+ task default: :spec
9
+ task test: :spec
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Generators
5
+ class InstallGenerator < ::Rails::Generators::Base
6
+
7
+ source_root File.expand_path('../../templates', __FILE__)
8
+
9
+ desc 'Creates a LinkThumbnailer initializer for your application.'
10
+
11
+ def copy_initializer
12
+ template 'initializer.rb', 'config/initializers/link_thumbnailer.rb'
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Use this hook to configure LinkThumbnailer bahaviors.
4
+ LinkThumbnailer.configure do |config|
5
+ # Numbers of redirects before raising an exception when trying to parse given url.
6
+ #
7
+ # config.redirect_limit = 3
8
+
9
+ # Set user agent
10
+ #
11
+ # config.user_agent = 'link_thumbnailer'
12
+
13
+ # Enable or disable SSL verification
14
+ #
15
+ # config.verify_ssl = true
16
+
17
+ # The amount of time in seconds to wait for a connection to be opened.
18
+ # If the HTTP object cannot open a connection in this many seconds,
19
+ # it raises a Net::OpenTimeout exception.
20
+ #
21
+ # See http://www.ruby-doc.org/stdlib-2.1.1/libdoc/net/http/rdoc/Net/HTTP.html#open_timeout
22
+ #
23
+ # config.http_open_timeout = 5
24
+
25
+ # List of blacklisted urls you want to skip when searching for images.
26
+ #
27
+ # config.blacklist_urls = [
28
+ # %r{^http://ad\.doubleclick\.net/},
29
+ # %r{^http://b\.scorecardresearch\.com/},
30
+ # %r{^http://pixel\.quantserve\.com/},
31
+ # %r{^http://s7\.addthis\.com/}
32
+ # ]
33
+
34
+ # List of attributes you want LinkThumbnailer to fetch on a website.
35
+ #
36
+ # config.attributes = [:title, :images, :description, :videos, :favicon]
37
+
38
+ # List of procedures used to rate the website description. Add you custom class
39
+ # here. See wiki for more details on how to build your own graders.
40
+ #
41
+ # config.graders = [
42
+ # ->(description) { ::LinkThumbnailer::Graders::Length.new(description) },
43
+ # ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
44
+ # ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
45
+ # ->(description) { ::LinkThumbnailer::Graders::Position.new(description, weight: 3) },
46
+ # ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) }
47
+ # ]
48
+
49
+ # Minimum description length for a website.
50
+ #
51
+ # config.description_min_length = 25
52
+
53
+ # Regex of words considered positive to rate website description.
54
+ #
55
+ # config.positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
56
+
57
+ # Regex of words considered negative to rate website description.
58
+ #
59
+ # config.negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
60
+
61
+ # Numbers of images to fetch. Fetching too many images will be slow.
62
+ # Note that LinkThumbnailer will only sort fetched images between each other.
63
+ # Meaning that they could be a "better" image on the page.
64
+ #
65
+ # config.image_limit = 5
66
+
67
+ # Whether you want LinkThumbnailer to return image size and type or not.
68
+ # Setting this value to false will increase performance since for each images, LinkThumbnailer
69
+ # does not have to fetch its size and type.
70
+ #
71
+ # config.image_stats = true
72
+
73
+ # Whether you want LinkThumbnailer to raise an exception if the Content-Type of the HTTP request
74
+ # is not an html or xml.
75
+ #
76
+ # config.raise_on_invalid_format = false
77
+
78
+ # Sets number of concurrent http connections that can be opened to fetch images informations such as size and type.
79
+ #
80
+ # config.max_concurrency = 20
81
+
82
+ # Defines the strategies to use to scrap the website. See the [Open Graph Protocol](http://ogp.me/) for more information.
83
+ #
84
+ # config.scrapers = [:opengraph, :default]
85
+
86
+ # Sets the default encoding.
87
+ #
88
+ # config.encoding = 'utf-8'
89
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'link_thumbnailer/version'
5
+ require 'link_thumbnailer/configuration'
6
+ require 'link_thumbnailer/exceptions'
7
+ require 'link_thumbnailer/page'
8
+
9
+ module LinkThumbnailer
10
+
11
+ class << self
12
+
13
+ attr_reader :page
14
+
15
+ def generate(url, options = {})
16
+ @page = ::LinkThumbnailer::Page.new(url, options)
17
+
18
+ page.generate
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+
25
+ begin
26
+ require 'rails'
27
+ rescue LoadError
28
+ end
29
+
30
+ $stderr.puts <<-EOC if !defined?(Rails)
31
+ warning: no framework detected.
32
+
33
+ Your Gemfile might not be configured properly.
34
+ ---- e.g. ----
35
+ Rails:
36
+ gem 'link_thumbnailer'
37
+
38
+ EOC
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+
5
+ # Access point for the gem configurations.
6
+ #
7
+ # @return [LinkThumbnailer::Configuration] a configuration instance.
8
+ def self.config
9
+ @config ||= Configuration.new
10
+ end
11
+
12
+ # Configure hook used in the gem initializer. Convinient way to set all the
13
+ # gem configurations.
14
+ #
15
+ # @example inside config/initializers/link_thumbnaler.rb
16
+ # LinkThumbnailer.configure do |config|
17
+ # config.user_agent = 'link_thumbnailer'
18
+ # end
19
+ #
20
+ # @return [void]
21
+ def self.configure
22
+ yield config if block_given?
23
+ end
24
+
25
+ class Configuration
26
+
27
+ attr_accessor :redirect_limit, :blacklist_urls, :user_agent,
28
+ :verify_ssl, :http_open_timeout, :http_read_timeout, :attributes,
29
+ :graders, :description_min_length, :positive_regex, :negative_regex,
30
+ :image_limit, :image_stats, :raise_on_invalid_format, :max_concurrency,
31
+ :scrapers, :http_override_headers, :encoding
32
+
33
+ alias_method :http_timeout, :http_open_timeout
34
+ alias_method :http_timeout=, :http_open_timeout=
35
+
36
+ # Create a new instance.
37
+ #
38
+ # @return [LinkThumbnailer::Configuration]
39
+ def initialize
40
+ @redirect_limit = 3
41
+ @user_agent = 'link_thumbnailer'
42
+ @verify_ssl = true
43
+ @http_open_timeout = 5
44
+ @http_read_timeout = 5
45
+ @blacklist_urls = [
46
+ %r{^http://ad\.doubleclick\.net/},
47
+ %r{^http://b\.scorecardresearch\.com/},
48
+ %r{^http://pixel\.quantserve\.com/},
49
+ %r{^http://s7\.addthis\.com/}
50
+ ]
51
+ @attributes = [:title, :images, :description, :videos, :favicon]
52
+ @graders = [
53
+ ->(description) { ::LinkThumbnailer::Graders::Length.new(description) },
54
+ ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
55
+ ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
56
+ ->(description) { ::LinkThumbnailer::Graders::Position.new(description, weigth: 3) },
57
+ ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) },
58
+ ]
59
+ @description_min_length = 50
60
+ @positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
61
+ @negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
62
+ @image_limit = 5
63
+ @image_stats = true
64
+ @raise_on_invalid_format = false
65
+ @max_concurrency = 20
66
+ @scrapers = [:opengraph, :default]
67
+ @http_override_headers = { 'Accept-Encoding' => 'none' }
68
+ @encoding = 'utf-8'
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ Exceptions = Class.new(StandardError)
5
+ RedirectLimit = Class.new(Exceptions)
6
+ BadUriFormat = Class.new(Exceptions)
7
+ FormatNotSupported = Class.new(Exceptions)
8
+ ScraperInvalid = Class.new(Exceptions)
9
+ HTTPError = Class.new(Exceptions)
10
+ SyntaxError = Class.new(Exceptions)
11
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+ require 'link_thumbnailer/graders/base'
5
+ require 'link_thumbnailer/graders/length'
6
+ require 'link_thumbnailer/graders/html_attribute'
7
+ require 'link_thumbnailer/graders/link_density'
8
+ require 'link_thumbnailer/graders/position'
9
+
10
+ module LinkThumbnailer
11
+ class Grader < ::SimpleDelegator
12
+
13
+ attr_reader :config, :description
14
+
15
+ def initialize(description)
16
+ @config = ::LinkThumbnailer.page.config
17
+ @description = description
18
+
19
+ super(config)
20
+ end
21
+
22
+ # For given description, computes probabilities returned by each graders by multipying them together.
23
+ #
24
+ # @return [Float] the probability for the given description to be considered good
25
+ def call
26
+ probability = 1.0
27
+
28
+ graders.each do |lambda|
29
+ instance = lambda.call(description)
30
+ probability *= instance.call.to_f ** instance.weight
31
+ end
32
+
33
+ probability
34
+ end
35
+
36
+ private
37
+
38
+ def graders
39
+ config.graders
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'delegate'
4
+
5
+ module LinkThumbnailer
6
+ module Graders
7
+ class Base < ::SimpleDelegator
8
+
9
+ attr_reader :config, :description, :options
10
+
11
+ def initialize(description, options = {})
12
+ @config = ::LinkThumbnailer.page.config
13
+ @description = description
14
+ @options = options
15
+
16
+ super(config)
17
+ end
18
+
19
+ def call
20
+ fail NotImplementedError
21
+ end
22
+
23
+ def weight
24
+ options.fetch(:weigth, 1)
25
+ end
26
+
27
+ private
28
+
29
+ def node
30
+ description.node
31
+ end
32
+
33
+ def text
34
+ description.text
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class HtmlAttribute < ::LinkThumbnailer::Graders::Base
6
+
7
+ attr_reader :attribute_name
8
+
9
+ def initialize(description, attribute_name)
10
+ super(description)
11
+ @attribute_name = attribute_name.to_sym
12
+ end
13
+
14
+ def call
15
+ return 1.0 if positive?
16
+ return 0.0 if negative?
17
+ 1.0
18
+ end
19
+
20
+ private
21
+
22
+ def attribute
23
+ node[attribute_name]
24
+ end
25
+
26
+ def attribute?
27
+ attribute && !attribute.empty?
28
+ end
29
+
30
+ def negative?
31
+ attribute? && attribute =~ negative_regex
32
+ end
33
+
34
+ def positive?
35
+ attribute? && attribute =~ positive_regex
36
+ end
37
+
38
+ def negative_regex
39
+ config.negative_regex
40
+ end
41
+
42
+ def positive_regex
43
+ config.positive_regex
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class Length < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ return 0.0 if too_short?
9
+
10
+ y / get_gaussian_value_for(ideal_description_length)
11
+ end
12
+
13
+ private
14
+
15
+ def get_gaussian_value_for(x)
16
+ Math.sqrt(2.0 * Math::PI ** 2) * Math.exp(-(x - ideal_description_length) ** 2 / 2.0 * 0.005 ** 2)
17
+ end
18
+
19
+ def x
20
+ text.length
21
+ end
22
+
23
+ def y
24
+ get_gaussian_value_for(x)
25
+ end
26
+
27
+ def ideal_description_length
28
+ options.fetch(:ideal_description_length, 120).to_f
29
+ end
30
+
31
+ def too_short?
32
+ text.length < config.description_min_length
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class LinkDensity < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ return 0.0 if text.length == 0
9
+ 1.0 - (links.count.to_f / text.length.to_f)
10
+ end
11
+
12
+ private
13
+
14
+ def links
15
+ node.css('a').map(&:text).compact.reject(&:empty?)
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinkThumbnailer
4
+ module Graders
5
+ class Position < ::LinkThumbnailer::Graders::Base
6
+
7
+ def call
8
+ 1.0 - (description.position.to_f / description.candidates_number.to_f)
9
+ end
10
+
11
+ end
12
+ end
13
+ end