link_thumbnailer 1.1.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +5 -13
  2. data/.travis.yml +1 -1
  3. data/CHANGELOG.md +117 -104
  4. data/Gemfile +1 -1
  5. data/{LICENSE → LICENSE.txt} +21 -21
  6. data/README.md +153 -184
  7. data/lib/generators/link_thumbnailer/install_generator.rb +0 -4
  8. data/lib/generators/templates/initializer.rb +63 -41
  9. data/lib/link_thumbnailer/configuration.rb +52 -10
  10. data/lib/link_thumbnailer/exceptions.rb +6 -0
  11. data/lib/link_thumbnailer/grader.rb +37 -0
  12. data/lib/link_thumbnailer/graders/base.rb +32 -0
  13. data/lib/link_thumbnailer/graders/html_attribute.rb +49 -0
  14. data/lib/link_thumbnailer/graders/length.rb +19 -0
  15. data/lib/link_thumbnailer/graders/link_density.rb +21 -0
  16. data/lib/link_thumbnailer/graders/position.rb +11 -0
  17. data/lib/link_thumbnailer/image_comparator.rb +24 -0
  18. data/lib/link_thumbnailer/image_comparators/base.rb +17 -0
  19. data/lib/link_thumbnailer/image_comparators/size.rb +11 -0
  20. data/lib/link_thumbnailer/image_parser.rb +18 -0
  21. data/lib/link_thumbnailer/image_parsers/size.rb +15 -0
  22. data/lib/link_thumbnailer/image_parsers/type.rb +15 -0
  23. data/lib/link_thumbnailer/image_validator.rb +30 -0
  24. data/lib/link_thumbnailer/model.rb +16 -0
  25. data/lib/link_thumbnailer/models/description.rb +34 -0
  26. data/lib/link_thumbnailer/models/image.rb +54 -0
  27. data/lib/link_thumbnailer/models/title.rb +20 -0
  28. data/lib/link_thumbnailer/models/website.rb +39 -0
  29. data/lib/link_thumbnailer/page.rb +40 -0
  30. data/lib/link_thumbnailer/parser.rb +13 -0
  31. data/lib/link_thumbnailer/processor.rb +94 -0
  32. data/lib/link_thumbnailer/railtie.rb +9 -9
  33. data/lib/link_thumbnailer/scraper.rb +64 -0
  34. data/lib/link_thumbnailer/scrapers/base.rb +63 -0
  35. data/lib/link_thumbnailer/scrapers/default/base.rb +10 -0
  36. data/lib/link_thumbnailer/scrapers/default/description.rb +47 -0
  37. data/lib/link_thumbnailer/scrapers/default/images.rb +64 -0
  38. data/lib/link_thumbnailer/scrapers/default/title.rb +25 -0
  39. data/lib/link_thumbnailer/scrapers/opengraph/base.rb +43 -0
  40. data/lib/link_thumbnailer/scrapers/opengraph/description.rb +10 -0
  41. data/lib/link_thumbnailer/scrapers/opengraph/image.rb +30 -0
  42. data/lib/link_thumbnailer/scrapers/opengraph/images.rb +16 -0
  43. data/lib/link_thumbnailer/scrapers/opengraph/title.rb +10 -0
  44. data/lib/link_thumbnailer/version.rb +3 -3
  45. data/lib/link_thumbnailer.rb +36 -119
  46. data/link_thumbnailer.gemspec +26 -28
  47. data/spec/configuration_spec.rb +51 -0
  48. data/spec/examples/empty_og_image_example.html +9 -0
  49. data/spec/fixture_spec.rb +88 -0
  50. data/spec/fixtures/bar.png +2907 -0
  51. data/spec/fixtures/default_from_body.html +12 -0
  52. data/spec/fixtures/default_from_meta.html +11 -0
  53. data/spec/{examples → fixtures}/example.html +53 -53
  54. data/spec/fixtures/foo.png +0 -0
  55. data/spec/fixtures/og_not_valid_example.html +12 -0
  56. data/spec/fixtures/og_valid_example.html +12 -0
  57. data/spec/fixtures/og_valid_multi_image_example.html +13 -0
  58. data/spec/grader_spec.rb +24 -0
  59. data/spec/graders/base_spec.rb +12 -0
  60. data/spec/graders/html_attribute_spec.rb +48 -0
  61. data/spec/graders/length_spec.rb +81 -0
  62. data/spec/graders/link_density_spec.rb +22 -0
  63. data/spec/image_comparators/size_spec.rb +39 -0
  64. data/spec/image_parsers/size_spec.rb +34 -0
  65. data/spec/image_parsers/type_spec.rb +34 -0
  66. data/spec/image_validator_spec.rb +35 -0
  67. data/spec/model_spec.rb +17 -0
  68. data/spec/models/description_spec.rb +64 -0
  69. data/spec/models/image_spec.rb +71 -0
  70. data/spec/models/title_spec.rb +24 -0
  71. data/spec/models/website_spec.rb +49 -0
  72. data/spec/page_spec.rb +26 -0
  73. data/spec/processor_spec.rb +349 -0
  74. data/spec/scraper_spec.rb +95 -0
  75. data/spec/scrapers/base_spec.rb +67 -0
  76. data/spec/scrapers/opengraph/base_spec.rb +94 -0
  77. data/spec/spec_helper.rb +15 -13
  78. metadata +126 -120
  79. data/app/controllers/link_thumbnailer/application_controller.rb +0 -4
  80. data/app/controllers/link_thumbnailer/previews_controller.rb +0 -11
  81. data/lib/link_thumbnailer/doc.rb +0 -65
  82. data/lib/link_thumbnailer/doc_parser.rb +0 -15
  83. data/lib/link_thumbnailer/engine.rb +0 -4
  84. data/lib/link_thumbnailer/fetcher.rb +0 -34
  85. data/lib/link_thumbnailer/img_comparator.rb +0 -17
  86. data/lib/link_thumbnailer/img_parser.rb +0 -41
  87. data/lib/link_thumbnailer/img_url_filter.rb +0 -13
  88. data/lib/link_thumbnailer/object.rb +0 -41
  89. data/lib/link_thumbnailer/opengraph.rb +0 -20
  90. data/lib/link_thumbnailer/rails/routes/mapper.rb +0 -30
  91. data/lib/link_thumbnailer/rails/routes/mapping.rb +0 -33
  92. data/lib/link_thumbnailer/rails/routes.rb +0 -47
  93. data/lib/link_thumbnailer/web_image.rb +0 -19
  94. data/spec/doc_parser_spec.rb +0 -25
  95. data/spec/doc_spec.rb +0 -23
  96. data/spec/examples/empty_example.html +0 -11
  97. data/spec/examples/og_example.html +0 -12
  98. data/spec/fetcher_spec.rb +0 -97
  99. data/spec/img_comparator_spec.rb +0 -16
  100. data/spec/img_url_filter_spec.rb +0 -31
  101. data/spec/link_thumbnailer_spec.rb +0 -205
  102. data/spec/object_spec.rb +0 -130
  103. data/spec/opengraph_spec.rb +0 -7
  104. data/spec/web_image_spec.rb +0 -57
@@ -0,0 +1,40 @@
1
+ require 'link_thumbnailer/processor'
2
+ require 'link_thumbnailer/scraper'
3
+
4
+ module LinkThumbnailer
5
+ class Page
6
+
7
+ attr_reader :url, :options, :source
8
+
9
+ def initialize(url, options = {})
10
+ @url = url
11
+ @options = options
12
+
13
+ set_options
14
+ end
15
+
16
+ def generate
17
+ @source = processor.call(url)
18
+ scraper.call
19
+ end
20
+
21
+ def config
22
+ @config ||= ::LinkThumbnailer.config.dup
23
+ end
24
+
25
+ private
26
+
27
+ def set_options
28
+ options.each { |k, v| config.send("#{k}=", v) }
29
+ end
30
+
31
+ def processor
32
+ @processor ||= ::LinkThumbnailer::Processor.new
33
+ end
34
+
35
+ def scraper
36
+ @scraper ||= ::LinkThumbnailer::Scraper.new(source, processor.url)
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,13 @@
1
+ require 'nokogiri'
2
+
3
+ module LinkThumbnailer
4
+ class Parser
5
+
6
+ attr_reader :document
7
+
8
+ def call(source)
9
+ @document ||= ::Nokogiri::HTML(source)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,94 @@
1
+ require 'delegate'
2
+ require 'net/http/persistent'
3
+
4
+ module LinkThumbnailer
5
+ class Processor < ::SimpleDelegator
6
+
7
+ attr_accessor :url
8
+ attr_reader :config, :http, :redirect_count
9
+
10
+ def initialize
11
+ @config = ::LinkThumbnailer.page.config
12
+ @http = ::Net::HTTP::Persistent.new
13
+
14
+ super(config)
15
+ end
16
+
17
+ def call(url = '', redirect_count = 0)
18
+ self.url = url
19
+ @redirect_count = redirect_count
20
+
21
+ raise ::LinkThumbnailer::RedirectLimit if too_many_redirections?
22
+
23
+ with_valid_url do
24
+ set_http_headers
25
+ set_http_options
26
+ perform_request
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def with_valid_url
33
+ raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
34
+ yield if block_given?
35
+ end
36
+
37
+ def set_http_headers
38
+ http.headers['User-Agent'] = user_agent
39
+ end
40
+
41
+ def set_http_options
42
+ http.verify_mode = ::OpenSSL::SSL::VERIFY_NONE unless ssl_required?
43
+ http.open_timeout = http_timeout
44
+ end
45
+
46
+ def perform_request
47
+ response = http.request(url)
48
+ case response
49
+ when ::Net::HTTPSuccess then response.body
50
+ when ::Net::HTTPRedirection
51
+ call resolve_relative_url(response['location']), redirect_count + 1
52
+ else
53
+ response.error!
54
+ end
55
+ end
56
+
57
+ def resolve_relative_url(location)
58
+ location.start_with?('http') ? location : build_absolute_url_for(location)
59
+ end
60
+
61
+ def build_absolute_url_for(relative_url)
62
+ URI("#{url.scheme}://#{url.host}#{relative_url}")
63
+ end
64
+
65
+ def redirect_limit
66
+ config.redirect_limit
67
+ end
68
+
69
+ def user_agent
70
+ config.user_agent
71
+ end
72
+
73
+ def http_timeout
74
+ config.http_timeout
75
+ end
76
+
77
+ def ssl_required?
78
+ config.verify_ssl
79
+ end
80
+
81
+ def too_many_redirections?
82
+ redirect_count > redirect_limit
83
+ end
84
+
85
+ def valid_url_format?
86
+ url.is_a?(URI::HTTP)
87
+ end
88
+
89
+ def url=(url)
90
+ @url = URI(url)
91
+ end
92
+
93
+ end
94
+ end
@@ -1,9 +1,9 @@
1
- module LinkThumbnailer
2
- class Railtie < ::Rails::Railtie
3
-
4
- initializer 'link_thumbnailer.routes' do
5
- LinkThumbnailer::Rails::Routes.install!
6
- end
7
-
8
- end
9
- end
1
+ module LinkThumbnailer
2
+ class Railtie < ::Rails::Railtie
3
+
4
+ initializer 'link_thumbnailer.routes' do
5
+ LinkThumbnailer::Rails::Routes.install!
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,64 @@
1
+ require 'delegate'
2
+ require 'active_support/core_ext/object/blank'
3
+ require 'active_support/inflector'
4
+
5
+ require 'link_thumbnailer/parser'
6
+ require 'link_thumbnailer/models/website'
7
+ require 'link_thumbnailer/scrapers/default/title'
8
+ require 'link_thumbnailer/scrapers/opengraph/title'
9
+ require 'link_thumbnailer/scrapers/default/description'
10
+ require 'link_thumbnailer/scrapers/opengraph/description'
11
+ require 'link_thumbnailer/scrapers/default/images'
12
+ require 'link_thumbnailer/scrapers/opengraph/images'
13
+
14
+ module LinkThumbnailer
15
+ class Scraper < ::SimpleDelegator
16
+
17
+ attr_reader :document, :source, :url, :config, :website
18
+
19
+ def initialize(source, url)
20
+ @source = source
21
+ @url = url
22
+ @config = ::LinkThumbnailer.page.config
23
+ @document = parser.call(source)
24
+ @website = ::LinkThumbnailer::Models::Website.new
25
+ @website.url = url
26
+
27
+ super(config)
28
+ end
29
+
30
+ def call
31
+ config.attributes.each do |name|
32
+ scrapers.each do |scraper_prefix|
33
+ scraper = scraper_class(scraper_prefix, name).new(document)
34
+ scraper.call(website, name.to_s) if scraper.applicable?
35
+
36
+ break unless website.send(name).blank?
37
+ end
38
+ end
39
+
40
+ website
41
+ end
42
+
43
+ private
44
+
45
+ def scrapers
46
+ [
47
+ "::LinkThumbnailer::Scrapers::Opengraph",
48
+ "::LinkThumbnailer::Scrapers::Default"
49
+ ]
50
+ end
51
+
52
+ def scraper_class(prefix, name)
53
+ name = name.to_s.camelize
54
+ "#{prefix}::#{name}".constantize
55
+ rescue NameError
56
+ raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
57
+ end
58
+
59
+ def parser
60
+ ::LinkThumbnailer::Parser.new
61
+ end
62
+
63
+ end
64
+ end
@@ -0,0 +1,63 @@
1
+ require 'delegate'
2
+ require 'link_thumbnailer/models/title'
3
+ require 'link_thumbnailer/models/description'
4
+ require 'link_thumbnailer/models/image'
5
+
6
+ module LinkThumbnailer
7
+ module Scrapers
8
+ class Base < ::SimpleDelegator
9
+
10
+ attr_reader :config, :document, :website, :attribute_name
11
+
12
+ def initialize(document)
13
+ @config = ::LinkThumbnailer.page.config
14
+ @document = document
15
+
16
+ super(config)
17
+ end
18
+
19
+ def call(website, attribute_name)
20
+ @website = website
21
+ @attribute_name = attribute_name
22
+
23
+ website.send("#{attribute_name}=", value)
24
+ website
25
+ end
26
+
27
+ def applicable?
28
+ true
29
+ end
30
+
31
+ private
32
+
33
+ def value
34
+ raise 'must implement'
35
+ end
36
+
37
+ def meta_xpath(options = {})
38
+ meta_xpaths(options).first
39
+ end
40
+
41
+ def meta_xpaths(options = {})
42
+ key = options.fetch(:key, :property)
43
+ value = options.fetch(:value, :content)
44
+ attribute = options.fetch(:attribute, attribute_name)
45
+
46
+ document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and @#{value}]")
47
+ end
48
+
49
+ def abc
50
+ 'abcdefghijklmnopqrstuvwxyz'
51
+ end
52
+
53
+ def model_class
54
+ "::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
55
+ end
56
+
57
+ def modelize(node, text = nil)
58
+ model_class.new(node, text)
59
+ end
60
+
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,10 @@
1
+ require 'link_thumbnailer/scrapers/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Default
6
+ class Base < ::LinkThumbnailer::Scrapers::Base
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,47 @@
1
+ require 'link_thumbnailer/scrapers/default/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Default
6
+ class Description < ::LinkThumbnailer::Scrapers::Default::Base
7
+
8
+ private
9
+
10
+ def value
11
+ return model_from_meta.to_s if model_from_meta
12
+ return model_from_body.to_s if model_from_body
13
+ nil
14
+ end
15
+
16
+ def model_from_meta
17
+ modelize(node_from_meta, node_from_meta.attributes['content'].value) if node_from_meta
18
+ end
19
+
20
+ def model_from_body
21
+ nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i + 1) }.sort.last
22
+ end
23
+
24
+ def node_from_meta
25
+ @node_from_meta ||= meta_xpath(key: :name)
26
+ end
27
+
28
+ def nodes_from_body
29
+ candidates.select { |node| valid_paragraph?(node) }
30
+ end
31
+
32
+ def valid_paragraph?(node)
33
+ true
34
+ end
35
+
36
+ def candidates
37
+ document.css('p,td')
38
+ end
39
+
40
+ def modelize(node, text, i = 1)
41
+ model_class.new(node, text, i)
42
+ end
43
+
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,64 @@
1
+ require 'link_thumbnailer/scrapers/default/base'
2
+ require 'link_thumbnailer/models/image'
3
+
4
+ module LinkThumbnailer
5
+ module Scrapers
6
+ module Default
7
+ class Images < ::LinkThumbnailer::Scrapers::Default::Base
8
+
9
+ private
10
+
11
+ def value
12
+ abs_urls.each_with_index.take_while { |_, i| i < config.image_limit }.map { |e| modelize(e.first) }
13
+ end
14
+
15
+ def urls
16
+ document.search('//img').map { |i| i['src'] }.compact
17
+ end
18
+
19
+ def abs_urls
20
+ urls.map do |url|
21
+ uri = validate_url(url)
22
+
23
+ next unless uri
24
+
25
+ uri = prefix_uri(uri) if needs_prefix?(uri)
26
+ uri
27
+ end
28
+ end
29
+
30
+ def validate_url(url)
31
+ URI(url)
32
+ rescue URI::InvalidURIError
33
+ nil
34
+ end
35
+
36
+ def needs_prefix?(uri)
37
+ !uri.is_a?(URI::HTTP)
38
+ end
39
+
40
+ def prefix_uri(uri)
41
+ URI.join(prefix_url, uri)
42
+ end
43
+
44
+ def prefix_url
45
+ base_href || website.url
46
+ end
47
+
48
+ def base_href
49
+ base = document.at('//head/base')
50
+ base['href'] if base
51
+ end
52
+
53
+ def model_class
54
+ ::LinkThumbnailer::Models::Image
55
+ end
56
+
57
+ def modelize(uri)
58
+ model_class.new(uri)
59
+ end
60
+
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,25 @@
1
+ require 'link_thumbnailer/scrapers/default/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Default
6
+ class Title < ::LinkThumbnailer::Scrapers::Default::Base
7
+
8
+ private
9
+
10
+ def value
11
+ model.to_s
12
+ end
13
+
14
+ def model
15
+ modelize(node)
16
+ end
17
+
18
+ def node
19
+ document.css(attribute_name)
20
+ end
21
+
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,43 @@
1
+ require 'link_thumbnailer/scrapers/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Opengraph
6
+ class Base < ::LinkThumbnailer::Scrapers::Base
7
+
8
+ def applicable?
9
+ meta.any? { |node| opengraph_node?(node) }
10
+ end
11
+
12
+ private
13
+
14
+ def value
15
+ model.to_s
16
+ end
17
+
18
+ def model
19
+ modelize(node, node.attributes['content'].to_s) if node
20
+ end
21
+
22
+ def node
23
+ @node ||= meta_xpath(attribute: attribute) ||
24
+ meta_xpath(attribute: attribute, key: :name)
25
+ end
26
+
27
+ def attribute
28
+ "og:#{attribute_name}"
29
+ end
30
+
31
+ def opengraph_node?(node)
32
+ node.attribute('name').to_s.start_with?('og:') ||
33
+ node.attribute('property').to_s.start_with?('og:')
34
+ end
35
+
36
+ def meta
37
+ document.css('meta')
38
+ end
39
+
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,10 @@
1
+ require 'link_thumbnailer/scrapers/opengraph/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Opengraph
6
+ class Description < ::LinkThumbnailer::Scrapers::Opengraph::Base
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,30 @@
1
+ require 'link_thumbnailer/scrapers/opengraph/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Opengraph
6
+ class Image < ::LinkThumbnailer::Scrapers::Opengraph::Base
7
+
8
+ private
9
+
10
+ def value
11
+ model
12
+ end
13
+
14
+ def model
15
+ nodes.map { |n| modelize(n, n.attributes['content'].to_s) }
16
+ end
17
+
18
+ def modelize(node, text = nil)
19
+ model_class.new(text)
20
+ end
21
+
22
+ def nodes
23
+ nodes = meta_xpaths(attribute: attribute)
24
+ nodes.empty? ? meta_xpaths(attribute: attribute, key: :name) : nodes
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,16 @@
1
+ require 'link_thumbnailer/scrapers/opengraph/base'
2
+ require 'link_thumbnailer/scrapers/opengraph/image'
3
+
4
+ module LinkThumbnailer
5
+ module Scrapers
6
+ module Opengraph
7
+ class Images < ::LinkThumbnailer::Scrapers::Opengraph::Base
8
+
9
+ def call(website, attribute_name)
10
+ ::LinkThumbnailer::Scrapers::Opengraph::Image.new(document).call(website, 'image')
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ require 'link_thumbnailer/scrapers/opengraph/base'
2
+
3
+ module LinkThumbnailer
4
+ module Scrapers
5
+ module Opengraph
6
+ class Title < ::LinkThumbnailer::Scrapers::Opengraph::Base
7
+ end
8
+ end
9
+ end
10
+ end
@@ -1,3 +1,3 @@
1
- module LinkThumbnailer
2
- VERSION = "1.1.2"
3
- end
1
+ module LinkThumbnailer
2
+ VERSION = '2.0.0'
3
+ end