link_thumbnailer 3.2.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.ruby-version +1 -0
- data/.travis.yml +2 -4
- data/CHANGELOG.md +252 -75
- data/Gemfile +5 -3
- data/README.md +4 -0
- data/Rakefile +2 -0
- data/lib/generators/link_thumbnailer/install_generator.rb +2 -0
- data/lib/generators/templates/initializer.rb +15 -0
- data/lib/link_thumbnailer.rb +2 -0
- data/lib/link_thumbnailer/configuration.rb +74 -68
- data/lib/link_thumbnailer/exceptions.rb +3 -0
- data/lib/link_thumbnailer/grader.rb +2 -0
- data/lib/link_thumbnailer/graders/base.rb +2 -0
- data/lib/link_thumbnailer/graders/html_attribute.rb +2 -0
- data/lib/link_thumbnailer/graders/length.rb +2 -0
- data/lib/link_thumbnailer/graders/link_density.rb +2 -0
- data/lib/link_thumbnailer/graders/position.rb +2 -0
- data/lib/link_thumbnailer/image_comparator.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/base.rb +2 -0
- data/lib/link_thumbnailer/image_comparators/size.rb +2 -0
- data/lib/link_thumbnailer/image_parser.rb +13 -1
- data/lib/link_thumbnailer/image_validator.rb +2 -0
- data/lib/link_thumbnailer/model.rb +20 -17
- data/lib/link_thumbnailer/models/description.rb +2 -0
- data/lib/link_thumbnailer/models/favicon.rb +2 -0
- data/lib/link_thumbnailer/models/image.rb +56 -54
- data/lib/link_thumbnailer/models/title.rb +2 -0
- data/lib/link_thumbnailer/models/video.rb +2 -0
- data/lib/link_thumbnailer/models/website.rb +54 -52
- data/lib/link_thumbnailer/page.rb +4 -1
- data/lib/link_thumbnailer/parser.rb +3 -1
- data/lib/link_thumbnailer/processor.rb +38 -5
- data/lib/link_thumbnailer/railtie.rb +2 -0
- data/lib/link_thumbnailer/response.rb +39 -0
- data/lib/link_thumbnailer/scraper.rb +62 -60
- data/lib/link_thumbnailer/scrapers/base.rb +69 -67
- data/lib/link_thumbnailer/scrapers/default/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/favicon.rb +16 -2
- data/lib/link_thumbnailer/scrapers/default/images.rb +5 -1
- data/lib/link_thumbnailer/scrapers/default/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/default/videos.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/base.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/description.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/favicon.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/image.rb +7 -1
- data/lib/link_thumbnailer/scrapers/opengraph/images.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/title.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/video.rb +2 -0
- data/lib/link_thumbnailer/scrapers/opengraph/videos.rb +2 -0
- data/lib/link_thumbnailer/uri.rb +20 -0
- data/lib/link_thumbnailer/version.rb +3 -1
- data/lib/link_thumbnailer/video_parser.rb +3 -1
- data/link_thumbnailer.gemspec +8 -6
- data/spec/configuration_spec.rb +4 -2
- data/spec/fixture_spec.rb +21 -0
- data/spec/fixtures/default_with_few_favicons.html +15 -0
- data/spec/fixtures/google_shift_jis.html +6 -0
- data/spec/fixtures/google_utf8.html +6 -0
- data/spec/fixtures/google_utf8_no_meta_charset.html +6 -0
- data/spec/fixtures/with_related_path_in_href.html +13 -0
- data/spec/fixtures/with_root_path_in_href.html +13 -0
- data/spec/grader_spec.rb +3 -1
- data/spec/graders/base_spec.rb +2 -0
- data/spec/graders/html_attribute_spec.rb +9 -7
- data/spec/graders/length_spec.rb +10 -6
- data/spec/graders/link_density_spec.rb +4 -2
- data/spec/graders/position_spec.rb +8 -6
- data/spec/image_comparators/size_spec.rb +2 -0
- data/spec/image_validator_spec.rb +3 -1
- data/spec/model_spec.rb +2 -0
- data/spec/models/description_spec.rb +3 -1
- data/spec/models/favicon_spec.rb +2 -0
- data/spec/models/image_spec.rb +6 -4
- data/spec/models/title_spec.rb +2 -0
- data/spec/models/video_spec.rb +7 -5
- data/spec/models/website_spec.rb +5 -3
- data/spec/page_spec.rb +2 -0
- data/spec/processor_spec.rb +74 -23
- data/spec/response_spec.rb +84 -0
- data/spec/scraper_spec.rb +6 -4
- data/spec/scrapers/base_spec.rb +6 -4
- data/spec/scrapers/opengraph/base_spec.rb +8 -6
- data/spec/spec_helper.rb +2 -0
- data/spec/uri_spec.rb +44 -0
- data/spec/video_parser_spec.rb +15 -13
- metadata +37 -19
|
@@ -1,52 +1,54 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/model'
|
|
4
|
+
|
|
5
|
+
module LinkThumbnailer
|
|
6
|
+
module Models
|
|
7
|
+
class Website < ::LinkThumbnailer::Model
|
|
8
|
+
|
|
9
|
+
attr_accessor :url, :title, :description, :images, :videos, :favicon
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@images = []
|
|
13
|
+
@videos = []
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def video=(video)
|
|
17
|
+
self.videos = video
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def videos=(videos)
|
|
21
|
+
Array(videos).each do |video|
|
|
22
|
+
@videos << video
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def image=(image)
|
|
27
|
+
self.images = image
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def images=(images)
|
|
31
|
+
Array(images).each do |image|
|
|
32
|
+
next unless image.valid?
|
|
33
|
+
@images << image
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def images
|
|
38
|
+
@images.sort!
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def as_json(*)
|
|
42
|
+
{
|
|
43
|
+
url: url.to_s,
|
|
44
|
+
favicon: favicon,
|
|
45
|
+
title: title,
|
|
46
|
+
description: description,
|
|
47
|
+
images: images.map(&:as_json),
|
|
48
|
+
videos: videos.map(&:as_json)
|
|
49
|
+
}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'link_thumbnailer/response'
|
|
1
4
|
require 'link_thumbnailer/processor'
|
|
2
5
|
require 'link_thumbnailer/scraper'
|
|
3
6
|
|
|
@@ -14,7 +17,7 @@ module LinkThumbnailer
|
|
|
14
17
|
end
|
|
15
18
|
|
|
16
19
|
def generate
|
|
17
|
-
@source = processor.
|
|
20
|
+
@source = processor.start(url)
|
|
18
21
|
scraper.call
|
|
19
22
|
end
|
|
20
23
|
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'nokogiri'
|
|
2
4
|
|
|
3
5
|
module LinkThumbnailer
|
|
4
6
|
class Parser
|
|
5
7
|
|
|
6
8
|
def call(source)
|
|
7
|
-
::Nokogiri::HTML(source)
|
|
9
|
+
::Nokogiri::HTML(source, nil, LinkThumbnailer.page.config.encoding)
|
|
8
10
|
rescue ::Nokogiri::XML::SyntaxError => e
|
|
9
11
|
raise ::LinkThumbnailer::SyntaxError.new(e.message)
|
|
10
12
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'delegate'
|
|
2
4
|
require 'uri'
|
|
3
5
|
require 'net/http/persistent'
|
|
@@ -15,6 +17,12 @@ module LinkThumbnailer
|
|
|
15
17
|
super(config)
|
|
16
18
|
end
|
|
17
19
|
|
|
20
|
+
def start(url)
|
|
21
|
+
result = call(url)
|
|
22
|
+
shutdown
|
|
23
|
+
result
|
|
24
|
+
end
|
|
25
|
+
|
|
18
26
|
def call(url = '', redirect_count = 0, headers = {})
|
|
19
27
|
self.url = url
|
|
20
28
|
@redirect_count = redirect_count
|
|
@@ -26,12 +34,16 @@ module LinkThumbnailer
|
|
|
26
34
|
set_http_options
|
|
27
35
|
perform_request
|
|
28
36
|
end
|
|
29
|
-
rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error => e
|
|
37
|
+
rescue ::Net::HTTPExceptions, ::SocketError, ::Timeout::Error, ::Net::HTTP::Persistent::Error => e
|
|
30
38
|
raise ::LinkThumbnailer::HTTPError.new(e.message)
|
|
31
39
|
end
|
|
32
40
|
|
|
33
41
|
private
|
|
34
42
|
|
|
43
|
+
def shutdown
|
|
44
|
+
http.shutdown
|
|
45
|
+
end
|
|
46
|
+
|
|
35
47
|
def with_valid_url
|
|
36
48
|
raise ::LinkThumbnailer::BadUriFormat unless valid_url_format?
|
|
37
49
|
yield if block_given?
|
|
@@ -39,8 +51,8 @@ module LinkThumbnailer
|
|
|
39
51
|
|
|
40
52
|
def set_http_headers(headers = {})
|
|
41
53
|
headers.each { |k, v| http.headers[k] = v }
|
|
42
|
-
http.
|
|
43
|
-
http.override_headers[
|
|
54
|
+
http.override_headers['User-Agent'] = user_agent
|
|
55
|
+
config.http_override_headers.each { |k, v| http.override_headers[k] = v }
|
|
44
56
|
end
|
|
45
57
|
|
|
46
58
|
def set_http_options
|
|
@@ -51,7 +63,7 @@ module LinkThumbnailer
|
|
|
51
63
|
end
|
|
52
64
|
|
|
53
65
|
def perform_request
|
|
54
|
-
response =
|
|
66
|
+
response = request_in_chunks
|
|
55
67
|
headers = {}
|
|
56
68
|
headers['Cookie'] = response['Set-Cookie'] if response['Set-Cookie'].present?
|
|
57
69
|
|
|
@@ -59,7 +71,7 @@ module LinkThumbnailer
|
|
|
59
71
|
|
|
60
72
|
case response
|
|
61
73
|
when ::Net::HTTPSuccess
|
|
62
|
-
response.body
|
|
74
|
+
Response.new(response).body
|
|
63
75
|
when ::Net::HTTPRedirection
|
|
64
76
|
call(
|
|
65
77
|
resolve_relative_url(response['location'].to_s),
|
|
@@ -71,6 +83,19 @@ module LinkThumbnailer
|
|
|
71
83
|
end
|
|
72
84
|
end
|
|
73
85
|
|
|
86
|
+
def request_in_chunks
|
|
87
|
+
body = String.new
|
|
88
|
+
response = http.request(url) do |resp|
|
|
89
|
+
raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(resp.content_length)
|
|
90
|
+
resp.read_body do |chunk|
|
|
91
|
+
body.concat(chunk)
|
|
92
|
+
raise ::LinkThumbnailer::DownloadSizeLimit if too_big_download_size?(body.length)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
response.body = body
|
|
96
|
+
response
|
|
97
|
+
end
|
|
98
|
+
|
|
74
99
|
def resolve_relative_url(location)
|
|
75
100
|
location.start_with?('http') ? location : build_absolute_url_for(location)
|
|
76
101
|
end
|
|
@@ -99,6 +124,10 @@ module LinkThumbnailer
|
|
|
99
124
|
config.verify_ssl
|
|
100
125
|
end
|
|
101
126
|
|
|
127
|
+
def download_size_limit
|
|
128
|
+
config.download_size_limit
|
|
129
|
+
end
|
|
130
|
+
|
|
102
131
|
def too_many_redirections?
|
|
103
132
|
redirect_count > redirect_limit
|
|
104
133
|
end
|
|
@@ -118,6 +147,10 @@ module LinkThumbnailer
|
|
|
118
147
|
false
|
|
119
148
|
end
|
|
120
149
|
|
|
150
|
+
def too_big_download_size?(size)
|
|
151
|
+
size.to_i > download_size_limit.to_i
|
|
152
|
+
end
|
|
153
|
+
|
|
121
154
|
def url=(url)
|
|
122
155
|
@url = ::URI.parse(url.to_s)
|
|
123
156
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LinkThumbnailer
|
|
4
|
+
class Response
|
|
5
|
+
def initialize(response)
|
|
6
|
+
@response = response
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def charset
|
|
10
|
+
@charset ||= extract_charset
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def body
|
|
14
|
+
@body ||= extract_body
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def extract_charset
|
|
20
|
+
content_type = @response['Content-Type'] || ''
|
|
21
|
+
m = content_type.match(/charset=([\w-]+)/)
|
|
22
|
+
(m && m[1]) || @response.body.scrub =~ /<meta[^>]*charset\s*=\s*["']?(.+?)["' >]/i && $1 || ''
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def extract_body
|
|
26
|
+
should_convert_body_to_utf8? ? convert_encoding_to_utf8(@response.body, charset) : @response.body
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def should_convert_body_to_utf8?
|
|
30
|
+
charset != '' && charset != 'utf-8'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def convert_encoding_to_utf8(body, from)
|
|
34
|
+
Encoding::Converter.new(from, 'utf-8').convert(body)
|
|
35
|
+
rescue EncodingError
|
|
36
|
+
body
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -1,60 +1,62 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require '
|
|
4
|
-
|
|
5
|
-
require '
|
|
6
|
-
|
|
7
|
-
require 'link_thumbnailer/
|
|
8
|
-
require 'link_thumbnailer/
|
|
9
|
-
require 'link_thumbnailer/scrapers/default/
|
|
10
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
|
11
|
-
require 'link_thumbnailer/scrapers/default/
|
|
12
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
|
13
|
-
require 'link_thumbnailer/scrapers/default/
|
|
14
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
|
15
|
-
require 'link_thumbnailer/scrapers/default/
|
|
16
|
-
require 'link_thumbnailer/scrapers/opengraph/
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@
|
|
27
|
-
@
|
|
28
|
-
@
|
|
29
|
-
@
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'active_support/core_ext/object/blank'
|
|
5
|
+
require 'active_support/inflector'
|
|
6
|
+
|
|
7
|
+
require 'link_thumbnailer/parser'
|
|
8
|
+
require 'link_thumbnailer/models/website'
|
|
9
|
+
require 'link_thumbnailer/scrapers/default/title'
|
|
10
|
+
require 'link_thumbnailer/scrapers/opengraph/title'
|
|
11
|
+
require 'link_thumbnailer/scrapers/default/description'
|
|
12
|
+
require 'link_thumbnailer/scrapers/opengraph/description'
|
|
13
|
+
require 'link_thumbnailer/scrapers/default/images'
|
|
14
|
+
require 'link_thumbnailer/scrapers/opengraph/images'
|
|
15
|
+
require 'link_thumbnailer/scrapers/default/videos'
|
|
16
|
+
require 'link_thumbnailer/scrapers/opengraph/videos'
|
|
17
|
+
require 'link_thumbnailer/scrapers/default/favicon'
|
|
18
|
+
require 'link_thumbnailer/scrapers/opengraph/favicon'
|
|
19
|
+
|
|
20
|
+
module LinkThumbnailer
|
|
21
|
+
class Scraper < ::SimpleDelegator
|
|
22
|
+
|
|
23
|
+
attr_reader :document, :source, :url, :config, :website
|
|
24
|
+
|
|
25
|
+
def initialize(source, url)
|
|
26
|
+
@source = source
|
|
27
|
+
@url = url
|
|
28
|
+
@config = ::LinkThumbnailer.page.config
|
|
29
|
+
@document = parser.call(source)
|
|
30
|
+
@website = ::LinkThumbnailer::Models::Website.new
|
|
31
|
+
@website.url = url
|
|
32
|
+
|
|
33
|
+
super(config)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def call
|
|
37
|
+
config.attributes.each do |name|
|
|
38
|
+
config.scrapers.each do |scraper_prefix|
|
|
39
|
+
scraper_class(scraper_prefix, name).new(document, website).call(name.to_s)
|
|
40
|
+
break unless website.send(name).blank?
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
website
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def scraper_class(prefix, name)
|
|
50
|
+
prefix = "::LinkThumbnailer::Scrapers::#{prefix.to_s.camelize}"
|
|
51
|
+
name = name.to_s.camelize
|
|
52
|
+
"#{prefix}::#{name}".constantize
|
|
53
|
+
rescue NameError
|
|
54
|
+
raise ::LinkThumbnailer::ScraperInvalid, "scraper named '#{prefix}::#{name}' does not exists."
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def parser
|
|
58
|
+
::LinkThumbnailer::Parser.new
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -1,67 +1,69 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require '
|
|
4
|
-
require 'link_thumbnailer/models/
|
|
5
|
-
require 'link_thumbnailer/models/
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'link_thumbnailer/models/title'
|
|
5
|
+
require 'link_thumbnailer/models/description'
|
|
6
|
+
require 'link_thumbnailer/models/image'
|
|
7
|
+
require 'link_thumbnailer/models/video'
|
|
8
|
+
|
|
9
|
+
module LinkThumbnailer
|
|
10
|
+
module Scrapers
|
|
11
|
+
class Base < ::SimpleDelegator
|
|
12
|
+
|
|
13
|
+
attr_reader :config, :document, :website, :attribute_name
|
|
14
|
+
|
|
15
|
+
def initialize(document, website = nil)
|
|
16
|
+
@config = ::LinkThumbnailer.page.config
|
|
17
|
+
@document = document
|
|
18
|
+
@website = website
|
|
19
|
+
|
|
20
|
+
super(config)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def call(attribute_name)
|
|
24
|
+
return false unless website.present?
|
|
25
|
+
return false unless applicable?
|
|
26
|
+
|
|
27
|
+
@attribute_name = attribute_name
|
|
28
|
+
|
|
29
|
+
website.send("#{attribute_name}=", value)
|
|
30
|
+
website
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def applicable?
|
|
34
|
+
true
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def value
|
|
38
|
+
fail NotImplementedError
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def meta_xpath(options = {})
|
|
44
|
+
meta_xpaths(options).first
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def meta_xpaths(options = {})
|
|
48
|
+
key = options.fetch(:key, :property)
|
|
49
|
+
value = options.fetch(:value, :content)
|
|
50
|
+
attribute = options.fetch(:attribute, attribute_name)
|
|
51
|
+
|
|
52
|
+
document.xpath("//meta[translate(@#{key},'#{abc.upcase}','#{abc}') = '#{attribute}' and string-length(@#{value}) > 0]")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def abc
|
|
56
|
+
'abcdefghijklmnopqrstuvwxyz'
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def model_class
|
|
60
|
+
"::LinkThumbnailer::Models::#{attribute_name.to_s.camelize}".constantize
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def modelize(node, text = nil)
|
|
64
|
+
model_class.new(node, text)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|