brand_logo 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +32 -0
- data/LICENSE.txt +21 -0
- data/README.md +107 -0
- data/lib/brand_logo/config.rb +60 -0
- data/lib/brand_logo/errors.rb +10 -0
- data/lib/brand_logo/fetcher.rb +109 -0
- data/lib/brand_logo/html_parser.rb +63 -0
- data/lib/brand_logo/http_client.rb +59 -0
- data/lib/brand_logo/icon.rb +34 -0
- data/lib/brand_logo/image_analyzer.rb +36 -0
- data/lib/brand_logo/logging.rb +29 -0
- data/lib/brand_logo/strategies/base_strategy.rb +117 -0
- data/lib/brand_logo/strategies/duckduckgo_strategy.rb +45 -0
- data/lib/brand_logo/strategies/manifest_strategy.rb +137 -0
- data/lib/brand_logo/strategies/meta_tag_strategy.rb +95 -0
- data/lib/brand_logo/strategies/scraping/default_favicon_checker.rb +33 -0
- data/lib/brand_logo/strategies/scraping/dimensions_extractor.rb +42 -0
- data/lib/brand_logo/strategies/scraping/format_extractor.rb +56 -0
- data/lib/brand_logo/strategies/scraping/icon_finder.rb +79 -0
- data/lib/brand_logo/strategies/scraping/url_normalizer.rb +50 -0
- data/lib/brand_logo/strategies/scraping_strategy.rb +77 -0
- data/lib/brand_logo/version.rb +6 -0
- data/lib/brand_logo.rb +37 -0
- metadata +138 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
# Fetches brand logos from DuckDuckGo's public icon service.
|
|
9
|
+
# Used as a last-resort fallback when scraping finds nothing.
|
|
10
|
+
class DuckduckgoStrategy < BaseStrategy
|
|
11
|
+
extend T::Sig
|
|
12
|
+
|
|
13
|
+
DUCKDUCKGO_URL = T.let('https://icons.duckduckgo.com/ip3/%s.ico', String)
|
|
14
|
+
|
|
15
|
+
sig do
|
|
16
|
+
params(
|
|
17
|
+
config: Config,
|
|
18
|
+
http_client: HttpClient,
|
|
19
|
+
image_analyzer: ImageAnalyzer
|
|
20
|
+
).void
|
|
21
|
+
end
|
|
22
|
+
def initialize(config:, http_client:, image_analyzer:)
|
|
23
|
+
super(config: config)
|
|
24
|
+
@http_client = T.let(http_client, HttpClient)
|
|
25
|
+
@image_analyzer = T.let(image_analyzer, ImageAnalyzer)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
sig { override.params(domain: String).returns(T::Array[Icon]) }
|
|
29
|
+
def fetch_all(domain)
|
|
30
|
+
url = format(DUCKDUCKGO_URL, domain)
|
|
31
|
+
return [] unless @http_client.head_success?(url)
|
|
32
|
+
|
|
33
|
+
icon = Icon.new(
|
|
34
|
+
url: url,
|
|
35
|
+
dimensions: @image_analyzer.dimensions(url),
|
|
36
|
+
format: 'ico'
|
|
37
|
+
)
|
|
38
|
+
[icon]
|
|
39
|
+
rescue StandardError => e
|
|
40
|
+
BrandLogo::Logging.logger.error("DuckduckgoStrategy error for #{domain}: #{e.message}")
|
|
41
|
+
[]
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
module BrandLogo
|
|
8
|
+
module Strategies
|
|
9
|
+
# Fetches icons from the Web App Manifest (PWA manifest).
|
|
10
|
+
# Modern progressive web apps store high-resolution icons (192x192, 512x512)
|
|
11
|
+
# in a manifest.json or .webmanifest file linked from the HTML.
|
|
12
|
+
#
|
|
13
|
+
# Flow:
|
|
14
|
+
# 1. Fetch HTML → find <link rel="manifest" href="...">
|
|
15
|
+
# 2. Fetch manifest JSON
|
|
16
|
+
# 3. Parse icons[] array → build Icons
|
|
17
|
+
class ManifestStrategy < BaseStrategy
|
|
18
|
+
extend T::Sig
|
|
19
|
+
|
|
20
|
+
MIME_TO_FORMAT = T.let({
|
|
21
|
+
'image/png' => 'png',
|
|
22
|
+
'image/svg+xml' => 'svg',
|
|
23
|
+
'image/jpeg' => 'jpg',
|
|
24
|
+
'image/webp' => 'webp',
|
|
25
|
+
'image/gif' => 'gif'
|
|
26
|
+
}.freeze, T::Hash[String, String])
|
|
27
|
+
|
|
28
|
+
sig do
|
|
29
|
+
params(
|
|
30
|
+
config: Config,
|
|
31
|
+
http_client: HttpClient,
|
|
32
|
+
html_parser: HtmlParser,
|
|
33
|
+
image_analyzer: ImageAnalyzer
|
|
34
|
+
).void
|
|
35
|
+
end
|
|
36
|
+
def initialize(config:, http_client:, html_parser:, image_analyzer:)
|
|
37
|
+
super(config: config)
|
|
38
|
+
@http_client = T.let(http_client, HttpClient)
|
|
39
|
+
@html_parser = T.let(html_parser, HtmlParser)
|
|
40
|
+
@image_analyzer = T.let(image_analyzer, ImageAnalyzer)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
sig { override.params(domain: String).returns(T::Array[Icon]) }
|
|
44
|
+
def fetch_all(domain)
|
|
45
|
+
manifest_url = find_manifest_url(domain)
|
|
46
|
+
return [] unless manifest_url
|
|
47
|
+
|
|
48
|
+
parse_manifest_icons(manifest_url)
|
|
49
|
+
rescue StandardError => e
|
|
50
|
+
BrandLogo::Logging.logger.error("ManifestStrategy error for #{domain}: #{e.message}")
|
|
51
|
+
[]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
# Finds the manifest URL from the HTML <link rel="manifest"> tag.
|
|
57
|
+
sig { params(domain: String).returns(T.nilable(String)) }
|
|
58
|
+
def find_manifest_url(domain)
|
|
59
|
+
html = fetch_html(domain)
|
|
60
|
+
return nil unless html
|
|
61
|
+
|
|
62
|
+
doc = @html_parser.parse(html)
|
|
63
|
+
node = doc.at('link[rel="manifest"]')
|
|
64
|
+
return nil unless node
|
|
65
|
+
|
|
66
|
+
href = node['href']
|
|
67
|
+
return nil unless href
|
|
68
|
+
|
|
69
|
+
Scraping::UrlNormalizer.new("https://#{domain}").normalize(href)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
sig { params(domain: String).returns(T.nilable(String)) }
|
|
73
|
+
def fetch_html(domain)
|
|
74
|
+
@http_client.get_body("https://#{domain}") ||
|
|
75
|
+
@http_client.get_body("https://www.#{domain}")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Downloads and parses the manifest JSON, returning icons.
|
|
79
|
+
sig { params(manifest_url: String).returns(T::Array[Icon]) }
|
|
80
|
+
def parse_manifest_icons(manifest_url)
|
|
81
|
+
body = @http_client.get_body(manifest_url)
|
|
82
|
+
return [] unless body
|
|
83
|
+
|
|
84
|
+
data = JSON.parse(body)
|
|
85
|
+
icons_data = data['icons']
|
|
86
|
+
return [] unless icons_data.is_a?(Array)
|
|
87
|
+
|
|
88
|
+
url_normalizer = Scraping::UrlNormalizer.new(manifest_url)
|
|
89
|
+
|
|
90
|
+
icons_data.filter_map { |entry| build_icon_from_entry(entry, url_normalizer) }
|
|
91
|
+
rescue JSON::ParserError => e
|
|
92
|
+
BrandLogo::Logging.logger.warn("ManifestStrategy: invalid JSON at #{manifest_url}: #{e.message}")
|
|
93
|
+
[]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
sig { params(entry: T.untyped, url_normalizer: Scraping::UrlNormalizer).returns(T.nilable(Icon)) }
|
|
97
|
+
def build_icon_from_entry(entry, url_normalizer)
|
|
98
|
+
return nil unless entry.is_a?(Hash)
|
|
99
|
+
|
|
100
|
+
src = entry['src']
|
|
101
|
+
return nil unless src.is_a?(String) && !src.empty?
|
|
102
|
+
|
|
103
|
+
url = url_normalizer.normalize(src)
|
|
104
|
+
dimensions = parse_sizes(entry['sizes'])
|
|
105
|
+
format = format_from_entry(entry, url)
|
|
106
|
+
|
|
107
|
+
BrandLogo::Logging.logger.debug("ManifestStrategy found icon: #{url} #{dimensions}")
|
|
108
|
+
Icon.new(url: url, dimensions: dimensions, format: format)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Parses a `sizes` entry like "192x192" or "any".
|
|
112
|
+
sig { params(sizes: T.untyped).returns(T::Hash[Symbol, T.nilable(Integer)]) }
|
|
113
|
+
def parse_sizes(sizes)
|
|
114
|
+
return { width: nil, height: nil } unless sizes.is_a?(String)
|
|
115
|
+
|
|
116
|
+
parts = sizes.downcase.split('x')
|
|
117
|
+
return { width: nil, height: nil } unless parts.length == 2
|
|
118
|
+
|
|
119
|
+
width, height = parts.map(&:to_i)
|
|
120
|
+
return { width: nil, height: nil } unless width.positive? && height.positive?
|
|
121
|
+
|
|
122
|
+
{ width: width, height: height }
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
sig { params(entry: T.untyped, url: String).returns(String) }
|
|
126
|
+
def format_from_entry(entry, url)
|
|
127
|
+
mime = entry['type']
|
|
128
|
+
return MIME_TO_FORMAT[mime] if mime.is_a?(String) && MIME_TO_FORMAT.key?(mime)
|
|
129
|
+
|
|
130
|
+
ext = File.extname(URI.parse(url).path).delete('.').downcase
|
|
131
|
+
ext.empty? ? 'png' : ext
|
|
132
|
+
rescue URI::Error
|
|
133
|
+
'png'
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
# Fetches icons from Open Graph and Twitter Card meta tags.
|
|
9
|
+
# These tags often contain high-resolution images (e.g. 1200x630 og:image),
|
|
10
|
+
# making them a useful source when `max_dimensions` is not constrained.
|
|
11
|
+
#
|
|
12
|
+
# Tried after ScrapingStrategy (which handles dedicated favicon links),
|
|
13
|
+
# before the DuckDuckGo last-resort fallback.
|
|
14
|
+
class MetaTagStrategy < BaseStrategy
|
|
15
|
+
extend T::Sig
|
|
16
|
+
|
|
17
|
+
META_SELECTORS = T.let([
|
|
18
|
+
'meta[property="og:image"]',
|
|
19
|
+
'meta[name="twitter:image"]',
|
|
20
|
+
'meta[name="twitter:image:src"]'
|
|
21
|
+
].freeze, T::Array[String])
|
|
22
|
+
|
|
23
|
+
MIME_TO_FORMAT = T.let({
|
|
24
|
+
'image/png' => 'png',
|
|
25
|
+
'image/svg+xml' => 'svg',
|
|
26
|
+
'image/jpeg' => 'jpg',
|
|
27
|
+
'image/webp' => 'webp'
|
|
28
|
+
}.freeze, T::Hash[String, String])
|
|
29
|
+
|
|
30
|
+
sig do
|
|
31
|
+
params(
|
|
32
|
+
config: Config,
|
|
33
|
+
http_client: HttpClient,
|
|
34
|
+
html_parser: HtmlParser,
|
|
35
|
+
image_analyzer: ImageAnalyzer
|
|
36
|
+
).void
|
|
37
|
+
end
|
|
38
|
+
def initialize(config:, http_client:, html_parser:, image_analyzer:)
|
|
39
|
+
super(config: config)
|
|
40
|
+
@http_client = T.let(http_client, HttpClient)
|
|
41
|
+
@html_parser = T.let(html_parser, HtmlParser)
|
|
42
|
+
@image_analyzer = T.let(image_analyzer, ImageAnalyzer)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
sig { override.params(domain: String).returns(T::Array[Icon]) }
|
|
46
|
+
def fetch_all(domain)
|
|
47
|
+
html, base_url = fetch_html_with_base_url(domain)
|
|
48
|
+
return [] unless html
|
|
49
|
+
|
|
50
|
+
doc = @html_parser.parse(html)
|
|
51
|
+
url_normalizer = Scraping::UrlNormalizer.new(base_url)
|
|
52
|
+
|
|
53
|
+
META_SELECTORS.filter_map { |selector| build_icon_from_selector(doc, selector, url_normalizer) }
|
|
54
|
+
rescue StandardError => e
|
|
55
|
+
BrandLogo::Logging.logger.error("MetaTagStrategy error for #{domain}: #{e.message}")
|
|
56
|
+
[]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
sig { params(doc: ParsedDocument, selector: String, url_normalizer: Scraping::UrlNormalizer).returns(T.nilable(Icon)) }
|
|
62
|
+
def build_icon_from_selector(doc, selector, url_normalizer)
|
|
63
|
+
node = doc.at(selector)
|
|
64
|
+
return nil unless node
|
|
65
|
+
|
|
66
|
+
content = node['content']
|
|
67
|
+
return nil unless content && !content.strip.empty?
|
|
68
|
+
|
|
69
|
+
url = url_normalizer.normalize(content.strip)
|
|
70
|
+
dimensions = @image_analyzer.dimensions(url)
|
|
71
|
+
format = extract_format_from_url(url)
|
|
72
|
+
|
|
73
|
+
BrandLogo::Logging.logger.debug("MetaTagStrategy found: #{url} (#{selector})")
|
|
74
|
+
Icon.new(url: url, dimensions: dimensions, format: format)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
sig { params(domain: String).returns(T.nilable([String, String])) }
|
|
78
|
+
def fetch_html_with_base_url(domain)
|
|
79
|
+
["https://#{domain}", "https://www.#{domain}"].each do |url|
|
|
80
|
+
body = @http_client.get_body(url)
|
|
81
|
+
return [body, url] if body
|
|
82
|
+
end
|
|
83
|
+
nil
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
sig { params(url: String).returns(String) }
|
|
87
|
+
def extract_format_from_url(url)
|
|
88
|
+
ext = File.extname(URI.parse(url).path).delete('.').downcase
|
|
89
|
+
ext.empty? ? 'unknown' : ext
|
|
90
|
+
rescue URI::Error
|
|
91
|
+
'unknown'
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
module Scraping
|
|
9
|
+
# Checks whether the conventional /favicon.ico path exists for a domain.
|
|
10
|
+
# Extracted from UrlNormalizer to respect SRP: URL normalization ≠ HTTP verification.
|
|
11
|
+
class DefaultFaviconChecker
|
|
12
|
+
extend T::Sig
|
|
13
|
+
|
|
14
|
+
sig { params(http_client: HttpClient).void }
|
|
15
|
+
def initialize(http_client:)
|
|
16
|
+
@http_client = T.let(http_client, HttpClient)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Returns an Icon if the URL responds with 2xx, nil otherwise.
|
|
20
|
+
sig { params(url: String).returns(T.nilable(Icon)) }
|
|
21
|
+
def check(url)
|
|
22
|
+
return nil unless @http_client.head_success?(url)
|
|
23
|
+
|
|
24
|
+
Icon.new(
|
|
25
|
+
url: url,
|
|
26
|
+
dimensions: Config::DEFAULT_DIMENSIONS,
|
|
27
|
+
format: 'ico'
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
module Scraping
|
|
9
|
+
# Extracts pixel dimensions from a favicon link element.
|
|
10
|
+
# Tries the HTML `sizes` attribute first (no network), then falls back
|
|
11
|
+
# to the injected ImageAnalyzer (may make a network request).
|
|
12
|
+
class DimensionsExtractor
|
|
13
|
+
extend T::Sig
|
|
14
|
+
|
|
15
|
+
sig { params(image_analyzer: ImageAnalyzer).void }
|
|
16
|
+
def initialize(image_analyzer:)
|
|
17
|
+
@image_analyzer = T.let(image_analyzer, ImageAnalyzer)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Returns dimensions for the given link element and its resolved href.
|
|
21
|
+
sig { params(link: T.untyped, href: String).returns(T::Hash[Symbol, T.nilable(Integer)]) }
|
|
22
|
+
def extract(link, href)
|
|
23
|
+
extract_from_sizes_attribute(link) || @image_analyzer.dimensions(href)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
# Parses the `sizes` HTML attribute (e.g. "32x32") without a network call.
|
|
29
|
+
sig { params(link: T.untyped).returns(T.nilable(T::Hash[Symbol, T.nilable(Integer)])) }
|
|
30
|
+
def extract_from_sizes_attribute(link)
|
|
31
|
+
sizes = link['sizes']&.split('x')
|
|
32
|
+
return nil unless sizes&.length == 2
|
|
33
|
+
|
|
34
|
+
width, height = T.cast(sizes, T::Array[String]).map(&:to_i)
|
|
35
|
+
return nil unless width.positive? && height.positive?
|
|
36
|
+
|
|
37
|
+
{ width: width, height: height }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
module Scraping
|
|
9
|
+
# Extracts format information from favicon link elements
|
|
10
|
+
class FormatExtractor
|
|
11
|
+
extend T::Sig
|
|
12
|
+
|
|
13
|
+
MIME_TO_FORMAT = T.let({
|
|
14
|
+
'image/x-icon' => 'ico',
|
|
15
|
+
'image/vnd.microsoft.icon' => 'ico',
|
|
16
|
+
'image/png' => 'png',
|
|
17
|
+
'image/svg+xml' => 'svg',
|
|
18
|
+
'image/jpeg' => 'jpg',
|
|
19
|
+
'image/webp' => 'webp'
|
|
20
|
+
}.freeze, T::Hash[String, String])
|
|
21
|
+
|
|
22
|
+
sig { params(link: Nokogiri::XML::Element).returns(String) }
|
|
23
|
+
def self.extract(link)
|
|
24
|
+
new(link).extract
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
sig { params(link: Nokogiri::XML::Element).void }
|
|
28
|
+
def initialize(link)
|
|
29
|
+
@link = link
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
sig { returns(String) }
|
|
33
|
+
def extract
|
|
34
|
+
extract_from_mime_type || extract_from_extension
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
sig { returns(T.nilable(String)) }
|
|
40
|
+
def extract_from_mime_type
|
|
41
|
+
return nil unless @link['type']
|
|
42
|
+
|
|
43
|
+
MIME_TO_FORMAT[@link['type']]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
sig { returns(String) }
|
|
47
|
+
def extract_from_extension
|
|
48
|
+
extension = File.extname(@link['href']).delete('.').downcase
|
|
49
|
+
return 'ico' if extension.empty?
|
|
50
|
+
|
|
51
|
+
extension
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
module Scraping
|
|
9
|
+
# Finds all favicon candidates in a parsed HTML document.
|
|
10
|
+
# Returns unfiltered icons — validation/selection is the strategy's responsibility.
|
|
11
|
+
class IconFinder
|
|
12
|
+
extend T::Sig
|
|
13
|
+
|
|
14
|
+
FAVICON_SELECTORS = T.let([
|
|
15
|
+
'link[rel~="icon"]',
|
|
16
|
+
'link[rel~="shortcut"]',
|
|
17
|
+
'link[rel~="apple-touch-icon"]',
|
|
18
|
+
'link[rel~="mask-icon"]',
|
|
19
|
+
'link[type="image/x-icon"]',
|
|
20
|
+
'link[type="image/vnd.microsoft.icon"]',
|
|
21
|
+
'link[type="image/png"]',
|
|
22
|
+
'link[type="image/svg+xml"]'
|
|
23
|
+
].freeze, T::Array[String])
|
|
24
|
+
|
|
25
|
+
sig do
|
|
26
|
+
params(
|
|
27
|
+
doc: ParsedDocument,
|
|
28
|
+
base_url: String,
|
|
29
|
+
dimensions_extractor: DimensionsExtractor,
|
|
30
|
+
default_favicon_checker: DefaultFaviconChecker
|
|
31
|
+
).void
|
|
32
|
+
end
|
|
33
|
+
def initialize(doc:, base_url:, dimensions_extractor:, default_favicon_checker:)
|
|
34
|
+
@doc = T.let(doc, ParsedDocument)
|
|
35
|
+
@url_normalizer = T.let(UrlNormalizer.new(base_url), UrlNormalizer)
|
|
36
|
+
@dimensions_extractor = T.let(dimensions_extractor, DimensionsExtractor)
|
|
37
|
+
@default_favicon_checker = T.let(default_favicon_checker, DefaultFaviconChecker)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Returns all icons found in the document. Falls back to /favicon.ico if none.
|
|
41
|
+
sig { returns(T::Array[Icon]) }
|
|
42
|
+
def find
|
|
43
|
+
icons = find_icons_from_selectors
|
|
44
|
+
return icons unless icons.empty?
|
|
45
|
+
|
|
46
|
+
BrandLogo::Logging.logger.debug('No icons found in HTML, checking default /favicon.ico')
|
|
47
|
+
default_icon = @default_favicon_checker.check(@url_normalizer.default_favicon_url)
|
|
48
|
+
default_icon ? [default_icon] : []
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
sig { returns(T::Array[Icon]) }
|
|
54
|
+
def find_icons_from_selectors
|
|
55
|
+
FAVICON_SELECTORS.flat_map { |selector| process_selector(selector) }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
sig { params(selector: String).returns(T::Array[Icon]) }
|
|
59
|
+
def process_selector(selector)
|
|
60
|
+
@doc.css(selector).filter_map { |link| build_icon_from_link(link) }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
sig { params(link: T.untyped).returns(T.nilable(Icon)) }
|
|
64
|
+
def build_icon_from_link(link)
|
|
65
|
+
href = link['href']
|
|
66
|
+
return nil unless href
|
|
67
|
+
|
|
68
|
+
normalized_url = @url_normalizer.normalize(href)
|
|
69
|
+
dimensions = @dimensions_extractor.extract(link, normalized_url)
|
|
70
|
+
format = FormatExtractor.extract(link)
|
|
71
|
+
|
|
72
|
+
BrandLogo::Logging.logger.debug("Found icon: url=#{normalized_url} format=#{format} dimensions=#{dimensions}")
|
|
73
|
+
|
|
74
|
+
Icon.new(url: normalized_url, dimensions: dimensions, format: format)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
module Scraping
|
|
9
|
+
# Resolves favicon href values into absolute URLs.
|
|
10
|
+
# Pure string manipulation — no network calls (SRP).
|
|
11
|
+
# HTTP verification of the default favicon is handled by DefaultFaviconChecker.
|
|
12
|
+
class UrlNormalizer
|
|
13
|
+
extend T::Sig
|
|
14
|
+
|
|
15
|
+
sig { params(base_url: String).void }
|
|
16
|
+
def initialize(base_url)
|
|
17
|
+
@base_url = T.let(base_url, String)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Returns an absolute URL, resolving relative hrefs against the base URL.
|
|
21
|
+
sig { params(href: String).returns(String) }
|
|
22
|
+
def normalize(href)
|
|
23
|
+
return href if absolute_url?(href)
|
|
24
|
+
|
|
25
|
+
join_with_base_url(href)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns the conventional favicon path for the domain.
|
|
29
|
+
sig { returns(String) }
|
|
30
|
+
def default_favicon_url
|
|
31
|
+
"#{@base_url}#{Config::DEFAULT_FAVICON_PATH}"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
sig { params(href: String).returns(T::Boolean) }
|
|
37
|
+
def absolute_url?(href)
|
|
38
|
+
href.start_with?('http://', 'https://')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
sig { params(href: String).returns(String) }
|
|
42
|
+
def join_with_base_url(href)
|
|
43
|
+
URI.join(@base_url, href).to_s
|
|
44
|
+
rescue URI::Error
|
|
45
|
+
href
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
module BrandLogo
|
|
7
|
+
module Strategies
|
|
8
|
+
# Fetches brand logos by scraping the target website's HTML.
|
|
9
|
+
# Tries HTTPS (with and without www), then falls back to HTTP.
|
|
10
|
+
# Delegates HTML fetching, parsing, and image analysis to injected dependencies.
|
|
11
|
+
class ScrapingStrategy < BaseStrategy
|
|
12
|
+
extend T::Sig
|
|
13
|
+
|
|
14
|
+
sig do
|
|
15
|
+
params(
|
|
16
|
+
config: Config,
|
|
17
|
+
http_client: HttpClient,
|
|
18
|
+
html_parser: HtmlParser,
|
|
19
|
+
image_analyzer: ImageAnalyzer
|
|
20
|
+
).void
|
|
21
|
+
end
|
|
22
|
+
def initialize(config:, http_client:, html_parser:, image_analyzer:)
|
|
23
|
+
super(config: config)
|
|
24
|
+
@http_client = T.let(http_client, HttpClient)
|
|
25
|
+
@html_parser = T.let(html_parser, HtmlParser)
|
|
26
|
+
@image_analyzer = T.let(image_analyzer, ImageAnalyzer)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
sig { override.params(domain: String).returns(T::Array[Icon]) }
|
|
30
|
+
def fetch_all(domain)
|
|
31
|
+
html, base_url = fetch_html_with_base_url(domain)
|
|
32
|
+
return [] unless html
|
|
33
|
+
|
|
34
|
+
dimensions_extractor = Scraping::DimensionsExtractor.new(image_analyzer: @image_analyzer)
|
|
35
|
+
default_favicon_checker = Scraping::DefaultFaviconChecker.new(http_client: @http_client)
|
|
36
|
+
|
|
37
|
+
finder = Scraping::IconFinder.new(
|
|
38
|
+
doc: @html_parser.parse(html),
|
|
39
|
+
base_url: base_url,
|
|
40
|
+
dimensions_extractor: dimensions_extractor,
|
|
41
|
+
default_favicon_checker: default_favicon_checker
|
|
42
|
+
)
|
|
43
|
+
finder.find
|
|
44
|
+
rescue StandardError => e
|
|
45
|
+
BrandLogo::Logging.logger.error("ScrapingStrategy error for #{domain}: #{e.message}")
|
|
46
|
+
[]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
# Returns [html_body, base_url] for the first responding URL candidate, or nil.
|
|
52
|
+
sig { params(domain: String).returns(T.nilable([String, String])) }
|
|
53
|
+
def fetch_html_with_base_url(domain)
|
|
54
|
+
url_candidates(domain).each do |url|
|
|
55
|
+
body = @http_client.get_body(url)
|
|
56
|
+
next unless body
|
|
57
|
+
|
|
58
|
+
BrandLogo::Logging.logger.debug("Fetched HTML from #{url}")
|
|
59
|
+
return [body, url]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
BrandLogo::Logging.logger.debug("Could not fetch HTML for #{domain}")
|
|
63
|
+
nil
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Ordered list of URLs to try: HTTPS without www, HTTPS with www, HTTP fallback.
|
|
67
|
+
sig { params(domain: String).returns(T::Array[String]) }
|
|
68
|
+
def url_candidates(domain)
|
|
69
|
+
[
|
|
70
|
+
"https://#{domain}",
|
|
71
|
+
"https://www.#{domain}",
|
|
72
|
+
"http://#{domain}"
|
|
73
|
+
]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
data/lib/brand_logo.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
# typed: strict
|
|
3
|
+
|
|
4
|
+
require 'sorbet-runtime'
|
|
5
|
+
|
|
6
|
+
# Foundation
|
|
7
|
+
require_relative 'brand_logo/version'
|
|
8
|
+
require_relative 'brand_logo/errors'
|
|
9
|
+
require_relative 'brand_logo/config'
|
|
10
|
+
require_relative 'brand_logo/logging'
|
|
11
|
+
|
|
12
|
+
# Interfaces & implementations
|
|
13
|
+
require_relative 'brand_logo/http_client'
|
|
14
|
+
require_relative 'brand_logo/image_analyzer'
|
|
15
|
+
require_relative 'brand_logo/html_parser'
|
|
16
|
+
|
|
17
|
+
# Domain model
|
|
18
|
+
require_relative 'brand_logo/icon'
|
|
19
|
+
|
|
20
|
+
# Strategies — base must be loaded before subclasses
|
|
21
|
+
require_relative 'brand_logo/strategies/base_strategy'
|
|
22
|
+
|
|
23
|
+
# Scraping utilities (loaded before strategies that use them)
|
|
24
|
+
require_relative 'brand_logo/strategies/scraping/format_extractor'
|
|
25
|
+
require_relative 'brand_logo/strategies/scraping/url_normalizer'
|
|
26
|
+
require_relative 'brand_logo/strategies/scraping/dimensions_extractor'
|
|
27
|
+
require_relative 'brand_logo/strategies/scraping/default_favicon_checker'
|
|
28
|
+
require_relative 'brand_logo/strategies/scraping/icon_finder'
|
|
29
|
+
|
|
30
|
+
# Concrete strategies
|
|
31
|
+
require_relative 'brand_logo/strategies/scraping_strategy'
|
|
32
|
+
require_relative 'brand_logo/strategies/duckduckgo_strategy'
|
|
33
|
+
require_relative 'brand_logo/strategies/meta_tag_strategy'
|
|
34
|
+
require_relative 'brand_logo/strategies/manifest_strategy'
|
|
35
|
+
|
|
36
|
+
# Entry point
|
|
37
|
+
require_relative 'brand_logo/fetcher'
|