ficon 0.2 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ficon/cache.rb +32 -1
- data/lib/ficon/version.rb +1 -1
- data/lib/ficon.rb +92 -28
- data/test/ficon_test.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e8577eb04ba2dceefb974b436520ff73b8c37f6beb04c85ac62f02e774bbc94
|
4
|
+
data.tar.gz: 4196a3eb41905e40f0285eb132b3ae4d9e9c2c00f002584f6dc0b667893842db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afbea646a672f8654bd8ba76a628c373ca1c40dbf051af8937656f755f65872031a6fc88e6a223914469a5baf10e303fc47e770ddb3abbf70305c1e08ebe1714
|
7
|
+
data.tar.gz: b08f3e0d30b5f93f503d03b5222938ab37e7c5fb423b5cbcf2df3fe217ed791aec4dc8cb8a32e540e41cd9bf06d7ea62b3bc062b159aa64452caa992c069de29
|
data/lib/ficon/cache.rb
CHANGED
@@ -40,6 +40,33 @@ class Ficon
|
|
40
40
|
db.execute("UPDATE urls SET not_before=? WHERE url=?", [_value, @url])
|
41
41
|
end
|
42
42
|
|
43
|
+
def status
|
44
|
+
db.execute("select status from urls where url=? limit 1", @url).first&.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def status=(_value)
|
48
|
+
db.execute("INSERT OR IGNORE INTO urls (url, status) VALUES (?, ?)", [@url, _value])
|
49
|
+
db.execute("UPDATE urls SET status=? WHERE url=?", [_value, @url])
|
50
|
+
end
|
51
|
+
|
52
|
+
def retry_count
|
53
|
+
db.execute("select retry_count from urls where url=? limit 1", @url).first&.first || 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def retry_count=(_value)
|
57
|
+
db.execute("INSERT OR IGNORE INTO urls (url, retry_count) VALUES (?, ?)", [@url, _value])
|
58
|
+
db.execute("UPDATE urls SET retry_count=? WHERE url=?", [_value, @url])
|
59
|
+
end
|
60
|
+
|
61
|
+
def last_attempt
|
62
|
+
db.execute("select last_attempt from urls where url=? limit 1", @url).first&.first
|
63
|
+
end
|
64
|
+
|
65
|
+
def last_attempt=(_value)
|
66
|
+
db.execute("INSERT OR IGNORE INTO urls (url, last_attempt) VALUES (?, ?)", [@url, _value])
|
67
|
+
db.execute("UPDATE urls SET last_attempt=? WHERE url=?", [_value, @url])
|
68
|
+
end
|
69
|
+
|
43
70
|
def self.db_file
|
44
71
|
if ENV["FICON_DB"].nil?
|
45
72
|
File.expand_path("~/.ficon.db")
|
@@ -49,8 +76,12 @@ class Ficon
|
|
49
76
|
end
|
50
77
|
|
51
78
|
def self.setup_cache(db)
|
52
|
-
db.execute("CREATE TABLE urls(url, etag, not_before, data)")
|
79
|
+
db.execute("CREATE TABLE urls(url, etag, not_before, data, status, retry_count, last_attempt)")
|
53
80
|
db.execute("CREATE UNIQUE INDEX `url` ON `urls` (`url`)")
|
54
81
|
end
|
82
|
+
|
83
|
+
def self.clear_cache
|
84
|
+
File.delete(db_file) if File.exist?(db_file)
|
85
|
+
end
|
55
86
|
end
|
56
87
|
end
|
data/lib/ficon/version.rb
CHANGED
data/lib/ficon.rb
CHANGED
@@ -2,6 +2,7 @@ require "net/http"
|
|
2
2
|
require "nokogiri"
|
3
3
|
require "uri"
|
4
4
|
require "addressable/uri"
|
5
|
+
require "resolv"
|
5
6
|
require "debug"
|
6
7
|
|
7
8
|
require_relative "ficon/version"
|
@@ -9,25 +10,35 @@ require_relative "ficon/image"
|
|
9
10
|
require_relative "ficon/cache"
|
10
11
|
|
11
12
|
class Ficon
|
12
|
-
attr_reader :site
|
13
|
+
attr_reader :site, :final_uri, :url_status
|
13
14
|
attr_accessor :user_agent
|
15
|
+
|
16
|
+
# URL health status constants
|
17
|
+
ALIVE = 'alive'
|
18
|
+
DEAD = 'dead'
|
19
|
+
SICK = 'sick'
|
20
|
+
BLOCKED = 'blocked'
|
14
21
|
|
15
22
|
def initialize(uri, user_agent: nil)
|
16
23
|
@uri = Addressable::URI.heuristic_parse(uri)
|
24
|
+
@final_uri = @uri
|
17
25
|
@site = {}
|
18
|
-
@
|
26
|
+
@url_status = nil
|
27
|
+
@user_agent = user_agent || "FiconBot/#{VERSION} (Ruby icon finder; https://github.com/dkam/ficon)"
|
19
28
|
process
|
20
29
|
end
|
21
30
|
|
22
31
|
def doc
|
23
|
-
|
32
|
+
# First try to fetch to determine final URL
|
33
|
+
response = fetch_url(@uri) unless @data
|
34
|
+
return nil if response.nil? && @data.nil?
|
24
35
|
|
25
|
-
|
36
|
+
# Use final URL for caching
|
37
|
+
cache = Cache.new(@final_uri)
|
26
38
|
|
27
|
-
|
28
|
-
response = fetch_url(@uri)
|
29
|
-
return nil unless response
|
39
|
+
@data ||= cache.data
|
30
40
|
|
41
|
+
if @data.nil? && response
|
31
42
|
@data = response.body.force_encoding("UTF-8")
|
32
43
|
cache.data = @data
|
33
44
|
cache.etag = response["etag"] if response["etag"]
|
@@ -60,29 +71,31 @@ class Ficon
|
|
60
71
|
end
|
61
72
|
|
62
73
|
def report
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
74
|
+
report_lines = []
|
75
|
+
report_lines << "Site icon: #{@site[:images].first}"
|
76
|
+
report_lines << "Page icon: #{@site[:page_images].first}"
|
77
|
+
report_lines << "Page title: #{@site[:title]}"
|
78
|
+
report_lines << "Page description: #{@site[:description]}"
|
79
|
+
report_lines << "Final URL: #{@final_uri}" if @final_uri.to_s != @uri.to_s
|
80
|
+
report_lines << "Canonical URL: #{@site[:canonical]}" if @site[:canonical]
|
81
|
+
report_lines << "URL Status: #{@url_status}" if @url_status
|
82
|
+
report_lines.join("\n") + "\n"
|
70
83
|
end
|
71
84
|
|
72
|
-
def site_icons
|
73
|
-
@site[:images]
|
74
|
-
end
|
85
|
+
def site_icons = @site[:images]
|
75
86
|
|
76
|
-
def
|
77
|
-
@site[:page_images]
|
78
|
-
end
|
87
|
+
def site_icon = site_icons&.first
|
79
88
|
|
80
|
-
def
|
81
|
-
@site[:title]
|
82
|
-
end
|
89
|
+
def page_images = @site[:page_images]
|
83
90
|
|
84
|
-
def
|
85
|
-
|
91
|
+
def page_image = page_images&.first
|
92
|
+
|
93
|
+
def title = @site[:title]
|
94
|
+
|
95
|
+
def description = @site[:description]
|
96
|
+
|
97
|
+
def self.clear_cache
|
98
|
+
Cache.clear_cache
|
86
99
|
end
|
87
100
|
|
88
101
|
def other_page_data
|
@@ -126,18 +139,69 @@ class Ficon
|
|
126
139
|
|
127
140
|
private
|
128
141
|
|
129
|
-
def fetch_url(uri)
|
142
|
+
def fetch_url(uri, redirect_limit = 5)
|
130
143
|
uri = URI(uri) unless uri.is_a?(URI)
|
144
|
+
|
145
|
+
if redirect_limit <= 0
|
146
|
+
@url_status = DEAD
|
147
|
+
raise "Too many redirects"
|
148
|
+
end
|
131
149
|
|
132
150
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
133
151
|
http.read_timeout = 10
|
134
152
|
http.open_timeout = 5
|
135
153
|
request = Net::HTTP::Get.new(uri)
|
136
154
|
request["User-Agent"] = @user_agent
|
137
|
-
http.request(request)
|
155
|
+
response = http.request(request)
|
156
|
+
|
157
|
+
# Set status based on response
|
158
|
+
@url_status = classify_response_status(response)
|
159
|
+
|
160
|
+
case response
|
161
|
+
when Net::HTTPRedirection
|
162
|
+
location = response["location"]
|
163
|
+
if location
|
164
|
+
new_uri = URI.join(uri.to_s, location)
|
165
|
+
@final_uri = Addressable::URI.parse(new_uri.to_s)
|
166
|
+
return fetch_url(new_uri, redirect_limit - 1)
|
167
|
+
end
|
168
|
+
else
|
169
|
+
@final_uri = Addressable::URI.parse(uri.to_s)
|
170
|
+
end
|
171
|
+
|
172
|
+
response
|
138
173
|
end
|
139
|
-
rescue
|
174
|
+
rescue => e
|
175
|
+
@url_status = classify_exception_status(e)
|
140
176
|
puts "Failed to fetch #{uri}: #{e.inspect}"
|
141
177
|
nil
|
142
178
|
end
|
179
|
+
|
180
|
+
def classify_response_status(response)
|
181
|
+
case response.code.to_i
|
182
|
+
when 200..299
|
183
|
+
ALIVE
|
184
|
+
when 404, 410
|
185
|
+
DEAD
|
186
|
+
when 401, 403, 429
|
187
|
+
BLOCKED
|
188
|
+
when 500..599
|
189
|
+
SICK
|
190
|
+
else
|
191
|
+
SICK
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def classify_exception_status(exception)
|
196
|
+
case exception
|
197
|
+
when SocketError, Resolv::ResolutionError
|
198
|
+
DEAD # DNS resolution failures
|
199
|
+
when Net::HTTPError, Timeout::Error, Errno::ECONNREFUSED
|
200
|
+
SICK # Network issues worth retrying
|
201
|
+
when OpenSSL::SSL::SSLError
|
202
|
+
SICK # SSL certificate errors
|
203
|
+
else
|
204
|
+
SICK # Default to retryable for unknown errors
|
205
|
+
end
|
206
|
+
end
|
143
207
|
end
|
data/test/ficon_test.rb
CHANGED
@@ -58,7 +58,7 @@ class FiconTest < Minitest::Test
|
|
58
58
|
def test_custom_user_agent
|
59
59
|
# Test default user agent
|
60
60
|
ficon_default = Ficon.new('https://example.com')
|
61
|
-
assert_match(/^
|
61
|
+
assert_match(/^FiconBot\/0\.\d+/, ficon_default.user_agent)
|
62
62
|
|
63
63
|
# Test custom user agent
|
64
64
|
custom_agent = 'MyApp/1.0 (Custom Bot)'
|