ficon 0.3 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ficon/cache.rb +32 -1
- data/lib/ficon/version.rb +1 -1
- data/lib/ficon.rb +55 -7
- data/test/ficon_test.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e8577eb04ba2dceefb974b436520ff73b8c37f6beb04c85ac62f02e774bbc94
|
4
|
+
data.tar.gz: 4196a3eb41905e40f0285eb132b3ae4d9e9c2c00f002584f6dc0b667893842db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afbea646a672f8654bd8ba76a628c373ca1c40dbf051af8937656f755f65872031a6fc88e6a223914469a5baf10e303fc47e770ddb3abbf70305c1e08ebe1714
|
7
|
+
data.tar.gz: b08f3e0d30b5f93f503d03b5222938ab37e7c5fb423b5cbcf2df3fe217ed791aec4dc8cb8a32e540e41cd9bf06d7ea62b3bc062b159aa64452caa992c069de29
|
data/lib/ficon/cache.rb
CHANGED
@@ -40,6 +40,33 @@ class Ficon
|
|
40
40
|
db.execute("UPDATE urls SET not_before=? WHERE url=?", [_value, @url])
|
41
41
|
end
|
42
42
|
|
43
|
+
def status
|
44
|
+
db.execute("select status from urls where url=? limit 1", @url).first&.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def status=(_value)
|
48
|
+
db.execute("INSERT OR IGNORE INTO urls (url, status) VALUES (?, ?)", [@url, _value])
|
49
|
+
db.execute("UPDATE urls SET status=? WHERE url=?", [_value, @url])
|
50
|
+
end
|
51
|
+
|
52
|
+
def retry_count
|
53
|
+
db.execute("select retry_count from urls where url=? limit 1", @url).first&.first || 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def retry_count=(_value)
|
57
|
+
db.execute("INSERT OR IGNORE INTO urls (url, retry_count) VALUES (?, ?)", [@url, _value])
|
58
|
+
db.execute("UPDATE urls SET retry_count=? WHERE url=?", [_value, @url])
|
59
|
+
end
|
60
|
+
|
61
|
+
def last_attempt
|
62
|
+
db.execute("select last_attempt from urls where url=? limit 1", @url).first&.first
|
63
|
+
end
|
64
|
+
|
65
|
+
def last_attempt=(_value)
|
66
|
+
db.execute("INSERT OR IGNORE INTO urls (url, last_attempt) VALUES (?, ?)", [@url, _value])
|
67
|
+
db.execute("UPDATE urls SET last_attempt=? WHERE url=?", [_value, @url])
|
68
|
+
end
|
69
|
+
|
43
70
|
def self.db_file
|
44
71
|
if ENV["FICON_DB"].nil?
|
45
72
|
File.expand_path("~/.ficon.db")
|
@@ -49,8 +76,12 @@ class Ficon
|
|
49
76
|
end
|
50
77
|
|
51
78
|
def self.setup_cache(db)
|
52
|
-
db.execute("CREATE TABLE urls(url, etag, not_before, data)")
|
79
|
+
db.execute("CREATE TABLE urls(url, etag, not_before, data, status, retry_count, last_attempt)")
|
53
80
|
db.execute("CREATE UNIQUE INDEX `url` ON `urls` (`url`)")
|
54
81
|
end
|
82
|
+
|
83
|
+
def self.clear_cache
|
84
|
+
File.delete(db_file) if File.exist?(db_file)
|
85
|
+
end
|
55
86
|
end
|
56
87
|
end
|
data/lib/ficon/version.rb
CHANGED
data/lib/ficon.rb
CHANGED
@@ -2,6 +2,7 @@ require "net/http"
|
|
2
2
|
require "nokogiri"
|
3
3
|
require "uri"
|
4
4
|
require "addressable/uri"
|
5
|
+
require "resolv"
|
5
6
|
require "debug"
|
6
7
|
|
7
8
|
require_relative "ficon/version"
|
@@ -9,14 +10,21 @@ require_relative "ficon/image"
|
|
9
10
|
require_relative "ficon/cache"
|
10
11
|
|
11
12
|
class Ficon
|
12
|
-
attr_reader :site, :final_uri
|
13
|
+
attr_reader :site, :final_uri, :url_status
|
13
14
|
attr_accessor :user_agent
|
15
|
+
|
16
|
+
# URL health status constants
|
17
|
+
ALIVE = 'alive'
|
18
|
+
DEAD = 'dead'
|
19
|
+
SICK = 'sick'
|
20
|
+
BLOCKED = 'blocked'
|
14
21
|
|
15
22
|
def initialize(uri, user_agent: nil)
|
16
23
|
@uri = Addressable::URI.heuristic_parse(uri)
|
17
24
|
@final_uri = @uri
|
18
25
|
@site = {}
|
19
|
-
@
|
26
|
+
@url_status = nil
|
27
|
+
@user_agent = user_agent || "FiconBot/#{VERSION} (Ruby icon finder; https://github.com/dkam/ficon)"
|
20
28
|
process
|
21
29
|
end
|
22
30
|
|
@@ -70,6 +78,7 @@ class Ficon
|
|
70
78
|
report_lines << "Page description: #{@site[:description]}"
|
71
79
|
report_lines << "Final URL: #{@final_uri}" if @final_uri.to_s != @uri.to_s
|
72
80
|
report_lines << "Canonical URL: #{@site[:canonical]}" if @site[:canonical]
|
81
|
+
report_lines << "URL Status: #{@url_status}" if @url_status
|
73
82
|
report_lines.join("\n") + "\n"
|
74
83
|
end
|
75
84
|
|
@@ -85,6 +94,10 @@ class Ficon
|
|
85
94
|
|
86
95
|
def description = @site[:description]
|
87
96
|
|
97
|
+
def self.clear_cache
|
98
|
+
Cache.clear_cache
|
99
|
+
end
|
100
|
+
|
88
101
|
def other_page_data
|
89
102
|
@site[:title] = doc.at_xpath("//meta[@property='og:title']/@content")&.value || @doc.at_xpath("//title")&.text&.strip
|
90
103
|
@site[:description] = doc.at_xpath("//meta[@property='og:description']/@content")&.value
|
@@ -128,8 +141,11 @@ class Ficon
|
|
128
141
|
|
129
142
|
def fetch_url(uri, redirect_limit = 5)
|
130
143
|
uri = URI(uri) unless uri.is_a?(URI)
|
131
|
-
|
132
|
-
|
144
|
+
|
145
|
+
if redirect_limit <= 0
|
146
|
+
@url_status = DEAD
|
147
|
+
raise "Too many redirects"
|
148
|
+
end
|
133
149
|
|
134
150
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
135
151
|
http.read_timeout = 10
|
@@ -137,7 +153,10 @@ class Ficon
|
|
137
153
|
request = Net::HTTP::Get.new(uri)
|
138
154
|
request["User-Agent"] = @user_agent
|
139
155
|
response = http.request(request)
|
140
|
-
|
156
|
+
|
157
|
+
# Set status based on response
|
158
|
+
@url_status = classify_response_status(response)
|
159
|
+
|
141
160
|
case response
|
142
161
|
when Net::HTTPRedirection
|
143
162
|
location = response["location"]
|
@@ -149,11 +168,40 @@ class Ficon
|
|
149
168
|
else
|
150
169
|
@final_uri = Addressable::URI.parse(uri.to_s)
|
151
170
|
end
|
152
|
-
|
171
|
+
|
153
172
|
response
|
154
173
|
end
|
155
|
-
rescue
|
174
|
+
rescue => e
|
175
|
+
@url_status = classify_exception_status(e)
|
156
176
|
puts "Failed to fetch #{uri}: #{e.inspect}"
|
157
177
|
nil
|
158
178
|
end
|
179
|
+
|
180
|
+
def classify_response_status(response)
|
181
|
+
case response.code.to_i
|
182
|
+
when 200..299
|
183
|
+
ALIVE
|
184
|
+
when 404, 410
|
185
|
+
DEAD
|
186
|
+
when 401, 403, 429
|
187
|
+
BLOCKED
|
188
|
+
when 500..599
|
189
|
+
SICK
|
190
|
+
else
|
191
|
+
SICK
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def classify_exception_status(exception)
|
196
|
+
case exception
|
197
|
+
when SocketError, Resolv::ResolutionError
|
198
|
+
DEAD # DNS resolution failures
|
199
|
+
when Net::HTTPError, Timeout::Error, Errno::ECONNREFUSED
|
200
|
+
SICK # Network issues worth retrying
|
201
|
+
when OpenSSL::SSL::SSLError
|
202
|
+
SICK # SSL certificate errors
|
203
|
+
else
|
204
|
+
SICK # Default to retryable for unknown errors
|
205
|
+
end
|
206
|
+
end
|
159
207
|
end
|
data/test/ficon_test.rb
CHANGED
@@ -58,7 +58,7 @@ class FiconTest < Minitest::Test
|
|
58
58
|
def test_custom_user_agent
|
59
59
|
# Test default user agent
|
60
60
|
ficon_default = Ficon.new('https://example.com')
|
61
|
-
assert_match(/^
|
61
|
+
assert_match(/^FiconBot\/0\.\d+/, ficon_default.user_agent)
|
62
62
|
|
63
63
|
# Test custom user agent
|
64
64
|
custom_agent = 'MyApp/1.0 (Custom Bot)'
|