ficon 0.3 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ficon/cache.rb +32 -1
- data/lib/ficon/version.rb +1 -1
- data/lib/ficon.rb +71 -21
- data/test/ficon_test.rb +51 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9f5f2c6b36b5360a88f4eac14c07c96f7fefb9674fb6bed0bdf4a7be63ad2d0
|
4
|
+
data.tar.gz: f85066461f2384ed8e2ed3d302e5e3ffe537178dfaf40c128e056dd16c1e3027
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ac7f80ea66b97249dccbfcdd4c7adf82b741125c23dbd92339346fb63e772bcfcea893279cca1b05b100946d567ee4c82bdfc9ba85dcd4fd10c881bef987ae9
|
7
|
+
data.tar.gz: 8c4421c19189b779f19aca1699521e600c25f4a17807dc22a546913a28e0d230263642d8610ef68f81d548f66be4cff1654dc9a12a644319c281432555428b4e
|
data/lib/ficon/cache.rb
CHANGED
@@ -40,6 +40,33 @@ class Ficon
|
|
40
40
|
db.execute("UPDATE urls SET not_before=? WHERE url=?", [_value, @url])
|
41
41
|
end
|
42
42
|
|
43
|
+
def status
|
44
|
+
db.execute("select status from urls where url=? limit 1", @url).first&.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def status=(_value)
|
48
|
+
db.execute("INSERT OR IGNORE INTO urls (url, status) VALUES (?, ?)", [@url, _value])
|
49
|
+
db.execute("UPDATE urls SET status=? WHERE url=?", [_value, @url])
|
50
|
+
end
|
51
|
+
|
52
|
+
def retry_count
|
53
|
+
db.execute("select retry_count from urls where url=? limit 1", @url).first&.first || 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def retry_count=(_value)
|
57
|
+
db.execute("INSERT OR IGNORE INTO urls (url, retry_count) VALUES (?, ?)", [@url, _value])
|
58
|
+
db.execute("UPDATE urls SET retry_count=? WHERE url=?", [_value, @url])
|
59
|
+
end
|
60
|
+
|
61
|
+
def last_attempt
|
62
|
+
db.execute("select last_attempt from urls where url=? limit 1", @url).first&.first
|
63
|
+
end
|
64
|
+
|
65
|
+
def last_attempt=(_value)
|
66
|
+
db.execute("INSERT OR IGNORE INTO urls (url, last_attempt) VALUES (?, ?)", [@url, _value])
|
67
|
+
db.execute("UPDATE urls SET last_attempt=? WHERE url=?", [_value, @url])
|
68
|
+
end
|
69
|
+
|
43
70
|
def self.db_file
|
44
71
|
if ENV["FICON_DB"].nil?
|
45
72
|
File.expand_path("~/.ficon.db")
|
@@ -49,8 +76,12 @@ class Ficon
|
|
49
76
|
end
|
50
77
|
|
51
78
|
def self.setup_cache(db)
|
52
|
-
db.execute("CREATE TABLE urls(url, etag, not_before, data)")
|
79
|
+
db.execute("CREATE TABLE urls(url, etag, not_before, data, status, retry_count, last_attempt)")
|
53
80
|
db.execute("CREATE UNIQUE INDEX `url` ON `urls` (`url`)")
|
54
81
|
end
|
82
|
+
|
83
|
+
def self.clear_cache
|
84
|
+
File.delete(db_file) if File.exist?(db_file)
|
85
|
+
end
|
55
86
|
end
|
56
87
|
end
|
data/lib/ficon/version.rb
CHANGED
data/lib/ficon.rb
CHANGED
@@ -2,6 +2,7 @@ require "net/http"
|
|
2
2
|
require "nokogiri"
|
3
3
|
require "uri"
|
4
4
|
require "addressable/uri"
|
5
|
+
require "resolv"
|
5
6
|
require "debug"
|
6
7
|
|
7
8
|
require_relative "ficon/version"
|
@@ -9,14 +10,21 @@ require_relative "ficon/image"
|
|
9
10
|
require_relative "ficon/cache"
|
10
11
|
|
11
12
|
class Ficon
|
12
|
-
attr_reader :site, :final_uri
|
13
|
+
attr_reader :site, :final_uri, :url_status
|
13
14
|
attr_accessor :user_agent
|
15
|
+
|
16
|
+
# URL health status constants
|
17
|
+
ALIVE = 'alive'
|
18
|
+
DEAD = 'dead'
|
19
|
+
SICK = 'sick'
|
20
|
+
BLOCKED = 'blocked'
|
14
21
|
|
15
22
|
def initialize(uri, user_agent: nil)
|
16
23
|
@uri = Addressable::URI.heuristic_parse(uri)
|
17
24
|
@final_uri = @uri
|
18
25
|
@site = {}
|
19
|
-
@
|
26
|
+
@url_status = nil
|
27
|
+
@user_agent = user_agent || "FiconBot/#{VERSION} (Ruby icon finder; https://github.com/dkam/ficon)"
|
20
28
|
process
|
21
29
|
end
|
22
30
|
|
@@ -56,9 +64,15 @@ class Ficon
|
|
56
64
|
end
|
57
65
|
|
58
66
|
def process
|
59
|
-
|
60
|
-
|
61
|
-
|
67
|
+
document = doc
|
68
|
+
if document
|
69
|
+
@site[:images] = self.class.site_images(@uri, document) || []
|
70
|
+
@site[:page_images] = self.class.page_images(@uri, document) || []
|
71
|
+
other_page_data(document)
|
72
|
+
else
|
73
|
+
@site[:images] = []
|
74
|
+
@site[:page_images] = []
|
75
|
+
end
|
62
76
|
nil
|
63
77
|
end
|
64
78
|
|
@@ -70,26 +84,27 @@ class Ficon
|
|
70
84
|
report_lines << "Page description: #{@site[:description]}"
|
71
85
|
report_lines << "Final URL: #{@final_uri}" if @final_uri.to_s != @uri.to_s
|
72
86
|
report_lines << "Canonical URL: #{@site[:canonical]}" if @site[:canonical]
|
87
|
+
report_lines << "URL Status: #{@url_status}" if @url_status
|
73
88
|
report_lines.join("\n") + "\n"
|
74
89
|
end
|
75
90
|
|
76
|
-
def site_icons = @site[:images]
|
77
|
-
|
78
|
-
def site_icon = site_icons&.first
|
91
|
+
def site_icons = @site[:images] || []
|
79
92
|
|
80
|
-
def page_images = @site[:page_images]
|
81
|
-
|
82
|
-
def page_image = page_images&.first
|
93
|
+
def page_images = @site[:page_images] || []
|
83
94
|
|
84
95
|
def title = @site[:title]
|
85
96
|
|
86
97
|
def description = @site[:description]
|
87
98
|
|
88
|
-
def
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
99
|
+
def self.clear_cache
|
100
|
+
Cache.clear_cache
|
101
|
+
end
|
102
|
+
|
103
|
+
def other_page_data(document)
|
104
|
+
@site[:title] = document.at_xpath("//meta[@property='og:title']/@content")&.value || document.at_xpath("//title")&.text&.strip
|
105
|
+
@site[:description] = document.at_xpath("//meta[@property='og:description']/@content")&.value
|
106
|
+
canonical = document.at_xpath("//link[@rel='canonical']/@href")&.value
|
107
|
+
@site[:canonical] = canonical unless canonical == @uri.to_s
|
93
108
|
end
|
94
109
|
|
95
110
|
def self.site_images(uri, doc)
|
@@ -124,12 +139,43 @@ class Ficon
|
|
124
139
|
parsed_candidate.to_s
|
125
140
|
end
|
126
141
|
|
142
|
+
def classify_response_status(response)
|
143
|
+
case response.code.to_i
|
144
|
+
when 200..299
|
145
|
+
ALIVE
|
146
|
+
when 404, 410
|
147
|
+
DEAD
|
148
|
+
when 401, 403, 429
|
149
|
+
BLOCKED
|
150
|
+
when 500..599
|
151
|
+
SICK
|
152
|
+
else
|
153
|
+
SICK
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def classify_exception_status(exception)
|
158
|
+
case exception
|
159
|
+
when SocketError, Resolv::ResolvError
|
160
|
+
DEAD # DNS resolution failures
|
161
|
+
when Net::HTTPError, Timeout::Error, Errno::ECONNREFUSED
|
162
|
+
SICK # Network issues worth retrying
|
163
|
+
when OpenSSL::SSL::SSLError
|
164
|
+
SICK # SSL certificate errors
|
165
|
+
else
|
166
|
+
SICK # Default to retryable for unknown errors
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
127
170
|
private
|
128
171
|
|
129
172
|
def fetch_url(uri, redirect_limit = 5)
|
130
173
|
uri = URI(uri) unless uri.is_a?(URI)
|
131
|
-
|
132
|
-
|
174
|
+
|
175
|
+
if redirect_limit <= 0
|
176
|
+
@url_status = DEAD
|
177
|
+
raise "Too many redirects"
|
178
|
+
end
|
133
179
|
|
134
180
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
135
181
|
http.read_timeout = 10
|
@@ -137,7 +183,10 @@ class Ficon
|
|
137
183
|
request = Net::HTTP::Get.new(uri)
|
138
184
|
request["User-Agent"] = @user_agent
|
139
185
|
response = http.request(request)
|
140
|
-
|
186
|
+
|
187
|
+
# Set status based on response
|
188
|
+
@url_status = classify_response_status(response)
|
189
|
+
|
141
190
|
case response
|
142
191
|
when Net::HTTPRedirection
|
143
192
|
location = response["location"]
|
@@ -149,10 +198,11 @@ class Ficon
|
|
149
198
|
else
|
150
199
|
@final_uri = Addressable::URI.parse(uri.to_s)
|
151
200
|
end
|
152
|
-
|
201
|
+
|
153
202
|
response
|
154
203
|
end
|
155
|
-
rescue
|
204
|
+
rescue => e
|
205
|
+
@url_status = classify_exception_status(e)
|
156
206
|
puts "Failed to fetch #{uri}: #{e.inspect}"
|
157
207
|
nil
|
158
208
|
end
|
data/test/ficon_test.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#require 'rubygems'
|
2
2
|
require 'debug'
|
3
|
+
require 'resolv'
|
3
4
|
|
4
5
|
require "minitest/autorun"
|
5
6
|
|
@@ -58,7 +59,7 @@ class FiconTest < Minitest::Test
|
|
58
59
|
def test_custom_user_agent
|
59
60
|
# Test default user agent
|
60
61
|
ficon_default = Ficon.new('https://example.com')
|
61
|
-
assert_match(/^
|
62
|
+
assert_match(/^FiconBot\/0\.\d+/, ficon_default.user_agent)
|
62
63
|
|
63
64
|
# Test custom user agent
|
64
65
|
custom_agent = 'MyApp/1.0 (Custom Bot)'
|
@@ -69,4 +70,53 @@ class FiconTest < Minitest::Test
|
|
69
70
|
ficon_custom.user_agent = 'Changed/2.0'
|
70
71
|
assert_equal 'Changed/2.0', ficon_custom.user_agent
|
71
72
|
end
|
73
|
+
|
74
|
+
def test_response_status_classification
|
75
|
+
ficon = Ficon.new('https://example.com')
|
76
|
+
|
77
|
+
# Test ALIVE status (2xx)
|
78
|
+
assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(200))
|
79
|
+
assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(201))
|
80
|
+
assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(299))
|
81
|
+
|
82
|
+
# Test DEAD status (404, 410)
|
83
|
+
assert_equal Ficon::DEAD, ficon.classify_response_status(mock_response(404))
|
84
|
+
assert_equal Ficon::DEAD, ficon.classify_response_status(mock_response(410))
|
85
|
+
|
86
|
+
# Test BLOCKED status (401, 403, 429)
|
87
|
+
assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(401))
|
88
|
+
assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(403))
|
89
|
+
assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(429))
|
90
|
+
|
91
|
+
# Test SICK status (5xx and others)
|
92
|
+
assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(500))
|
93
|
+
assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(502))
|
94
|
+
assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(503))
|
95
|
+
assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(300)) # Other codes default to SICK
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_exception_status_classification
|
99
|
+
ficon = Ficon.new('https://example.com')
|
100
|
+
|
101
|
+
# Test DEAD status (DNS and resolution errors)
|
102
|
+
assert_equal Ficon::DEAD, ficon.classify_exception_status(SocketError.new)
|
103
|
+
assert_equal Ficon::DEAD, ficon.classify_exception_status(Resolv::ResolvError.new)
|
104
|
+
|
105
|
+
# Test SICK status (network and timeout errors)
|
106
|
+
assert_equal Ficon::SICK, ficon.classify_exception_status(Timeout::Error.new)
|
107
|
+
assert_equal Ficon::SICK, ficon.classify_exception_status(Errno::ECONNREFUSED.new)
|
108
|
+
assert_equal Ficon::SICK, ficon.classify_exception_status(OpenSSL::SSL::SSLError.new)
|
109
|
+
assert_equal Ficon::SICK, ficon.classify_exception_status(Net::HTTPError.new('error', nil))
|
110
|
+
|
111
|
+
# Test default to SICK for unknown exceptions
|
112
|
+
assert_equal Ficon::SICK, ficon.classify_exception_status(StandardError.new)
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def mock_response(code)
|
118
|
+
response = Object.new
|
119
|
+
response.define_singleton_method(:code) { code }
|
120
|
+
response
|
121
|
+
end
|
72
122
|
end
|