ficon 0.3 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6b300144548c846bb2d309ca04ca78198f9e37682e50f08ac2f60e95f54e84f5
4
- data.tar.gz: 90e5b7804b8e6c44e8c361c585e03a2553724322cebf84846c66c73c1652a234
3
+ metadata.gz: a9f5f2c6b36b5360a88f4eac14c07c96f7fefb9674fb6bed0bdf4a7be63ad2d0
4
+ data.tar.gz: f85066461f2384ed8e2ed3d302e5e3ffe537178dfaf40c128e056dd16c1e3027
5
5
  SHA512:
6
- metadata.gz: 6781246417b9d5cc5e1791acae5232dcd13ea9a934f0f977929fd57a5fa348dff96143b2acf327bd3ea7626e0eea65c92b585f4bd544c3fcd7764de76eee121b
7
- data.tar.gz: 0a2b83a50bdbc340e8877028f77f99de6ca23e9bdfa15b237f0d47390394ba3f9165ebfdf2a625bffdbebca40789cac8ce2cc3b6d9c6cf11c79ae2f23f8aca19
6
+ metadata.gz: 7ac7f80ea66b97249dccbfcdd4c7adf82b741125c23dbd92339346fb63e772bcfcea893279cca1b05b100946d567ee4c82bdfc9ba85dcd4fd10c881bef987ae9
7
+ data.tar.gz: 8c4421c19189b779f19aca1699521e600c25f4a17807dc22a546913a28e0d230263642d8610ef68f81d548f66be4cff1654dc9a12a644319c281432555428b4e
data/lib/ficon/cache.rb CHANGED
@@ -40,6 +40,33 @@ class Ficon
40
40
  db.execute("UPDATE urls SET not_before=? WHERE url=?", [_value, @url])
41
41
  end
42
42
 
43
+ def status
44
+ db.execute("select status from urls where url=? limit 1", @url).first&.first
45
+ end
46
+
47
+ def status=(_value)
48
+ db.execute("INSERT OR IGNORE INTO urls (url, status) VALUES (?, ?)", [@url, _value])
49
+ db.execute("UPDATE urls SET status=? WHERE url=?", [_value, @url])
50
+ end
51
+
52
+ def retry_count
53
+ db.execute("select retry_count from urls where url=? limit 1", @url).first&.first || 0
54
+ end
55
+
56
+ def retry_count=(_value)
57
+ db.execute("INSERT OR IGNORE INTO urls (url, retry_count) VALUES (?, ?)", [@url, _value])
58
+ db.execute("UPDATE urls SET retry_count=? WHERE url=?", [_value, @url])
59
+ end
60
+
61
+ def last_attempt
62
+ db.execute("select last_attempt from urls where url=? limit 1", @url).first&.first
63
+ end
64
+
65
+ def last_attempt=(_value)
66
+ db.execute("INSERT OR IGNORE INTO urls (url, last_attempt) VALUES (?, ?)", [@url, _value])
67
+ db.execute("UPDATE urls SET last_attempt=? WHERE url=?", [_value, @url])
68
+ end
69
+
43
70
  def self.db_file
44
71
  if ENV["FICON_DB"].nil?
45
72
  File.expand_path("~/.ficon.db")
@@ -49,8 +76,12 @@ class Ficon
49
76
  end
50
77
 
51
78
  def self.setup_cache(db)
52
- db.execute("CREATE TABLE urls(url, etag, not_before, data)")
79
+ db.execute("CREATE TABLE urls(url, etag, not_before, data, status, retry_count, last_attempt)")
53
80
  db.execute("CREATE UNIQUE INDEX `url` ON `urls` (`url`)")
54
81
  end
82
+
83
+ def self.clear_cache
84
+ File.delete(db_file) if File.exist?(db_file)
85
+ end
55
86
  end
56
87
  end
data/lib/ficon/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  class Ficon
2
- VERSION = "0.3"
2
+ VERSION = "0.5"
3
3
  end
data/lib/ficon.rb CHANGED
@@ -2,6 +2,7 @@ require "net/http"
2
2
  require "nokogiri"
3
3
  require "uri"
4
4
  require "addressable/uri"
5
+ require "resolv"
5
6
  require "debug"
6
7
 
7
8
  require_relative "ficon/version"
@@ -9,14 +10,21 @@ require_relative "ficon/image"
9
10
  require_relative "ficon/cache"
10
11
 
11
12
  class Ficon
12
- attr_reader :site, :final_uri
13
+ attr_reader :site, :final_uri, :url_status
13
14
  attr_accessor :user_agent
15
+
16
+ # URL health status constants
17
+ ALIVE = 'alive'
18
+ DEAD = 'dead'
19
+ SICK = 'sick'
20
+ BLOCKED = 'blocked'
14
21
 
15
22
  def initialize(uri, user_agent: nil)
16
23
  @uri = Addressable::URI.heuristic_parse(uri)
17
24
  @final_uri = @uri
18
25
  @site = {}
19
- @user_agent = user_agent || "Ficon/#{VERSION} (Ruby icon finder; https://github.com/dkam/ficon)"
26
+ @url_status = nil
27
+ @user_agent = user_agent || "FiconBot/#{VERSION} (Ruby icon finder; https://github.com/dkam/ficon)"
20
28
  process
21
29
  end
22
30
 
@@ -56,9 +64,15 @@ class Ficon
56
64
  end
57
65
 
58
66
  def process
59
- @site[:images] = self.class.site_images(@uri, doc) || []
60
- @site[:page_images] = self.class.page_images(@uri, doc) || []
61
- other_page_data
67
+ document = doc
68
+ if document
69
+ @site[:images] = self.class.site_images(@uri, document) || []
70
+ @site[:page_images] = self.class.page_images(@uri, document) || []
71
+ other_page_data(document)
72
+ else
73
+ @site[:images] = []
74
+ @site[:page_images] = []
75
+ end
62
76
  nil
63
77
  end
64
78
 
@@ -70,26 +84,27 @@ class Ficon
70
84
  report_lines << "Page description: #{@site[:description]}"
71
85
  report_lines << "Final URL: #{@final_uri}" if @final_uri.to_s != @uri.to_s
72
86
  report_lines << "Canonical URL: #{@site[:canonical]}" if @site[:canonical]
87
+ report_lines << "URL Status: #{@url_status}" if @url_status
73
88
  report_lines.join("\n") + "\n"
74
89
  end
75
90
 
76
- def site_icons = @site[:images]
77
-
78
- def site_icon = site_icons&.first
91
+ def site_icons = @site[:images] || []
79
92
 
80
- def page_images = @site[:page_images]
81
-
82
- def page_image = page_images&.first
93
+ def page_images = @site[:page_images] || []
83
94
 
84
95
  def title = @site[:title]
85
96
 
86
97
  def description = @site[:description]
87
98
 
88
- def other_page_data
89
- @site[:title] = doc.at_xpath("//meta[@property='og:title']/@content")&.value || @doc.at_xpath("//title")&.text&.strip
90
- @site[:description] = doc.at_xpath("//meta[@property='og:description']/@content")&.value
91
- canonical = doc.at_xpath("//link[@rel='canonical']/@href")&.value
92
- @site[:canonical] = canonical unless canonical == @url
99
+ def self.clear_cache
100
+ Cache.clear_cache
101
+ end
102
+
103
+ def other_page_data(document)
104
+ @site[:title] = document.at_xpath("//meta[@property='og:title']/@content")&.value || document.at_xpath("//title")&.text&.strip
105
+ @site[:description] = document.at_xpath("//meta[@property='og:description']/@content")&.value
106
+ canonical = document.at_xpath("//link[@rel='canonical']/@href")&.value
107
+ @site[:canonical] = canonical unless canonical == @uri.to_s
93
108
  end
94
109
 
95
110
  def self.site_images(uri, doc)
@@ -124,12 +139,43 @@ class Ficon
124
139
  parsed_candidate.to_s
125
140
  end
126
141
 
142
+ def classify_response_status(response)
143
+ case response.code.to_i
144
+ when 200..299
145
+ ALIVE
146
+ when 404, 410
147
+ DEAD
148
+ when 401, 403, 429
149
+ BLOCKED
150
+ when 500..599
151
+ SICK
152
+ else
153
+ SICK
154
+ end
155
+ end
156
+
157
+ def classify_exception_status(exception)
158
+ case exception
159
+ when SocketError, Resolv::ResolvError
160
+ DEAD # DNS resolution failures
161
+ when Net::HTTPError, Timeout::Error, Errno::ECONNREFUSED
162
+ SICK # Network issues worth retrying
163
+ when OpenSSL::SSL::SSLError
164
+ SICK # SSL certificate errors
165
+ else
166
+ SICK # Default to retryable for unknown errors
167
+ end
168
+ end
169
+
127
170
  private
128
171
 
129
172
  def fetch_url(uri, redirect_limit = 5)
130
173
  uri = URI(uri) unless uri.is_a?(URI)
131
-
132
- raise "Too many redirects" if redirect_limit <= 0
174
+
175
+ if redirect_limit <= 0
176
+ @url_status = DEAD
177
+ raise "Too many redirects"
178
+ end
133
179
 
134
180
  Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
135
181
  http.read_timeout = 10
@@ -137,7 +183,10 @@ class Ficon
137
183
  request = Net::HTTP::Get.new(uri)
138
184
  request["User-Agent"] = @user_agent
139
185
  response = http.request(request)
140
-
186
+
187
+ # Set status based on response
188
+ @url_status = classify_response_status(response)
189
+
141
190
  case response
142
191
  when Net::HTTPRedirection
143
192
  location = response["location"]
@@ -149,10 +198,11 @@ class Ficon
149
198
  else
150
199
  @final_uri = Addressable::URI.parse(uri.to_s)
151
200
  end
152
-
201
+
153
202
  response
154
203
  end
155
- rescue Net::HTTPError, SocketError, Timeout::Error => e
204
+ rescue => e
205
+ @url_status = classify_exception_status(e)
156
206
  puts "Failed to fetch #{uri}: #{e.inspect}"
157
207
  nil
158
208
  end
data/test/ficon_test.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #require 'rubygems'
2
2
  require 'debug'
3
+ require 'resolv'
3
4
 
4
5
  require "minitest/autorun"
5
6
 
@@ -58,7 +59,7 @@ class FiconTest < Minitest::Test
58
59
  def test_custom_user_agent
59
60
  # Test default user agent
60
61
  ficon_default = Ficon.new('https://example.com')
61
- assert_match(/^Ficon\/0\.2/, ficon_default.user_agent)
62
+ assert_match(/^FiconBot\/0\.\d+/, ficon_default.user_agent)
62
63
 
63
64
  # Test custom user agent
64
65
  custom_agent = 'MyApp/1.0 (Custom Bot)'
@@ -69,4 +70,53 @@ class FiconTest < Minitest::Test
69
70
  ficon_custom.user_agent = 'Changed/2.0'
70
71
  assert_equal 'Changed/2.0', ficon_custom.user_agent
71
72
  end
73
+
74
+ def test_response_status_classification
75
+ ficon = Ficon.new('https://example.com')
76
+
77
+ # Test ALIVE status (2xx)
78
+ assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(200))
79
+ assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(201))
80
+ assert_equal Ficon::ALIVE, ficon.classify_response_status(mock_response(299))
81
+
82
+ # Test DEAD status (404, 410)
83
+ assert_equal Ficon::DEAD, ficon.classify_response_status(mock_response(404))
84
+ assert_equal Ficon::DEAD, ficon.classify_response_status(mock_response(410))
85
+
86
+ # Test BLOCKED status (401, 403, 429)
87
+ assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(401))
88
+ assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(403))
89
+ assert_equal Ficon::BLOCKED, ficon.classify_response_status(mock_response(429))
90
+
91
+ # Test SICK status (5xx and others)
92
+ assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(500))
93
+ assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(502))
94
+ assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(503))
95
+ assert_equal Ficon::SICK, ficon.classify_response_status(mock_response(300)) # Other codes default to SICK
96
+ end
97
+
98
+ def test_exception_status_classification
99
+ ficon = Ficon.new('https://example.com')
100
+
101
+ # Test DEAD status (DNS and resolution errors)
102
+ assert_equal Ficon::DEAD, ficon.classify_exception_status(SocketError.new)
103
+ assert_equal Ficon::DEAD, ficon.classify_exception_status(Resolv::ResolvError.new)
104
+
105
+ # Test SICK status (network and timeout errors)
106
+ assert_equal Ficon::SICK, ficon.classify_exception_status(Timeout::Error.new)
107
+ assert_equal Ficon::SICK, ficon.classify_exception_status(Errno::ECONNREFUSED.new)
108
+ assert_equal Ficon::SICK, ficon.classify_exception_status(OpenSSL::SSL::SSLError.new)
109
+ assert_equal Ficon::SICK, ficon.classify_exception_status(Net::HTTPError.new('error', nil))
110
+
111
+ # Test default to SICK for unknown exceptions
112
+ assert_equal Ficon::SICK, ficon.classify_exception_status(StandardError.new)
113
+ end
114
+
115
+ private
116
+
117
+ def mock_response(code)
118
+ response = Object.new
119
+ response.define_singleton_method(:code) { code }
120
+ response
121
+ end
72
122
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ficon
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.3'
4
+ version: '0.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Milne