UrlCategorise 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ require 'set'
2
+
1
3
  module UrlCategorise
2
4
  class Client < ApiPattern::Client
3
5
  include ::UrlCategorise::Constants
@@ -10,21 +12,28 @@ module UrlCategorise
10
12
  'v2 2025-08-23'
11
13
  end
12
14
 
13
- attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout
15
+ attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout,
16
+ :dataset_processor, :dataset_categories
14
17
 
15
- def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false, dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10)
18
+ def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false,
19
+ dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10, dataset_config: {})
16
20
  @host_urls = host_urls
17
21
  @cache_dir = cache_dir
18
22
  @force_download = force_download
19
23
  @dns_servers = dns_servers
20
24
  @request_timeout = request_timeout
21
25
  @metadata = {}
26
+ @dataset_categories = Set.new # Track which categories come from datasets
27
+
28
+ # Initialize dataset processor if config provided
29
+ @dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
30
+
22
31
  @hosts = fetch_and_build_host_lists
23
32
  end
24
33
 
25
34
  def categorise(url)
26
35
  host = (URI.parse(url).host || url).downcase
27
- host = host.gsub("www.", "")
36
+ host = host.gsub('www.', '')
28
37
 
29
38
  @hosts.keys.select do |category|
30
39
  @hosts[category].any? do |blocked_host|
@@ -41,18 +50,18 @@ module UrlCategorise
41
50
 
42
51
  def resolve_and_categorise(domain)
43
52
  categories = categorise(domain)
44
-
53
+
45
54
  begin
46
55
  resolver = Resolv::DNS.new(nameserver: @dns_servers)
47
56
  ip_addresses = resolver.getaddresses(domain).map(&:to_s)
48
-
57
+
49
58
  ip_addresses.each do |ip|
50
59
  categories.concat(categorise_ip(ip))
51
60
  end
52
- rescue
61
+ rescue StandardError
53
62
  # DNS resolution failed, return domain categories only
54
63
  end
55
-
64
+
56
65
  categories.uniq
57
66
  end
58
67
 
@@ -71,58 +80,56 @@ module UrlCategorise
71
80
  end
72
81
 
73
82
  def check_all_lists
74
- puts "Checking all lists in constants..."
75
-
83
+ puts 'Checking all lists in constants...'
84
+
76
85
  unreachable_lists = {}
77
86
  missing_categories = []
78
87
  successful_lists = {}
79
-
88
+
80
89
  @host_urls.each do |category, urls|
81
90
  puts "\nChecking category: #{category}"
82
-
91
+
83
92
  if urls.empty?
84
93
  missing_categories << category
85
- puts " ❌ No URLs defined for category"
94
+ puts ' ❌ No URLs defined for category'
86
95
  next
87
96
  end
88
-
97
+
89
98
  unreachable_lists[category] = []
90
99
  successful_lists[category] = []
91
-
100
+
92
101
  urls.each do |url|
93
102
  # Skip symbol references (combined categories)
94
103
  if url.is_a?(Symbol)
95
104
  puts " ➡️ References other category: #{url}"
96
105
  next
97
106
  end
98
-
107
+
99
108
  unless url_valid?(url)
100
- unreachable_lists[category] << { url: url, error: "Invalid URL format" }
109
+ unreachable_lists[category] << { url: url, error: 'Invalid URL format' }
101
110
  puts " ❌ Invalid URL format: #{url}"
102
111
  next
103
112
  end
104
-
113
+
105
114
  print " 🔍 Testing #{url}... "
106
-
115
+
107
116
  begin
108
117
  response = HTTParty.head(url, timeout: @request_timeout, follow_redirects: true)
109
-
118
+
110
119
  case response.code
111
120
  when 200
112
- puts "✅ OK"
121
+ puts '✅ OK'
113
122
  successful_lists[category] << url
114
123
  when 301, 302, 307, 308
115
124
  puts "↗️ Redirect (#{response.code})"
116
- if response.headers['location']
117
- puts " Redirects to: #{response.headers['location']}"
118
- end
125
+ puts " Redirects to: #{response.headers['location']}" if response.headers['location']
119
126
  successful_lists[category] << url
120
127
  when 404
121
- puts "❌ Not Found (404)"
122
- unreachable_lists[category] << { url: url, error: "404 Not Found" }
128
+ puts '❌ Not Found (404)'
129
+ unreachable_lists[category] << { url: url, error: '404 Not Found' }
123
130
  when 403
124
- puts "❌ Forbidden (403)"
125
- unreachable_lists[category] << { url: url, error: "403 Forbidden" }
131
+ puts '❌ Forbidden (403)'
132
+ unreachable_lists[category] << { url: url, error: '403 Forbidden' }
126
133
  when 500..599
127
134
  puts "❌ Server Error (#{response.code})"
128
135
  unreachable_lists[category] << { url: url, error: "Server Error #{response.code}" }
@@ -130,51 +137,50 @@ module UrlCategorise
130
137
  puts "⚠️ Unexpected response (#{response.code})"
131
138
  unreachable_lists[category] << { url: url, error: "HTTP #{response.code}" }
132
139
  end
133
-
134
140
  rescue Timeout::Error
135
- puts "❌ Timeout"
136
- unreachable_lists[category] << { url: url, error: "Request timeout" }
141
+ puts '❌ Timeout'
142
+ unreachable_lists[category] << { url: url, error: 'Request timeout' }
137
143
  rescue SocketError => e
138
- puts "❌ DNS/Network Error"
144
+ puts '❌ DNS/Network Error'
139
145
  unreachable_lists[category] << { url: url, error: "DNS/Network: #{e.message}" }
140
146
  rescue HTTParty::Error, Net::HTTPError => e
141
- puts "❌ HTTP Error"
147
+ puts '❌ HTTP Error'
142
148
  unreachable_lists[category] << { url: url, error: "HTTP Error: #{e.message}" }
143
149
  rescue StandardError => e
144
150
  puts "❌ Error: #{e.class}"
145
151
  unreachable_lists[category] << { url: url, error: "#{e.class}: #{e.message}" }
146
152
  end
147
-
153
+
148
154
  # Small delay to be respectful to servers
149
155
  sleep(0.1)
150
156
  end
151
-
157
+
152
158
  # Remove empty arrays
153
159
  unreachable_lists.delete(category) if unreachable_lists[category].empty?
154
160
  successful_lists.delete(category) if successful_lists[category].empty?
155
161
  end
156
-
162
+
157
163
  # Generate summary report
158
- puts "\n" + "="*80
159
- puts "LIST HEALTH REPORT"
160
- puts "="*80
161
-
164
+ puts "\n" + '=' * 80
165
+ puts 'LIST HEALTH REPORT'
166
+ puts '=' * 80
167
+
162
168
  puts "\n📊 SUMMARY:"
163
169
  total_categories = @host_urls.keys.length
164
170
  categories_with_issues = unreachable_lists.keys.length + missing_categories.length
165
171
  categories_healthy = total_categories - categories_with_issues
166
-
172
+
167
173
  puts " Total categories: #{total_categories}"
168
174
  puts " Healthy categories: #{categories_healthy}"
169
175
  puts " Categories with issues: #{categories_with_issues}"
170
-
176
+
171
177
  if missing_categories.any?
172
178
  puts "\n❌ CATEGORIES WITH NO URLS (#{missing_categories.length}):"
173
179
  missing_categories.each do |category|
174
180
  puts " - #{category}"
175
181
  end
176
182
  end
177
-
183
+
178
184
  if unreachable_lists.any?
179
185
  puts "\n❌ UNREACHABLE LISTS:"
180
186
  unreachable_lists.each do |category, failed_urls|
@@ -185,15 +191,15 @@ module UrlCategorise
185
191
  end
186
192
  end
187
193
  end
188
-
194
+
189
195
  puts "\n✅ WORKING CATEGORIES (#{successful_lists.keys.length}):"
190
196
  successful_lists.keys.sort.each do |category|
191
197
  url_count = successful_lists[category].length
192
198
  puts " - #{category} (#{url_count} URL#{'s' if url_count != 1})"
193
199
  end
194
-
195
- puts "\n" + "="*80
196
-
200
+
201
+ puts "\n" + '=' * 80
202
+
197
203
  # Return structured data for programmatic use
198
204
  {
199
205
  summary: {
@@ -207,12 +213,121 @@ module UrlCategorise
207
213
  }
208
214
  end
209
215
 
216
+ def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
217
+ raise Error, 'Dataset processor not configured' unless @dataset_processor
218
+
219
+ default_options = { use_cache: true, integrate_data: true }
220
+ merged_options = default_options.merge(options)
221
+
222
+ dataset = @dataset_processor.process_kaggle_dataset(dataset_owner, dataset_name, merged_options)
223
+
224
+ if merged_options[:integrate_data]
225
+ integrate_dataset(dataset, merged_options[:category_mappings] || {})
226
+ else
227
+ dataset
228
+ end
229
+ end
230
+
231
+ def load_csv_dataset(url, options = {})
232
+ raise Error, 'Dataset processor not configured' unless @dataset_processor
233
+
234
+ default_options = { use_cache: true, integrate_data: true }
235
+ merged_options = default_options.merge(options)
236
+
237
+ dataset = @dataset_processor.process_csv_dataset(url, merged_options)
238
+
239
+ if merged_options[:integrate_data]
240
+ integrate_dataset(dataset, merged_options[:category_mappings] || {})
241
+ else
242
+ dataset
243
+ end
244
+ end
245
+
246
+ def dataset_metadata
247
+ return {} unless @dataset_processor
248
+
249
+ @dataset_metadata || {}
250
+ end
251
+
252
+ def reload_with_datasets
253
+ # Store dataset categories before reload (only those that were added via integrate_dataset)
254
+ dataset_category_data = {}
255
+ if @hosts
256
+ @dataset_categories.each do |category|
257
+ dataset_category_data[category] = @hosts[category].dup if @hosts[category]
258
+ end
259
+ end
260
+
261
+ @hosts = fetch_and_build_host_lists
262
+
263
+ # Restore dataset categories
264
+ dataset_category_data.each do |category, domains|
265
+ @hosts[category] ||= []
266
+ @hosts[category].concat(domains).uniq!
267
+ end
268
+
269
+ self
270
+ end
271
+
210
272
  private
211
273
 
274
+ def initialize_dataset_processor(config)
275
+ processor_config = {
276
+ download_path: config[:download_path] || @cache_dir&.+(File::SEPARATOR + 'downloads'),
277
+ cache_path: config[:cache_path] || @cache_dir&.+(File::SEPARATOR + 'datasets'),
278
+ timeout: config[:timeout] || @request_timeout,
279
+ enable_kaggle: config.fetch(:enable_kaggle, true) # Default to true for backwards compatibility
280
+ }
281
+
282
+ # Add Kaggle credentials if provided and Kaggle is enabled
283
+ if config[:kaggle] && processor_config[:enable_kaggle]
284
+ kaggle_config = config[:kaggle]
285
+ processor_config.merge!({
286
+ username: kaggle_config[:username],
287
+ api_key: kaggle_config[:api_key],
288
+ credentials_file: kaggle_config[:credentials_file]
289
+ })
290
+ end
291
+
292
+ DatasetProcessor.new(**processor_config)
293
+ rescue Error => e
294
+ # Dataset processor failed to initialize, but client can still work without it
295
+ puts "Warning: Dataset processor initialization failed: #{e.message}" if ENV['DEBUG']
296
+ nil
297
+ end
298
+
299
+ def integrate_dataset(dataset, category_mappings)
300
+ return dataset unless @dataset_processor
301
+
302
+ categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
303
+
304
+ # Store metadata
305
+ @dataset_metadata ||= {}
306
+ @dataset_metadata[categorized_data[:_metadata][:data_hash]] = categorized_data[:_metadata]
307
+
308
+ # Remove metadata from the working data
309
+ categorized_data.delete(:_metadata)
310
+
311
+ # Merge with existing host data
312
+ categorized_data.each do |category, domains|
313
+ next if category.to_s.start_with?('_') # Skip internal keys
314
+
315
+ # Convert category to symbol for consistency
316
+ category_sym = category.to_sym
317
+ @hosts[category_sym] ||= []
318
+ @hosts[category_sym].concat(domains).uniq!
319
+
320
+ # Track this as a dataset category
321
+ @dataset_categories.add(category_sym)
322
+ end
323
+
324
+ dataset
325
+ end
326
+
212
327
  def hash_size_in_mb(hash)
213
328
  size = 0
214
329
 
215
- hash.each do |key, value|
330
+ hash.each do |_key, value|
216
331
  size += value.join.length
217
332
  end
218
333
 
@@ -243,74 +358,70 @@ module UrlCategorise
243
358
 
244
359
  def build_host_data(urls)
245
360
  all_hosts = []
246
-
361
+
247
362
  urls.each do |url|
248
363
  next unless url_valid?(url)
249
-
364
+
250
365
  hosts_data = nil
251
-
252
- if @cache_dir && !@force_download
253
- hosts_data = read_from_cache(url)
254
- end
255
-
366
+
367
+ hosts_data = read_from_cache(url) if @cache_dir && !@force_download
368
+
256
369
  if hosts_data.nil?
257
370
  hosts_data = download_and_parse_list(url)
258
371
  save_to_cache(url, hosts_data) if @cache_dir
259
372
  end
260
-
373
+
261
374
  all_hosts.concat(hosts_data) if hosts_data
262
375
  end
263
-
376
+
264
377
  all_hosts.compact.sort.uniq
265
378
  end
266
379
 
267
380
  def download_and_parse_list(url)
268
- begin
269
- raw_data = HTTParty.get(url, timeout: @request_timeout)
270
- return [] if raw_data.body.nil? || raw_data.body.empty?
271
-
272
- # Store metadata
273
- etag = raw_data.headers['etag']
274
- last_modified = raw_data.headers['last-modified']
275
- @metadata[url] = {
276
- last_updated: Time.now,
277
- etag: etag,
278
- last_modified: last_modified,
279
- content_hash: Digest::SHA256.hexdigest(raw_data.body),
280
- status: 'success'
281
- }
282
-
283
- parse_list_content(raw_data.body, detect_list_format(raw_data.body))
284
- rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
285
- # Log the error but continue with other lists
286
- @metadata[url] = {
287
- last_updated: Time.now,
288
- error: e.message,
289
- status: 'failed'
290
- }
291
- return []
292
- end
381
+ raw_data = HTTParty.get(url, timeout: @request_timeout)
382
+ return [] if raw_data.body.nil? || raw_data.body.empty?
383
+
384
+ # Store metadata
385
+ etag = raw_data.headers['etag']
386
+ last_modified = raw_data.headers['last-modified']
387
+ @metadata[url] = {
388
+ last_updated: Time.now,
389
+ etag: etag,
390
+ last_modified: last_modified,
391
+ content_hash: Digest::SHA256.hexdigest(raw_data.body),
392
+ status: 'success'
393
+ }
394
+
395
+ parse_list_content(raw_data.body, detect_list_format(raw_data.body))
396
+ rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
397
+ # Log the error but continue with other lists
398
+ @metadata[url] = {
399
+ last_updated: Time.now,
400
+ error: e.message,
401
+ status: 'failed'
402
+ }
403
+ []
293
404
  end
294
405
 
295
406
  def parse_list_content(content, format)
296
407
  lines = content.split("\n").reject { |line| line.empty? || line.strip.start_with?('#') }
297
-
408
+
298
409
  case format
299
410
  when :hosts
300
- lines.map { |line|
411
+ lines.map do |line|
301
412
  parts = line.split(' ')
302
413
  # Extract domain from hosts format: "0.0.0.0 domain.com" -> "domain.com"
303
414
  parts.length >= 2 ? parts[1].strip : nil
304
- }.compact.reject(&:empty?)
415
+ end.compact.reject(&:empty?)
305
416
  when :plain
306
417
  lines.map(&:strip)
307
418
  when :dnsmasq
308
- lines.map { |line|
309
- match = line.match(/address=\/(.+?)\//)
419
+ lines.map do |line|
420
+ match = line.match(%r{address=/(.+?)/})
310
421
  match ? match[1] : nil
311
- }.compact
422
+ end.compact
312
423
  when :ublock
313
- lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[\$\^].*$/, '').strip }.reject(&:empty?)
424
+ lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[$\^].*$/, '').strip }.reject(&:empty?)
314
425
  else
315
426
  lines.map(&:strip)
316
427
  end
@@ -319,19 +430,19 @@ module UrlCategorise
319
430
  def detect_list_format(content)
320
431
  # Skip comments and empty lines, then look at first 20 non-comment lines
321
432
  sample_lines = content.split("\n")
322
- .reject { |line| line.empty? || line.strip.start_with?('#') }
323
- .first(20)
324
-
433
+ .reject { |line| line.empty? || line.strip.start_with?('#') }
434
+ .first(20)
435
+
325
436
  return :hosts if sample_lines.any? { |line| line.match(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+/) }
326
437
  return :dnsmasq if sample_lines.any? { |line| line.include?('address=/') }
327
438
  return :ublock if sample_lines.any? { |line| line.match(/^\|\|/) }
328
-
439
+
329
440
  :plain
330
441
  end
331
442
 
332
443
  def cache_file_path(url)
333
444
  return nil unless @cache_dir
334
-
445
+
335
446
  FileUtils.mkdir_p(@cache_dir) unless Dir.exist?(@cache_dir)
336
447
  filename = Digest::MD5.hexdigest(url) + '.cache'
337
448
  File.join(@cache_dir, filename)
@@ -340,62 +451,60 @@ module UrlCategorise
340
451
  def read_from_cache(url)
341
452
  cache_file = cache_file_path(url)
342
453
  return nil unless cache_file && File.exist?(cache_file)
343
-
454
+
344
455
  cache_data = Marshal.load(File.read(cache_file))
345
-
456
+
346
457
  # Check if we should update based on hash or time
347
- if should_update_cache?(url, cache_data)
348
- return nil
349
- end
350
-
458
+ return nil if should_update_cache?(url, cache_data)
459
+
351
460
  cache_data[:hosts]
352
- rescue
461
+ rescue StandardError
353
462
  nil
354
463
  end
355
464
 
356
465
  def save_to_cache(url, hosts_data)
357
466
  cache_file = cache_file_path(url)
358
467
  return unless cache_file
359
-
468
+
360
469
  cache_data = {
361
470
  hosts: hosts_data,
362
471
  metadata: @metadata[url],
363
472
  cached_at: Time.now
364
473
  }
365
-
474
+
366
475
  File.write(cache_file, Marshal.dump(cache_data))
367
- rescue
476
+ rescue StandardError
368
477
  # Cache save failed, continue without caching
369
478
  end
370
479
 
371
480
  def should_update_cache?(url, cache_data)
372
481
  return true if @force_download
373
482
  return true unless cache_data[:metadata]
374
-
483
+
375
484
  # Update if cache is older than 24 hours
376
485
  cache_age = Time.now - cache_data[:cached_at]
377
486
  return true if cache_age > 24 * 60 * 60
378
-
487
+
379
488
  # Check if remote content has changed
380
489
  begin
381
490
  head_response = HTTParty.head(url, timeout: @request_timeout)
382
491
  remote_etag = head_response.headers['etag']
383
492
  remote_last_modified = head_response.headers['last-modified']
384
-
493
+
385
494
  cached_metadata = cache_data[:metadata]
386
-
495
+
387
496
  return true if remote_etag && cached_metadata[:etag] && remote_etag != cached_metadata[:etag]
388
- return true if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
497
+ if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
498
+ return true
499
+ end
389
500
  rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError
390
501
  # If HEAD request fails, assume we should update
391
502
  return true
392
503
  end
393
-
504
+
394
505
  false
395
506
  end
396
507
 
397
- private
398
-
399
508
  def categories_with_keys
400
509
  keyed_categories = {}
401
510