UrlCategorise 0.1.2 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
+ require 'set'
2
+
1
3
  module UrlCategorise
2
4
  class Client < ApiPattern::Client
3
5
  include ::UrlCategorise::Constants
6
+ include ActiveAttr::Model
4
7
 
5
8
  def self.compatible_api_version
6
9
  'v2'
@@ -10,49 +13,93 @@ module UrlCategorise
10
13
  'v2 2025-08-23'
11
14
  end
12
15
 
13
- attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout
16
+ attribute :host_urls, default: -> { DEFAULT_HOST_URLS }
17
+ attribute :cache_dir
18
+ attribute :force_download, type: Boolean, default: false
19
+ attribute :dns_servers, default: ['1.1.1.1', '1.0.0.1']
20
+ attribute :request_timeout, type: Integer, default: 10
21
+ attribute :iab_compliance_enabled, type: Boolean, default: false
22
+ attribute :iab_version, default: :v3
23
+ attribute :auto_load_datasets, type: Boolean, default: false
24
+ attribute :smart_categorization_enabled, type: Boolean, default: false
25
+ attribute :smart_rules, default: -> { {} }
26
+
27
+ attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
28
+
29
+ def initialize(**kwargs)
30
+ # Extract dataset_config for later use
31
+ dataset_config = kwargs.fetch(:dataset_config, {})
32
+
33
+ # Set ActiveAttr attributes - preserve explicitly passed values including nil
34
+ self.host_urls = kwargs.key?(:host_urls) ? kwargs[:host_urls] : DEFAULT_HOST_URLS
35
+ self.cache_dir = kwargs[:cache_dir] # will be nil if not provided or explicitly nil
36
+ self.force_download = kwargs.key?(:force_download) ? kwargs[:force_download] : false
37
+ self.dns_servers = kwargs.key?(:dns_servers) ? kwargs[:dns_servers] : ['1.1.1.1', '1.0.0.1']
38
+ self.request_timeout = kwargs.key?(:request_timeout) ? kwargs[:request_timeout] : 10
39
+ self.iab_compliance_enabled = kwargs.key?(:iab_compliance) ? kwargs[:iab_compliance] : false
40
+ self.iab_version = kwargs.key?(:iab_version) ? kwargs[:iab_version] : :v3
41
+ self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
42
+ self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
43
+ self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
14
44
 
15
- def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false, dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10)
16
- @host_urls = host_urls
17
- @cache_dir = cache_dir
18
- @force_download = force_download
19
- @dns_servers = dns_servers
20
- @request_timeout = request_timeout
21
45
  @metadata = {}
46
+ @dataset_categories = Set.new # Track which categories come from datasets
47
+
48
+ # Initialize dataset processor if config provided
49
+ @dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
50
+
22
51
  @hosts = fetch_and_build_host_lists
52
+
53
+ # Auto-load datasets from constants if enabled
54
+ load_datasets_from_constants if auto_load_datasets && @dataset_processor
23
55
  end
24
56
 
25
57
  def categorise(url)
26
58
  host = (URI.parse(url).host || url).downcase
27
- host = host.gsub("www.", "")
59
+ host = host.gsub('www.', '')
28
60
 
29
- @hosts.keys.select do |category|
61
+ categories = @hosts.keys.select do |category|
30
62
  @hosts[category].any? do |blocked_host|
31
63
  host == blocked_host || host.end_with?(".#{blocked_host}")
32
64
  end
33
65
  end
66
+
67
+ # Apply smart categorization if enabled
68
+ categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
69
+
70
+ if iab_compliance_enabled
71
+ IabCompliance.get_iab_categories(categories, iab_version)
72
+ else
73
+ categories
74
+ end
34
75
  end
35
76
 
36
77
  def categorise_ip(ip_address)
37
- @hosts.keys.select do |category|
78
+ categories = @hosts.keys.select do |category|
38
79
  @hosts[category].include?(ip_address)
39
80
  end
81
+
82
+ if iab_compliance_enabled
83
+ IabCompliance.get_iab_categories(categories, iab_version)
84
+ else
85
+ categories
86
+ end
40
87
  end
41
88
 
42
89
  def resolve_and_categorise(domain)
43
90
  categories = categorise(domain)
44
-
91
+
45
92
  begin
46
- resolver = Resolv::DNS.new(nameserver: @dns_servers)
93
+ resolver = Resolv::DNS.new(nameserver: dns_servers)
47
94
  ip_addresses = resolver.getaddresses(domain).map(&:to_s)
48
-
95
+
49
96
  ip_addresses.each do |ip|
50
97
  categories.concat(categorise_ip(ip))
51
98
  end
52
- rescue
99
+ rescue StandardError
53
100
  # DNS resolution failed, return domain categories only
54
101
  end
55
-
102
+
56
103
  categories.uniq
57
104
  end
58
105
 
@@ -70,59 +117,113 @@ module UrlCategorise
70
117
  hash_size_in_mb(@hosts)
71
118
  end
72
119
 
120
+ def size_of_dataset_data
121
+ dataset_hosts = {}
122
+ @dataset_categories.each do |category|
123
+ dataset_hosts[category] = @hosts[category] || []
124
+ end
125
+ hash_size_in_mb(dataset_hosts)
126
+ end
127
+
128
+ def size_of_blocklist_data
129
+ blocklist_hosts = {}
130
+ @hosts.each do |category, domains|
131
+ blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
132
+ end
133
+ hash_size_in_mb(blocklist_hosts)
134
+ end
135
+
136
+ def size_of_data_bytes
137
+ hash_size_in_bytes(@hosts)
138
+ end
139
+
140
+ def size_of_dataset_data_bytes
141
+ dataset_hosts = {}
142
+ @dataset_categories.each do |category|
143
+ dataset_hosts[category] = @hosts[category] || []
144
+ end
145
+ hash_size_in_bytes(dataset_hosts)
146
+ end
147
+
148
+ def size_of_blocklist_data_bytes
149
+ blocklist_hosts = {}
150
+ @hosts.each do |category, domains|
151
+ blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
152
+ end
153
+ hash_size_in_bytes(blocklist_hosts)
154
+ end
155
+
156
+ def count_of_dataset_hosts
157
+ @dataset_categories.map do |category|
158
+ @hosts[category]&.size || 0
159
+ end.sum
160
+ end
161
+
162
+ def count_of_dataset_categories
163
+ @dataset_categories.size
164
+ end
165
+
166
+ def iab_compliant?
167
+ iab_compliance_enabled
168
+ end
169
+
170
+ def get_iab_mapping(category)
171
+ return nil unless iab_compliance_enabled
172
+
173
+ IabCompliance.map_category_to_iab(category, iab_version)
174
+ end
175
+
73
176
  def check_all_lists
74
- puts "Checking all lists in constants..."
75
-
177
+ puts 'Checking all lists in constants...'
178
+
76
179
  unreachable_lists = {}
77
180
  missing_categories = []
78
181
  successful_lists = {}
79
-
80
- @host_urls.each do |category, urls|
182
+
183
+ (host_urls || {}).each do |category, urls|
81
184
  puts "\nChecking category: #{category}"
82
-
185
+
83
186
  if urls.empty?
84
187
  missing_categories << category
85
- puts " ❌ No URLs defined for category"
188
+ puts ' ❌ No URLs defined for category'
86
189
  next
87
190
  end
88
-
191
+
89
192
  unreachable_lists[category] = []
90
193
  successful_lists[category] = []
91
-
194
+
92
195
  urls.each do |url|
93
196
  # Skip symbol references (combined categories)
94
197
  if url.is_a?(Symbol)
95
198
  puts " ➡️ References other category: #{url}"
96
199
  next
97
200
  end
98
-
201
+
99
202
  unless url_valid?(url)
100
- unreachable_lists[category] << { url: url, error: "Invalid URL format" }
203
+ unreachable_lists[category] << { url: url, error: 'Invalid URL format' }
101
204
  puts " ❌ Invalid URL format: #{url}"
102
205
  next
103
206
  end
104
-
207
+
105
208
  print " 🔍 Testing #{url}... "
106
-
209
+
107
210
  begin
108
- response = HTTParty.head(url, timeout: @request_timeout, follow_redirects: true)
109
-
211
+ response = HTTParty.head(url, timeout: request_timeout, follow_redirects: true)
212
+
110
213
  case response.code
111
214
  when 200
112
- puts "✅ OK"
215
+ puts '✅ OK'
113
216
  successful_lists[category] << url
114
217
  when 301, 302, 307, 308
115
218
  puts "↗️ Redirect (#{response.code})"
116
- if response.headers['location']
117
- puts " Redirects to: #{response.headers['location']}"
118
- end
219
+ puts " Redirects to: #{response.headers['location']}" if response.headers['location']
119
220
  successful_lists[category] << url
120
221
  when 404
121
- puts "❌ Not Found (404)"
122
- unreachable_lists[category] << { url: url, error: "404 Not Found" }
222
+ puts '❌ Not Found (404)'
223
+ unreachable_lists[category] << { url: url, error: '404 Not Found' }
123
224
  when 403
124
- puts "❌ Forbidden (403)"
125
- unreachable_lists[category] << { url: url, error: "403 Forbidden" }
225
+ puts '❌ Forbidden (403)'
226
+ unreachable_lists[category] << { url: url, error: '403 Forbidden' }
126
227
  when 500..599
127
228
  puts "❌ Server Error (#{response.code})"
128
229
  unreachable_lists[category] << { url: url, error: "Server Error #{response.code}" }
@@ -130,51 +231,50 @@ module UrlCategorise
130
231
  puts "⚠️ Unexpected response (#{response.code})"
131
232
  unreachable_lists[category] << { url: url, error: "HTTP #{response.code}" }
132
233
  end
133
-
134
234
  rescue Timeout::Error
135
- puts "❌ Timeout"
136
- unreachable_lists[category] << { url: url, error: "Request timeout" }
235
+ puts '❌ Timeout'
236
+ unreachable_lists[category] << { url: url, error: 'Request timeout' }
137
237
  rescue SocketError => e
138
- puts "❌ DNS/Network Error"
238
+ puts '❌ DNS/Network Error'
139
239
  unreachable_lists[category] << { url: url, error: "DNS/Network: #{e.message}" }
140
240
  rescue HTTParty::Error, Net::HTTPError => e
141
- puts "❌ HTTP Error"
241
+ puts '❌ HTTP Error'
142
242
  unreachable_lists[category] << { url: url, error: "HTTP Error: #{e.message}" }
143
243
  rescue StandardError => e
144
244
  puts "❌ Error: #{e.class}"
145
245
  unreachable_lists[category] << { url: url, error: "#{e.class}: #{e.message}" }
146
246
  end
147
-
247
+
148
248
  # Small delay to be respectful to servers
149
249
  sleep(0.1)
150
250
  end
151
-
251
+
152
252
  # Remove empty arrays
153
253
  unreachable_lists.delete(category) if unreachable_lists[category].empty?
154
254
  successful_lists.delete(category) if successful_lists[category].empty?
155
255
  end
156
-
256
+
157
257
  # Generate summary report
158
- puts "\n" + "="*80
159
- puts "LIST HEALTH REPORT"
160
- puts "="*80
161
-
258
+ puts "\n" + '=' * 80
259
+ puts 'LIST HEALTH REPORT'
260
+ puts '=' * 80
261
+
162
262
  puts "\n📊 SUMMARY:"
163
- total_categories = @host_urls.keys.length
263
+ total_categories = (host_urls || {}).keys.length
164
264
  categories_with_issues = unreachable_lists.keys.length + missing_categories.length
165
265
  categories_healthy = total_categories - categories_with_issues
166
-
266
+
167
267
  puts " Total categories: #{total_categories}"
168
268
  puts " Healthy categories: #{categories_healthy}"
169
269
  puts " Categories with issues: #{categories_with_issues}"
170
-
270
+
171
271
  if missing_categories.any?
172
272
  puts "\n❌ CATEGORIES WITH NO URLS (#{missing_categories.length}):"
173
273
  missing_categories.each do |category|
174
274
  puts " - #{category}"
175
275
  end
176
276
  end
177
-
277
+
178
278
  if unreachable_lists.any?
179
279
  puts "\n❌ UNREACHABLE LISTS:"
180
280
  unreachable_lists.each do |category, failed_urls|
@@ -185,15 +285,15 @@ module UrlCategorise
185
285
  end
186
286
  end
187
287
  end
188
-
288
+
189
289
  puts "\n✅ WORKING CATEGORIES (#{successful_lists.keys.length}):"
190
290
  successful_lists.keys.sort.each do |category|
191
291
  url_count = successful_lists[category].length
192
292
  puts " - #{category} (#{url_count} URL#{'s' if url_count != 1})"
193
293
  end
194
-
195
- puts "\n" + "="*80
196
-
294
+
295
+ puts "\n" + '=' * 80
296
+
197
297
  # Return structured data for programmatic use
198
298
  {
199
299
  summary: {
@@ -207,23 +307,363 @@ module UrlCategorise
207
307
  }
208
308
  end
209
309
 
310
+ def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
311
+ raise Error, 'Dataset processor not configured' unless @dataset_processor
312
+
313
+ default_options = { use_cache: true, integrate_data: true }
314
+ merged_options = default_options.merge(options)
315
+
316
+ dataset = @dataset_processor.process_kaggle_dataset(dataset_owner, dataset_name, merged_options)
317
+
318
+ if merged_options[:integrate_data]
319
+ integrate_dataset(dataset, merged_options[:category_mappings] || {})
320
+ else
321
+ dataset
322
+ end
323
+ end
324
+
325
+ def load_csv_dataset(url, options = {})
326
+ raise Error, 'Dataset processor not configured' unless @dataset_processor
327
+
328
+ default_options = { use_cache: true, integrate_data: true }
329
+ merged_options = default_options.merge(options)
330
+
331
+ dataset = @dataset_processor.process_csv_dataset(url, merged_options)
332
+
333
+ if merged_options[:integrate_data]
334
+ integrate_dataset(dataset, merged_options[:category_mappings] || {})
335
+ else
336
+ dataset
337
+ end
338
+ end
339
+
340
+ def dataset_metadata
341
+ return {} unless @dataset_processor
342
+
343
+ @dataset_metadata || {}
344
+ end
345
+
346
+ def reload_with_datasets
347
+ # Store dataset categories before reload (only those that were added via integrate_dataset)
348
+ dataset_category_data = {}
349
+ if @hosts
350
+ @dataset_categories.each do |category|
351
+ dataset_category_data[category] = @hosts[category].dup if @hosts[category]
352
+ end
353
+ end
354
+
355
+ @hosts = fetch_and_build_host_lists
356
+
357
+ # Restore dataset categories
358
+ dataset_category_data.each do |category, domains|
359
+ @hosts[category] ||= []
360
+ @hosts[category].concat(domains).uniq!
361
+ end
362
+
363
+ # Reload datasets from constants if auto-loading is enabled
364
+ load_datasets_from_constants if auto_load_datasets && @dataset_processor
365
+
366
+ self
367
+ end
368
+
369
+ def export_hosts_files(output_path = nil)
370
+ export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'hosts') : File.join(Dir.pwd, 'exports', 'hosts'))
371
+
372
+ FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
373
+
374
+ exported_files = {}
375
+
376
+ @hosts.each do |category, domains|
377
+ next if domains.empty?
378
+
379
+ filename = "#{category}.hosts"
380
+ file_path = File.join(export_dir, filename)
381
+
382
+ File.open(file_path, 'w') do |file|
383
+ file.puts "# #{category.to_s.gsub('_', ' ').split.map(&:capitalize).join(' ')} - Generated by UrlCategorise"
384
+ file.puts "# Generated at: #{Time.now}"
385
+ file.puts "# Total entries: #{domains.length}"
386
+ file.puts ""
387
+
388
+ domains.sort.each do |domain|
389
+ file.puts "0.0.0.0 #{domain}"
390
+ end
391
+ end
392
+
393
+ exported_files[category] = {
394
+ path: file_path,
395
+ filename: filename,
396
+ count: domains.length
397
+ }
398
+ end
399
+
400
+ # Create summary file
401
+ summary_path = File.join(export_dir, '_export_summary.txt')
402
+ File.open(summary_path, 'w') do |file|
403
+ file.puts "UrlCategorise Hosts Export Summary"
404
+ file.puts "=================================="
405
+ file.puts "Generated at: #{Time.now}"
406
+ file.puts "Export directory: #{export_dir}"
407
+ file.puts "Total categories: #{exported_files.keys.length}"
408
+ file.puts "Total domains: #{@hosts.values.map(&:length).sum}"
409
+ file.puts ""
410
+ file.puts "Files created:"
411
+
412
+ exported_files.each do |category, info|
413
+ file.puts " #{info[:filename]} - #{info[:count]} domains"
414
+ end
415
+ end
416
+
417
+ exported_files[:_summary] = {
418
+ path: summary_path,
419
+ total_categories: exported_files.keys.length,
420
+ total_domains: @hosts.values.map(&:length).sum,
421
+ export_directory: export_dir
422
+ }
423
+
424
+ exported_files
425
+ end
426
+
427
+ def export_csv_data(output_path = nil)
428
+ require 'csv'
429
+
430
+ export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'csv') : File.join(Dir.pwd, 'exports', 'csv'))
431
+
432
+ FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
433
+
434
+ filename = "url_categorise_data_export_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
435
+ file_path = File.join(export_dir, filename)
436
+
437
+ CSV.open(file_path, 'w', headers: true) do |csv|
438
+ # Add headers
439
+ csv << [
440
+ 'domain',
441
+ 'category',
442
+ 'source_type',
443
+ 'is_dataset_category',
444
+ 'iab_category_v2',
445
+ 'iab_category_v3',
446
+ 'export_timestamp',
447
+ 'smart_categorization_enabled'
448
+ ]
449
+
450
+ # Export all host/category data
451
+ @hosts.each do |category, domains|
452
+ domains.each do |domain|
453
+ source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
454
+ is_dataset_category = @dataset_categories.include?(category)
455
+
456
+ # Get IAB mappings if compliance is enabled
457
+ iab_v2 = nil
458
+ iab_v3 = nil
459
+ if iab_compliance_enabled
460
+ iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
461
+ iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
462
+ end
463
+
464
+ csv << [
465
+ domain,
466
+ category,
467
+ source_type,
468
+ is_dataset_category,
469
+ iab_v2,
470
+ iab_v3,
471
+ Time.now.iso8601,
472
+ smart_categorization_enabled
473
+ ]
474
+ end
475
+ end
476
+ end
477
+
478
+ # Create metadata file
479
+ metadata_path = File.join(export_dir, "#{File.basename(filename, '.csv')}_metadata.json")
480
+ metadata = {
481
+ export_info: {
482
+ timestamp: Time.now.iso8601,
483
+ filename: filename,
484
+ file_path: file_path,
485
+ metadata_path: metadata_path
486
+ },
487
+ client_settings: {
488
+ iab_compliance_enabled: iab_compliance_enabled,
489
+ iab_version: iab_version,
490
+ smart_categorization_enabled: smart_categorization_enabled,
491
+ auto_load_datasets: auto_load_datasets
492
+ },
493
+ data_summary: {
494
+ total_domains: @hosts.values.map(&:length).sum,
495
+ total_categories: @hosts.keys.length,
496
+ dataset_categories_count: @dataset_categories.size,
497
+ blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
498
+ categories: @hosts.keys.sort.map(&:to_s)
499
+ },
500
+ dataset_metadata: dataset_metadata
501
+ }
502
+
503
+ File.write(metadata_path, JSON.pretty_generate(metadata))
504
+
505
+ {
506
+ csv_file: file_path,
507
+ metadata_file: metadata_path,
508
+ summary: metadata[:data_summary],
509
+ export_directory: export_dir
510
+ }
511
+ end
512
+
210
513
  private
211
514
 
515
+ def initialize_dataset_processor(config)
516
+ processor_config = {
517
+ download_path: config[:download_path] || cache_dir&.+(File::SEPARATOR + 'downloads'),
518
+ cache_path: config[:cache_path] || cache_dir&.+(File::SEPARATOR + 'datasets'),
519
+ timeout: config[:timeout] || request_timeout,
520
+ enable_kaggle: config.fetch(:enable_kaggle, true) # Default to true for backwards compatibility
521
+ }
522
+
523
+ # Add Kaggle credentials if provided and Kaggle is enabled
524
+ if config[:kaggle] && processor_config[:enable_kaggle]
525
+ kaggle_config = config[:kaggle]
526
+ processor_config.merge!({
527
+ username: kaggle_config[:username],
528
+ api_key: kaggle_config[:api_key],
529
+ credentials_file: kaggle_config[:credentials_file]
530
+ })
531
+ end
532
+
533
+ DatasetProcessor.new(**processor_config)
534
+ rescue Error => e
535
+ # Dataset processor failed to initialize, but client can still work without it
536
+ puts "Warning: Dataset processor initialization failed: #{e.message}" if ENV['DEBUG']
537
+ nil
538
+ end
539
+
540
+ def integrate_dataset(dataset, category_mappings)
541
+ return dataset unless @dataset_processor
542
+ return nil unless dataset # Handle nil datasets gracefully
543
+
544
+ categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
545
+
546
+ # Store metadata
547
+ @dataset_metadata ||= {}
548
+ @dataset_metadata[categorized_data[:_metadata][:data_hash]] = categorized_data[:_metadata]
549
+
550
+ # Remove metadata from the working data
551
+ categorized_data.delete(:_metadata)
552
+
553
+ # Merge with existing host data
554
+ categorized_data.each do |category, domains|
555
+ next if category.to_s.start_with?('_') # Skip internal keys
556
+
557
+ # Convert category to symbol for consistency
558
+ category_sym = category.to_sym
559
+ @hosts[category_sym] ||= []
560
+ @hosts[category_sym].concat(domains).uniq!
561
+
562
+ # Track this as a dataset category
563
+ @dataset_categories.add(category_sym)
564
+ end
565
+
566
+ dataset
567
+ end
568
+
569
+ def load_datasets_from_constants
570
+ return unless defined?(CATEGORIY_DATABASES) && CATEGORIY_DATABASES.is_a?(Array)
571
+ return unless @dataset_processor
572
+
573
+ puts "Loading #{CATEGORIY_DATABASES.length} datasets from constants..." if ENV['DEBUG']
574
+ loaded_count = 0
575
+
576
+ CATEGORIY_DATABASES.each do |dataset_config|
577
+ begin
578
+ case dataset_config[:type]
579
+ when :kaggle
580
+ # Parse the kaggle path to get owner and dataset name
581
+ path_parts = dataset_config[:path].split('/')
582
+ next unless path_parts.length == 2
583
+
584
+ dataset_owner, dataset_name = path_parts
585
+
586
+ # Check if dataset is already cached before attempting to load
587
+ cache_key = @dataset_processor.send(:generate_cache_key, "#{dataset_owner}/#{dataset_name}", :kaggle)
588
+ cache_file = File.join(@dataset_processor.cache_path, cache_key)
589
+
590
+ if File.exist?(cache_file)
591
+ puts "Loading cached Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
592
+ load_kaggle_dataset(dataset_owner, dataset_name, {
593
+ use_cache: true,
594
+ integrate_data: true
595
+ })
596
+ loaded_count += 1
597
+ else
598
+ puts "Attempting to download missing Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
599
+ begin
600
+ load_kaggle_dataset(dataset_owner, dataset_name, {
601
+ use_cache: true,
602
+ integrate_data: true
603
+ })
604
+ loaded_count += 1
605
+ rescue Error => e
606
+ puts "Warning: Failed to download Kaggle dataset #{dataset_owner}/#{dataset_name}: #{e.message}" if ENV['DEBUG']
607
+ end
608
+ end
609
+
610
+ when :csv
611
+ # Check if CSV dataset is cached
612
+ cache_key = @dataset_processor.send(:generate_cache_key, dataset_config[:path], :csv)
613
+ cache_file = File.join(@dataset_processor.cache_path, cache_key)
614
+
615
+ if File.exist?(cache_file)
616
+ puts "Loading cached CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
617
+ load_csv_dataset(dataset_config[:path], {
618
+ use_cache: true,
619
+ integrate_data: true
620
+ })
621
+ loaded_count += 1
622
+ else
623
+ puts "Attempting to download missing CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
624
+ begin
625
+ load_csv_dataset(dataset_config[:path], {
626
+ use_cache: true,
627
+ integrate_data: true
628
+ })
629
+ loaded_count += 1
630
+ rescue Error => e
631
+ puts "Warning: Failed to download CSV dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
632
+ end
633
+ end
634
+ end
635
+ rescue Error => e
636
+ puts "Warning: Failed to load dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
637
+ # Continue loading other datasets even if one fails
638
+ rescue StandardError => e
639
+ puts "Warning: Unexpected error loading dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
640
+ # Continue loading other datasets even if one fails
641
+ end
642
+ end
643
+
644
+ puts "Finished loading datasets from constants (#{loaded_count}/#{CATEGORIY_DATABASES.length} loaded)" if ENV['DEBUG']
645
+ end
646
+
212
647
  def hash_size_in_mb(hash)
648
+ size_bytes = hash_size_in_bytes(hash)
649
+ (size_bytes / ONE_MEGABYTE.to_f).round(2)
650
+ end
651
+
652
+ def hash_size_in_bytes(hash)
213
653
  size = 0
654
+ hash.each do |_key, value|
655
+ next unless value.is_a?(Array)
214
656
 
215
- hash.each do |key, value|
216
657
  size += value.join.length
217
658
  end
218
-
219
- (size / ONE_MEGABYTE).round(2)
659
+ size
220
660
  end
221
661
 
222
662
  def fetch_and_build_host_lists
223
663
  @hosts = {}
224
664
 
225
- host_urls.keys.each do |category|
226
- @hosts[category] = build_host_data(host_urls[category])
665
+ (host_urls || {}).keys.each do |category|
666
+ @hosts[category] = build_host_data((host_urls || {})[category])
227
667
  end
228
668
 
229
669
  sub_category_values = categories_with_keys
@@ -241,76 +681,145 @@ module UrlCategorise
241
681
  @hosts
242
682
  end
243
683
 
684
+ def initialize_smart_rules(custom_rules)
685
+ custom_rules = {} if custom_rules.nil?
686
+ default_rules = {
687
+ social_media_platforms: {
688
+ domains: %w[reddit.com facebook.com twitter.com x.com instagram.com linkedin.com
689
+ pinterest.com tiktok.com youtube.com snapchat.com discord.com],
690
+ remove_categories: %i[health_and_fitness forums news technology education
691
+ business finance entertainment travel sports politics
692
+ science music art food_and_drink shopping gaming]
693
+ },
694
+ search_engines: {
695
+ domains: %w[google.com bing.com yahoo.com duckduckgo.com baidu.com yandex.com],
696
+ remove_categories: %i[news shopping travel health_and_fitness finance technology]
697
+ },
698
+ video_platforms: {
699
+ domains: %w[youtube.com vimeo.com dailymotion.com twitch.tv],
700
+ remove_categories: %i[education news entertainment music sports gaming]
701
+ },
702
+ news_aggregators: {
703
+ domains: %w[reddit.com digg.com],
704
+ keep_primary_only: %i[social_media reddit digg]
705
+ }
706
+ }
707
+
708
+ # Merge custom rules with defaults
709
+ default_rules.merge(custom_rules)
710
+ end
711
+
712
+ def apply_smart_categorization(url, categories)
713
+ return categories unless smart_categorization_enabled
714
+
715
+ host = extract_host(url)
716
+
717
+ smart_rules.each do |_rule_name, rule_config|
718
+ if rule_config[:domains]&.any? { |domain| host == domain || host.end_with?(".#{domain}") }
719
+ categories = apply_rule(categories, rule_config, host, url)
720
+ end
721
+ end
722
+
723
+ categories
724
+ end
725
+
726
+ def apply_rule(categories, rule_config, _host, url)
727
+ # Rule: Remove overly broad categories for specific platforms
728
+ if rule_config[:remove_categories]
729
+ categories = categories.reject { |cat| rule_config[:remove_categories].include?(cat) }
730
+ end
731
+
732
+ # Rule: Keep only primary categories
733
+ if rule_config[:keep_primary_only]
734
+ primary_categories = categories & rule_config[:keep_primary_only]
735
+ categories = primary_categories if primary_categories.any?
736
+ end
737
+
738
+ # Rule: Add specific categories based on URL patterns
739
+ if rule_config[:add_categories_by_path]
740
+ rule_config[:add_categories_by_path].each do |path_pattern, additional_categories|
741
+ categories = (categories + additional_categories).uniq if url.match?(path_pattern)
742
+ end
743
+ end
744
+
745
+ # Rule: Remove all categories except allowed ones
746
+ categories &= rule_config[:allowed_categories_only] if rule_config[:allowed_categories_only]
747
+
748
+ categories
749
+ end
750
+
751
+ def extract_host(url)
752
+ (URI.parse(url).host || url).downcase.gsub('www.', '')
753
+ rescue URI::InvalidURIError
754
+ url.downcase.gsub('www.', '')
755
+ end
756
+
244
757
  def build_host_data(urls)
245
758
  all_hosts = []
246
-
759
+
247
760
  urls.each do |url|
248
761
  next unless url_valid?(url)
249
-
762
+
250
763
  hosts_data = nil
251
-
252
- if @cache_dir && !@force_download
253
- hosts_data = read_from_cache(url)
254
- end
255
-
764
+
765
+ hosts_data = read_from_cache(url) if cache_dir && !force_download
766
+
256
767
  if hosts_data.nil?
257
768
  hosts_data = download_and_parse_list(url)
258
- save_to_cache(url, hosts_data) if @cache_dir
769
+ save_to_cache(url, hosts_data) if cache_dir
259
770
  end
260
-
771
+
261
772
  all_hosts.concat(hosts_data) if hosts_data
262
773
  end
263
-
774
+
264
775
  all_hosts.compact.sort.uniq
265
776
  end
266
777
 
267
778
  def download_and_parse_list(url)
268
- begin
269
- raw_data = HTTParty.get(url, timeout: @request_timeout)
270
- return [] if raw_data.body.nil? || raw_data.body.empty?
271
-
272
- # Store metadata
273
- etag = raw_data.headers['etag']
274
- last_modified = raw_data.headers['last-modified']
275
- @metadata[url] = {
276
- last_updated: Time.now,
277
- etag: etag,
278
- last_modified: last_modified,
279
- content_hash: Digest::SHA256.hexdigest(raw_data.body),
280
- status: 'success'
281
- }
282
-
283
- parse_list_content(raw_data.body, detect_list_format(raw_data.body))
284
- rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
285
- # Log the error but continue with other lists
286
- @metadata[url] = {
287
- last_updated: Time.now,
288
- error: e.message,
289
- status: 'failed'
290
- }
291
- return []
292
- end
779
+ raw_data = HTTParty.get(url, timeout: request_timeout)
780
+ return [] if raw_data.body.nil? || raw_data.body.empty?
781
+
782
+ # Store metadata
783
+ etag = raw_data.headers['etag']
784
+ last_modified = raw_data.headers['last-modified']
785
+ @metadata[url] = {
786
+ last_updated: Time.now,
787
+ etag: etag,
788
+ last_modified: last_modified,
789
+ content_hash: Digest::SHA256.hexdigest(raw_data.body),
790
+ status: 'success'
791
+ }
792
+
793
+ parse_list_content(raw_data.body, detect_list_format(raw_data.body))
794
+ rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
795
+ # Log the error but continue with other lists
796
+ @metadata[url] = {
797
+ last_updated: Time.now,
798
+ error: e.message,
799
+ status: 'failed'
800
+ }
801
+ []
293
802
  end
294
803
 
295
804
  def parse_list_content(content, format)
296
805
  lines = content.split("\n").reject { |line| line.empty? || line.strip.start_with?('#') }
297
-
806
+
298
807
  case format
299
808
  when :hosts
300
- lines.map { |line|
809
+ lines.map do |line|
301
810
  parts = line.split(' ')
302
811
  # Extract domain from hosts format: "0.0.0.0 domain.com" -> "domain.com"
303
812
  parts.length >= 2 ? parts[1].strip : nil
304
- }.compact.reject(&:empty?)
813
+ end.compact.reject(&:empty?)
305
814
  when :plain
306
815
  lines.map(&:strip)
307
816
  when :dnsmasq
308
- lines.map { |line|
309
- match = line.match(/address=\/(.+?)\//)
817
+ lines.map do |line|
818
+ match = line.match(%r{address=/(.+?)/})
310
819
  match ? match[1] : nil
311
- }.compact
820
+ end.compact
312
821
  when :ublock
313
- lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[\$\^].*$/, '').strip }.reject(&:empty?)
822
+ lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[$\^].*$/, '').strip }.reject(&:empty?)
314
823
  else
315
824
  lines.map(&:strip)
316
825
  end
@@ -319,88 +828,86 @@ module UrlCategorise
319
828
  def detect_list_format(content)
320
829
  # Skip comments and empty lines, then look at first 20 non-comment lines
321
830
  sample_lines = content.split("\n")
322
- .reject { |line| line.empty? || line.strip.start_with?('#') }
323
- .first(20)
324
-
831
+ .reject { |line| line.empty? || line.strip.start_with?('#') }
832
+ .first(20)
833
+
325
834
  return :hosts if sample_lines.any? { |line| line.match(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+/) }
326
835
  return :dnsmasq if sample_lines.any? { |line| line.include?('address=/') }
327
836
  return :ublock if sample_lines.any? { |line| line.match(/^\|\|/) }
328
-
837
+
329
838
  :plain
330
839
  end
331
840
 
332
841
  def cache_file_path(url)
333
- return nil unless @cache_dir
334
-
335
- FileUtils.mkdir_p(@cache_dir) unless Dir.exist?(@cache_dir)
842
+ return nil unless cache_dir
843
+
844
+ FileUtils.mkdir_p(cache_dir) unless Dir.exist?(cache_dir)
336
845
  filename = Digest::MD5.hexdigest(url) + '.cache'
337
- File.join(@cache_dir, filename)
846
+ File.join(cache_dir, filename)
338
847
  end
339
848
 
340
849
  def read_from_cache(url)
341
850
  cache_file = cache_file_path(url)
342
851
  return nil unless cache_file && File.exist?(cache_file)
343
-
852
+
344
853
  cache_data = Marshal.load(File.read(cache_file))
345
-
854
+
346
855
  # Check if we should update based on hash or time
347
- if should_update_cache?(url, cache_data)
348
- return nil
349
- end
350
-
856
+ return nil if should_update_cache?(url, cache_data)
857
+
351
858
  cache_data[:hosts]
352
- rescue
859
+ rescue StandardError
353
860
  nil
354
861
  end
355
862
 
356
863
  def save_to_cache(url, hosts_data)
357
864
  cache_file = cache_file_path(url)
358
865
  return unless cache_file
359
-
866
+
360
867
  cache_data = {
361
868
  hosts: hosts_data,
362
869
  metadata: @metadata[url],
363
870
  cached_at: Time.now
364
871
  }
365
-
872
+
366
873
  File.write(cache_file, Marshal.dump(cache_data))
367
- rescue
874
+ rescue StandardError
368
875
  # Cache save failed, continue without caching
369
876
  end
370
877
 
371
878
  def should_update_cache?(url, cache_data)
372
- return true if @force_download
879
+ return true if force_download
373
880
  return true unless cache_data[:metadata]
374
-
881
+
375
882
  # Update if cache is older than 24 hours
376
883
  cache_age = Time.now - cache_data[:cached_at]
377
884
  return true if cache_age > 24 * 60 * 60
378
-
885
+
379
886
  # Check if remote content has changed
380
887
  begin
381
- head_response = HTTParty.head(url, timeout: @request_timeout)
888
+ head_response = HTTParty.head(url, timeout: request_timeout)
382
889
  remote_etag = head_response.headers['etag']
383
890
  remote_last_modified = head_response.headers['last-modified']
384
-
891
+
385
892
  cached_metadata = cache_data[:metadata]
386
-
893
+
387
894
  return true if remote_etag && cached_metadata[:etag] && remote_etag != cached_metadata[:etag]
388
- return true if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
895
+ if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
896
+ return true
897
+ end
389
898
  rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError
390
899
  # If HEAD request fails, assume we should update
391
900
  return true
392
901
  end
393
-
902
+
394
903
  false
395
904
  end
396
905
 
397
- private
398
-
399
906
  def categories_with_keys
400
907
  keyed_categories = {}
401
908
 
402
- host_urls.keys.each do |category|
403
- category_values = host_urls[category].select do |url|
909
+ (host_urls || {}).keys.each do |category|
910
+ category_values = (host_urls || {})[category].select do |url|
404
911
  url.is_a?(Symbol)
405
912
  end
406
913