UrlCategorise 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +7 -2
- data/.gitignore +1 -0
- data/CLAUDE.md +77 -2
- data/Gemfile.lock +13 -1
- data/README.md +332 -7
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +1 -1
- data/lib/url_categorise/client.rb +431 -33
- data/lib/url_categorise/dataset_processor.rb +9 -4
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +2 -0
- data/url_categorise.gemspec +4 -2
- metadata +52 -3
@@ -3,6 +3,7 @@ require 'set'
|
|
3
3
|
module UrlCategorise
|
4
4
|
class Client < ApiPattern::Client
|
5
5
|
include ::UrlCategorise::Constants
|
6
|
+
include ActiveAttr::Model
|
6
7
|
|
7
8
|
def self.compatible_api_version
|
8
9
|
'v2'
|
@@ -12,16 +13,35 @@ module UrlCategorise
|
|
12
13
|
'v2 2025-08-23'
|
13
14
|
end
|
14
15
|
|
15
|
-
|
16
|
-
|
16
|
+
attribute :host_urls, default: -> { DEFAULT_HOST_URLS }
|
17
|
+
attribute :cache_dir
|
18
|
+
attribute :force_download, type: Boolean, default: false
|
19
|
+
attribute :dns_servers, default: ['1.1.1.1', '1.0.0.1']
|
20
|
+
attribute :request_timeout, type: Integer, default: 10
|
21
|
+
attribute :iab_compliance_enabled, type: Boolean, default: false
|
22
|
+
attribute :iab_version, default: :v3
|
23
|
+
attribute :auto_load_datasets, type: Boolean, default: false
|
24
|
+
attribute :smart_categorization_enabled, type: Boolean, default: false
|
25
|
+
attribute :smart_rules, default: -> { {} }
|
26
|
+
|
27
|
+
attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
|
28
|
+
|
29
|
+
def initialize(**kwargs)
|
30
|
+
# Extract dataset_config for later use
|
31
|
+
dataset_config = kwargs.fetch(:dataset_config, {})
|
32
|
+
|
33
|
+
# Set ActiveAttr attributes - preserve explicitly passed values including nil
|
34
|
+
self.host_urls = kwargs.key?(:host_urls) ? kwargs[:host_urls] : DEFAULT_HOST_URLS
|
35
|
+
self.cache_dir = kwargs[:cache_dir] # will be nil if not provided or explicitly nil
|
36
|
+
self.force_download = kwargs.key?(:force_download) ? kwargs[:force_download] : false
|
37
|
+
self.dns_servers = kwargs.key?(:dns_servers) ? kwargs[:dns_servers] : ['1.1.1.1', '1.0.0.1']
|
38
|
+
self.request_timeout = kwargs.key?(:request_timeout) ? kwargs[:request_timeout] : 10
|
39
|
+
self.iab_compliance_enabled = kwargs.key?(:iab_compliance) ? kwargs[:iab_compliance] : false
|
40
|
+
self.iab_version = kwargs.key?(:iab_version) ? kwargs[:iab_version] : :v3
|
41
|
+
self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
|
42
|
+
self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
|
43
|
+
self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
|
17
44
|
|
18
|
-
def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false,
|
19
|
-
dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10, dataset_config: {})
|
20
|
-
@host_urls = host_urls
|
21
|
-
@cache_dir = cache_dir
|
22
|
-
@force_download = force_download
|
23
|
-
@dns_servers = dns_servers
|
24
|
-
@request_timeout = request_timeout
|
25
45
|
@metadata = {}
|
26
46
|
@dataset_categories = Set.new # Track which categories come from datasets
|
27
47
|
|
@@ -29,30 +49,48 @@ module UrlCategorise
|
|
29
49
|
@dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
|
30
50
|
|
31
51
|
@hosts = fetch_and_build_host_lists
|
52
|
+
|
53
|
+
# Auto-load datasets from constants if enabled
|
54
|
+
load_datasets_from_constants if auto_load_datasets && @dataset_processor
|
32
55
|
end
|
33
56
|
|
34
57
|
def categorise(url)
|
35
58
|
host = (URI.parse(url).host || url).downcase
|
36
59
|
host = host.gsub('www.', '')
|
37
60
|
|
38
|
-
@hosts.keys.select do |category|
|
61
|
+
categories = @hosts.keys.select do |category|
|
39
62
|
@hosts[category].any? do |blocked_host|
|
40
63
|
host == blocked_host || host.end_with?(".#{blocked_host}")
|
41
64
|
end
|
42
65
|
end
|
66
|
+
|
67
|
+
# Apply smart categorization if enabled
|
68
|
+
categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
|
69
|
+
|
70
|
+
if iab_compliance_enabled
|
71
|
+
IabCompliance.get_iab_categories(categories, iab_version)
|
72
|
+
else
|
73
|
+
categories
|
74
|
+
end
|
43
75
|
end
|
44
76
|
|
45
77
|
def categorise_ip(ip_address)
|
46
|
-
@hosts.keys.select do |category|
|
78
|
+
categories = @hosts.keys.select do |category|
|
47
79
|
@hosts[category].include?(ip_address)
|
48
80
|
end
|
81
|
+
|
82
|
+
if iab_compliance_enabled
|
83
|
+
IabCompliance.get_iab_categories(categories, iab_version)
|
84
|
+
else
|
85
|
+
categories
|
86
|
+
end
|
49
87
|
end
|
50
88
|
|
51
89
|
def resolve_and_categorise(domain)
|
52
90
|
categories = categorise(domain)
|
53
91
|
|
54
92
|
begin
|
55
|
-
resolver = Resolv::DNS.new(nameserver:
|
93
|
+
resolver = Resolv::DNS.new(nameserver: dns_servers)
|
56
94
|
ip_addresses = resolver.getaddresses(domain).map(&:to_s)
|
57
95
|
|
58
96
|
ip_addresses.each do |ip|
|
@@ -79,6 +117,62 @@ module UrlCategorise
|
|
79
117
|
hash_size_in_mb(@hosts)
|
80
118
|
end
|
81
119
|
|
120
|
+
def size_of_dataset_data
|
121
|
+
dataset_hosts = {}
|
122
|
+
@dataset_categories.each do |category|
|
123
|
+
dataset_hosts[category] = @hosts[category] || []
|
124
|
+
end
|
125
|
+
hash_size_in_mb(dataset_hosts)
|
126
|
+
end
|
127
|
+
|
128
|
+
def size_of_blocklist_data
|
129
|
+
blocklist_hosts = {}
|
130
|
+
@hosts.each do |category, domains|
|
131
|
+
blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
|
132
|
+
end
|
133
|
+
hash_size_in_mb(blocklist_hosts)
|
134
|
+
end
|
135
|
+
|
136
|
+
def size_of_data_bytes
|
137
|
+
hash_size_in_bytes(@hosts)
|
138
|
+
end
|
139
|
+
|
140
|
+
def size_of_dataset_data_bytes
|
141
|
+
dataset_hosts = {}
|
142
|
+
@dataset_categories.each do |category|
|
143
|
+
dataset_hosts[category] = @hosts[category] || []
|
144
|
+
end
|
145
|
+
hash_size_in_bytes(dataset_hosts)
|
146
|
+
end
|
147
|
+
|
148
|
+
def size_of_blocklist_data_bytes
|
149
|
+
blocklist_hosts = {}
|
150
|
+
@hosts.each do |category, domains|
|
151
|
+
blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
|
152
|
+
end
|
153
|
+
hash_size_in_bytes(blocklist_hosts)
|
154
|
+
end
|
155
|
+
|
156
|
+
def count_of_dataset_hosts
|
157
|
+
@dataset_categories.map do |category|
|
158
|
+
@hosts[category]&.size || 0
|
159
|
+
end.sum
|
160
|
+
end
|
161
|
+
|
162
|
+
def count_of_dataset_categories
|
163
|
+
@dataset_categories.size
|
164
|
+
end
|
165
|
+
|
166
|
+
def iab_compliant?
|
167
|
+
iab_compliance_enabled
|
168
|
+
end
|
169
|
+
|
170
|
+
def get_iab_mapping(category)
|
171
|
+
return nil unless iab_compliance_enabled
|
172
|
+
|
173
|
+
IabCompliance.map_category_to_iab(category, iab_version)
|
174
|
+
end
|
175
|
+
|
82
176
|
def check_all_lists
|
83
177
|
puts 'Checking all lists in constants...'
|
84
178
|
|
@@ -86,7 +180,7 @@ module UrlCategorise
|
|
86
180
|
missing_categories = []
|
87
181
|
successful_lists = {}
|
88
182
|
|
89
|
-
|
183
|
+
(host_urls || {}).each do |category, urls|
|
90
184
|
puts "\nChecking category: #{category}"
|
91
185
|
|
92
186
|
if urls.empty?
|
@@ -114,7 +208,7 @@ module UrlCategorise
|
|
114
208
|
print " 🔍 Testing #{url}... "
|
115
209
|
|
116
210
|
begin
|
117
|
-
response = HTTParty.head(url, timeout:
|
211
|
+
response = HTTParty.head(url, timeout: request_timeout, follow_redirects: true)
|
118
212
|
|
119
213
|
case response.code
|
120
214
|
when 200
|
@@ -166,7 +260,7 @@ module UrlCategorise
|
|
166
260
|
puts '=' * 80
|
167
261
|
|
168
262
|
puts "\n📊 SUMMARY:"
|
169
|
-
total_categories =
|
263
|
+
total_categories = (host_urls || {}).keys.length
|
170
264
|
categories_with_issues = unreachable_lists.keys.length + missing_categories.length
|
171
265
|
categories_healthy = total_categories - categories_with_issues
|
172
266
|
|
@@ -266,16 +360,163 @@ module UrlCategorise
|
|
266
360
|
@hosts[category].concat(domains).uniq!
|
267
361
|
end
|
268
362
|
|
363
|
+
# Reload datasets from constants if auto-loading is enabled
|
364
|
+
load_datasets_from_constants if auto_load_datasets && @dataset_processor
|
365
|
+
|
269
366
|
self
|
270
367
|
end
|
271
368
|
|
369
|
+
def export_hosts_files(output_path = nil)
|
370
|
+
export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'hosts') : File.join(Dir.pwd, 'exports', 'hosts'))
|
371
|
+
|
372
|
+
FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
|
373
|
+
|
374
|
+
exported_files = {}
|
375
|
+
|
376
|
+
@hosts.each do |category, domains|
|
377
|
+
next if domains.empty?
|
378
|
+
|
379
|
+
filename = "#{category}.hosts"
|
380
|
+
file_path = File.join(export_dir, filename)
|
381
|
+
|
382
|
+
File.open(file_path, 'w') do |file|
|
383
|
+
file.puts "# #{category.to_s.gsub('_', ' ').split.map(&:capitalize).join(' ')} - Generated by UrlCategorise"
|
384
|
+
file.puts "# Generated at: #{Time.now}"
|
385
|
+
file.puts "# Total entries: #{domains.length}"
|
386
|
+
file.puts ""
|
387
|
+
|
388
|
+
domains.sort.each do |domain|
|
389
|
+
file.puts "0.0.0.0 #{domain}"
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
exported_files[category] = {
|
394
|
+
path: file_path,
|
395
|
+
filename: filename,
|
396
|
+
count: domains.length
|
397
|
+
}
|
398
|
+
end
|
399
|
+
|
400
|
+
# Create summary file
|
401
|
+
summary_path = File.join(export_dir, '_export_summary.txt')
|
402
|
+
File.open(summary_path, 'w') do |file|
|
403
|
+
file.puts "UrlCategorise Hosts Export Summary"
|
404
|
+
file.puts "=================================="
|
405
|
+
file.puts "Generated at: #{Time.now}"
|
406
|
+
file.puts "Export directory: #{export_dir}"
|
407
|
+
file.puts "Total categories: #{exported_files.keys.length}"
|
408
|
+
file.puts "Total domains: #{@hosts.values.map(&:length).sum}"
|
409
|
+
file.puts ""
|
410
|
+
file.puts "Files created:"
|
411
|
+
|
412
|
+
exported_files.each do |category, info|
|
413
|
+
file.puts " #{info[:filename]} - #{info[:count]} domains"
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
exported_files[:_summary] = {
|
418
|
+
path: summary_path,
|
419
|
+
total_categories: exported_files.keys.length,
|
420
|
+
total_domains: @hosts.values.map(&:length).sum,
|
421
|
+
export_directory: export_dir
|
422
|
+
}
|
423
|
+
|
424
|
+
exported_files
|
425
|
+
end
|
426
|
+
|
427
|
+
def export_csv_data(output_path = nil)
|
428
|
+
require 'csv'
|
429
|
+
|
430
|
+
export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'csv') : File.join(Dir.pwd, 'exports', 'csv'))
|
431
|
+
|
432
|
+
FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
|
433
|
+
|
434
|
+
filename = "url_categorise_data_export_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
|
435
|
+
file_path = File.join(export_dir, filename)
|
436
|
+
|
437
|
+
CSV.open(file_path, 'w', headers: true) do |csv|
|
438
|
+
# Add headers
|
439
|
+
csv << [
|
440
|
+
'domain',
|
441
|
+
'category',
|
442
|
+
'source_type',
|
443
|
+
'is_dataset_category',
|
444
|
+
'iab_category_v2',
|
445
|
+
'iab_category_v3',
|
446
|
+
'export_timestamp',
|
447
|
+
'smart_categorization_enabled'
|
448
|
+
]
|
449
|
+
|
450
|
+
# Export all host/category data
|
451
|
+
@hosts.each do |category, domains|
|
452
|
+
domains.each do |domain|
|
453
|
+
source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
|
454
|
+
is_dataset_category = @dataset_categories.include?(category)
|
455
|
+
|
456
|
+
# Get IAB mappings if compliance is enabled
|
457
|
+
iab_v2 = nil
|
458
|
+
iab_v3 = nil
|
459
|
+
if iab_compliance_enabled
|
460
|
+
iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
|
461
|
+
iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
|
462
|
+
end
|
463
|
+
|
464
|
+
csv << [
|
465
|
+
domain,
|
466
|
+
category,
|
467
|
+
source_type,
|
468
|
+
is_dataset_category,
|
469
|
+
iab_v2,
|
470
|
+
iab_v3,
|
471
|
+
Time.now.iso8601,
|
472
|
+
smart_categorization_enabled
|
473
|
+
]
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
# Create metadata file
|
479
|
+
metadata_path = File.join(export_dir, "#{File.basename(filename, '.csv')}_metadata.json")
|
480
|
+
metadata = {
|
481
|
+
export_info: {
|
482
|
+
timestamp: Time.now.iso8601,
|
483
|
+
filename: filename,
|
484
|
+
file_path: file_path,
|
485
|
+
metadata_path: metadata_path
|
486
|
+
},
|
487
|
+
client_settings: {
|
488
|
+
iab_compliance_enabled: iab_compliance_enabled,
|
489
|
+
iab_version: iab_version,
|
490
|
+
smart_categorization_enabled: smart_categorization_enabled,
|
491
|
+
auto_load_datasets: auto_load_datasets
|
492
|
+
},
|
493
|
+
data_summary: {
|
494
|
+
total_domains: @hosts.values.map(&:length).sum,
|
495
|
+
total_categories: @hosts.keys.length,
|
496
|
+
dataset_categories_count: @dataset_categories.size,
|
497
|
+
blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
|
498
|
+
categories: @hosts.keys.sort.map(&:to_s)
|
499
|
+
},
|
500
|
+
dataset_metadata: dataset_metadata
|
501
|
+
}
|
502
|
+
|
503
|
+
File.write(metadata_path, JSON.pretty_generate(metadata))
|
504
|
+
|
505
|
+
{
|
506
|
+
csv_file: file_path,
|
507
|
+
metadata_file: metadata_path,
|
508
|
+
summary: metadata[:data_summary],
|
509
|
+
export_directory: export_dir
|
510
|
+
}
|
511
|
+
end
|
512
|
+
|
272
513
|
private
|
273
514
|
|
274
515
|
def initialize_dataset_processor(config)
|
275
516
|
processor_config = {
|
276
|
-
download_path: config[:download_path] ||
|
277
|
-
cache_path: config[:cache_path] ||
|
278
|
-
timeout: config[:timeout] ||
|
517
|
+
download_path: config[:download_path] || cache_dir&.+(File::SEPARATOR + 'downloads'),
|
518
|
+
cache_path: config[:cache_path] || cache_dir&.+(File::SEPARATOR + 'datasets'),
|
519
|
+
timeout: config[:timeout] || request_timeout,
|
279
520
|
enable_kaggle: config.fetch(:enable_kaggle, true) # Default to true for backwards compatibility
|
280
521
|
}
|
281
522
|
|
@@ -298,6 +539,7 @@ module UrlCategorise
|
|
298
539
|
|
299
540
|
def integrate_dataset(dataset, category_mappings)
|
300
541
|
return dataset unless @dataset_processor
|
542
|
+
return nil unless dataset # Handle nil datasets gracefully
|
301
543
|
|
302
544
|
categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
|
303
545
|
|
@@ -324,21 +566,104 @@ module UrlCategorise
|
|
324
566
|
dataset
|
325
567
|
end
|
326
568
|
|
569
|
+
def load_datasets_from_constants
|
570
|
+
return unless defined?(CATEGORIY_DATABASES) && CATEGORIY_DATABASES.is_a?(Array)
|
571
|
+
return unless @dataset_processor
|
572
|
+
|
573
|
+
puts "Loading #{CATEGORIY_DATABASES.length} datasets from constants..." if ENV['DEBUG']
|
574
|
+
loaded_count = 0
|
575
|
+
|
576
|
+
CATEGORIY_DATABASES.each do |dataset_config|
|
577
|
+
begin
|
578
|
+
case dataset_config[:type]
|
579
|
+
when :kaggle
|
580
|
+
# Parse the kaggle path to get owner and dataset name
|
581
|
+
path_parts = dataset_config[:path].split('/')
|
582
|
+
next unless path_parts.length == 2
|
583
|
+
|
584
|
+
dataset_owner, dataset_name = path_parts
|
585
|
+
|
586
|
+
# Check if dataset is already cached before attempting to load
|
587
|
+
cache_key = @dataset_processor.send(:generate_cache_key, "#{dataset_owner}/#{dataset_name}", :kaggle)
|
588
|
+
cache_file = File.join(@dataset_processor.cache_path, cache_key)
|
589
|
+
|
590
|
+
if File.exist?(cache_file)
|
591
|
+
puts "Loading cached Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
|
592
|
+
load_kaggle_dataset(dataset_owner, dataset_name, {
|
593
|
+
use_cache: true,
|
594
|
+
integrate_data: true
|
595
|
+
})
|
596
|
+
loaded_count += 1
|
597
|
+
else
|
598
|
+
puts "Attempting to download missing Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
|
599
|
+
begin
|
600
|
+
load_kaggle_dataset(dataset_owner, dataset_name, {
|
601
|
+
use_cache: true,
|
602
|
+
integrate_data: true
|
603
|
+
})
|
604
|
+
loaded_count += 1
|
605
|
+
rescue Error => e
|
606
|
+
puts "Warning: Failed to download Kaggle dataset #{dataset_owner}/#{dataset_name}: #{e.message}" if ENV['DEBUG']
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
when :csv
|
611
|
+
# Check if CSV dataset is cached
|
612
|
+
cache_key = @dataset_processor.send(:generate_cache_key, dataset_config[:path], :csv)
|
613
|
+
cache_file = File.join(@dataset_processor.cache_path, cache_key)
|
614
|
+
|
615
|
+
if File.exist?(cache_file)
|
616
|
+
puts "Loading cached CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
|
617
|
+
load_csv_dataset(dataset_config[:path], {
|
618
|
+
use_cache: true,
|
619
|
+
integrate_data: true
|
620
|
+
})
|
621
|
+
loaded_count += 1
|
622
|
+
else
|
623
|
+
puts "Attempting to download missing CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
|
624
|
+
begin
|
625
|
+
load_csv_dataset(dataset_config[:path], {
|
626
|
+
use_cache: true,
|
627
|
+
integrate_data: true
|
628
|
+
})
|
629
|
+
loaded_count += 1
|
630
|
+
rescue Error => e
|
631
|
+
puts "Warning: Failed to download CSV dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
632
|
+
end
|
633
|
+
end
|
634
|
+
end
|
635
|
+
rescue Error => e
|
636
|
+
puts "Warning: Failed to load dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
637
|
+
# Continue loading other datasets even if one fails
|
638
|
+
rescue StandardError => e
|
639
|
+
puts "Warning: Unexpected error loading dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
640
|
+
# Continue loading other datasets even if one fails
|
641
|
+
end
|
642
|
+
end
|
643
|
+
|
644
|
+
puts "Finished loading datasets from constants (#{loaded_count}/#{CATEGORIY_DATABASES.length} loaded)" if ENV['DEBUG']
|
645
|
+
end
|
646
|
+
|
327
647
|
def hash_size_in_mb(hash)
|
328
|
-
|
648
|
+
size_bytes = hash_size_in_bytes(hash)
|
649
|
+
(size_bytes / ONE_MEGABYTE.to_f).round(2)
|
650
|
+
end
|
329
651
|
|
652
|
+
def hash_size_in_bytes(hash)
|
653
|
+
size = 0
|
330
654
|
hash.each do |_key, value|
|
655
|
+
next unless value.is_a?(Array)
|
656
|
+
|
331
657
|
size += value.join.length
|
332
658
|
end
|
333
|
-
|
334
|
-
(size / ONE_MEGABYTE).round(2)
|
659
|
+
size
|
335
660
|
end
|
336
661
|
|
337
662
|
def fetch_and_build_host_lists
|
338
663
|
@hosts = {}
|
339
664
|
|
340
|
-
host_urls.keys.each do |category|
|
341
|
-
@hosts[category] = build_host_data(host_urls[category])
|
665
|
+
(host_urls || {}).keys.each do |category|
|
666
|
+
@hosts[category] = build_host_data((host_urls || {})[category])
|
342
667
|
end
|
343
668
|
|
344
669
|
sub_category_values = categories_with_keys
|
@@ -356,6 +681,79 @@ module UrlCategorise
|
|
356
681
|
@hosts
|
357
682
|
end
|
358
683
|
|
684
|
+
def initialize_smart_rules(custom_rules)
|
685
|
+
custom_rules = {} if custom_rules.nil?
|
686
|
+
default_rules = {
|
687
|
+
social_media_platforms: {
|
688
|
+
domains: %w[reddit.com facebook.com twitter.com x.com instagram.com linkedin.com
|
689
|
+
pinterest.com tiktok.com youtube.com snapchat.com discord.com],
|
690
|
+
remove_categories: %i[health_and_fitness forums news technology education
|
691
|
+
business finance entertainment travel sports politics
|
692
|
+
science music art food_and_drink shopping gaming]
|
693
|
+
},
|
694
|
+
search_engines: {
|
695
|
+
domains: %w[google.com bing.com yahoo.com duckduckgo.com baidu.com yandex.com],
|
696
|
+
remove_categories: %i[news shopping travel health_and_fitness finance technology]
|
697
|
+
},
|
698
|
+
video_platforms: {
|
699
|
+
domains: %w[youtube.com vimeo.com dailymotion.com twitch.tv],
|
700
|
+
remove_categories: %i[education news entertainment music sports gaming]
|
701
|
+
},
|
702
|
+
news_aggregators: {
|
703
|
+
domains: %w[reddit.com digg.com],
|
704
|
+
keep_primary_only: %i[social_media reddit digg]
|
705
|
+
}
|
706
|
+
}
|
707
|
+
|
708
|
+
# Merge custom rules with defaults
|
709
|
+
default_rules.merge(custom_rules)
|
710
|
+
end
|
711
|
+
|
712
|
+
def apply_smart_categorization(url, categories)
|
713
|
+
return categories unless smart_categorization_enabled
|
714
|
+
|
715
|
+
host = extract_host(url)
|
716
|
+
|
717
|
+
smart_rules.each do |_rule_name, rule_config|
|
718
|
+
if rule_config[:domains]&.any? { |domain| host == domain || host.end_with?(".#{domain}") }
|
719
|
+
categories = apply_rule(categories, rule_config, host, url)
|
720
|
+
end
|
721
|
+
end
|
722
|
+
|
723
|
+
categories
|
724
|
+
end
|
725
|
+
|
726
|
+
def apply_rule(categories, rule_config, _host, url)
|
727
|
+
# Rule: Remove overly broad categories for specific platforms
|
728
|
+
if rule_config[:remove_categories]
|
729
|
+
categories = categories.reject { |cat| rule_config[:remove_categories].include?(cat) }
|
730
|
+
end
|
731
|
+
|
732
|
+
# Rule: Keep only primary categories
|
733
|
+
if rule_config[:keep_primary_only]
|
734
|
+
primary_categories = categories & rule_config[:keep_primary_only]
|
735
|
+
categories = primary_categories if primary_categories.any?
|
736
|
+
end
|
737
|
+
|
738
|
+
# Rule: Add specific categories based on URL patterns
|
739
|
+
if rule_config[:add_categories_by_path]
|
740
|
+
rule_config[:add_categories_by_path].each do |path_pattern, additional_categories|
|
741
|
+
categories = (categories + additional_categories).uniq if url.match?(path_pattern)
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
# Rule: Remove all categories except allowed ones
|
746
|
+
categories &= rule_config[:allowed_categories_only] if rule_config[:allowed_categories_only]
|
747
|
+
|
748
|
+
categories
|
749
|
+
end
|
750
|
+
|
751
|
+
def extract_host(url)
|
752
|
+
(URI.parse(url).host || url).downcase.gsub('www.', '')
|
753
|
+
rescue URI::InvalidURIError
|
754
|
+
url.downcase.gsub('www.', '')
|
755
|
+
end
|
756
|
+
|
359
757
|
def build_host_data(urls)
|
360
758
|
all_hosts = []
|
361
759
|
|
@@ -364,11 +762,11 @@ module UrlCategorise
|
|
364
762
|
|
365
763
|
hosts_data = nil
|
366
764
|
|
367
|
-
hosts_data = read_from_cache(url) if
|
765
|
+
hosts_data = read_from_cache(url) if cache_dir && !force_download
|
368
766
|
|
369
767
|
if hosts_data.nil?
|
370
768
|
hosts_data = download_and_parse_list(url)
|
371
|
-
save_to_cache(url, hosts_data) if
|
769
|
+
save_to_cache(url, hosts_data) if cache_dir
|
372
770
|
end
|
373
771
|
|
374
772
|
all_hosts.concat(hosts_data) if hosts_data
|
@@ -378,7 +776,7 @@ module UrlCategorise
|
|
378
776
|
end
|
379
777
|
|
380
778
|
def download_and_parse_list(url)
|
381
|
-
raw_data = HTTParty.get(url, timeout:
|
779
|
+
raw_data = HTTParty.get(url, timeout: request_timeout)
|
382
780
|
return [] if raw_data.body.nil? || raw_data.body.empty?
|
383
781
|
|
384
782
|
# Store metadata
|
@@ -441,11 +839,11 @@ module UrlCategorise
|
|
441
839
|
end
|
442
840
|
|
443
841
|
def cache_file_path(url)
|
444
|
-
return nil unless
|
842
|
+
return nil unless cache_dir
|
445
843
|
|
446
|
-
FileUtils.mkdir_p(
|
844
|
+
FileUtils.mkdir_p(cache_dir) unless Dir.exist?(cache_dir)
|
447
845
|
filename = Digest::MD5.hexdigest(url) + '.cache'
|
448
|
-
File.join(
|
846
|
+
File.join(cache_dir, filename)
|
449
847
|
end
|
450
848
|
|
451
849
|
def read_from_cache(url)
|
@@ -478,7 +876,7 @@ module UrlCategorise
|
|
478
876
|
end
|
479
877
|
|
480
878
|
def should_update_cache?(url, cache_data)
|
481
|
-
return true if
|
879
|
+
return true if force_download
|
482
880
|
return true unless cache_data[:metadata]
|
483
881
|
|
484
882
|
# Update if cache is older than 24 hours
|
@@ -487,7 +885,7 @@ module UrlCategorise
|
|
487
885
|
|
488
886
|
# Check if remote content has changed
|
489
887
|
begin
|
490
|
-
head_response = HTTParty.head(url, timeout:
|
888
|
+
head_response = HTTParty.head(url, timeout: request_timeout)
|
491
889
|
remote_etag = head_response.headers['etag']
|
492
890
|
remote_last_modified = head_response.headers['last-modified']
|
493
891
|
|
@@ -508,8 +906,8 @@ module UrlCategorise
|
|
508
906
|
def categories_with_keys
|
509
907
|
keyed_categories = {}
|
510
908
|
|
511
|
-
host_urls.keys.each do |category|
|
512
|
-
category_values = host_urls[category].select do |url|
|
909
|
+
(host_urls || {}).keys.each do |category|
|
910
|
+
category_values = (host_urls || {})[category].select do |url|
|
513
911
|
url.is_a?(Symbol)
|
514
912
|
end
|
515
913
|
|
@@ -58,11 +58,16 @@ module UrlCategorise
|
|
58
58
|
return handle_existing_dataset(extracted_dir, options)
|
59
59
|
end
|
60
60
|
|
61
|
-
#
|
61
|
+
# If credentials not available, return nil gracefully for cache mode
|
62
62
|
unless kaggle_credentials_available?
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
if options[:use_cache]
|
64
|
+
puts "Warning: Kaggle dataset '#{dataset_path}' not cached and no credentials available" if ENV['DEBUG']
|
65
|
+
return nil
|
66
|
+
else
|
67
|
+
raise Error, 'Kaggle credentials required for downloading new datasets. ' \
|
68
|
+
'Set KAGGLE_USERNAME/KAGGLE_KEY environment variables, provide credentials explicitly, ' \
|
69
|
+
'or place kaggle.json file in ~/.kaggle/ directory.'
|
70
|
+
end
|
66
71
|
end
|
67
72
|
|
68
73
|
# Download from Kaggle API
|