UrlCategorise 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ require 'set'
3
3
  module UrlCategorise
4
4
  class Client < ApiPattern::Client
5
5
  include ::UrlCategorise::Constants
6
+ include ActiveAttr::Model
6
7
 
7
8
  def self.compatible_api_version
8
9
  'v2'
@@ -12,16 +13,35 @@ module UrlCategorise
12
13
  'v2 2025-08-23'
13
14
  end
14
15
 
15
- attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout,
16
- :dataset_processor, :dataset_categories
16
+ attribute :host_urls, default: -> { DEFAULT_HOST_URLS }
17
+ attribute :cache_dir
18
+ attribute :force_download, type: Boolean, default: false
19
+ attribute :dns_servers, default: ['1.1.1.1', '1.0.0.1']
20
+ attribute :request_timeout, type: Integer, default: 10
21
+ attribute :iab_compliance_enabled, type: Boolean, default: false
22
+ attribute :iab_version, default: :v3
23
+ attribute :auto_load_datasets, type: Boolean, default: false
24
+ attribute :smart_categorization_enabled, type: Boolean, default: false
25
+ attribute :smart_rules, default: -> { {} }
26
+
27
+ attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
28
+
29
+ def initialize(**kwargs)
30
+ # Extract dataset_config for later use
31
+ dataset_config = kwargs.fetch(:dataset_config, {})
32
+
33
+ # Set ActiveAttr attributes - preserve explicitly passed values including nil
34
+ self.host_urls = kwargs.key?(:host_urls) ? kwargs[:host_urls] : DEFAULT_HOST_URLS
35
+ self.cache_dir = kwargs[:cache_dir] # will be nil if not provided or explicitly nil
36
+ self.force_download = kwargs.key?(:force_download) ? kwargs[:force_download] : false
37
+ self.dns_servers = kwargs.key?(:dns_servers) ? kwargs[:dns_servers] : ['1.1.1.1', '1.0.0.1']
38
+ self.request_timeout = kwargs.key?(:request_timeout) ? kwargs[:request_timeout] : 10
39
+ self.iab_compliance_enabled = kwargs.key?(:iab_compliance) ? kwargs[:iab_compliance] : false
40
+ self.iab_version = kwargs.key?(:iab_version) ? kwargs[:iab_version] : :v3
41
+ self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
42
+ self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
43
+ self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
17
44
 
18
- def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false,
19
- dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10, dataset_config: {})
20
- @host_urls = host_urls
21
- @cache_dir = cache_dir
22
- @force_download = force_download
23
- @dns_servers = dns_servers
24
- @request_timeout = request_timeout
25
45
  @metadata = {}
26
46
  @dataset_categories = Set.new # Track which categories come from datasets
27
47
 
@@ -29,30 +49,48 @@ module UrlCategorise
29
49
  @dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
30
50
 
31
51
  @hosts = fetch_and_build_host_lists
52
+
53
+ # Auto-load datasets from constants if enabled
54
+ load_datasets_from_constants if auto_load_datasets && @dataset_processor
32
55
  end
33
56
 
34
57
  def categorise(url)
35
58
  host = (URI.parse(url).host || url).downcase
36
59
  host = host.gsub('www.', '')
37
60
 
38
- @hosts.keys.select do |category|
61
+ categories = @hosts.keys.select do |category|
39
62
  @hosts[category].any? do |blocked_host|
40
63
  host == blocked_host || host.end_with?(".#{blocked_host}")
41
64
  end
42
65
  end
66
+
67
+ # Apply smart categorization if enabled
68
+ categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
69
+
70
+ if iab_compliance_enabled
71
+ IabCompliance.get_iab_categories(categories, iab_version)
72
+ else
73
+ categories
74
+ end
43
75
  end
44
76
 
45
77
  def categorise_ip(ip_address)
46
- @hosts.keys.select do |category|
78
+ categories = @hosts.keys.select do |category|
47
79
  @hosts[category].include?(ip_address)
48
80
  end
81
+
82
+ if iab_compliance_enabled
83
+ IabCompliance.get_iab_categories(categories, iab_version)
84
+ else
85
+ categories
86
+ end
49
87
  end
50
88
 
51
89
  def resolve_and_categorise(domain)
52
90
  categories = categorise(domain)
53
91
 
54
92
  begin
55
- resolver = Resolv::DNS.new(nameserver: @dns_servers)
93
+ resolver = Resolv::DNS.new(nameserver: dns_servers)
56
94
  ip_addresses = resolver.getaddresses(domain).map(&:to_s)
57
95
 
58
96
  ip_addresses.each do |ip|
@@ -79,6 +117,62 @@ module UrlCategorise
79
117
  hash_size_in_mb(@hosts)
80
118
  end
81
119
 
120
+ def size_of_dataset_data
121
+ dataset_hosts = {}
122
+ @dataset_categories.each do |category|
123
+ dataset_hosts[category] = @hosts[category] || []
124
+ end
125
+ hash_size_in_mb(dataset_hosts)
126
+ end
127
+
128
+ def size_of_blocklist_data
129
+ blocklist_hosts = {}
130
+ @hosts.each do |category, domains|
131
+ blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
132
+ end
133
+ hash_size_in_mb(blocklist_hosts)
134
+ end
135
+
136
+ def size_of_data_bytes
137
+ hash_size_in_bytes(@hosts)
138
+ end
139
+
140
+ def size_of_dataset_data_bytes
141
+ dataset_hosts = {}
142
+ @dataset_categories.each do |category|
143
+ dataset_hosts[category] = @hosts[category] || []
144
+ end
145
+ hash_size_in_bytes(dataset_hosts)
146
+ end
147
+
148
+ def size_of_blocklist_data_bytes
149
+ blocklist_hosts = {}
150
+ @hosts.each do |category, domains|
151
+ blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
152
+ end
153
+ hash_size_in_bytes(blocklist_hosts)
154
+ end
155
+
156
+ def count_of_dataset_hosts
157
+ @dataset_categories.map do |category|
158
+ @hosts[category]&.size || 0
159
+ end.sum
160
+ end
161
+
162
+ def count_of_dataset_categories
163
+ @dataset_categories.size
164
+ end
165
+
166
+ def iab_compliant?
167
+ iab_compliance_enabled
168
+ end
169
+
170
+ def get_iab_mapping(category)
171
+ return nil unless iab_compliance_enabled
172
+
173
+ IabCompliance.map_category_to_iab(category, iab_version)
174
+ end
175
+
82
176
  def check_all_lists
83
177
  puts 'Checking all lists in constants...'
84
178
 
@@ -86,7 +180,7 @@ module UrlCategorise
86
180
  missing_categories = []
87
181
  successful_lists = {}
88
182
 
89
- @host_urls.each do |category, urls|
183
+ (host_urls || {}).each do |category, urls|
90
184
  puts "\nChecking category: #{category}"
91
185
 
92
186
  if urls.empty?
@@ -114,7 +208,7 @@ module UrlCategorise
114
208
  print " 🔍 Testing #{url}... "
115
209
 
116
210
  begin
117
- response = HTTParty.head(url, timeout: @request_timeout, follow_redirects: true)
211
+ response = HTTParty.head(url, timeout: request_timeout, follow_redirects: true)
118
212
 
119
213
  case response.code
120
214
  when 200
@@ -166,7 +260,7 @@ module UrlCategorise
166
260
  puts '=' * 80
167
261
 
168
262
  puts "\n📊 SUMMARY:"
169
- total_categories = @host_urls.keys.length
263
+ total_categories = (host_urls || {}).keys.length
170
264
  categories_with_issues = unreachable_lists.keys.length + missing_categories.length
171
265
  categories_healthy = total_categories - categories_with_issues
172
266
 
@@ -266,16 +360,163 @@ module UrlCategorise
266
360
  @hosts[category].concat(domains).uniq!
267
361
  end
268
362
 
363
+ # Reload datasets from constants if auto-loading is enabled
364
+ load_datasets_from_constants if auto_load_datasets && @dataset_processor
365
+
269
366
  self
270
367
  end
271
368
 
369
+ def export_hosts_files(output_path = nil)
370
+ export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'hosts') : File.join(Dir.pwd, 'exports', 'hosts'))
371
+
372
+ FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
373
+
374
+ exported_files = {}
375
+
376
+ @hosts.each do |category, domains|
377
+ next if domains.empty?
378
+
379
+ filename = "#{category}.hosts"
380
+ file_path = File.join(export_dir, filename)
381
+
382
+ File.open(file_path, 'w') do |file|
383
+ file.puts "# #{category.to_s.gsub('_', ' ').split.map(&:capitalize).join(' ')} - Generated by UrlCategorise"
384
+ file.puts "# Generated at: #{Time.now}"
385
+ file.puts "# Total entries: #{domains.length}"
386
+ file.puts ""
387
+
388
+ domains.sort.each do |domain|
389
+ file.puts "0.0.0.0 #{domain}"
390
+ end
391
+ end
392
+
393
+ exported_files[category] = {
394
+ path: file_path,
395
+ filename: filename,
396
+ count: domains.length
397
+ }
398
+ end
399
+
400
+ # Create summary file
401
+ summary_path = File.join(export_dir, '_export_summary.txt')
402
+ File.open(summary_path, 'w') do |file|
403
+ file.puts "UrlCategorise Hosts Export Summary"
404
+ file.puts "=================================="
405
+ file.puts "Generated at: #{Time.now}"
406
+ file.puts "Export directory: #{export_dir}"
407
+ file.puts "Total categories: #{exported_files.keys.length}"
408
+ file.puts "Total domains: #{@hosts.values.map(&:length).sum}"
409
+ file.puts ""
410
+ file.puts "Files created:"
411
+
412
+ exported_files.each do |category, info|
413
+ file.puts " #{info[:filename]} - #{info[:count]} domains"
414
+ end
415
+ end
416
+
417
+ exported_files[:_summary] = {
418
+ path: summary_path,
419
+ total_categories: exported_files.keys.length,
420
+ total_domains: @hosts.values.map(&:length).sum,
421
+ export_directory: export_dir
422
+ }
423
+
424
+ exported_files
425
+ end
426
+
427
+ def export_csv_data(output_path = nil)
428
+ require 'csv'
429
+
430
+ export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'csv') : File.join(Dir.pwd, 'exports', 'csv'))
431
+
432
+ FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
433
+
434
+ filename = "url_categorise_data_export_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
435
+ file_path = File.join(export_dir, filename)
436
+
437
+ CSV.open(file_path, 'w', headers: true) do |csv|
438
+ # Add headers
439
+ csv << [
440
+ 'domain',
441
+ 'category',
442
+ 'source_type',
443
+ 'is_dataset_category',
444
+ 'iab_category_v2',
445
+ 'iab_category_v3',
446
+ 'export_timestamp',
447
+ 'smart_categorization_enabled'
448
+ ]
449
+
450
+ # Export all host/category data
451
+ @hosts.each do |category, domains|
452
+ domains.each do |domain|
453
+ source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
454
+ is_dataset_category = @dataset_categories.include?(category)
455
+
456
+ # Get IAB mappings if compliance is enabled
457
+ iab_v2 = nil
458
+ iab_v3 = nil
459
+ if iab_compliance_enabled
460
+ iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
461
+ iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
462
+ end
463
+
464
+ csv << [
465
+ domain,
466
+ category,
467
+ source_type,
468
+ is_dataset_category,
469
+ iab_v2,
470
+ iab_v3,
471
+ Time.now.iso8601,
472
+ smart_categorization_enabled
473
+ ]
474
+ end
475
+ end
476
+ end
477
+
478
+ # Create metadata file
479
+ metadata_path = File.join(export_dir, "#{File.basename(filename, '.csv')}_metadata.json")
480
+ metadata = {
481
+ export_info: {
482
+ timestamp: Time.now.iso8601,
483
+ filename: filename,
484
+ file_path: file_path,
485
+ metadata_path: metadata_path
486
+ },
487
+ client_settings: {
488
+ iab_compliance_enabled: iab_compliance_enabled,
489
+ iab_version: iab_version,
490
+ smart_categorization_enabled: smart_categorization_enabled,
491
+ auto_load_datasets: auto_load_datasets
492
+ },
493
+ data_summary: {
494
+ total_domains: @hosts.values.map(&:length).sum,
495
+ total_categories: @hosts.keys.length,
496
+ dataset_categories_count: @dataset_categories.size,
497
+ blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
498
+ categories: @hosts.keys.sort.map(&:to_s)
499
+ },
500
+ dataset_metadata: dataset_metadata
501
+ }
502
+
503
+ File.write(metadata_path, JSON.pretty_generate(metadata))
504
+
505
+ {
506
+ csv_file: file_path,
507
+ metadata_file: metadata_path,
508
+ summary: metadata[:data_summary],
509
+ export_directory: export_dir
510
+ }
511
+ end
512
+
272
513
  private
273
514
 
274
515
  def initialize_dataset_processor(config)
275
516
  processor_config = {
276
- download_path: config[:download_path] || @cache_dir&.+(File::SEPARATOR + 'downloads'),
277
- cache_path: config[:cache_path] || @cache_dir&.+(File::SEPARATOR + 'datasets'),
278
- timeout: config[:timeout] || @request_timeout,
517
+ download_path: config[:download_path] || cache_dir&.+(File::SEPARATOR + 'downloads'),
518
+ cache_path: config[:cache_path] || cache_dir&.+(File::SEPARATOR + 'datasets'),
519
+ timeout: config[:timeout] || request_timeout,
279
520
  enable_kaggle: config.fetch(:enable_kaggle, true) # Default to true for backwards compatibility
280
521
  }
281
522
 
@@ -298,6 +539,7 @@ module UrlCategorise
298
539
 
299
540
  def integrate_dataset(dataset, category_mappings)
300
541
  return dataset unless @dataset_processor
542
+ return nil unless dataset # Handle nil datasets gracefully
301
543
 
302
544
  categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
303
545
 
@@ -324,21 +566,104 @@ module UrlCategorise
324
566
  dataset
325
567
  end
326
568
 
569
+ def load_datasets_from_constants
570
+ return unless defined?(CATEGORIY_DATABASES) && CATEGORIY_DATABASES.is_a?(Array)
571
+ return unless @dataset_processor
572
+
573
+ puts "Loading #{CATEGORIY_DATABASES.length} datasets from constants..." if ENV['DEBUG']
574
+ loaded_count = 0
575
+
576
+ CATEGORIY_DATABASES.each do |dataset_config|
577
+ begin
578
+ case dataset_config[:type]
579
+ when :kaggle
580
+ # Parse the kaggle path to get owner and dataset name
581
+ path_parts = dataset_config[:path].split('/')
582
+ next unless path_parts.length == 2
583
+
584
+ dataset_owner, dataset_name = path_parts
585
+
586
+ # Check if dataset is already cached before attempting to load
587
+ cache_key = @dataset_processor.send(:generate_cache_key, "#{dataset_owner}/#{dataset_name}", :kaggle)
588
+ cache_file = File.join(@dataset_processor.cache_path, cache_key)
589
+
590
+ if File.exist?(cache_file)
591
+ puts "Loading cached Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
592
+ load_kaggle_dataset(dataset_owner, dataset_name, {
593
+ use_cache: true,
594
+ integrate_data: true
595
+ })
596
+ loaded_count += 1
597
+ else
598
+ puts "Attempting to download missing Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
599
+ begin
600
+ load_kaggle_dataset(dataset_owner, dataset_name, {
601
+ use_cache: true,
602
+ integrate_data: true
603
+ })
604
+ loaded_count += 1
605
+ rescue Error => e
606
+ puts "Warning: Failed to download Kaggle dataset #{dataset_owner}/#{dataset_name}: #{e.message}" if ENV['DEBUG']
607
+ end
608
+ end
609
+
610
+ when :csv
611
+ # Check if CSV dataset is cached
612
+ cache_key = @dataset_processor.send(:generate_cache_key, dataset_config[:path], :csv)
613
+ cache_file = File.join(@dataset_processor.cache_path, cache_key)
614
+
615
+ if File.exist?(cache_file)
616
+ puts "Loading cached CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
617
+ load_csv_dataset(dataset_config[:path], {
618
+ use_cache: true,
619
+ integrate_data: true
620
+ })
621
+ loaded_count += 1
622
+ else
623
+ puts "Attempting to download missing CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
624
+ begin
625
+ load_csv_dataset(dataset_config[:path], {
626
+ use_cache: true,
627
+ integrate_data: true
628
+ })
629
+ loaded_count += 1
630
+ rescue Error => e
631
+ puts "Warning: Failed to download CSV dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
632
+ end
633
+ end
634
+ end
635
+ rescue Error => e
636
+ puts "Warning: Failed to load dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
637
+ # Continue loading other datasets even if one fails
638
+ rescue StandardError => e
639
+ puts "Warning: Unexpected error loading dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
640
+ # Continue loading other datasets even if one fails
641
+ end
642
+ end
643
+
644
+ puts "Finished loading datasets from constants (#{loaded_count}/#{CATEGORIY_DATABASES.length} loaded)" if ENV['DEBUG']
645
+ end
646
+
327
647
  def hash_size_in_mb(hash)
328
- size = 0
648
+ size_bytes = hash_size_in_bytes(hash)
649
+ (size_bytes / ONE_MEGABYTE.to_f).round(2)
650
+ end
329
651
 
652
+ def hash_size_in_bytes(hash)
653
+ size = 0
330
654
  hash.each do |_key, value|
655
+ next unless value.is_a?(Array)
656
+
331
657
  size += value.join.length
332
658
  end
333
-
334
- (size / ONE_MEGABYTE).round(2)
659
+ size
335
660
  end
336
661
 
337
662
  def fetch_and_build_host_lists
338
663
  @hosts = {}
339
664
 
340
- host_urls.keys.each do |category|
341
- @hosts[category] = build_host_data(host_urls[category])
665
+ (host_urls || {}).keys.each do |category|
666
+ @hosts[category] = build_host_data((host_urls || {})[category])
342
667
  end
343
668
 
344
669
  sub_category_values = categories_with_keys
@@ -356,6 +681,79 @@ module UrlCategorise
356
681
  @hosts
357
682
  end
358
683
 
684
+ def initialize_smart_rules(custom_rules)
685
+ custom_rules = {} if custom_rules.nil?
686
+ default_rules = {
687
+ social_media_platforms: {
688
+ domains: %w[reddit.com facebook.com twitter.com x.com instagram.com linkedin.com
689
+ pinterest.com tiktok.com youtube.com snapchat.com discord.com],
690
+ remove_categories: %i[health_and_fitness forums news technology education
691
+ business finance entertainment travel sports politics
692
+ science music art food_and_drink shopping gaming]
693
+ },
694
+ search_engines: {
695
+ domains: %w[google.com bing.com yahoo.com duckduckgo.com baidu.com yandex.com],
696
+ remove_categories: %i[news shopping travel health_and_fitness finance technology]
697
+ },
698
+ video_platforms: {
699
+ domains: %w[youtube.com vimeo.com dailymotion.com twitch.tv],
700
+ remove_categories: %i[education news entertainment music sports gaming]
701
+ },
702
+ news_aggregators: {
703
+ domains: %w[reddit.com digg.com],
704
+ keep_primary_only: %i[social_media reddit digg]
705
+ }
706
+ }
707
+
708
+ # Merge custom rules with defaults
709
+ default_rules.merge(custom_rules)
710
+ end
711
+
712
+ def apply_smart_categorization(url, categories)
713
+ return categories unless smart_categorization_enabled
714
+
715
+ host = extract_host(url)
716
+
717
+ smart_rules.each do |_rule_name, rule_config|
718
+ if rule_config[:domains]&.any? { |domain| host == domain || host.end_with?(".#{domain}") }
719
+ categories = apply_rule(categories, rule_config, host, url)
720
+ end
721
+ end
722
+
723
+ categories
724
+ end
725
+
726
+ def apply_rule(categories, rule_config, _host, url)
727
+ # Rule: Remove overly broad categories for specific platforms
728
+ if rule_config[:remove_categories]
729
+ categories = categories.reject { |cat| rule_config[:remove_categories].include?(cat) }
730
+ end
731
+
732
+ # Rule: Keep only primary categories
733
+ if rule_config[:keep_primary_only]
734
+ primary_categories = categories & rule_config[:keep_primary_only]
735
+ categories = primary_categories if primary_categories.any?
736
+ end
737
+
738
+ # Rule: Add specific categories based on URL patterns
739
+ if rule_config[:add_categories_by_path]
740
+ rule_config[:add_categories_by_path].each do |path_pattern, additional_categories|
741
+ categories = (categories + additional_categories).uniq if url.match?(path_pattern)
742
+ end
743
+ end
744
+
745
+ # Rule: Remove all categories except allowed ones
746
+ categories &= rule_config[:allowed_categories_only] if rule_config[:allowed_categories_only]
747
+
748
+ categories
749
+ end
750
+
751
+ def extract_host(url)
752
+ (URI.parse(url).host || url).downcase.gsub('www.', '')
753
+ rescue URI::InvalidURIError
754
+ url.downcase.gsub('www.', '')
755
+ end
756
+
359
757
  def build_host_data(urls)
360
758
  all_hosts = []
361
759
 
@@ -364,11 +762,11 @@ module UrlCategorise
364
762
 
365
763
  hosts_data = nil
366
764
 
367
- hosts_data = read_from_cache(url) if @cache_dir && !@force_download
765
+ hosts_data = read_from_cache(url) if cache_dir && !force_download
368
766
 
369
767
  if hosts_data.nil?
370
768
  hosts_data = download_and_parse_list(url)
371
- save_to_cache(url, hosts_data) if @cache_dir
769
+ save_to_cache(url, hosts_data) if cache_dir
372
770
  end
373
771
 
374
772
  all_hosts.concat(hosts_data) if hosts_data
@@ -378,7 +776,7 @@ module UrlCategorise
378
776
  end
379
777
 
380
778
  def download_and_parse_list(url)
381
- raw_data = HTTParty.get(url, timeout: @request_timeout)
779
+ raw_data = HTTParty.get(url, timeout: request_timeout)
382
780
  return [] if raw_data.body.nil? || raw_data.body.empty?
383
781
 
384
782
  # Store metadata
@@ -441,11 +839,11 @@ module UrlCategorise
441
839
  end
442
840
 
443
841
  def cache_file_path(url)
444
- return nil unless @cache_dir
842
+ return nil unless cache_dir
445
843
 
446
- FileUtils.mkdir_p(@cache_dir) unless Dir.exist?(@cache_dir)
844
+ FileUtils.mkdir_p(cache_dir) unless Dir.exist?(cache_dir)
447
845
  filename = Digest::MD5.hexdigest(url) + '.cache'
448
- File.join(@cache_dir, filename)
846
+ File.join(cache_dir, filename)
449
847
  end
450
848
 
451
849
  def read_from_cache(url)
@@ -478,7 +876,7 @@ module UrlCategorise
478
876
  end
479
877
 
480
878
  def should_update_cache?(url, cache_data)
481
- return true if @force_download
879
+ return true if force_download
482
880
  return true unless cache_data[:metadata]
483
881
 
484
882
  # Update if cache is older than 24 hours
@@ -487,7 +885,7 @@ module UrlCategorise
487
885
 
488
886
  # Check if remote content has changed
489
887
  begin
490
- head_response = HTTParty.head(url, timeout: @request_timeout)
888
+ head_response = HTTParty.head(url, timeout: request_timeout)
491
889
  remote_etag = head_response.headers['etag']
492
890
  remote_last_modified = head_response.headers['last-modified']
493
891
 
@@ -508,8 +906,8 @@ module UrlCategorise
508
906
  def categories_with_keys
509
907
  keyed_categories = {}
510
908
 
511
- host_urls.keys.each do |category|
512
- category_values = host_urls[category].select do |url|
909
+ (host_urls || {}).keys.each do |category|
910
+ category_values = (host_urls || {})[category].select do |url|
513
911
  url.is_a?(Symbol)
514
912
  end
515
913
 
@@ -58,11 +58,16 @@ module UrlCategorise
58
58
  return handle_existing_dataset(extracted_dir, options)
59
59
  end
60
60
 
61
- # Only require credentials if we need to download fresh data
61
+ # If credentials not available, return nil gracefully for cache mode
62
62
  unless kaggle_credentials_available?
63
- raise Error, 'Kaggle credentials required for downloading new datasets. ' \
64
- 'Set KAGGLE_USERNAME/KAGGLE_KEY environment variables, provide credentials explicitly, ' \
65
- 'or place kaggle.json file in ~/.kaggle/ directory.'
63
+ if options[:use_cache]
64
+ puts "Warning: Kaggle dataset '#{dataset_path}' not cached and no credentials available" if ENV['DEBUG']
65
+ return nil
66
+ else
67
+ raise Error, 'Kaggle credentials required for downloading new datasets. ' \
68
+ 'Set KAGGLE_USERNAME/KAGGLE_KEY environment variables, provide credentials explicitly, ' \
69
+ 'or place kaggle.json file in ~/.kaggle/ directory.'
70
+ end
66
71
  end
67
72
 
68
73
  # Download from Kaggle API