UrlCategorise 0.1.2 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/export_csv ADDED
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'optparse'
5
+ require_relative '../lib/url_categorise'
6
+
7
+ options = {
8
+ output_path: nil,
9
+ cache_dir: nil,
10
+ verbose: false,
11
+ iab_compliance: false,
12
+ smart_categorization: false
13
+ }
14
+
15
+ OptionParser.new do |opts|
16
+ opts.banner = "Usage: #{$0} [options]"
17
+ opts.separator ""
18
+ opts.separator "Export all categorized domains and metadata as a single CSV file for AI training"
19
+ opts.separator ""
20
+
21
+ opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/csv or ./exports/csv)") do |path|
22
+ options[:output_path] = path
23
+ end
24
+
25
+ opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
26
+ options[:cache_dir] = path
27
+ end
28
+
29
+ opts.on("--iab-compliance", "Enable IAB compliance for category mapping") do
30
+ options[:iab_compliance] = true
31
+ end
32
+
33
+ opts.on("--smart-categorization", "Enable smart categorization") do
34
+ options[:smart_categorization] = true
35
+ end
36
+
37
+ opts.on("-v", "--verbose", "Verbose output") do
38
+ options[:verbose] = true
39
+ end
40
+
41
+ opts.on("-h", "--help", "Show this help message") do
42
+ puts opts
43
+ exit
44
+ end
45
+ end.parse!
46
+
47
+ puts "=== UrlCategorise CSV Data Export ===" if options[:verbose]
48
+ puts "Initializing client..." if options[:verbose]
49
+
50
+ begin
51
+ client = UrlCategorise::Client.new(
52
+ cache_dir: options[:cache_dir],
53
+ iab_compliance: options[:iab_compliance],
54
+ smart_categorization: options[:smart_categorization]
55
+ )
56
+
57
+ puts "Exporting CSV data..." if options[:verbose]
58
+
59
+ result = client.export_csv_data(options[:output_path])
60
+
61
+ puts "\nāœ… Export completed successfully!"
62
+ puts "šŸ“ Export directory: #{result[:export_directory]}"
63
+ puts "šŸ“„ CSV file: #{result[:csv_file]}"
64
+ puts "šŸ“„ Metadata file: #{result[:metadata_file]}"
65
+
66
+ puts "\nšŸ“Š Data Summary:"
67
+ puts " Total domains: #{result[:summary][:total_domains]}"
68
+ puts " Total categories: #{result[:summary][:total_categories]}"
69
+ puts " Dataset categories: #{result[:summary][:dataset_categories_count]}"
70
+ puts " Blocklist categories: #{result[:summary][:blocklist_categories_count]}"
71
+
72
+ if options[:verbose]
73
+ puts "\nšŸ·ļø Categories included:"
74
+ result[:summary][:categories].each do |category|
75
+ puts " - #{category}"
76
+ end
77
+ end
78
+
79
+ rescue StandardError => e
80
+ puts "āŒ Error: #{e.message}"
81
+ puts e.backtrace if options[:verbose]
82
+ exit 1
83
+ end
data/bin/export_hosts ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'optparse'
5
+ require_relative '../lib/url_categorise'
6
+
7
+ options = {
8
+ output_path: nil,
9
+ cache_dir: nil,
10
+ verbose: false
11
+ }
12
+
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{$0} [options]"
15
+ opts.separator ""
16
+ opts.separator "Export all categorized domains as separate hosts files per category"
17
+ opts.separator ""
18
+
19
+ opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/hosts or ./exports/hosts)") do |path|
20
+ options[:output_path] = path
21
+ end
22
+
23
+ opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
24
+ options[:cache_dir] = path
25
+ end
26
+
27
+ opts.on("-v", "--verbose", "Verbose output") do
28
+ options[:verbose] = true
29
+ end
30
+
31
+ opts.on("-h", "--help", "Show this help message") do
32
+ puts opts
33
+ exit
34
+ end
35
+ end.parse!
36
+
37
+ puts "=== UrlCategorise Hosts Export ===" if options[:verbose]
38
+ puts "Initializing client..." if options[:verbose]
39
+
40
+ begin
41
+ client = UrlCategorise::Client.new(
42
+ cache_dir: options[:cache_dir]
43
+ )
44
+
45
+ puts "Exporting hosts files..." if options[:verbose]
46
+
47
+ result = client.export_hosts_files(options[:output_path])
48
+
49
+ summary = result.delete(:_summary)
50
+
51
+ puts "\nāœ… Export completed successfully!"
52
+ puts "šŸ“ Export directory: #{summary[:export_directory]}"
53
+ puts "šŸ“Š Total categories exported: #{summary[:total_categories]}"
54
+ puts "🌐 Total domains exported: #{summary[:total_domains]}"
55
+ puts "šŸ“„ Summary file: #{summary[:path]}"
56
+
57
+ if options[:verbose]
58
+ puts "\nšŸ“‹ Files created:"
59
+ result.each do |category, info|
60
+ puts " #{info[:filename]} - #{info[:count]} domains"
61
+ end
62
+ end
63
+
64
+ rescue StandardError => e
65
+ puts "āŒ Error: #{e.message}"
66
+ puts e.backtrace if options[:verbose]
67
+ exit 1
68
+ end
data/bin/rake ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ rake
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require './lib/url_categorise'
5
+
6
+ puts "=== Large Dataset Loading Example ==="
7
+
8
+ # Configuration for handling large datasets (300+ MB)
9
+ # First test with cache-only mode
10
+ puts "Creating client with cached datasets only..."
11
+ client = UrlCategorise::Client.new(
12
+ cache_dir: './url_cache',
13
+ auto_load_datasets: true,
14
+ smart_categorization: true,
15
+ dataset_config: {
16
+ cache_path: './url_cache/datasets',
17
+ download_path: './url_cache/downloads',
18
+ kaggle: { credentials_file: '~/kaggle.json' }
19
+ }
20
+ )
21
+
22
+ puts "Client created successfully!"
23
+ puts ""
24
+ puts "Dataset Statistics:"
25
+ puts " Total categories: #{client.count_of_categories}"
26
+ puts " Dataset categories: #{client.count_of_dataset_categories}"
27
+ puts " Blocklist categories: #{client.count_of_categories - client.count_of_dataset_categories}"
28
+ puts ""
29
+ puts " Total hosts: #{client.count_of_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
30
+ puts " Dataset hosts: #{client.count_of_dataset_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
31
+ puts ""
32
+ puts " Total data size: #{client.size_of_data.round(1)} MB"
33
+ puts " Dataset data size: #{client.size_of_dataset_data.round(1)} MB"
34
+ puts " Blocklist data size: #{client.size_of_blocklist_data.round(1)} MB"
35
+
36
+ puts ""
37
+ puts "Dataset-specific Statistics:"
38
+ # Get dataset metadata if available
39
+ metadata = client.dataset_metadata
40
+ if metadata && !metadata.empty?
41
+ puts " Datasets loaded: #{metadata.size}"
42
+
43
+ # Calculate size for each dataset by finding its categories and domains
44
+ dataset_categories = client.instance_variable_get(:@dataset_categories)
45
+ total_dataset_size = 0
46
+
47
+ metadata.each_with_index do |(hash, data), index|
48
+ # Estimate size contribution of this dataset
49
+ dataset_portion = data[:total_entries].to_f / metadata.values.sum { |d| d[:total_entries] }
50
+ dataset_size_mb = (client.size_of_dataset_data * dataset_portion).round(2)
51
+ total_dataset_size += dataset_size_mb
52
+
53
+ puts " Dataset #{index + 1}:"
54
+ puts " Processed at: #{data[:processed_at]}"
55
+ puts " Total entries: #{data[:total_entries].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
56
+ puts " Estimated size: #{dataset_size_mb} MB"
57
+ puts " Data hash: #{hash[0..12]}..."
58
+ end
59
+
60
+ puts ""
61
+ puts " Total dataset size: #{total_dataset_size.round(2)} MB (#{client.size_of_dataset_data.round(1)} MB actual)"
62
+ else
63
+ puts " No dataset metadata available"
64
+ end
@@ -0,0 +1,215 @@
1
+ # UrlCategorise v0.1.4 Release Notes
2
+
3
+ ## New Features Added
4
+
5
+ ### šŸ†• Dataset-Specific Helper Methods
6
+
7
+ Added dedicated helper methods for tracking dataset-only statistics separately from DNS blocklists:
8
+
9
+ ```ruby
10
+ client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
11
+ client.load_kaggle_dataset('owner', 'dataset-name')
12
+
13
+ # New dataset-specific methods
14
+ client.count_of_dataset_hosts # Returns hosts from datasets only
15
+ client.count_of_dataset_categories # Returns categories from datasets only
16
+
17
+ # Existing methods still work and count both
18
+ client.count_of_hosts # All hosts (DNS lists + datasets)
19
+ client.count_of_categories # All categories (DNS lists + datasets)
20
+ ```
21
+
22
+ **Implementation Details:**
23
+ - `count_of_dataset_hosts` - Sums hosts from categories tracked in `@dataset_categories` Set
24
+ - `count_of_dataset_categories` - Returns size of `@dataset_categories` Set
25
+ - Gracefully handles nil categories with safe navigation (`&.size || 0`)
26
+ - Both methods return 0 when no datasets are loaded
27
+
28
+ ### šŸ†• IAB Content Taxonomy Compliance
29
+
30
+ Full support for IAB (Interactive Advertising Bureau) Content Taxonomy standards:
31
+
32
+ ```ruby
33
+ # Enable IAB v3.0 compliance (recommended)
34
+ client = UrlCategorise::Client.new(
35
+ iab_compliance: true,
36
+ iab_version: :v3
37
+ )
38
+
39
+ # Enable IAB v2.0 compliance
40
+ client = UrlCategorise::Client.new(
41
+ iab_compliance: true,
42
+ iab_version: :v2
43
+ )
44
+
45
+ # Categorization returns IAB codes instead of custom categories
46
+ categories = client.categorise("badsite.com")
47
+ puts categories # => ["626"] (IAB v3 code for illegal content)
48
+
49
+ # Helper methods
50
+ client.iab_compliant? # => true/false
51
+ client.get_iab_mapping(:malware) # => "626" (v3) or "IAB25" (v2)
52
+ ```
53
+
54
+ **IAB Category Mappings:**
55
+
56
+ **IAB Content Taxonomy v3.0:**
57
+ - Security threats (`malware`, `phishing`, `illegal`) → `626` (Illegal Content)
58
+ - Advertising (`advertising`, `mobile_ads`) → `3` (Advertising)
59
+ - Gambling → `7-39` (Gambling subcategory)
60
+ - Adult content (`pornography`) → `626` (Adult Content)
61
+ - Social platforms → `14` (Society)
62
+ - Technology → `19` (Technology & Computing)
63
+
64
+ **IAB Content Taxonomy v2.0:**
65
+ - Security threats → `IAB25` (Non-Standard Content)
66
+ - Advertising → `IAB3` (Advertising)
67
+ - Gambling → `IAB7-39` (Gambling subcategory)
68
+ - Adult content → `IAB25-3` (Pornography)
69
+
70
+ **Implementation Details:**
71
+ - New `IabCompliance` module with comprehensive mappings
72
+ - Support for both v2.0 and v3.0 standards
73
+ - IAB compliance affects all categorization methods (`categorise`, `categorise_ip`, `resolve_and_categorise`)
74
+ - Automatic deduplication of IAB codes
75
+ - Graceful handling of unknown categories (returns 'Unknown')
76
+
77
+ ### 🧪 Comprehensive Test Coverage
78
+
79
+ Added extensive test suites for new features:
80
+
81
+ **Dataset Methods Tests:**
82
+ - `client_dataset_methods_test.rb` (9 tests)
83
+ - Tests for empty, populated, and mixed dataset scenarios
84
+ - Integration tests with existing methods
85
+ - Edge case handling (nil categories, empty arrays)
86
+
87
+ **IAB Compliance Tests:**
88
+ - `iab_compliance_test.rb` (14 tests) - Module functionality
89
+ - `client_iab_compliance_test.rb` (19 tests) - Client integration
90
+ - Comprehensive mapping validation for v2 and v3
91
+ - Integration with all categorization methods
92
+ - DNS resolution with IAB compliance
93
+ - Error handling and edge cases
94
+
95
+ **Total Test Stats:**
96
+ - **316 tests, 2455 assertions, 0 failures, 0 errors**
97
+ - **94.69% line coverage** (660/697 lines)
98
+ - All new features fully tested with edge cases
99
+
100
+ ### šŸ”§ Technical Implementation
101
+
102
+ **New Files:**
103
+ - `lib/url_categorise/iab_compliance.rb` - IAB mapping module
104
+ - `test/url_categorise/client_dataset_methods_test.rb` - Dataset helper tests
105
+ - `test/url_categorise/iab_compliance_test.rb` - IAB module tests
106
+ - `test/url_categorise/client_iab_compliance_test.rb` - IAB client integration tests
107
+
108
+ **Updated Files:**
109
+ - `lib/url_categorise/client.rb` - Added dataset helpers and IAB support
110
+ - `lib/url_categorise.rb` - Required new IAB module
111
+ - `lib/url_categorise/version.rb` - Bumped to 0.1.4
112
+
113
+ **New Client Attributes:**
114
+ - `iab_compliance_enabled` - Boolean flag for IAB compliance
115
+ - `iab_version` - IAB taxonomy version (:v2 or :v3)
116
+
117
+ **New Client Methods:**
118
+ - `count_of_dataset_hosts` - Dataset-specific host count
119
+ - `count_of_dataset_categories` - Dataset-specific category count
120
+ - `iab_compliant?` - Check IAB compliance status
121
+ - `get_iab_mapping(category)` - Get IAB code for category
122
+
123
+ ### šŸš€ Usage Examples
124
+
125
+ **Dataset Statistics:**
126
+ ```ruby
127
+ client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
128
+ client.load_kaggle_dataset('owner', 'dataset')
129
+
130
+ puts "Total hosts: #{client.count_of_hosts}" # All sources
131
+ puts "Dataset hosts: #{client.count_of_dataset_hosts}" # Datasets only
132
+ puts "DNS list hosts: #{client.count_of_hosts - client.count_of_dataset_hosts}"
133
+ ```
134
+
135
+ **IAB Compliance:**
136
+ ```ruby
137
+ # Production environment with IAB compliance
138
+ client = UrlCategorise::Client.new(
139
+ iab_compliance: true,
140
+ iab_version: :v3,
141
+ dataset_config: { kaggle: { username: 'user', api_key: 'key' } }
142
+ )
143
+
144
+ # All methods return IAB codes
145
+ domain_cats = client.categorise("example.com") # => ["3", "626"]
146
+ ip_cats = client.categorise_ip("192.168.1.100") # => ["626"]
147
+ resolved_cats = client.resolve_and_categorise("site.com") # => ["3"]
148
+
149
+ # Check compliance
150
+ puts "IAB compliant: #{client.iab_compliant?}" # => true
151
+ puts "Using version: #{client.iab_version}" # => :v3
152
+ ```
153
+
154
+ **Rails Service Integration:**
155
+ ```ruby
156
+ class UrlCategorizerService
157
+ def initialize
158
+ @client = UrlCategorise::ActiveRecordClient.new(
159
+ iab_compliance: Rails.env.production?,
160
+ iab_version: :v3,
161
+ dataset_config: {
162
+ kaggle: {
163
+ username: ENV['KAGGLE_USERNAME'],
164
+ api_key: ENV['KAGGLE_API_KEY']
165
+ }
166
+ }
167
+ )
168
+ end
169
+
170
+ def stats
171
+ {
172
+ total_hosts: @client.count_of_hosts,
173
+ dataset_hosts: @client.count_of_dataset_hosts,
174
+ dns_list_hosts: @client.count_of_hosts - @client.count_of_dataset_hosts,
175
+ iab_compliant: @client.iab_compliant?,
176
+ iab_version: @client.iab_version
177
+ }
178
+ end
179
+ end
180
+ ```
181
+
182
+ ### šŸ”„ Migration Guide
183
+
184
+ **From v0.1.3 to v0.1.4:**
185
+
186
+ 1. **No Breaking Changes** - All existing code continues to work
187
+ 2. **Optional New Features** - IAB compliance and dataset helpers are opt-in
188
+ 3. **Enhanced Statistics** - Use new helper methods for better insights
189
+
190
+ **Recommended Updates:**
191
+ ```ruby
192
+ # Before (still works)
193
+ client = UrlCategorise::Client.new
194
+ puts "Total: #{client.count_of_hosts}"
195
+
196
+ # After (enhanced)
197
+ client = UrlCategorise::Client.new(
198
+ iab_compliance: true, # Optional IAB compliance
199
+ iab_version: :v3 # Optional version selection
200
+ )
201
+ puts "Total hosts: #{client.count_of_hosts}"
202
+ puts "Dataset hosts: #{client.count_of_dataset_hosts}"
203
+ puts "IAB compliant: #{client.iab_compliant?}"
204
+ ```
205
+
206
+ ### šŸ“Š Quality Assurance
207
+
208
+ - āœ… All 316 tests pass with 0 failures/errors
209
+ - āœ… 94.69% line coverage maintained
210
+ - āœ… Code style enforced with rubocop
211
+ - āœ… Comprehensive edge case testing
212
+ - āœ… Memory-efficient implementation
213
+ - āœ… Backward compatibility preserved
214
+
215
+ This release adds powerful new features while maintaining the reliability and performance standards established in previous versions.
@@ -3,65 +3,125 @@ require_relative 'models'
3
3
  module UrlCategorise
4
4
  class ActiveRecordClient < Client
5
5
  def initialize(**kwargs)
6
- raise "ActiveRecord not available" unless UrlCategorise::Models.available?
7
-
6
+ raise 'ActiveRecord not available' unless UrlCategorise::Models.available?
7
+
8
8
  @use_database = kwargs.delete(:use_database) { true }
9
9
  super(**kwargs)
10
-
10
+
11
11
  populate_database if @use_database
12
12
  end
13
13
 
14
14
  def categorise(url)
15
15
  return super(url) unless @use_database && UrlCategorise::Models.available?
16
-
17
- host = (URI.parse(url).host || url).downcase.gsub("www.", "")
18
-
16
+
17
+ host = (URI.parse(url).host || url).downcase.gsub('www.', '')
18
+
19
19
  # Try database first
20
20
  categories = UrlCategorise::Models::Domain.categorise(host)
21
21
  return categories unless categories.empty?
22
-
22
+
23
23
  # Fallback to memory-based categorization
24
24
  super(url)
25
25
  end
26
26
 
27
27
  def categorise_ip(ip_address)
28
28
  return super(ip_address) unless @use_database && UrlCategorise::Models.available?
29
-
29
+
30
30
  # Try database first
31
31
  categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
32
32
  return categories unless categories.empty?
33
-
33
+
34
34
  # Fallback to memory-based categorization
35
35
  super(ip_address)
36
36
  end
37
37
 
38
38
  def update_database
39
39
  return unless @use_database && UrlCategorise::Models.available?
40
-
40
+
41
41
  populate_database
42
42
  end
43
43
 
44
44
  def database_stats
45
45
  return {} unless @use_database && UrlCategorise::Models.available?
46
-
46
+
47
47
  {
48
48
  domains: UrlCategorise::Models::Domain.count,
49
49
  ip_addresses: UrlCategorise::Models::IpAddress.count,
50
50
  list_metadata: UrlCategorise::Models::ListMetadata.count,
51
+ dataset_metadata: UrlCategorise::Models::DatasetMetadata.count,
51
52
  categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
52
53
  }
53
54
  end
54
55
 
56
+ def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
57
+ result = super(dataset_owner, dataset_name, options)
58
+
59
+ # Store dataset metadata in database if enabled
60
+ if @use_database && UrlCategorise::Models.available? && @dataset_metadata
61
+ store_dataset_metadata_in_db(
62
+ source_type: 'kaggle',
63
+ identifier: "#{dataset_owner}/#{dataset_name}",
64
+ metadata: @dataset_metadata.values.last,
65
+ category_mappings: options[:category_mappings],
66
+ processing_options: options
67
+ )
68
+ end
69
+
70
+ # Repopulate database with integrated dataset domains
71
+ populate_database if @use_database
72
+
73
+ result
74
+ end
75
+
76
+ def load_csv_dataset(url, options = {})
77
+ result = super(url, options)
78
+
79
+ # Store dataset metadata in database if enabled
80
+ if @use_database && UrlCategorise::Models.available? && @dataset_metadata
81
+ store_dataset_metadata_in_db(
82
+ source_type: 'csv',
83
+ identifier: url,
84
+ metadata: @dataset_metadata.values.last,
85
+ category_mappings: options[:category_mappings],
86
+ processing_options: options
87
+ )
88
+ end
89
+
90
+ # Repopulate database with integrated dataset domains
91
+ populate_database if @use_database
92
+
93
+ result
94
+ end
95
+
96
+ def dataset_history(source_type: nil, limit: 10)
97
+ return [] unless @use_database && UrlCategorise::Models.available?
98
+
99
+ query = UrlCategorise::Models::DatasetMetadata.order(processed_at: :desc).limit(limit)
100
+ query = query.by_source(source_type) if source_type
101
+
102
+ query.map do |record|
103
+ {
104
+ source_type: record.source_type,
105
+ identifier: record.identifier,
106
+ data_hash: record.data_hash,
107
+ total_entries: record.total_entries,
108
+ processed_at: record.processed_at,
109
+ category_mappings: record.category_mappings,
110
+ processing_options: record.processing_options
111
+ }
112
+ end
113
+ end
114
+
55
115
  private
56
116
 
57
117
  def populate_database
58
118
  return unless UrlCategorise::Models.available?
59
-
119
+
60
120
  # Store list metadata
61
- @host_urls.each do |category, urls|
121
+ (host_urls || {}).each do |category, urls|
62
122
  urls.each do |url|
63
123
  next unless url.is_a?(String)
64
-
124
+
65
125
  metadata = @metadata[url] || {}
66
126
  UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
67
127
  record.name = category.to_s
@@ -76,7 +136,7 @@ module UrlCategorise
76
136
  @hosts.each do |category, domains|
77
137
  domains.each do |domain|
78
138
  next if domain.nil? || domain.empty?
79
-
139
+
80
140
  existing = UrlCategorise::Models::Domain.find_by(domain: domain)
81
141
  if existing
82
142
  # Add category if not already present
@@ -92,15 +152,15 @@ module UrlCategorise
92
152
  end
93
153
 
94
154
  # Store IP data (for IP-based lists)
95
- ip_categories = [:sanctions_ips, :compromised_ips, :tor_exit_nodes, :open_proxy_ips,
96
- :banking_trojans, :malicious_ssl_certificates, :top_attack_sources]
97
-
155
+ ip_categories = %i[sanctions_ips compromised_ips tor_exit_nodes open_proxy_ips
156
+ banking_trojans malicious_ssl_certificates top_attack_sources]
157
+
98
158
  ip_categories.each do |category|
99
159
  next unless @hosts[category]
100
-
160
+
101
161
  @hosts[category].each do |ip|
102
162
  next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
103
-
163
+
104
164
  existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
105
165
  if existing
106
166
  categories = existing.categories | [category.to_s]
@@ -114,5 +174,22 @@ module UrlCategorise
114
174
  end
115
175
  end
116
176
  end
177
+
178
+ def store_dataset_metadata_in_db(source_type:, identifier:, metadata:, category_mappings: nil,
179
+ processing_options: nil)
180
+ return unless UrlCategorise::Models.available?
181
+
182
+ UrlCategorise::Models::DatasetMetadata.find_or_create_by(data_hash: metadata[:data_hash]) do |record|
183
+ record.source_type = source_type
184
+ record.identifier = identifier
185
+ record.total_entries = metadata[:total_entries]
186
+ record.category_mappings = category_mappings || {}
187
+ record.processing_options = processing_options || {}
188
+ record.processed_at = metadata[:processed_at] || Time.now
189
+ end
190
+ rescue ActiveRecord::RecordInvalid => e
191
+ # Dataset metadata already exists or validation failed
192
+ puts "Warning: Failed to store dataset metadata: #{e.message}" if ENV['DEBUG']
193
+ end
117
194
  end
118
- end
195
+ end