UrlCategorise 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74ab2a721a954a91958dc1c184fd76a4a2c3acd250730bb7db58c30286579fcd
4
- data.tar.gz: 3b01bf42b654266dddcd85d684c2f3e25d455d0fe9b82094f07de2b95c026bd3
3
+ metadata.gz: dcb05d79b6bc09b5b338183d412cd309d9634a342c95b14ea25df5926d8609fb
4
+ data.tar.gz: effa4c7a010ee574fe6a41653af553a68710ceca46ebdb9dd5352096af5fa7e1
5
5
  SHA512:
6
- metadata.gz: 2d175a1f72e2fda10771c770a694f07235cd7f5a3f93d1b487d82c98b4f7b490f7aaf665834dcf226bc1a6ae2cc833038731e8d9581af0a570302a7c17dad1c1
7
- data.tar.gz: 2bd46865492bea3411b8506f294cf96b507c5c5beafad2584b2438977043fbfd3850e79c0a0342b11079829ef5bf3fd91c18c3f45ad5e0537fae8c78c9645245
6
+ metadata.gz: a527c801cbf6318305d640925dd75922c2ac1bcc76a5de75c75e3ad24698305c3d7d885d7da8dd280e61ceb1fe91a57eac185c5c209e11685f2ddb6833b120b9
7
+ data.tar.gz: de81765d20a0b36c54b71b935928777140f86f6e0a130c71ec7804ed28c1b3d8f12ca30fff5cf8c93952aaed7a9c279fe35a0d32bc83a851aad2556b55fd7942
@@ -6,7 +6,11 @@
6
6
  "Bash(ruby:*)",
7
7
  "Bash(bundle exec ruby:*)",
8
8
  "Bash(find:*)",
9
- "Bash(grep:*)"
9
+ "Bash(grep:*)",
10
+ "Read(//Users/trex22/development/rubygems/kaggle/**)",
11
+ "Bash(for file in test/url_categorise/*dataset*test.rb)",
12
+ "Bash(do echo \"Checking $file...\")",
13
+ "Bash(done)"
10
14
  ],
11
15
  "deny": []
12
16
  }
data/CLAUDE.md CHANGED
@@ -78,12 +78,20 @@ The gem includes automatic monitoring and cleanup of broken URLs:
78
78
  - ActiveRecord/Rails integration (optional)
79
79
  - URL health monitoring and reporting
80
80
  - Automatic cleanup of broken blocklist sources
81
+ - **Dataset Processing**: Kaggle and CSV dataset integration with three auth methods
82
+ - **Optional Kaggle**: Can disable Kaggle functionality entirely while keeping CSV processing
83
+ - **Smart Caching**: Cached datasets work without credentials, avoiding unnecessary authentication
84
+ - **Data Hashing**: SHA256 content hashing for dataset change detection
85
+ - **Category Mapping**: Flexible column detection and category mapping for datasets
86
+ - **Credential Warnings**: Helpful warnings when Kaggle credentials are missing but functionality continues
81
87
 
82
88
  ### Architecture
83
89
  - `Client` class: Main interface for categorization
90
+ - `DatasetProcessor` class: Handles Kaggle and CSV dataset processing
84
91
  - `Constants` module: Contains default list URLs and categories
85
- - Modular design allows extending with new list sources
86
- - Support for custom list directories and caching
92
+ - `ActiveRecordClient` class: Database-backed client with dataset history
93
+ - Modular design allows extending with new list sources and datasets
94
+ - Support for custom list directories, caching, and dataset integration
87
95
 
88
96
  ### List Sources
89
97
  Primary sources include:
@@ -91,6 +99,8 @@ Primary sources include:
91
99
  - hagezi/dns-blocklists
92
100
  - StevenBlack/hosts
93
101
  - Various specialized security lists
102
+ - **Kaggle datasets**: Public URL classification datasets
103
+ - **Custom CSV files**: Direct CSV dataset URLs with flexible column mapping
94
104
 
95
105
  ### Testing Guidelines
96
106
  - Mock all HTTP requests using WebMock
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
- source "https://rubygems.org"
1
+ source 'https://rubygems.org'
2
2
 
3
- git_source(:github) {|repo_name| "https://github.com/TRex22/url_categorise" }
3
+ git_source(:github) { |_repo_name| 'https://github.com/TRex22/url_categorise' }
4
4
 
5
5
  # Specify your gem's dependencies in url_categorise.gemspec
6
6
  gemspec
data/Gemfile.lock CHANGED
@@ -1,14 +1,16 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- UrlCategorise (0.1.2)
4
+ UrlCategorise (0.1.3)
5
5
  api_pattern (>= 0.0.6, < 1.0)
6
6
  csv (>= 3.3.0, < 4.0)
7
7
  digest (>= 3.1.0, < 4.0)
8
8
  fileutils (>= 1.7.0, < 2.0)
9
9
  httparty (>= 0.22.0, < 1.0)
10
+ json (>= 2.7.0, < 3.0)
10
11
  nokogiri (>= 1.18.9, < 2.0)
11
12
  resolv (>= 0.4.0, < 1.0)
13
+ rubyzip (>= 2.3.0, < 3.0)
12
14
 
13
15
  GEM
14
16
  remote: https://rubygems.org/
@@ -78,19 +80,19 @@ GEM
78
80
  erubi (1.13.1)
79
81
  fileutils (1.7.3)
80
82
  hashdiff (1.2.0)
81
- httparty (0.22.0)
83
+ httparty (0.23.1)
82
84
  csv
83
85
  mini_mime (>= 1.0.0)
84
86
  multi_xml (>= 0.5.2)
85
87
  i18n (1.14.7)
86
88
  concurrent-ruby (~> 1.0)
89
+ json (2.13.2)
87
90
  logger (1.7.0)
88
91
  loofah (2.24.1)
89
92
  crass (~> 1.0.2)
90
93
  nokogiri (>= 1.12.0)
91
94
  method_source (1.1.0)
92
95
  mini_mime (1.1.5)
93
- mini_portile2 (2.8.9)
94
96
  minitest (5.25.5)
95
97
  minitest-focus (1.4.0)
96
98
  minitest (>= 4, < 6)
@@ -103,9 +105,6 @@ GEM
103
105
  ruby2_keywords (>= 0.0.5)
104
106
  multi_xml (0.7.2)
105
107
  bigdecimal (~> 3.1)
106
- nokogiri (1.18.9)
107
- mini_portile2 (~> 2.8.2)
108
- racc (~> 1.4)
109
108
  nokogiri (1.18.9-arm64-darwin)
110
109
  racc (~> 1.4)
111
110
  pry (0.15.2)
@@ -130,6 +129,7 @@ GEM
130
129
  rexml (3.4.1)
131
130
  ruby-progressbar (1.13.0)
132
131
  ruby2_keywords (0.0.5)
132
+ rubyzip (2.4.1)
133
133
  securerandom (0.4.1)
134
134
  simplecov (0.22.0)
135
135
  docile (~> 1.1)
@@ -137,8 +137,6 @@ GEM
137
137
  simplecov_json_formatter (~> 0.1)
138
138
  simplecov-html (0.13.2)
139
139
  simplecov_json_formatter (0.1.4)
140
- sqlite3 (2.7.3)
141
- mini_portile2 (~> 2.8.0)
142
140
  sqlite3 (2.7.3-arm64-darwin)
143
141
  timecop (0.9.10)
144
142
  timeout (0.4.3)
@@ -153,7 +151,6 @@ GEM
153
151
 
154
152
  PLATFORMS
155
153
  arm64-darwin-24
156
- ruby
157
154
 
158
155
  DEPENDENCIES
159
156
  UrlCategorise!
data/README.md CHANGED
@@ -192,6 +192,159 @@ ruby bin/check_lists
192
192
 
193
193
  [View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
194
194
 
195
+ ## Dataset Processing
196
+
197
+ UrlCategorise now supports processing external datasets from Kaggle and CSV files to expand categorization data:
198
+
199
+ ### Kaggle Dataset Integration
200
+
201
+ Load datasets directly from Kaggle using three authentication methods:
202
+
203
+ ```ruby
204
+ # Method 1: Environment variables (KAGGLE_USERNAME, KAGGLE_KEY)
205
+ client = UrlCategorise::Client.new(
206
+ dataset_config: {
207
+ kaggle: {} # Will use environment variables
208
+ }
209
+ )
210
+
211
+ # Method 2: Explicit credentials
212
+ client = UrlCategorise::Client.new(
213
+ dataset_config: {
214
+ kaggle: {
215
+ username: 'your_username',
216
+ api_key: 'your_api_key'
217
+ }
218
+ }
219
+ )
220
+
221
+ # Method 3: Credentials file (~/.kaggle/kaggle.json or custom path)
222
+ client = UrlCategorise::Client.new(
223
+ dataset_config: {
224
+ kaggle: {
225
+ credentials_file: '/path/to/kaggle.json'
226
+ }
227
+ }
228
+ )
229
+
230
+ # Load and integrate a Kaggle dataset
231
+ client.load_kaggle_dataset('owner', 'dataset-name', {
232
+ use_cache: true, # Cache processed data
233
+ category_mappings: {
234
+ url_column: 'website', # Column containing URLs/domains
235
+ category_column: 'type', # Column containing categories
236
+ category_map: {
237
+ 'malicious' => 'malware', # Map dataset categories to your categories
238
+ 'spam' => 'phishing'
239
+ }
240
+ }
241
+ })
242
+
243
+ # Check categorization with dataset data
244
+ categories = client.categorise('https://example.com')
245
+ ```
246
+
247
+ ### CSV Dataset Processing
248
+
249
+ Load datasets from direct CSV URLs:
250
+
251
+ ```ruby
252
+ client = UrlCategorise::Client.new(
253
+ dataset_config: {
254
+ download_path: './datasets',
255
+ cache_path: './dataset_cache'
256
+ }
257
+ )
258
+
259
+ # Load CSV dataset
260
+ client.load_csv_dataset('https://example.com/url-classification.csv', {
261
+ use_cache: true,
262
+ category_mappings: {
263
+ url_column: 'url',
264
+ category_column: 'category'
265
+ }
266
+ })
267
+ ```
268
+
269
+ ### Dataset Configuration Options
270
+
271
+ ```ruby
272
+ dataset_config = {
273
+ # Kaggle functionality control
274
+ enable_kaggle: true, # Set to false to disable Kaggle entirely (default: true)
275
+
276
+ # Kaggle authentication (optional - will try env vars and default file)
277
+ kaggle: {
278
+ username: 'kaggle_username', # Or use KAGGLE_USERNAME env var
279
+ api_key: 'kaggle_api_key', # Or use KAGGLE_KEY env var
280
+ credentials_file: '~/.kaggle/kaggle.json' # Optional custom path
281
+ },
282
+
283
+ # File paths
284
+ download_path: './downloads', # Where to store downloads
285
+ cache_path: './cache', # Where to cache processed data
286
+ timeout: 30 # HTTP timeout for downloads
287
+ }
288
+
289
+ client = UrlCategorise::Client.new(dataset_config: dataset_config)
290
+ ```
291
+
292
+ ### Disabling Kaggle Functionality
293
+
294
+ You can completely disable Kaggle functionality if you only need CSV processing:
295
+
296
+ ```ruby
297
+ # Disable Kaggle - only CSV datasets will work
298
+ client = UrlCategorise::Client.new(
299
+ dataset_config: {
300
+ enable_kaggle: false,
301
+ download_path: './datasets',
302
+ cache_path: './dataset_cache'
303
+ }
304
+ )
305
+
306
+ # This will raise an error
307
+ # client.load_kaggle_dataset('owner', 'dataset') # Error!
308
+
309
+ # But CSV datasets still work
310
+ client.load_csv_dataset('https://example.com/data.csv')
311
+ ```
312
+
313
+ ### Working with Cached Datasets
314
+
315
+ If you have cached datasets, you can access them even without Kaggle credentials:
316
+
317
+ ```ruby
318
+ # No credentials provided, but cached data will work
319
+ client = UrlCategorise::Client.new(
320
+ dataset_config: {
321
+ kaggle: {}, # Empty config - will show warning but continue
322
+ download_path: './datasets',
323
+ cache_path: './cache'
324
+ }
325
+ )
326
+
327
+ # Will work if data is cached, otherwise will show helpful error message
328
+ client.load_kaggle_dataset('owner', 'dataset', use_cache: true)
329
+ ```
330
+
331
+ ### Dataset Metadata and Hashing
332
+
333
+ The system automatically tracks dataset metadata and generates content hashes:
334
+
335
+ ```ruby
336
+ # Get dataset metadata
337
+ metadata = client.dataset_metadata
338
+ metadata.each do |data_hash, meta|
339
+ puts "Dataset hash: #{data_hash}"
340
+ puts "Processed at: #{meta[:processed_at]}"
341
+ puts "Total entries: #{meta[:total_entries]}"
342
+ end
343
+
344
+ # Reload client with fresh dataset integration
345
+ client.reload_with_datasets
346
+ ```
347
+
195
348
  ## ActiveRecord Integration
196
349
 
197
350
  For high-performance applications, enable database storage:
@@ -215,11 +368,31 @@ categories = client.categorise("example.com")
215
368
 
216
369
  # Get database statistics
217
370
  stats = client.database_stats
218
- # => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90 }
371
+ # => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90, dataset_metadata: 5 }
219
372
 
220
373
  # Direct model access
221
374
  domain_record = UrlCategorise::Models::Domain.find_by(domain: "example.com")
222
375
  ip_record = UrlCategorise::Models::IpAddress.find_by(ip_address: "1.2.3.4")
376
+
377
+ # Dataset integration with ActiveRecord
378
+ client = UrlCategorise::ActiveRecordClient.new(
379
+ use_database: true,
380
+ dataset_config: {
381
+ kaggle: { username: 'user', api_key: 'key' }
382
+ }
383
+ )
384
+
385
+ # Load datasets - automatically stored in database
386
+ client.load_kaggle_dataset('owner', 'dataset')
387
+ client.load_csv_dataset('https://example.com/data.csv')
388
+
389
+ # View dataset history
390
+ history = client.dataset_history(limit: 5)
391
+ # => [{ source_type: 'kaggle', identifier: 'owner/dataset', total_entries: 1000, processed_at: ... }]
392
+
393
+ # Filter by source type
394
+ kaggle_history = client.dataset_history(source_type: 'kaggle')
395
+ csv_history = client.dataset_history(source_type: 'csv')
223
396
  ```
224
397
 
225
398
  ## Rails Integration
@@ -274,6 +447,21 @@ class CreateUrlCategoriseTables < ActiveRecord::Migration[7.0]
274
447
 
275
448
  add_index :url_categorise_ip_addresses, :ip_address
276
449
  add_index :url_categorise_ip_addresses, :categories
450
+
451
+ create_table :url_categorise_dataset_metadata do |t|
452
+ t.string :source_type, null: false, index: true
453
+ t.string :identifier, null: false
454
+ t.string :data_hash, null: false, index: { unique: true }
455
+ t.integer :total_entries, null: false
456
+ t.text :category_mappings
457
+ t.text :processing_options
458
+ t.datetime :processed_at
459
+ t.timestamps
460
+ end
461
+
462
+ add_index :url_categorise_dataset_metadata, :source_type
463
+ add_index :url_categorise_dataset_metadata, :identifier
464
+ add_index :url_categorise_dataset_metadata, :processed_at
277
465
  end
278
466
  end
279
467
  ```
data/Rakefile CHANGED
@@ -1,12 +1,12 @@
1
- require "bundler/gem_tasks"
2
- require "bundler/setup"
3
- require "rake/testtask"
1
+ require 'bundler/gem_tasks'
2
+ require 'bundler/setup'
3
+ require 'rake/testtask'
4
4
 
5
5
  Rake::TestTask.new(:test) do |t|
6
- t.libs << "test"
7
- t.libs << "lib"
8
- t.test_files = FileList["test/**/*_test.rb"]
9
- t.ruby_opts = ["-rbundler/setup"]
6
+ t.libs << 'test'
7
+ t.libs << 'lib'
8
+ t.test_files = FileList['test/**/*_test.rb']
9
+ t.ruby_opts = ['-rbundler/setup']
10
10
  end
11
11
 
12
- task :default => :test
12
+ task default: :test
data/bin/check_lists CHANGED
@@ -3,46 +3,45 @@
3
3
  require 'bundler/setup'
4
4
  require_relative '../lib/url_categorise'
5
5
 
6
- puts "=== CHECKING ALL URLs IN CONSTANTS ==="
6
+ puts '=== CHECKING ALL URLs IN CONSTANTS ==='
7
7
 
8
8
  UrlCategorise::Constants::DEFAULT_HOST_URLS.each do |category, urls|
9
9
  puts "\n#{category.upcase}:"
10
-
10
+
11
11
  # Skip categories that only reference other categories (symbols)
12
12
  actual_urls = urls.reject { |url| url.is_a?(Symbol) }
13
-
13
+
14
14
  if actual_urls.empty?
15
15
  if urls.empty?
16
- puts " Empty category (no URLs defined)"
16
+ puts ' Empty category (no URLs defined)'
17
17
  else
18
18
  puts " Only references other categories: #{urls}"
19
19
  end
20
20
  next
21
21
  end
22
-
22
+
23
23
  actual_urls.each do |url|
24
24
  print " Testing #{url}... "
25
25
  begin
26
26
  response = HTTParty.head(url, timeout: 10)
27
27
  case response.code
28
28
  when 200
29
- puts "✅ OK"
29
+ puts '✅ OK'
30
30
  when 404
31
- puts "❌ 404 Not Found"
31
+ puts '❌ 404 Not Found'
32
32
  when 403
33
- puts "❌ 403 Forbidden"
33
+ puts '❌ 403 Forbidden'
34
34
  when 500..599
35
35
  puts "❌ Server Error (#{response.code})"
36
36
  else
37
37
  puts "⚠️ HTTP #{response.code}"
38
38
  end
39
39
  rescue Net::TimeoutError, HTTParty::TimeoutError
40
- puts "❌ Timeout"
41
- rescue SocketError, Errno::ECONNREFUSED => e
42
- puts "❌ DNS/Network Error"
43
- rescue => e
40
+ puts '❌ Timeout'
41
+ rescue SocketError, Errno::ECONNREFUSED
42
+ puts '❌ DNS/Network Error'
43
+ rescue StandardError => e
44
44
  puts "❌ Error: #{e.class}"
45
45
  end
46
46
  end
47
47
  end
48
-
data/bin/console CHANGED
@@ -1,11 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "url_categorise"
3
+ require 'bundler/setup'
4
+ require 'url_categorise'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
8
8
 
9
9
  # (If you use this, don't forget to add pry to your Gemfile!)
10
- require "pry"
10
+ require 'pry'
11
11
  Pry.start
@@ -3,65 +3,125 @@ require_relative 'models'
3
3
  module UrlCategorise
4
4
  class ActiveRecordClient < Client
5
5
  def initialize(**kwargs)
6
- raise "ActiveRecord not available" unless UrlCategorise::Models.available?
7
-
6
+ raise 'ActiveRecord not available' unless UrlCategorise::Models.available?
7
+
8
8
  @use_database = kwargs.delete(:use_database) { true }
9
9
  super(**kwargs)
10
-
10
+
11
11
  populate_database if @use_database
12
12
  end
13
13
 
14
14
  def categorise(url)
15
15
  return super(url) unless @use_database && UrlCategorise::Models.available?
16
-
17
- host = (URI.parse(url).host || url).downcase.gsub("www.", "")
18
-
16
+
17
+ host = (URI.parse(url).host || url).downcase.gsub('www.', '')
18
+
19
19
  # Try database first
20
20
  categories = UrlCategorise::Models::Domain.categorise(host)
21
21
  return categories unless categories.empty?
22
-
22
+
23
23
  # Fallback to memory-based categorization
24
24
  super(url)
25
25
  end
26
26
 
27
27
  def categorise_ip(ip_address)
28
28
  return super(ip_address) unless @use_database && UrlCategorise::Models.available?
29
-
29
+
30
30
  # Try database first
31
31
  categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
32
32
  return categories unless categories.empty?
33
-
33
+
34
34
  # Fallback to memory-based categorization
35
35
  super(ip_address)
36
36
  end
37
37
 
38
38
  def update_database
39
39
  return unless @use_database && UrlCategorise::Models.available?
40
-
40
+
41
41
  populate_database
42
42
  end
43
43
 
44
44
  def database_stats
45
45
  return {} unless @use_database && UrlCategorise::Models.available?
46
-
46
+
47
47
  {
48
48
  domains: UrlCategorise::Models::Domain.count,
49
49
  ip_addresses: UrlCategorise::Models::IpAddress.count,
50
50
  list_metadata: UrlCategorise::Models::ListMetadata.count,
51
+ dataset_metadata: UrlCategorise::Models::DatasetMetadata.count,
51
52
  categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
52
53
  }
53
54
  end
54
55
 
56
+ def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
57
+ result = super(dataset_owner, dataset_name, options)
58
+
59
+ # Store dataset metadata in database if enabled
60
+ if @use_database && UrlCategorise::Models.available? && @dataset_metadata
61
+ store_dataset_metadata_in_db(
62
+ source_type: 'kaggle',
63
+ identifier: "#{dataset_owner}/#{dataset_name}",
64
+ metadata: @dataset_metadata.values.last,
65
+ category_mappings: options[:category_mappings],
66
+ processing_options: options
67
+ )
68
+ end
69
+
70
+ # Repopulate database with integrated dataset domains
71
+ populate_database if @use_database
72
+
73
+ result
74
+ end
75
+
76
+ def load_csv_dataset(url, options = {})
77
+ result = super(url, options)
78
+
79
+ # Store dataset metadata in database if enabled
80
+ if @use_database && UrlCategorise::Models.available? && @dataset_metadata
81
+ store_dataset_metadata_in_db(
82
+ source_type: 'csv',
83
+ identifier: url,
84
+ metadata: @dataset_metadata.values.last,
85
+ category_mappings: options[:category_mappings],
86
+ processing_options: options
87
+ )
88
+ end
89
+
90
+ # Repopulate database with integrated dataset domains
91
+ populate_database if @use_database
92
+
93
+ result
94
+ end
95
+
96
+ def dataset_history(source_type: nil, limit: 10)
97
+ return [] unless @use_database && UrlCategorise::Models.available?
98
+
99
+ query = UrlCategorise::Models::DatasetMetadata.order(processed_at: :desc).limit(limit)
100
+ query = query.by_source(source_type) if source_type
101
+
102
+ query.map do |record|
103
+ {
104
+ source_type: record.source_type,
105
+ identifier: record.identifier,
106
+ data_hash: record.data_hash,
107
+ total_entries: record.total_entries,
108
+ processed_at: record.processed_at,
109
+ category_mappings: record.category_mappings,
110
+ processing_options: record.processing_options
111
+ }
112
+ end
113
+ end
114
+
55
115
  private
56
116
 
57
117
  def populate_database
58
118
  return unless UrlCategorise::Models.available?
59
-
119
+
60
120
  # Store list metadata
61
121
  @host_urls.each do |category, urls|
62
122
  urls.each do |url|
63
123
  next unless url.is_a?(String)
64
-
124
+
65
125
  metadata = @metadata[url] || {}
66
126
  UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
67
127
  record.name = category.to_s
@@ -76,7 +136,7 @@ module UrlCategorise
76
136
  @hosts.each do |category, domains|
77
137
  domains.each do |domain|
78
138
  next if domain.nil? || domain.empty?
79
-
139
+
80
140
  existing = UrlCategorise::Models::Domain.find_by(domain: domain)
81
141
  if existing
82
142
  # Add category if not already present
@@ -92,15 +152,15 @@ module UrlCategorise
92
152
  end
93
153
 
94
154
  # Store IP data (for IP-based lists)
95
- ip_categories = [:sanctions_ips, :compromised_ips, :tor_exit_nodes, :open_proxy_ips,
96
- :banking_trojans, :malicious_ssl_certificates, :top_attack_sources]
97
-
155
+ ip_categories = %i[sanctions_ips compromised_ips tor_exit_nodes open_proxy_ips
156
+ banking_trojans malicious_ssl_certificates top_attack_sources]
157
+
98
158
  ip_categories.each do |category|
99
159
  next unless @hosts[category]
100
-
160
+
101
161
  @hosts[category].each do |ip|
102
162
  next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
103
-
163
+
104
164
  existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
105
165
  if existing
106
166
  categories = existing.categories | [category.to_s]
@@ -114,5 +174,22 @@ module UrlCategorise
114
174
  end
115
175
  end
116
176
  end
177
+
178
+ def store_dataset_metadata_in_db(source_type:, identifier:, metadata:, category_mappings: nil,
179
+ processing_options: nil)
180
+ return unless UrlCategorise::Models.available?
181
+
182
+ UrlCategorise::Models::DatasetMetadata.find_or_create_by(data_hash: metadata[:data_hash]) do |record|
183
+ record.source_type = source_type
184
+ record.identifier = identifier
185
+ record.total_entries = metadata[:total_entries]
186
+ record.category_mappings = category_mappings || {}
187
+ record.processing_options = processing_options || {}
188
+ record.processed_at = metadata[:processed_at] || Time.now
189
+ end
190
+ rescue ActiveRecord::RecordInvalid => e
191
+ # Dataset metadata already exists or validation failed
192
+ puts "Warning: Failed to store dataset metadata: #{e.message}" if ENV['DEBUG']
193
+ end
117
194
  end
118
- end
195
+ end