UrlCategorise 0.1.2 ā 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +10 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +88 -3
- data/Gemfile +2 -2
- data/Gemfile.lock +18 -9
- data/README.md +517 -4
- data/Rakefile +8 -8
- data/bin/check_lists +12 -13
- data/bin/console +3 -3
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +98 -21
- data/lib/url_categorise/client.rb +641 -134
- data/lib/url_categorise/constants.rb +86 -71
- data/lib/url_categorise/dataset_processor.rb +476 -0
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/models.rb +53 -14
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +3 -0
- data/url_categorise.gemspec +37 -33
- metadata +142 -52
data/bin/export_csv
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'optparse'
|
5
|
+
require_relative '../lib/url_categorise'
|
6
|
+
|
7
|
+
options = {
|
8
|
+
output_path: nil,
|
9
|
+
cache_dir: nil,
|
10
|
+
verbose: false,
|
11
|
+
iab_compliance: false,
|
12
|
+
smart_categorization: false
|
13
|
+
}
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{$0} [options]"
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Export all categorized domains and metadata as a single CSV file for AI training"
|
19
|
+
opts.separator ""
|
20
|
+
|
21
|
+
opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/csv or ./exports/csv)") do |path|
|
22
|
+
options[:output_path] = path
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
|
26
|
+
options[:cache_dir] = path
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("--iab-compliance", "Enable IAB compliance for category mapping") do
|
30
|
+
options[:iab_compliance] = true
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("--smart-categorization", "Enable smart categorization") do
|
34
|
+
options[:smart_categorization] = true
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-v", "--verbose", "Verbose output") do
|
38
|
+
options[:verbose] = true
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-h", "--help", "Show this help message") do
|
42
|
+
puts opts
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
end.parse!
|
46
|
+
|
47
|
+
puts "=== UrlCategorise CSV Data Export ===" if options[:verbose]
|
48
|
+
puts "Initializing client..." if options[:verbose]
|
49
|
+
|
50
|
+
begin
|
51
|
+
client = UrlCategorise::Client.new(
|
52
|
+
cache_dir: options[:cache_dir],
|
53
|
+
iab_compliance: options[:iab_compliance],
|
54
|
+
smart_categorization: options[:smart_categorization]
|
55
|
+
)
|
56
|
+
|
57
|
+
puts "Exporting CSV data..." if options[:verbose]
|
58
|
+
|
59
|
+
result = client.export_csv_data(options[:output_path])
|
60
|
+
|
61
|
+
puts "\nā
Export completed successfully!"
|
62
|
+
puts "š Export directory: #{result[:export_directory]}"
|
63
|
+
puts "š CSV file: #{result[:csv_file]}"
|
64
|
+
puts "š Metadata file: #{result[:metadata_file]}"
|
65
|
+
|
66
|
+
puts "\nš Data Summary:"
|
67
|
+
puts " Total domains: #{result[:summary][:total_domains]}"
|
68
|
+
puts " Total categories: #{result[:summary][:total_categories]}"
|
69
|
+
puts " Dataset categories: #{result[:summary][:dataset_categories_count]}"
|
70
|
+
puts " Blocklist categories: #{result[:summary][:blocklist_categories_count]}"
|
71
|
+
|
72
|
+
if options[:verbose]
|
73
|
+
puts "\nš·ļø Categories included:"
|
74
|
+
result[:summary][:categories].each do |category|
|
75
|
+
puts " - #{category}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
rescue StandardError => e
|
80
|
+
puts "ā Error: #{e.message}"
|
81
|
+
puts e.backtrace if options[:verbose]
|
82
|
+
exit 1
|
83
|
+
end
|
data/bin/export_hosts
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'optparse'
|
5
|
+
require_relative '../lib/url_categorise'
|
6
|
+
|
7
|
+
options = {
|
8
|
+
output_path: nil,
|
9
|
+
cache_dir: nil,
|
10
|
+
verbose: false
|
11
|
+
}
|
12
|
+
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{$0} [options]"
|
15
|
+
opts.separator ""
|
16
|
+
opts.separator "Export all categorized domains as separate hosts files per category"
|
17
|
+
opts.separator ""
|
18
|
+
|
19
|
+
opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/hosts or ./exports/hosts)") do |path|
|
20
|
+
options[:output_path] = path
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
|
24
|
+
options[:cache_dir] = path
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-v", "--verbose", "Verbose output") do
|
28
|
+
options[:verbose] = true
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-h", "--help", "Show this help message") do
|
32
|
+
puts opts
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
end.parse!
|
36
|
+
|
37
|
+
puts "=== UrlCategorise Hosts Export ===" if options[:verbose]
|
38
|
+
puts "Initializing client..." if options[:verbose]
|
39
|
+
|
40
|
+
begin
|
41
|
+
client = UrlCategorise::Client.new(
|
42
|
+
cache_dir: options[:cache_dir]
|
43
|
+
)
|
44
|
+
|
45
|
+
puts "Exporting hosts files..." if options[:verbose]
|
46
|
+
|
47
|
+
result = client.export_hosts_files(options[:output_path])
|
48
|
+
|
49
|
+
summary = result.delete(:_summary)
|
50
|
+
|
51
|
+
puts "\nā
Export completed successfully!"
|
52
|
+
puts "š Export directory: #{summary[:export_directory]}"
|
53
|
+
puts "š Total categories exported: #{summary[:total_categories]}"
|
54
|
+
puts "š Total domains exported: #{summary[:total_domains]}"
|
55
|
+
puts "š Summary file: #{summary[:path]}"
|
56
|
+
|
57
|
+
if options[:verbose]
|
58
|
+
puts "\nš Files created:"
|
59
|
+
result.each do |category, info|
|
60
|
+
puts " #{info[:filename]} - #{info[:count]} domains"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
rescue StandardError => e
|
65
|
+
puts "ā Error: #{e.message}"
|
66
|
+
puts e.backtrace if options[:verbose]
|
67
|
+
exit 1
|
68
|
+
end
|
data/bin/rake
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require './lib/url_categorise'
|
5
|
+
|
6
|
+
puts "=== Large Dataset Loading Example ==="
|
7
|
+
|
8
|
+
# Configuration for handling large datasets (300+ MB)
|
9
|
+
# First test with cache-only mode
|
10
|
+
puts "Creating client with cached datasets only..."
|
11
|
+
client = UrlCategorise::Client.new(
|
12
|
+
cache_dir: './url_cache',
|
13
|
+
auto_load_datasets: true,
|
14
|
+
smart_categorization: true,
|
15
|
+
dataset_config: {
|
16
|
+
cache_path: './url_cache/datasets',
|
17
|
+
download_path: './url_cache/downloads',
|
18
|
+
kaggle: { credentials_file: '~/kaggle.json' }
|
19
|
+
}
|
20
|
+
)
|
21
|
+
|
22
|
+
puts "Client created successfully!"
|
23
|
+
puts ""
|
24
|
+
puts "Dataset Statistics:"
|
25
|
+
puts " Total categories: #{client.count_of_categories}"
|
26
|
+
puts " Dataset categories: #{client.count_of_dataset_categories}"
|
27
|
+
puts " Blocklist categories: #{client.count_of_categories - client.count_of_dataset_categories}"
|
28
|
+
puts ""
|
29
|
+
puts " Total hosts: #{client.count_of_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
|
30
|
+
puts " Dataset hosts: #{client.count_of_dataset_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
|
31
|
+
puts ""
|
32
|
+
puts " Total data size: #{client.size_of_data.round(1)} MB"
|
33
|
+
puts " Dataset data size: #{client.size_of_dataset_data.round(1)} MB"
|
34
|
+
puts " Blocklist data size: #{client.size_of_blocklist_data.round(1)} MB"
|
35
|
+
|
36
|
+
puts ""
|
37
|
+
puts "Dataset-specific Statistics:"
|
38
|
+
# Get dataset metadata if available
|
39
|
+
metadata = client.dataset_metadata
|
40
|
+
if metadata && !metadata.empty?
|
41
|
+
puts " Datasets loaded: #{metadata.size}"
|
42
|
+
|
43
|
+
# Calculate size for each dataset by finding its categories and domains
|
44
|
+
dataset_categories = client.instance_variable_get(:@dataset_categories)
|
45
|
+
total_dataset_size = 0
|
46
|
+
|
47
|
+
metadata.each_with_index do |(hash, data), index|
|
48
|
+
# Estimate size contribution of this dataset
|
49
|
+
dataset_portion = data[:total_entries].to_f / metadata.values.sum { |d| d[:total_entries] }
|
50
|
+
dataset_size_mb = (client.size_of_dataset_data * dataset_portion).round(2)
|
51
|
+
total_dataset_size += dataset_size_mb
|
52
|
+
|
53
|
+
puts " Dataset #{index + 1}:"
|
54
|
+
puts " Processed at: #{data[:processed_at]}"
|
55
|
+
puts " Total entries: #{data[:total_entries].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
|
56
|
+
puts " Estimated size: #{dataset_size_mb} MB"
|
57
|
+
puts " Data hash: #{hash[0..12]}..."
|
58
|
+
end
|
59
|
+
|
60
|
+
puts ""
|
61
|
+
puts " Total dataset size: #{total_dataset_size.round(2)} MB (#{client.size_of_dataset_data.round(1)} MB actual)"
|
62
|
+
else
|
63
|
+
puts " No dataset metadata available"
|
64
|
+
end
|
@@ -0,0 +1,215 @@
|
|
1
|
+
# UrlCategorise v0.1.4 Release Notes
|
2
|
+
|
3
|
+
## New Features Added
|
4
|
+
|
5
|
+
### š Dataset-Specific Helper Methods
|
6
|
+
|
7
|
+
Added dedicated helper methods for tracking dataset-only statistics separately from DNS blocklists:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
|
11
|
+
client.load_kaggle_dataset('owner', 'dataset-name')
|
12
|
+
|
13
|
+
# New dataset-specific methods
|
14
|
+
client.count_of_dataset_hosts # Returns hosts from datasets only
|
15
|
+
client.count_of_dataset_categories # Returns categories from datasets only
|
16
|
+
|
17
|
+
# Existing methods still work and count both
|
18
|
+
client.count_of_hosts # All hosts (DNS lists + datasets)
|
19
|
+
client.count_of_categories # All categories (DNS lists + datasets)
|
20
|
+
```
|
21
|
+
|
22
|
+
**Implementation Details:**
|
23
|
+
- `count_of_dataset_hosts` - Sums hosts from categories tracked in `@dataset_categories` Set
|
24
|
+
- `count_of_dataset_categories` - Returns size of `@dataset_categories` Set
|
25
|
+
- Gracefully handles nil categories with safe navigation (`&.size || 0`)
|
26
|
+
- Both methods return 0 when no datasets are loaded
|
27
|
+
|
28
|
+
### š IAB Content Taxonomy Compliance
|
29
|
+
|
30
|
+
Full support for IAB (Interactive Advertising Bureau) Content Taxonomy standards:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
# Enable IAB v3.0 compliance (recommended)
|
34
|
+
client = UrlCategorise::Client.new(
|
35
|
+
iab_compliance: true,
|
36
|
+
iab_version: :v3
|
37
|
+
)
|
38
|
+
|
39
|
+
# Enable IAB v2.0 compliance
|
40
|
+
client = UrlCategorise::Client.new(
|
41
|
+
iab_compliance: true,
|
42
|
+
iab_version: :v2
|
43
|
+
)
|
44
|
+
|
45
|
+
# Categorization returns IAB codes instead of custom categories
|
46
|
+
categories = client.categorise("badsite.com")
|
47
|
+
puts categories # => ["626"] (IAB v3 code for illegal content)
|
48
|
+
|
49
|
+
# Helper methods
|
50
|
+
client.iab_compliant? # => true/false
|
51
|
+
client.get_iab_mapping(:malware) # => "626" (v3) or "IAB25" (v2)
|
52
|
+
```
|
53
|
+
|
54
|
+
**IAB Category Mappings:**
|
55
|
+
|
56
|
+
**IAB Content Taxonomy v3.0:**
|
57
|
+
- Security threats (`malware`, `phishing`, `illegal`) ā `626` (Illegal Content)
|
58
|
+
- Advertising (`advertising`, `mobile_ads`) ā `3` (Advertising)
|
59
|
+
- Gambling ā `7-39` (Gambling subcategory)
|
60
|
+
- Adult content (`pornography`) ā `626` (Adult Content)
|
61
|
+
- Social platforms ā `14` (Society)
|
62
|
+
- Technology ā `19` (Technology & Computing)
|
63
|
+
|
64
|
+
**IAB Content Taxonomy v2.0:**
|
65
|
+
- Security threats ā `IAB25` (Non-Standard Content)
|
66
|
+
- Advertising ā `IAB3` (Advertising)
|
67
|
+
- Gambling ā `IAB7-39` (Gambling subcategory)
|
68
|
+
- Adult content ā `IAB25-3` (Pornography)
|
69
|
+
|
70
|
+
**Implementation Details:**
|
71
|
+
- New `IabCompliance` module with comprehensive mappings
|
72
|
+
- Support for both v2.0 and v3.0 standards
|
73
|
+
- IAB compliance affects all categorization methods (`categorise`, `categorise_ip`, `resolve_and_categorise`)
|
74
|
+
- Automatic deduplication of IAB codes
|
75
|
+
- Graceful handling of unknown categories (returns 'Unknown')
|
76
|
+
|
77
|
+
### š§Ŗ Comprehensive Test Coverage
|
78
|
+
|
79
|
+
Added extensive test suites for new features:
|
80
|
+
|
81
|
+
**Dataset Methods Tests:**
|
82
|
+
- `client_dataset_methods_test.rb` (9 tests)
|
83
|
+
- Tests for empty, populated, and mixed dataset scenarios
|
84
|
+
- Integration tests with existing methods
|
85
|
+
- Edge case handling (nil categories, empty arrays)
|
86
|
+
|
87
|
+
**IAB Compliance Tests:**
|
88
|
+
- `iab_compliance_test.rb` (14 tests) - Module functionality
|
89
|
+
- `client_iab_compliance_test.rb` (19 tests) - Client integration
|
90
|
+
- Comprehensive mapping validation for v2 and v3
|
91
|
+
- Integration with all categorization methods
|
92
|
+
- DNS resolution with IAB compliance
|
93
|
+
- Error handling and edge cases
|
94
|
+
|
95
|
+
**Total Test Stats:**
|
96
|
+
- **316 tests, 2455 assertions, 0 failures, 0 errors**
|
97
|
+
- **94.69% line coverage** (660/697 lines)
|
98
|
+
- All new features fully tested with edge cases
|
99
|
+
|
100
|
+
### š§ Technical Implementation
|
101
|
+
|
102
|
+
**New Files:**
|
103
|
+
- `lib/url_categorise/iab_compliance.rb` - IAB mapping module
|
104
|
+
- `test/url_categorise/client_dataset_methods_test.rb` - Dataset helper tests
|
105
|
+
- `test/url_categorise/iab_compliance_test.rb` - IAB module tests
|
106
|
+
- `test/url_categorise/client_iab_compliance_test.rb` - IAB client integration tests
|
107
|
+
|
108
|
+
**Updated Files:**
|
109
|
+
- `lib/url_categorise/client.rb` - Added dataset helpers and IAB support
|
110
|
+
- `lib/url_categorise.rb` - Required new IAB module
|
111
|
+
- `lib/url_categorise/version.rb` - Bumped to 0.1.4
|
112
|
+
|
113
|
+
**New Client Attributes:**
|
114
|
+
- `iab_compliance_enabled` - Boolean flag for IAB compliance
|
115
|
+
- `iab_version` - IAB taxonomy version (:v2 or :v3)
|
116
|
+
|
117
|
+
**New Client Methods:**
|
118
|
+
- `count_of_dataset_hosts` - Dataset-specific host count
|
119
|
+
- `count_of_dataset_categories` - Dataset-specific category count
|
120
|
+
- `iab_compliant?` - Check IAB compliance status
|
121
|
+
- `get_iab_mapping(category)` - Get IAB code for category
|
122
|
+
|
123
|
+
### š Usage Examples
|
124
|
+
|
125
|
+
**Dataset Statistics:**
|
126
|
+
```ruby
|
127
|
+
client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
|
128
|
+
client.load_kaggle_dataset('owner', 'dataset')
|
129
|
+
|
130
|
+
puts "Total hosts: #{client.count_of_hosts}" # All sources
|
131
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}" # Datasets only
|
132
|
+
puts "DNS list hosts: #{client.count_of_hosts - client.count_of_dataset_hosts}"
|
133
|
+
```
|
134
|
+
|
135
|
+
**IAB Compliance:**
|
136
|
+
```ruby
|
137
|
+
# Production environment with IAB compliance
|
138
|
+
client = UrlCategorise::Client.new(
|
139
|
+
iab_compliance: true,
|
140
|
+
iab_version: :v3,
|
141
|
+
dataset_config: { kaggle: { username: 'user', api_key: 'key' } }
|
142
|
+
)
|
143
|
+
|
144
|
+
# All methods return IAB codes
|
145
|
+
domain_cats = client.categorise("example.com") # => ["3", "626"]
|
146
|
+
ip_cats = client.categorise_ip("192.168.1.100") # => ["626"]
|
147
|
+
resolved_cats = client.resolve_and_categorise("site.com") # => ["3"]
|
148
|
+
|
149
|
+
# Check compliance
|
150
|
+
puts "IAB compliant: #{client.iab_compliant?}" # => true
|
151
|
+
puts "Using version: #{client.iab_version}" # => :v3
|
152
|
+
```
|
153
|
+
|
154
|
+
**Rails Service Integration:**
|
155
|
+
```ruby
|
156
|
+
class UrlCategorizerService
|
157
|
+
def initialize
|
158
|
+
@client = UrlCategorise::ActiveRecordClient.new(
|
159
|
+
iab_compliance: Rails.env.production?,
|
160
|
+
iab_version: :v3,
|
161
|
+
dataset_config: {
|
162
|
+
kaggle: {
|
163
|
+
username: ENV['KAGGLE_USERNAME'],
|
164
|
+
api_key: ENV['KAGGLE_API_KEY']
|
165
|
+
}
|
166
|
+
}
|
167
|
+
)
|
168
|
+
end
|
169
|
+
|
170
|
+
def stats
|
171
|
+
{
|
172
|
+
total_hosts: @client.count_of_hosts,
|
173
|
+
dataset_hosts: @client.count_of_dataset_hosts,
|
174
|
+
dns_list_hosts: @client.count_of_hosts - @client.count_of_dataset_hosts,
|
175
|
+
iab_compliant: @client.iab_compliant?,
|
176
|
+
iab_version: @client.iab_version
|
177
|
+
}
|
178
|
+
end
|
179
|
+
end
|
180
|
+
```
|
181
|
+
|
182
|
+
### š Migration Guide
|
183
|
+
|
184
|
+
**From v0.1.3 to v0.1.4:**
|
185
|
+
|
186
|
+
1. **No Breaking Changes** - All existing code continues to work
|
187
|
+
2. **Optional New Features** - IAB compliance and dataset helpers are opt-in
|
188
|
+
3. **Enhanced Statistics** - Use new helper methods for better insights
|
189
|
+
|
190
|
+
**Recommended Updates:**
|
191
|
+
```ruby
|
192
|
+
# Before (still works)
|
193
|
+
client = UrlCategorise::Client.new
|
194
|
+
puts "Total: #{client.count_of_hosts}"
|
195
|
+
|
196
|
+
# After (enhanced)
|
197
|
+
client = UrlCategorise::Client.new(
|
198
|
+
iab_compliance: true, # Optional IAB compliance
|
199
|
+
iab_version: :v3 # Optional version selection
|
200
|
+
)
|
201
|
+
puts "Total hosts: #{client.count_of_hosts}"
|
202
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}"
|
203
|
+
puts "IAB compliant: #{client.iab_compliant?}"
|
204
|
+
```
|
205
|
+
|
206
|
+
### š Quality Assurance
|
207
|
+
|
208
|
+
- ā
All 316 tests pass with 0 failures/errors
|
209
|
+
- ā
94.69% line coverage maintained
|
210
|
+
- ā
Code style enforced with rubocop
|
211
|
+
- ā
Comprehensive edge case testing
|
212
|
+
- ā
Memory-efficient implementation
|
213
|
+
- ā
Backward compatibility preserved
|
214
|
+
|
215
|
+
This release adds powerful new features while maintaining the reliability and performance standards established in previous versions.
|
@@ -3,65 +3,125 @@ require_relative 'models'
|
|
3
3
|
module UrlCategorise
|
4
4
|
class ActiveRecordClient < Client
|
5
5
|
def initialize(**kwargs)
|
6
|
-
raise
|
7
|
-
|
6
|
+
raise 'ActiveRecord not available' unless UrlCategorise::Models.available?
|
7
|
+
|
8
8
|
@use_database = kwargs.delete(:use_database) { true }
|
9
9
|
super(**kwargs)
|
10
|
-
|
10
|
+
|
11
11
|
populate_database if @use_database
|
12
12
|
end
|
13
13
|
|
14
14
|
def categorise(url)
|
15
15
|
return super(url) unless @use_database && UrlCategorise::Models.available?
|
16
|
-
|
17
|
-
host = (URI.parse(url).host || url).downcase.gsub(
|
18
|
-
|
16
|
+
|
17
|
+
host = (URI.parse(url).host || url).downcase.gsub('www.', '')
|
18
|
+
|
19
19
|
# Try database first
|
20
20
|
categories = UrlCategorise::Models::Domain.categorise(host)
|
21
21
|
return categories unless categories.empty?
|
22
|
-
|
22
|
+
|
23
23
|
# Fallback to memory-based categorization
|
24
24
|
super(url)
|
25
25
|
end
|
26
26
|
|
27
27
|
def categorise_ip(ip_address)
|
28
28
|
return super(ip_address) unless @use_database && UrlCategorise::Models.available?
|
29
|
-
|
29
|
+
|
30
30
|
# Try database first
|
31
31
|
categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
|
32
32
|
return categories unless categories.empty?
|
33
|
-
|
33
|
+
|
34
34
|
# Fallback to memory-based categorization
|
35
35
|
super(ip_address)
|
36
36
|
end
|
37
37
|
|
38
38
|
def update_database
|
39
39
|
return unless @use_database && UrlCategorise::Models.available?
|
40
|
-
|
40
|
+
|
41
41
|
populate_database
|
42
42
|
end
|
43
43
|
|
44
44
|
def database_stats
|
45
45
|
return {} unless @use_database && UrlCategorise::Models.available?
|
46
|
-
|
46
|
+
|
47
47
|
{
|
48
48
|
domains: UrlCategorise::Models::Domain.count,
|
49
49
|
ip_addresses: UrlCategorise::Models::IpAddress.count,
|
50
50
|
list_metadata: UrlCategorise::Models::ListMetadata.count,
|
51
|
+
dataset_metadata: UrlCategorise::Models::DatasetMetadata.count,
|
51
52
|
categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
|
52
53
|
}
|
53
54
|
end
|
54
55
|
|
56
|
+
def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
|
57
|
+
result = super(dataset_owner, dataset_name, options)
|
58
|
+
|
59
|
+
# Store dataset metadata in database if enabled
|
60
|
+
if @use_database && UrlCategorise::Models.available? && @dataset_metadata
|
61
|
+
store_dataset_metadata_in_db(
|
62
|
+
source_type: 'kaggle',
|
63
|
+
identifier: "#{dataset_owner}/#{dataset_name}",
|
64
|
+
metadata: @dataset_metadata.values.last,
|
65
|
+
category_mappings: options[:category_mappings],
|
66
|
+
processing_options: options
|
67
|
+
)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Repopulate database with integrated dataset domains
|
71
|
+
populate_database if @use_database
|
72
|
+
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def load_csv_dataset(url, options = {})
|
77
|
+
result = super(url, options)
|
78
|
+
|
79
|
+
# Store dataset metadata in database if enabled
|
80
|
+
if @use_database && UrlCategorise::Models.available? && @dataset_metadata
|
81
|
+
store_dataset_metadata_in_db(
|
82
|
+
source_type: 'csv',
|
83
|
+
identifier: url,
|
84
|
+
metadata: @dataset_metadata.values.last,
|
85
|
+
category_mappings: options[:category_mappings],
|
86
|
+
processing_options: options
|
87
|
+
)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Repopulate database with integrated dataset domains
|
91
|
+
populate_database if @use_database
|
92
|
+
|
93
|
+
result
|
94
|
+
end
|
95
|
+
|
96
|
+
def dataset_history(source_type: nil, limit: 10)
|
97
|
+
return [] unless @use_database && UrlCategorise::Models.available?
|
98
|
+
|
99
|
+
query = UrlCategorise::Models::DatasetMetadata.order(processed_at: :desc).limit(limit)
|
100
|
+
query = query.by_source(source_type) if source_type
|
101
|
+
|
102
|
+
query.map do |record|
|
103
|
+
{
|
104
|
+
source_type: record.source_type,
|
105
|
+
identifier: record.identifier,
|
106
|
+
data_hash: record.data_hash,
|
107
|
+
total_entries: record.total_entries,
|
108
|
+
processed_at: record.processed_at,
|
109
|
+
category_mappings: record.category_mappings,
|
110
|
+
processing_options: record.processing_options
|
111
|
+
}
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
55
115
|
private
|
56
116
|
|
57
117
|
def populate_database
|
58
118
|
return unless UrlCategorise::Models.available?
|
59
|
-
|
119
|
+
|
60
120
|
# Store list metadata
|
61
|
-
|
121
|
+
(host_urls || {}).each do |category, urls|
|
62
122
|
urls.each do |url|
|
63
123
|
next unless url.is_a?(String)
|
64
|
-
|
124
|
+
|
65
125
|
metadata = @metadata[url] || {}
|
66
126
|
UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
|
67
127
|
record.name = category.to_s
|
@@ -76,7 +136,7 @@ module UrlCategorise
|
|
76
136
|
@hosts.each do |category, domains|
|
77
137
|
domains.each do |domain|
|
78
138
|
next if domain.nil? || domain.empty?
|
79
|
-
|
139
|
+
|
80
140
|
existing = UrlCategorise::Models::Domain.find_by(domain: domain)
|
81
141
|
if existing
|
82
142
|
# Add category if not already present
|
@@ -92,15 +152,15 @@ module UrlCategorise
|
|
92
152
|
end
|
93
153
|
|
94
154
|
# Store IP data (for IP-based lists)
|
95
|
-
ip_categories = [
|
96
|
-
|
97
|
-
|
155
|
+
ip_categories = %i[sanctions_ips compromised_ips tor_exit_nodes open_proxy_ips
|
156
|
+
banking_trojans malicious_ssl_certificates top_attack_sources]
|
157
|
+
|
98
158
|
ip_categories.each do |category|
|
99
159
|
next unless @hosts[category]
|
100
|
-
|
160
|
+
|
101
161
|
@hosts[category].each do |ip|
|
102
162
|
next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
|
103
|
-
|
163
|
+
|
104
164
|
existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
|
105
165
|
if existing
|
106
166
|
categories = existing.categories | [category.to_s]
|
@@ -114,5 +174,22 @@ module UrlCategorise
|
|
114
174
|
end
|
115
175
|
end
|
116
176
|
end
|
177
|
+
|
178
|
+
def store_dataset_metadata_in_db(source_type:, identifier:, metadata:, category_mappings: nil,
|
179
|
+
processing_options: nil)
|
180
|
+
return unless UrlCategorise::Models.available?
|
181
|
+
|
182
|
+
UrlCategorise::Models::DatasetMetadata.find_or_create_by(data_hash: metadata[:data_hash]) do |record|
|
183
|
+
record.source_type = source_type
|
184
|
+
record.identifier = identifier
|
185
|
+
record.total_entries = metadata[:total_entries]
|
186
|
+
record.category_mappings = category_mappings || {}
|
187
|
+
record.processing_options = processing_options || {}
|
188
|
+
record.processed_at = metadata[:processed_at] || Time.now
|
189
|
+
end
|
190
|
+
rescue ActiveRecord::RecordInvalid => e
|
191
|
+
# Dataset metadata already exists or validation failed
|
192
|
+
puts "Warning: Failed to store dataset metadata: #{e.message}" if ENV['DEBUG']
|
193
|
+
end
|
117
194
|
end
|
118
|
-
end
|
195
|
+
end
|