UrlCategorise 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +5 -1
- data/CLAUDE.md +12 -2
- data/Gemfile +2 -2
- data/Gemfile.lock +6 -9
- data/README.md +189 -1
- data/Rakefile +8 -8
- data/bin/check_lists +12 -13
- data/bin/console +3 -3
- data/lib/url_categorise/active_record_client.rb +97 -20
- data/lib/url_categorise/client.rb +220 -111
- data/lib/url_categorise/constants.rb +86 -71
- data/lib/url_categorise/dataset_processor.rb +471 -0
- data/lib/url_categorise/models.rb +53 -14
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +1 -0
- data/url_categorise.gemspec +34 -32
- metadata +91 -50
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dcb05d79b6bc09b5b338183d412cd309d9634a342c95b14ea25df5926d8609fb
|
4
|
+
data.tar.gz: effa4c7a010ee574fe6a41653af553a68710ceca46ebdb9dd5352096af5fa7e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a527c801cbf6318305d640925dd75922c2ac1bcc76a5de75c75e3ad24698305c3d7d885d7da8dd280e61ceb1fe91a57eac185c5c209e11685f2ddb6833b120b9
|
7
|
+
data.tar.gz: de81765d20a0b36c54b71b935928777140f86f6e0a130c71ec7804ed28c1b3d8f12ca30fff5cf8c93952aaed7a9c279fe35a0d32bc83a851aad2556b55fd7942
|
data/.claude/settings.local.json
CHANGED
@@ -6,7 +6,11 @@
|
|
6
6
|
"Bash(ruby:*)",
|
7
7
|
"Bash(bundle exec ruby:*)",
|
8
8
|
"Bash(find:*)",
|
9
|
-
"Bash(grep:*)"
|
9
|
+
"Bash(grep:*)",
|
10
|
+
"Read(//Users/trex22/development/rubygems/kaggle/**)",
|
11
|
+
"Bash(for file in test/url_categorise/*dataset*test.rb)",
|
12
|
+
"Bash(do echo \"Checking $file...\")",
|
13
|
+
"Bash(done)"
|
10
14
|
],
|
11
15
|
"deny": []
|
12
16
|
}
|
data/CLAUDE.md
CHANGED
@@ -78,12 +78,20 @@ The gem includes automatic monitoring and cleanup of broken URLs:
|
|
78
78
|
- ActiveRecord/Rails integration (optional)
|
79
79
|
- URL health monitoring and reporting
|
80
80
|
- Automatic cleanup of broken blocklist sources
|
81
|
+
- **Dataset Processing**: Kaggle and CSV dataset integration with three auth methods
|
82
|
+
- **Optional Kaggle**: Can disable Kaggle functionality entirely while keeping CSV processing
|
83
|
+
- **Smart Caching**: Cached datasets work without credentials, avoiding unnecessary authentication
|
84
|
+
- **Data Hashing**: SHA256 content hashing for dataset change detection
|
85
|
+
- **Category Mapping**: Flexible column detection and category mapping for datasets
|
86
|
+
- **Credential Warnings**: Helpful warnings when Kaggle credentials are missing but functionality continues
|
81
87
|
|
82
88
|
### Architecture
|
83
89
|
- `Client` class: Main interface for categorization
|
90
|
+
- `DatasetProcessor` class: Handles Kaggle and CSV dataset processing
|
84
91
|
- `Constants` module: Contains default list URLs and categories
|
85
|
-
-
|
86
|
-
-
|
92
|
+
- `ActiveRecordClient` class: Database-backed client with dataset history
|
93
|
+
- Modular design allows extending with new list sources and datasets
|
94
|
+
- Support for custom list directories, caching, and dataset integration
|
87
95
|
|
88
96
|
### List Sources
|
89
97
|
Primary sources include:
|
@@ -91,6 +99,8 @@ Primary sources include:
|
|
91
99
|
- hagezi/dns-blocklists
|
92
100
|
- StevenBlack/hosts
|
93
101
|
- Various specialized security lists
|
102
|
+
- **Kaggle datasets**: Public URL classification datasets
|
103
|
+
- **Custom CSV files**: Direct CSV dataset URLs with flexible column mapping
|
94
104
|
|
95
105
|
### Testing Guidelines
|
96
106
|
- Mock all HTTP requests using WebMock
|
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
git_source(:github) {|
|
3
|
+
git_source(:github) { |_repo_name| 'https://github.com/TRex22/url_categorise' }
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in url_categorise.gemspec
|
6
6
|
gemspec
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
UrlCategorise (0.1.
|
4
|
+
UrlCategorise (0.1.3)
|
5
5
|
api_pattern (>= 0.0.6, < 1.0)
|
6
6
|
csv (>= 3.3.0, < 4.0)
|
7
7
|
digest (>= 3.1.0, < 4.0)
|
8
8
|
fileutils (>= 1.7.0, < 2.0)
|
9
9
|
httparty (>= 0.22.0, < 1.0)
|
10
|
+
json (>= 2.7.0, < 3.0)
|
10
11
|
nokogiri (>= 1.18.9, < 2.0)
|
11
12
|
resolv (>= 0.4.0, < 1.0)
|
13
|
+
rubyzip (>= 2.3.0, < 3.0)
|
12
14
|
|
13
15
|
GEM
|
14
16
|
remote: https://rubygems.org/
|
@@ -78,19 +80,19 @@ GEM
|
|
78
80
|
erubi (1.13.1)
|
79
81
|
fileutils (1.7.3)
|
80
82
|
hashdiff (1.2.0)
|
81
|
-
httparty (0.
|
83
|
+
httparty (0.23.1)
|
82
84
|
csv
|
83
85
|
mini_mime (>= 1.0.0)
|
84
86
|
multi_xml (>= 0.5.2)
|
85
87
|
i18n (1.14.7)
|
86
88
|
concurrent-ruby (~> 1.0)
|
89
|
+
json (2.13.2)
|
87
90
|
logger (1.7.0)
|
88
91
|
loofah (2.24.1)
|
89
92
|
crass (~> 1.0.2)
|
90
93
|
nokogiri (>= 1.12.0)
|
91
94
|
method_source (1.1.0)
|
92
95
|
mini_mime (1.1.5)
|
93
|
-
mini_portile2 (2.8.9)
|
94
96
|
minitest (5.25.5)
|
95
97
|
minitest-focus (1.4.0)
|
96
98
|
minitest (>= 4, < 6)
|
@@ -103,9 +105,6 @@ GEM
|
|
103
105
|
ruby2_keywords (>= 0.0.5)
|
104
106
|
multi_xml (0.7.2)
|
105
107
|
bigdecimal (~> 3.1)
|
106
|
-
nokogiri (1.18.9)
|
107
|
-
mini_portile2 (~> 2.8.2)
|
108
|
-
racc (~> 1.4)
|
109
108
|
nokogiri (1.18.9-arm64-darwin)
|
110
109
|
racc (~> 1.4)
|
111
110
|
pry (0.15.2)
|
@@ -130,6 +129,7 @@ GEM
|
|
130
129
|
rexml (3.4.1)
|
131
130
|
ruby-progressbar (1.13.0)
|
132
131
|
ruby2_keywords (0.0.5)
|
132
|
+
rubyzip (2.4.1)
|
133
133
|
securerandom (0.4.1)
|
134
134
|
simplecov (0.22.0)
|
135
135
|
docile (~> 1.1)
|
@@ -137,8 +137,6 @@ GEM
|
|
137
137
|
simplecov_json_formatter (~> 0.1)
|
138
138
|
simplecov-html (0.13.2)
|
139
139
|
simplecov_json_formatter (0.1.4)
|
140
|
-
sqlite3 (2.7.3)
|
141
|
-
mini_portile2 (~> 2.8.0)
|
142
140
|
sqlite3 (2.7.3-arm64-darwin)
|
143
141
|
timecop (0.9.10)
|
144
142
|
timeout (0.4.3)
|
@@ -153,7 +151,6 @@ GEM
|
|
153
151
|
|
154
152
|
PLATFORMS
|
155
153
|
arm64-darwin-24
|
156
|
-
ruby
|
157
154
|
|
158
155
|
DEPENDENCIES
|
159
156
|
UrlCategorise!
|
data/README.md
CHANGED
@@ -192,6 +192,159 @@ ruby bin/check_lists
|
|
192
192
|
|
193
193
|
[View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
|
194
194
|
|
195
|
+
## Dataset Processing
|
196
|
+
|
197
|
+
UrlCategorise now supports processing external datasets from Kaggle and CSV files to expand categorization data:
|
198
|
+
|
199
|
+
### Kaggle Dataset Integration
|
200
|
+
|
201
|
+
Load datasets directly from Kaggle using three authentication methods:
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
# Method 1: Environment variables (KAGGLE_USERNAME, KAGGLE_KEY)
|
205
|
+
client = UrlCategorise::Client.new(
|
206
|
+
dataset_config: {
|
207
|
+
kaggle: {} # Will use environment variables
|
208
|
+
}
|
209
|
+
)
|
210
|
+
|
211
|
+
# Method 2: Explicit credentials
|
212
|
+
client = UrlCategorise::Client.new(
|
213
|
+
dataset_config: {
|
214
|
+
kaggle: {
|
215
|
+
username: 'your_username',
|
216
|
+
api_key: 'your_api_key'
|
217
|
+
}
|
218
|
+
}
|
219
|
+
)
|
220
|
+
|
221
|
+
# Method 3: Credentials file (~/.kaggle/kaggle.json or custom path)
|
222
|
+
client = UrlCategorise::Client.new(
|
223
|
+
dataset_config: {
|
224
|
+
kaggle: {
|
225
|
+
credentials_file: '/path/to/kaggle.json'
|
226
|
+
}
|
227
|
+
}
|
228
|
+
)
|
229
|
+
|
230
|
+
# Load and integrate a Kaggle dataset
|
231
|
+
client.load_kaggle_dataset('owner', 'dataset-name', {
|
232
|
+
use_cache: true, # Cache processed data
|
233
|
+
category_mappings: {
|
234
|
+
url_column: 'website', # Column containing URLs/domains
|
235
|
+
category_column: 'type', # Column containing categories
|
236
|
+
category_map: {
|
237
|
+
'malicious' => 'malware', # Map dataset categories to your categories
|
238
|
+
'spam' => 'phishing'
|
239
|
+
}
|
240
|
+
}
|
241
|
+
})
|
242
|
+
|
243
|
+
# Check categorization with dataset data
|
244
|
+
categories = client.categorise('https://example.com')
|
245
|
+
```
|
246
|
+
|
247
|
+
### CSV Dataset Processing
|
248
|
+
|
249
|
+
Load datasets from direct CSV URLs:
|
250
|
+
|
251
|
+
```ruby
|
252
|
+
client = UrlCategorise::Client.new(
|
253
|
+
dataset_config: {
|
254
|
+
download_path: './datasets',
|
255
|
+
cache_path: './dataset_cache'
|
256
|
+
}
|
257
|
+
)
|
258
|
+
|
259
|
+
# Load CSV dataset
|
260
|
+
client.load_csv_dataset('https://example.com/url-classification.csv', {
|
261
|
+
use_cache: true,
|
262
|
+
category_mappings: {
|
263
|
+
url_column: 'url',
|
264
|
+
category_column: 'category'
|
265
|
+
}
|
266
|
+
})
|
267
|
+
```
|
268
|
+
|
269
|
+
### Dataset Configuration Options
|
270
|
+
|
271
|
+
```ruby
|
272
|
+
dataset_config = {
|
273
|
+
# Kaggle functionality control
|
274
|
+
enable_kaggle: true, # Set to false to disable Kaggle entirely (default: true)
|
275
|
+
|
276
|
+
# Kaggle authentication (optional - will try env vars and default file)
|
277
|
+
kaggle: {
|
278
|
+
username: 'kaggle_username', # Or use KAGGLE_USERNAME env var
|
279
|
+
api_key: 'kaggle_api_key', # Or use KAGGLE_KEY env var
|
280
|
+
credentials_file: '~/.kaggle/kaggle.json' # Optional custom path
|
281
|
+
},
|
282
|
+
|
283
|
+
# File paths
|
284
|
+
download_path: './downloads', # Where to store downloads
|
285
|
+
cache_path: './cache', # Where to cache processed data
|
286
|
+
timeout: 30 # HTTP timeout for downloads
|
287
|
+
}
|
288
|
+
|
289
|
+
client = UrlCategorise::Client.new(dataset_config: dataset_config)
|
290
|
+
```
|
291
|
+
|
292
|
+
### Disabling Kaggle Functionality
|
293
|
+
|
294
|
+
You can completely disable Kaggle functionality if you only need CSV processing:
|
295
|
+
|
296
|
+
```ruby
|
297
|
+
# Disable Kaggle - only CSV datasets will work
|
298
|
+
client = UrlCategorise::Client.new(
|
299
|
+
dataset_config: {
|
300
|
+
enable_kaggle: false,
|
301
|
+
download_path: './datasets',
|
302
|
+
cache_path: './dataset_cache'
|
303
|
+
}
|
304
|
+
)
|
305
|
+
|
306
|
+
# This will raise an error
|
307
|
+
# client.load_kaggle_dataset('owner', 'dataset') # Error!
|
308
|
+
|
309
|
+
# But CSV datasets still work
|
310
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
311
|
+
```
|
312
|
+
|
313
|
+
### Working with Cached Datasets
|
314
|
+
|
315
|
+
If you have cached datasets, you can access them even without Kaggle credentials:
|
316
|
+
|
317
|
+
```ruby
|
318
|
+
# No credentials provided, but cached data will work
|
319
|
+
client = UrlCategorise::Client.new(
|
320
|
+
dataset_config: {
|
321
|
+
kaggle: {}, # Empty config - will show warning but continue
|
322
|
+
download_path: './datasets',
|
323
|
+
cache_path: './cache'
|
324
|
+
}
|
325
|
+
)
|
326
|
+
|
327
|
+
# Will work if data is cached, otherwise will show helpful error message
|
328
|
+
client.load_kaggle_dataset('owner', 'dataset', use_cache: true)
|
329
|
+
```
|
330
|
+
|
331
|
+
### Dataset Metadata and Hashing
|
332
|
+
|
333
|
+
The system automatically tracks dataset metadata and generates content hashes:
|
334
|
+
|
335
|
+
```ruby
|
336
|
+
# Get dataset metadata
|
337
|
+
metadata = client.dataset_metadata
|
338
|
+
metadata.each do |data_hash, meta|
|
339
|
+
puts "Dataset hash: #{data_hash}"
|
340
|
+
puts "Processed at: #{meta[:processed_at]}"
|
341
|
+
puts "Total entries: #{meta[:total_entries]}"
|
342
|
+
end
|
343
|
+
|
344
|
+
# Reload client with fresh dataset integration
|
345
|
+
client.reload_with_datasets
|
346
|
+
```
|
347
|
+
|
195
348
|
## ActiveRecord Integration
|
196
349
|
|
197
350
|
For high-performance applications, enable database storage:
|
@@ -215,11 +368,31 @@ categories = client.categorise("example.com")
|
|
215
368
|
|
216
369
|
# Get database statistics
|
217
370
|
stats = client.database_stats
|
218
|
-
# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90 }
|
371
|
+
# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90, dataset_metadata: 5 }
|
219
372
|
|
220
373
|
# Direct model access
|
221
374
|
domain_record = UrlCategorise::Models::Domain.find_by(domain: "example.com")
|
222
375
|
ip_record = UrlCategorise::Models::IpAddress.find_by(ip_address: "1.2.3.4")
|
376
|
+
|
377
|
+
# Dataset integration with ActiveRecord
|
378
|
+
client = UrlCategorise::ActiveRecordClient.new(
|
379
|
+
use_database: true,
|
380
|
+
dataset_config: {
|
381
|
+
kaggle: { username: 'user', api_key: 'key' }
|
382
|
+
}
|
383
|
+
)
|
384
|
+
|
385
|
+
# Load datasets - automatically stored in database
|
386
|
+
client.load_kaggle_dataset('owner', 'dataset')
|
387
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
388
|
+
|
389
|
+
# View dataset history
|
390
|
+
history = client.dataset_history(limit: 5)
|
391
|
+
# => [{ source_type: 'kaggle', identifier: 'owner/dataset', total_entries: 1000, processed_at: ... }]
|
392
|
+
|
393
|
+
# Filter by source type
|
394
|
+
kaggle_history = client.dataset_history(source_type: 'kaggle')
|
395
|
+
csv_history = client.dataset_history(source_type: 'csv')
|
223
396
|
```
|
224
397
|
|
225
398
|
## Rails Integration
|
@@ -274,6 +447,21 @@ class CreateUrlCategoriseTables < ActiveRecord::Migration[7.0]
|
|
274
447
|
|
275
448
|
add_index :url_categorise_ip_addresses, :ip_address
|
276
449
|
add_index :url_categorise_ip_addresses, :categories
|
450
|
+
|
451
|
+
create_table :url_categorise_dataset_metadata do |t|
|
452
|
+
t.string :source_type, null: false, index: true
|
453
|
+
t.string :identifier, null: false
|
454
|
+
t.string :data_hash, null: false, index: { unique: true }
|
455
|
+
t.integer :total_entries, null: false
|
456
|
+
t.text :category_mappings
|
457
|
+
t.text :processing_options
|
458
|
+
t.datetime :processed_at
|
459
|
+
t.timestamps
|
460
|
+
end
|
461
|
+
|
462
|
+
add_index :url_categorise_dataset_metadata, :source_type
|
463
|
+
add_index :url_categorise_dataset_metadata, :identifier
|
464
|
+
add_index :url_categorise_dataset_metadata, :processed_at
|
277
465
|
end
|
278
466
|
end
|
279
467
|
```
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'rake/testtask'
|
4
4
|
|
5
5
|
Rake::TestTask.new(:test) do |t|
|
6
|
-
t.libs <<
|
7
|
-
t.libs <<
|
8
|
-
t.test_files = FileList[
|
9
|
-
t.ruby_opts = [
|
6
|
+
t.libs << 'test'
|
7
|
+
t.libs << 'lib'
|
8
|
+
t.test_files = FileList['test/**/*_test.rb']
|
9
|
+
t.ruby_opts = ['-rbundler/setup']
|
10
10
|
end
|
11
11
|
|
12
|
-
task :
|
12
|
+
task default: :test
|
data/bin/check_lists
CHANGED
@@ -3,46 +3,45 @@
|
|
3
3
|
require 'bundler/setup'
|
4
4
|
require_relative '../lib/url_categorise'
|
5
5
|
|
6
|
-
puts
|
6
|
+
puts '=== CHECKING ALL URLs IN CONSTANTS ==='
|
7
7
|
|
8
8
|
UrlCategorise::Constants::DEFAULT_HOST_URLS.each do |category, urls|
|
9
9
|
puts "\n#{category.upcase}:"
|
10
|
-
|
10
|
+
|
11
11
|
# Skip categories that only reference other categories (symbols)
|
12
12
|
actual_urls = urls.reject { |url| url.is_a?(Symbol) }
|
13
|
-
|
13
|
+
|
14
14
|
if actual_urls.empty?
|
15
15
|
if urls.empty?
|
16
|
-
puts
|
16
|
+
puts ' Empty category (no URLs defined)'
|
17
17
|
else
|
18
18
|
puts " Only references other categories: #{urls}"
|
19
19
|
end
|
20
20
|
next
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
actual_urls.each do |url|
|
24
24
|
print " Testing #{url}... "
|
25
25
|
begin
|
26
26
|
response = HTTParty.head(url, timeout: 10)
|
27
27
|
case response.code
|
28
28
|
when 200
|
29
|
-
puts
|
29
|
+
puts '✅ OK'
|
30
30
|
when 404
|
31
|
-
puts
|
31
|
+
puts '❌ 404 Not Found'
|
32
32
|
when 403
|
33
|
-
puts
|
33
|
+
puts '❌ 403 Forbidden'
|
34
34
|
when 500..599
|
35
35
|
puts "❌ Server Error (#{response.code})"
|
36
36
|
else
|
37
37
|
puts "⚠️ HTTP #{response.code}"
|
38
38
|
end
|
39
39
|
rescue Net::TimeoutError, HTTParty::TimeoutError
|
40
|
-
puts
|
41
|
-
rescue SocketError, Errno::ECONNREFUSED
|
42
|
-
puts
|
43
|
-
rescue => e
|
40
|
+
puts '❌ Timeout'
|
41
|
+
rescue SocketError, Errno::ECONNREFUSED
|
42
|
+
puts '❌ DNS/Network Error'
|
43
|
+
rescue StandardError => e
|
44
44
|
puts "❌ Error: #{e.class}"
|
45
45
|
end
|
46
46
|
end
|
47
47
|
end
|
48
|
-
|
data/bin/console
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'url_categorise'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
8
8
|
|
9
9
|
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
require
|
10
|
+
require 'pry'
|
11
11
|
Pry.start
|
@@ -3,65 +3,125 @@ require_relative 'models'
|
|
3
3
|
module UrlCategorise
|
4
4
|
class ActiveRecordClient < Client
|
5
5
|
def initialize(**kwargs)
|
6
|
-
raise
|
7
|
-
|
6
|
+
raise 'ActiveRecord not available' unless UrlCategorise::Models.available?
|
7
|
+
|
8
8
|
@use_database = kwargs.delete(:use_database) { true }
|
9
9
|
super(**kwargs)
|
10
|
-
|
10
|
+
|
11
11
|
populate_database if @use_database
|
12
12
|
end
|
13
13
|
|
14
14
|
def categorise(url)
|
15
15
|
return super(url) unless @use_database && UrlCategorise::Models.available?
|
16
|
-
|
17
|
-
host = (URI.parse(url).host || url).downcase.gsub(
|
18
|
-
|
16
|
+
|
17
|
+
host = (URI.parse(url).host || url).downcase.gsub('www.', '')
|
18
|
+
|
19
19
|
# Try database first
|
20
20
|
categories = UrlCategorise::Models::Domain.categorise(host)
|
21
21
|
return categories unless categories.empty?
|
22
|
-
|
22
|
+
|
23
23
|
# Fallback to memory-based categorization
|
24
24
|
super(url)
|
25
25
|
end
|
26
26
|
|
27
27
|
def categorise_ip(ip_address)
|
28
28
|
return super(ip_address) unless @use_database && UrlCategorise::Models.available?
|
29
|
-
|
29
|
+
|
30
30
|
# Try database first
|
31
31
|
categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
|
32
32
|
return categories unless categories.empty?
|
33
|
-
|
33
|
+
|
34
34
|
# Fallback to memory-based categorization
|
35
35
|
super(ip_address)
|
36
36
|
end
|
37
37
|
|
38
38
|
def update_database
|
39
39
|
return unless @use_database && UrlCategorise::Models.available?
|
40
|
-
|
40
|
+
|
41
41
|
populate_database
|
42
42
|
end
|
43
43
|
|
44
44
|
def database_stats
|
45
45
|
return {} unless @use_database && UrlCategorise::Models.available?
|
46
|
-
|
46
|
+
|
47
47
|
{
|
48
48
|
domains: UrlCategorise::Models::Domain.count,
|
49
49
|
ip_addresses: UrlCategorise::Models::IpAddress.count,
|
50
50
|
list_metadata: UrlCategorise::Models::ListMetadata.count,
|
51
|
+
dataset_metadata: UrlCategorise::Models::DatasetMetadata.count,
|
51
52
|
categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
|
52
53
|
}
|
53
54
|
end
|
54
55
|
|
56
|
+
def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
|
57
|
+
result = super(dataset_owner, dataset_name, options)
|
58
|
+
|
59
|
+
# Store dataset metadata in database if enabled
|
60
|
+
if @use_database && UrlCategorise::Models.available? && @dataset_metadata
|
61
|
+
store_dataset_metadata_in_db(
|
62
|
+
source_type: 'kaggle',
|
63
|
+
identifier: "#{dataset_owner}/#{dataset_name}",
|
64
|
+
metadata: @dataset_metadata.values.last,
|
65
|
+
category_mappings: options[:category_mappings],
|
66
|
+
processing_options: options
|
67
|
+
)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Repopulate database with integrated dataset domains
|
71
|
+
populate_database if @use_database
|
72
|
+
|
73
|
+
result
|
74
|
+
end
|
75
|
+
|
76
|
+
def load_csv_dataset(url, options = {})
|
77
|
+
result = super(url, options)
|
78
|
+
|
79
|
+
# Store dataset metadata in database if enabled
|
80
|
+
if @use_database && UrlCategorise::Models.available? && @dataset_metadata
|
81
|
+
store_dataset_metadata_in_db(
|
82
|
+
source_type: 'csv',
|
83
|
+
identifier: url,
|
84
|
+
metadata: @dataset_metadata.values.last,
|
85
|
+
category_mappings: options[:category_mappings],
|
86
|
+
processing_options: options
|
87
|
+
)
|
88
|
+
end
|
89
|
+
|
90
|
+
# Repopulate database with integrated dataset domains
|
91
|
+
populate_database if @use_database
|
92
|
+
|
93
|
+
result
|
94
|
+
end
|
95
|
+
|
96
|
+
def dataset_history(source_type: nil, limit: 10)
|
97
|
+
return [] unless @use_database && UrlCategorise::Models.available?
|
98
|
+
|
99
|
+
query = UrlCategorise::Models::DatasetMetadata.order(processed_at: :desc).limit(limit)
|
100
|
+
query = query.by_source(source_type) if source_type
|
101
|
+
|
102
|
+
query.map do |record|
|
103
|
+
{
|
104
|
+
source_type: record.source_type,
|
105
|
+
identifier: record.identifier,
|
106
|
+
data_hash: record.data_hash,
|
107
|
+
total_entries: record.total_entries,
|
108
|
+
processed_at: record.processed_at,
|
109
|
+
category_mappings: record.category_mappings,
|
110
|
+
processing_options: record.processing_options
|
111
|
+
}
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
55
115
|
private
|
56
116
|
|
57
117
|
def populate_database
|
58
118
|
return unless UrlCategorise::Models.available?
|
59
|
-
|
119
|
+
|
60
120
|
# Store list metadata
|
61
121
|
@host_urls.each do |category, urls|
|
62
122
|
urls.each do |url|
|
63
123
|
next unless url.is_a?(String)
|
64
|
-
|
124
|
+
|
65
125
|
metadata = @metadata[url] || {}
|
66
126
|
UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
|
67
127
|
record.name = category.to_s
|
@@ -76,7 +136,7 @@ module UrlCategorise
|
|
76
136
|
@hosts.each do |category, domains|
|
77
137
|
domains.each do |domain|
|
78
138
|
next if domain.nil? || domain.empty?
|
79
|
-
|
139
|
+
|
80
140
|
existing = UrlCategorise::Models::Domain.find_by(domain: domain)
|
81
141
|
if existing
|
82
142
|
# Add category if not already present
|
@@ -92,15 +152,15 @@ module UrlCategorise
|
|
92
152
|
end
|
93
153
|
|
94
154
|
# Store IP data (for IP-based lists)
|
95
|
-
ip_categories = [
|
96
|
-
|
97
|
-
|
155
|
+
ip_categories = %i[sanctions_ips compromised_ips tor_exit_nodes open_proxy_ips
|
156
|
+
banking_trojans malicious_ssl_certificates top_attack_sources]
|
157
|
+
|
98
158
|
ip_categories.each do |category|
|
99
159
|
next unless @hosts[category]
|
100
|
-
|
160
|
+
|
101
161
|
@hosts[category].each do |ip|
|
102
162
|
next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
|
103
|
-
|
163
|
+
|
104
164
|
existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
|
105
165
|
if existing
|
106
166
|
categories = existing.categories | [category.to_s]
|
@@ -114,5 +174,22 @@ module UrlCategorise
|
|
114
174
|
end
|
115
175
|
end
|
116
176
|
end
|
177
|
+
|
178
|
+
def store_dataset_metadata_in_db(source_type:, identifier:, metadata:, category_mappings: nil,
|
179
|
+
processing_options: nil)
|
180
|
+
return unless UrlCategorise::Models.available?
|
181
|
+
|
182
|
+
UrlCategorise::Models::DatasetMetadata.find_or_create_by(data_hash: metadata[:data_hash]) do |record|
|
183
|
+
record.source_type = source_type
|
184
|
+
record.identifier = identifier
|
185
|
+
record.total_entries = metadata[:total_entries]
|
186
|
+
record.category_mappings = category_mappings || {}
|
187
|
+
record.processing_options = processing_options || {}
|
188
|
+
record.processed_at = metadata[:processed_at] || Time.now
|
189
|
+
end
|
190
|
+
rescue ActiveRecord::RecordInvalid => e
|
191
|
+
# Dataset metadata already exists or validation failed
|
192
|
+
puts "Warning: Failed to store dataset metadata: #{e.message}" if ENV['DEBUG']
|
193
|
+
end
|
117
194
|
end
|
118
|
-
end
|
195
|
+
end
|