UrlCategorise 0.1.2 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +10 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +88 -3
- data/Gemfile +2 -2
- data/Gemfile.lock +18 -9
- data/README.md +517 -4
- data/Rakefile +8 -8
- data/bin/check_lists +12 -13
- data/bin/console +3 -3
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +98 -21
- data/lib/url_categorise/client.rb +641 -134
- data/lib/url_categorise/constants.rb +86 -71
- data/lib/url_categorise/dataset_processor.rb +476 -0
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/models.rb +53 -14
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +3 -0
- data/url_categorise.gemspec +37 -33
- metadata +142 -52
data/README.md
CHANGED
@@ -5,6 +5,8 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
5
5
|
## Features
|
6
6
|
|
7
7
|
- **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
|
8
|
+
- **Kaggle Dataset Integration**: Automatic loading and processing of machine learning datasets from Kaggle
|
9
|
+
- **Multiple Data Sources**: Supports blocklists, CSV datasets, and Kaggle ML datasets
|
8
10
|
- **Multiple List Formats**: Supports hosts files, pfSense, AdSense, uBlock Origin, dnsmasq, and plain text formats
|
9
11
|
- **Intelligent Caching**: Hash-based file update detection with configurable local cache
|
10
12
|
- **DNS Resolution**: Resolve domains to IPs and check against IP-based blocklists
|
@@ -14,6 +16,10 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
14
16
|
- **Metadata Tracking**: Track last update times, ETags, and content hashes
|
15
17
|
- **Health Monitoring**: Automatic detection and removal of broken blocklist sources
|
16
18
|
- **List Validation**: Built-in tools to verify all configured URLs are accessible
|
19
|
+
- **Auto-Loading Datasets**: Automatic processing of predefined datasets during client initialization
|
20
|
+
- **ActiveAttr Settings**: In-memory modification of client settings using attribute setters
|
21
|
+
- **Data Export**: Export categorized data as hosts files per category or comprehensive CSV exports
|
22
|
+
- **CLI Commands**: Command-line utilities for data export and list checking
|
17
23
|
|
18
24
|
## Installation
|
19
25
|
|
@@ -44,6 +50,15 @@ puts "Total hosts: #{client.count_of_hosts}"
|
|
44
50
|
puts "Categories: #{client.count_of_categories}"
|
45
51
|
puts "Data size: #{client.size_of_data} MB"
|
46
52
|
|
53
|
+
# Get detailed size breakdown
|
54
|
+
puts "Total data size: #{client.size_of_data} MB (#{client.size_of_data_bytes} bytes)"
|
55
|
+
puts "Blocklist data size: #{client.size_of_blocklist_data} MB (#{client.size_of_blocklist_data_bytes} bytes)"
|
56
|
+
puts "Dataset data size: #{client.size_of_dataset_data} MB (#{client.size_of_dataset_data_bytes} bytes)"
|
57
|
+
|
58
|
+
# Get dataset-specific statistics (if datasets are loaded)
|
59
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}"
|
60
|
+
puts "Dataset categories: #{client.count_of_dataset_categories}"
|
61
|
+
|
47
62
|
# Categorize a URL or domain
|
48
63
|
categories = client.categorise("badsite.com")
|
49
64
|
puts "Categories: #{categories}" # => [:malware, :phishing]
|
@@ -57,6 +72,83 @@ ip_categories = client.categorise_ip("192.168.1.100")
|
|
57
72
|
puts "IP categories: #{ip_categories}"
|
58
73
|
```
|
59
74
|
|
75
|
+
## New Features
|
76
|
+
|
77
|
+
### Dynamic Settings with ActiveAttr
|
78
|
+
|
79
|
+
The Client class now supports in-memory modification of settings using ActiveAttr:
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
client = UrlCategorise::Client.new
|
83
|
+
|
84
|
+
# Modify settings dynamically
|
85
|
+
client.smart_categorization_enabled = true
|
86
|
+
client.iab_compliance_enabled = true
|
87
|
+
client.iab_version = :v2
|
88
|
+
client.request_timeout = 30
|
89
|
+
client.dns_servers = ['8.8.8.8', '8.8.4.4']
|
90
|
+
|
91
|
+
# Settings take effect immediately - no need to recreate the client
|
92
|
+
categories = client.categorise('reddit.com') # Uses new smart categorization rules
|
93
|
+
```
|
94
|
+
|
95
|
+
### Data Export Features
|
96
|
+
|
97
|
+
#### Hosts File Export
|
98
|
+
|
99
|
+
Export all categorized domains as separate hosts files per category:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
# Export to default location
|
103
|
+
result = client.export_hosts_files
|
104
|
+
|
105
|
+
# Export to custom location
|
106
|
+
result = client.export_hosts_files('/custom/export/path')
|
107
|
+
|
108
|
+
# Result includes file information and summary
|
109
|
+
puts "Exported #{result[:_summary][:total_categories]} categories"
|
110
|
+
puts "Total domains: #{result[:_summary][:total_domains]}"
|
111
|
+
puts "Files saved to: #{result[:_summary][:export_directory]}"
|
112
|
+
```
|
113
|
+
|
114
|
+
Each category gets its own hosts file (e.g., `malware.hosts`, `advertising.hosts`) with proper headers and sorted domains.
|
115
|
+
|
116
|
+
#### CSV Data Export
|
117
|
+
|
118
|
+
Export all data as a single CSV file for AI training and analysis:
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
# Export to default location
|
122
|
+
result = client.export_csv_data
|
123
|
+
|
124
|
+
# Export to custom location with IAB compliance
|
125
|
+
client.iab_compliance_enabled = true
|
126
|
+
result = client.export_csv_data('/custom/export/path')
|
127
|
+
|
128
|
+
# CSV includes comprehensive data:
|
129
|
+
# - domain, category, source_type, is_dataset_category
|
130
|
+
# - iab_category_v2, iab_category_v3, export_timestamp
|
131
|
+
# - smart_categorization_enabled
|
132
|
+
|
133
|
+
# Metadata file includes:
|
134
|
+
# - Export info, client settings, data summary, dataset metadata
|
135
|
+
```
|
136
|
+
|
137
|
+
#### CLI Commands
|
138
|
+
|
139
|
+
New command-line utilities for data export:
|
140
|
+
|
141
|
+
```bash
|
142
|
+
# Export hosts files
|
143
|
+
$ bundle exec export_hosts --output /tmp/hosts --verbose
|
144
|
+
|
145
|
+
# Export CSV data with IAB compliance
|
146
|
+
$ bundle exec export_csv --output /tmp/csv --iab-compliance --verbose
|
147
|
+
|
148
|
+
# Check URL health (existing command)
|
149
|
+
$ bundle exec check_lists
|
150
|
+
```
|
151
|
+
|
60
152
|
## Advanced Configuration
|
61
153
|
|
62
154
|
### File Caching
|
@@ -113,7 +205,11 @@ client = UrlCategorise::Client.new(
|
|
113
205
|
cache_dir: "./url_cache", # Enable local caching
|
114
206
|
force_download: false, # Use cache when available
|
115
207
|
dns_servers: ['1.1.1.1', '1.0.0.1'], # Cloudflare DNS servers
|
116
|
-
request_timeout: 15
|
208
|
+
request_timeout: 15, # 15 second HTTP timeout
|
209
|
+
iab_compliance: true, # Enable IAB compliance
|
210
|
+
iab_version: :v3, # Use IAB Content Taxonomy v3.0
|
211
|
+
auto_load_datasets: false, # Disable automatic dataset loading (default)
|
212
|
+
smart_categorization: false # Disable smart post-processing (default)
|
117
213
|
)
|
118
214
|
```
|
119
215
|
|
@@ -132,6 +228,165 @@ host_urls = {
|
|
132
228
|
client = UrlCategorise::Client.new(host_urls: host_urls)
|
133
229
|
```
|
134
230
|
|
231
|
+
### Smart Categorization (Post-Processing)
|
232
|
+
|
233
|
+
Smart categorization solves the problem of overly broad domain-level categorization. For example, `reddit.com` might appear in health & fitness blocklists, but not all Reddit content is health-related.
|
234
|
+
|
235
|
+
#### The Problem
|
236
|
+
|
237
|
+
```ruby
|
238
|
+
# Without smart categorization
|
239
|
+
client.categorise("reddit.com")
|
240
|
+
# => [:reddit, :social_media, :health_and_fitness, :forums] # Too broad!
|
241
|
+
|
242
|
+
client.categorise("reddit.com/r/technology")
|
243
|
+
# => [:reddit, :social_media, :health_and_fitness, :forums] # Still wrong!
|
244
|
+
```
|
245
|
+
|
246
|
+
#### The Solution
|
247
|
+
|
248
|
+
```ruby
|
249
|
+
# Enable smart categorization
|
250
|
+
client = UrlCategorise::Client.new(
|
251
|
+
smart_categorization: true # Remove overly broad categories
|
252
|
+
)
|
253
|
+
|
254
|
+
client.categorise("reddit.com")
|
255
|
+
# => [:reddit, :social_media] # Much more accurate!
|
256
|
+
```
|
257
|
+
|
258
|
+
#### How It Works
|
259
|
+
|
260
|
+
Smart categorization automatically removes overly broad categories for known platforms:
|
261
|
+
|
262
|
+
- **Social Media Platforms** (Reddit, Facebook, Twitter, etc.): Removes categories like `:health_and_fitness`, `:forums`, `:news`, `:technology`, `:education`
|
263
|
+
- **Search Engines** (Google, Bing, etc.): Removes categories like `:news`, `:shopping`, `:travel`
|
264
|
+
- **Video Platforms** (YouTube, Vimeo, etc.): Removes categories like `:education`, `:entertainment`, `:music`
|
265
|
+
|
266
|
+
#### Custom Smart Rules
|
267
|
+
|
268
|
+
You can define custom rules for specific domains or URL patterns:
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
custom_rules = {
|
272
|
+
reddit_subreddits: {
|
273
|
+
domains: ['reddit.com'],
|
274
|
+
remove_categories: [:health_and_fitness, :forums],
|
275
|
+
add_categories_by_path: {
|
276
|
+
/\/r\/fitness/ => [:health_and_fitness], # Add back for /r/fitness
|
277
|
+
/\/r\/technology/ => [:technology], # Add technology for /r/technology
|
278
|
+
/\/r\/programming/ => [:technology, :programming]
|
279
|
+
}
|
280
|
+
},
|
281
|
+
my_company_domains: {
|
282
|
+
domains: ['mycompany.com'],
|
283
|
+
allowed_categories_only: [:business, :technology] # Only allow specific categories
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
client = UrlCategorise::Client.new(
|
288
|
+
smart_categorization: true,
|
289
|
+
smart_rules: custom_rules
|
290
|
+
)
|
291
|
+
|
292
|
+
# Now path-based categorization works
|
293
|
+
client.categorise('reddit.com') # => [:reddit, :social_media]
|
294
|
+
client.categorise('reddit.com/r/fitness') # => [:reddit, :social_media, :health_and_fitness]
|
295
|
+
client.categorise('reddit.com/r/technology') # => [:reddit, :social_media, :technology]
|
296
|
+
```
|
297
|
+
|
298
|
+
#### Available Rule Types
|
299
|
+
|
300
|
+
- **`remove_categories`**: Remove specific categories for domains
|
301
|
+
- **`keep_primary_only`**: Keep only specified categories, remove others
|
302
|
+
- **`allowed_categories_only`**: Only allow specific categories, block all others
|
303
|
+
- **`add_categories_by_path`**: Add categories based on URL path patterns
|
304
|
+
|
305
|
+
#### Smart Rules with IAB Compliance
|
306
|
+
|
307
|
+
Smart categorization works seamlessly with IAB compliance:
|
308
|
+
|
309
|
+
```ruby
|
310
|
+
client = UrlCategorise::Client.new(
|
311
|
+
smart_categorization: true,
|
312
|
+
iab_compliance: true,
|
313
|
+
iab_version: :v3
|
314
|
+
)
|
315
|
+
|
316
|
+
# Returns clean IAB codes after smart processing
|
317
|
+
categories = client.categorise("reddit.com") # => ["14"] (Society - Social Media)
|
318
|
+
```
|
319
|
+
|
320
|
+
## IAB Content Taxonomy Compliance
|
321
|
+
|
322
|
+
UrlCategorise supports IAB (Interactive Advertising Bureau) Content Taxonomy compliance for standardized content categorization:
|
323
|
+
|
324
|
+
### Basic IAB Compliance
|
325
|
+
|
326
|
+
```ruby
|
327
|
+
# Enable IAB v3.0 compliance (default)
|
328
|
+
client = UrlCategorise::Client.new(
|
329
|
+
iab_compliance: true,
|
330
|
+
iab_version: :v3
|
331
|
+
)
|
332
|
+
|
333
|
+
# Enable IAB v2.0 compliance
|
334
|
+
client = UrlCategorise::Client.new(
|
335
|
+
iab_compliance: true,
|
336
|
+
iab_version: :v2
|
337
|
+
)
|
338
|
+
|
339
|
+
# Categorization returns IAB codes instead of custom categories
|
340
|
+
categories = client.categorise("badsite.com")
|
341
|
+
puts categories # => ["626"] (IAB v3 code for illegal content)
|
342
|
+
|
343
|
+
# Check IAB compliance status
|
344
|
+
puts client.iab_compliant? # => true
|
345
|
+
|
346
|
+
# Get IAB mapping for a specific category
|
347
|
+
puts client.get_iab_mapping(:malware) # => "626" (v3) or "IAB25" (v2)
|
348
|
+
```
|
349
|
+
|
350
|
+
### IAB Category Mappings
|
351
|
+
|
352
|
+
The gem maps security and content categories to appropriate IAB codes:
|
353
|
+
|
354
|
+
**IAB Content Taxonomy v3.0 (recommended):**
|
355
|
+
- `malware`, `phishing`, `illegal` → `626` (Illegal Content)
|
356
|
+
- `advertising`, `mobile_ads` → `3` (Advertising)
|
357
|
+
- `gambling` → `7-39` (Gambling)
|
358
|
+
- `pornography` → `626` (Adult Content)
|
359
|
+
- `social_media` → `14` (Society)
|
360
|
+
- `technology` → `19` (Technology & Computing)
|
361
|
+
|
362
|
+
**IAB Content Taxonomy v2.0:**
|
363
|
+
- `malware`, `phishing` → `IAB25` (Non-Standard Content)
|
364
|
+
- `advertising` → `IAB3` (Advertising)
|
365
|
+
- `gambling` → `IAB7-39` (Gambling)
|
366
|
+
- `pornography` → `IAB25-3` (Pornography)
|
367
|
+
|
368
|
+
### Integration with Datasets
|
369
|
+
|
370
|
+
IAB compliance works seamlessly with dataset processing:
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
client = UrlCategorise::Client.new(
|
374
|
+
iab_compliance: true,
|
375
|
+
iab_version: :v3,
|
376
|
+
dataset_config: {
|
377
|
+
kaggle: { username: 'user', api_key: 'key' }
|
378
|
+
},
|
379
|
+
auto_load_datasets: true # Automatically load predefined datasets with IAB mapping
|
380
|
+
)
|
381
|
+
|
382
|
+
# Load additional datasets - categories will be mapped to IAB codes
|
383
|
+
client.load_kaggle_dataset('owner', 'dataset-name')
|
384
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
385
|
+
|
386
|
+
# All categorization methods return IAB codes
|
387
|
+
categories = client.categorise("example.com") # => ["3", "626"]
|
388
|
+
```
|
389
|
+
|
135
390
|
## Available Categories
|
136
391
|
|
137
392
|
### Security & Threat Intelligence
|
@@ -192,6 +447,196 @@ ruby bin/check_lists
|
|
192
447
|
|
193
448
|
[View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
|
194
449
|
|
450
|
+
## Dataset Processing
|
451
|
+
|
452
|
+
UrlCategorise supports processing external datasets from Kaggle and CSV files to expand categorization data beyond traditional blocklists. This allows integration of machine learning datasets and custom URL classification data:
|
453
|
+
|
454
|
+
### Automatic Dataset Loading
|
455
|
+
|
456
|
+
Enable automatic loading of predefined datasets during client initialization:
|
457
|
+
|
458
|
+
```ruby
|
459
|
+
# Enable automatic dataset loading from constants
|
460
|
+
client = UrlCategorise::Client.new(
|
461
|
+
dataset_config: {
|
462
|
+
kaggle: {
|
463
|
+
username: ENV['KAGGLE_USERNAME'],
|
464
|
+
api_key: ENV['KAGGLE_API_KEY']
|
465
|
+
},
|
466
|
+
cache_path: './dataset_cache',
|
467
|
+
download_path: './downloads'
|
468
|
+
},
|
469
|
+
auto_load_datasets: true # Automatically loads all predefined datasets
|
470
|
+
)
|
471
|
+
|
472
|
+
# Datasets are now automatically integrated and ready for use
|
473
|
+
categories = client.categorise('https://example.com')
|
474
|
+
puts "Dataset categories loaded: #{client.count_of_dataset_categories}"
|
475
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}"
|
476
|
+
```
|
477
|
+
|
478
|
+
The gem includes predefined high-quality datasets in constants:
|
479
|
+
- **`shaurov/website-classification-using-url`** - Comprehensive URL classification dataset
|
480
|
+
- **`hetulmehta/website-classification`** - Website categorization with cleaned text data
|
481
|
+
- **`shawon10/url-classification-dataset-dmoz`** - DMOZ-based URL classification
|
482
|
+
- **Data.world CSV dataset** - Additional URL categorization data
|
483
|
+
|
484
|
+
### Manual Dataset Loading
|
485
|
+
|
486
|
+
You can also load datasets manually for more control over the process:
|
487
|
+
|
488
|
+
#### Kaggle Dataset Integration
|
489
|
+
|
490
|
+
Load datasets directly from Kaggle using three authentication methods:
|
491
|
+
|
492
|
+
```ruby
|
493
|
+
# Method 1: Environment variables (KAGGLE_USERNAME, KAGGLE_KEY)
|
494
|
+
client = UrlCategorise::Client.new(
|
495
|
+
dataset_config: {
|
496
|
+
kaggle: {} # Will use environment variables
|
497
|
+
}
|
498
|
+
)
|
499
|
+
|
500
|
+
# Method 2: Explicit credentials
|
501
|
+
client = UrlCategorise::Client.new(
|
502
|
+
dataset_config: {
|
503
|
+
kaggle: {
|
504
|
+
username: 'your_username',
|
505
|
+
api_key: 'your_api_key'
|
506
|
+
}
|
507
|
+
}
|
508
|
+
)
|
509
|
+
|
510
|
+
# Method 3: Credentials file (~/.kaggle/kaggle.json or custom path)
|
511
|
+
client = UrlCategorise::Client.new(
|
512
|
+
dataset_config: {
|
513
|
+
kaggle: {
|
514
|
+
credentials_file: '/path/to/kaggle.json'
|
515
|
+
}
|
516
|
+
}
|
517
|
+
)
|
518
|
+
|
519
|
+
# Load and integrate a Kaggle dataset
|
520
|
+
client.load_kaggle_dataset('owner', 'dataset-name', {
|
521
|
+
use_cache: true, # Cache processed data
|
522
|
+
category_mappings: {
|
523
|
+
url_column: 'website', # Column containing URLs/domains
|
524
|
+
category_column: 'type', # Column containing categories
|
525
|
+
category_map: {
|
526
|
+
'malicious' => 'malware', # Map dataset categories to your categories
|
527
|
+
'spam' => 'phishing'
|
528
|
+
}
|
529
|
+
}
|
530
|
+
})
|
531
|
+
|
532
|
+
# Check categorization with dataset data
|
533
|
+
categories = client.categorise('https://example.com')
|
534
|
+
```
|
535
|
+
|
536
|
+
#### CSV Dataset Processing
|
537
|
+
|
538
|
+
Load datasets from direct CSV URLs:
|
539
|
+
|
540
|
+
```ruby
|
541
|
+
client = UrlCategorise::Client.new(
|
542
|
+
dataset_config: {
|
543
|
+
download_path: './datasets',
|
544
|
+
cache_path: './dataset_cache'
|
545
|
+
}
|
546
|
+
)
|
547
|
+
|
548
|
+
# Load CSV dataset
|
549
|
+
client.load_csv_dataset('https://example.com/url-classification.csv', {
|
550
|
+
use_cache: true,
|
551
|
+
category_mappings: {
|
552
|
+
url_column: 'url',
|
553
|
+
category_column: 'category'
|
554
|
+
}
|
555
|
+
})
|
556
|
+
```
|
557
|
+
|
558
|
+
### Dataset Configuration Options
|
559
|
+
|
560
|
+
```ruby
|
561
|
+
dataset_config = {
|
562
|
+
# Kaggle functionality control
|
563
|
+
enable_kaggle: true, # Set to false to disable Kaggle entirely (default: true)
|
564
|
+
|
565
|
+
# Kaggle authentication (optional - will try env vars and default file)
|
566
|
+
kaggle: {
|
567
|
+
username: 'kaggle_username', # Or use KAGGLE_USERNAME env var
|
568
|
+
api_key: 'kaggle_api_key', # Or use KAGGLE_KEY env var
|
569
|
+
credentials_file: '~/.kaggle/kaggle.json' # Optional custom path
|
570
|
+
},
|
571
|
+
|
572
|
+
# File paths
|
573
|
+
download_path: './downloads', # Where to store downloads
|
574
|
+
cache_path: './cache', # Where to cache processed data
|
575
|
+
timeout: 30 # HTTP timeout for downloads
|
576
|
+
}
|
577
|
+
|
578
|
+
client = UrlCategorise::Client.new(
|
579
|
+
dataset_config: dataset_config,
|
580
|
+
auto_load_datasets: true # Enable automatic loading of predefined datasets
|
581
|
+
)
|
582
|
+
```
|
583
|
+
|
584
|
+
### Disabling Kaggle Functionality
|
585
|
+
|
586
|
+
You can completely disable Kaggle functionality if you only need CSV processing:
|
587
|
+
|
588
|
+
```ruby
|
589
|
+
# Disable Kaggle - only CSV datasets will work
|
590
|
+
client = UrlCategorise::Client.new(
|
591
|
+
dataset_config: {
|
592
|
+
enable_kaggle: false,
|
593
|
+
download_path: './datasets',
|
594
|
+
cache_path: './dataset_cache'
|
595
|
+
}
|
596
|
+
)
|
597
|
+
|
598
|
+
# This will raise an error
|
599
|
+
# client.load_kaggle_dataset('owner', 'dataset') # Error!
|
600
|
+
|
601
|
+
# But CSV datasets still work
|
602
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
603
|
+
```
|
604
|
+
|
605
|
+
### Working with Cached Datasets
|
606
|
+
|
607
|
+
If you have cached datasets, you can access them even without Kaggle credentials:
|
608
|
+
|
609
|
+
```ruby
|
610
|
+
# No credentials provided, but cached data will work
|
611
|
+
client = UrlCategorise::Client.new(
|
612
|
+
dataset_config: {
|
613
|
+
kaggle: {}, # Empty config - will show warning but continue
|
614
|
+
download_path: './datasets',
|
615
|
+
cache_path: './cache'
|
616
|
+
}
|
617
|
+
)
|
618
|
+
|
619
|
+
# Will work if data is cached, otherwise will show helpful error message
|
620
|
+
client.load_kaggle_dataset('owner', 'dataset', use_cache: true)
|
621
|
+
```
|
622
|
+
|
623
|
+
### Dataset Metadata and Hashing
|
624
|
+
|
625
|
+
The system automatically tracks dataset metadata and generates content hashes:
|
626
|
+
|
627
|
+
```ruby
|
628
|
+
# Get dataset metadata
|
629
|
+
metadata = client.dataset_metadata
|
630
|
+
metadata.each do |data_hash, meta|
|
631
|
+
puts "Dataset hash: #{data_hash}"
|
632
|
+
puts "Processed at: #{meta[:processed_at]}"
|
633
|
+
puts "Total entries: #{meta[:total_entries]}"
|
634
|
+
end
|
635
|
+
|
636
|
+
# Reload client with fresh dataset integration
|
637
|
+
client.reload_with_datasets
|
638
|
+
```
|
639
|
+
|
195
640
|
## ActiveRecord Integration
|
196
641
|
|
197
642
|
For high-performance applications, enable database storage:
|
@@ -215,11 +660,31 @@ categories = client.categorise("example.com")
|
|
215
660
|
|
216
661
|
# Get database statistics
|
217
662
|
stats = client.database_stats
|
218
|
-
# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90 }
|
663
|
+
# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90, dataset_metadata: 5 }
|
219
664
|
|
220
665
|
# Direct model access
|
221
666
|
domain_record = UrlCategorise::Models::Domain.find_by(domain: "example.com")
|
222
667
|
ip_record = UrlCategorise::Models::IpAddress.find_by(ip_address: "1.2.3.4")
|
668
|
+
|
669
|
+
# Dataset integration with ActiveRecord
|
670
|
+
client = UrlCategorise::ActiveRecordClient.new(
|
671
|
+
use_database: true,
|
672
|
+
dataset_config: {
|
673
|
+
kaggle: { username: 'user', api_key: 'key' }
|
674
|
+
}
|
675
|
+
)
|
676
|
+
|
677
|
+
# Load datasets - automatically stored in database
|
678
|
+
client.load_kaggle_dataset('owner', 'dataset')
|
679
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
680
|
+
|
681
|
+
# View dataset history
|
682
|
+
history = client.dataset_history(limit: 5)
|
683
|
+
# => [{ source_type: 'kaggle', identifier: 'owner/dataset', total_entries: 1000, processed_at: ... }]
|
684
|
+
|
685
|
+
# Filter by source type
|
686
|
+
kaggle_history = client.dataset_history(source_type: 'kaggle')
|
687
|
+
csv_history = client.dataset_history(source_type: 'csv')
|
223
688
|
```
|
224
689
|
|
225
690
|
## Rails Integration
|
@@ -274,6 +739,21 @@ class CreateUrlCategoriseTables < ActiveRecord::Migration[7.0]
|
|
274
739
|
|
275
740
|
add_index :url_categorise_ip_addresses, :ip_address
|
276
741
|
add_index :url_categorise_ip_addresses, :categories
|
742
|
+
|
743
|
+
create_table :url_categorise_dataset_metadata do |t|
|
744
|
+
t.string :source_type, null: false, index: true
|
745
|
+
t.string :identifier, null: false
|
746
|
+
t.string :data_hash, null: false, index: { unique: true }
|
747
|
+
t.integer :total_entries, null: false
|
748
|
+
t.text :category_mappings
|
749
|
+
t.text :processing_options
|
750
|
+
t.datetime :processed_at
|
751
|
+
t.timestamps
|
752
|
+
end
|
753
|
+
|
754
|
+
add_index :url_categorise_dataset_metadata, :source_type
|
755
|
+
add_index :url_categorise_dataset_metadata, :identifier
|
756
|
+
add_index :url_categorise_dataset_metadata, :processed_at
|
277
757
|
end
|
278
758
|
end
|
279
759
|
```
|
@@ -297,7 +777,18 @@ class UrlCategorizerService
|
|
297
777
|
cache_dir: Rails.root.join('tmp', 'url_cache'),
|
298
778
|
use_database: true,
|
299
779
|
force_download: Rails.env.development?,
|
300
|
-
request_timeout: Rails.env.production? ? 30 : 10 # Longer timeout in production
|
780
|
+
request_timeout: Rails.env.production? ? 30 : 10, # Longer timeout in production
|
781
|
+
iab_compliance: Rails.env.production?, # Enable IAB compliance in production
|
782
|
+
iab_version: :v3, # Use IAB Content Taxonomy v3.0
|
783
|
+
auto_load_datasets: Rails.env.production?, # Auto-load datasets in production
|
784
|
+
dataset_config: {
|
785
|
+
kaggle: {
|
786
|
+
username: ENV['KAGGLE_USERNAME'],
|
787
|
+
api_key: ENV['KAGGLE_API_KEY']
|
788
|
+
},
|
789
|
+
cache_path: Rails.root.join('tmp', 'dataset_cache'),
|
790
|
+
download_path: Rails.root.join('tmp', 'dataset_downloads')
|
791
|
+
}
|
301
792
|
)
|
302
793
|
end
|
303
794
|
|
@@ -320,12 +811,34 @@ class UrlCategorizerService
|
|
320
811
|
end
|
321
812
|
|
322
813
|
def stats
|
323
|
-
@client.database_stats
|
814
|
+
base_stats = @client.database_stats
|
815
|
+
base_stats.merge({
|
816
|
+
dataset_hosts: @client.count_of_dataset_hosts,
|
817
|
+
dataset_categories: @client.count_of_dataset_categories,
|
818
|
+
iab_compliant: @client.iab_compliant?,
|
819
|
+
iab_version: @client.iab_version
|
820
|
+
})
|
324
821
|
end
|
325
822
|
|
326
823
|
def refresh_lists!
|
327
824
|
@client.update_database
|
328
825
|
end
|
826
|
+
|
827
|
+
def load_dataset(type, identifier, options = {})
|
828
|
+
case type.to_s
|
829
|
+
when 'kaggle'
|
830
|
+
owner, dataset = identifier.split('/')
|
831
|
+
@client.load_kaggle_dataset(owner, dataset, options)
|
832
|
+
when 'csv'
|
833
|
+
@client.load_csv_dataset(identifier, options)
|
834
|
+
else
|
835
|
+
raise ArgumentError, "Unsupported dataset type: #{type}"
|
836
|
+
end
|
837
|
+
end
|
838
|
+
|
839
|
+
def get_iab_mapping(category)
|
840
|
+
@client.get_iab_mapping(category)
|
841
|
+
end
|
329
842
|
end
|
330
843
|
```
|
331
844
|
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'rake/testtask'
|
4
4
|
|
5
5
|
Rake::TestTask.new(:test) do |t|
|
6
|
-
t.libs <<
|
7
|
-
t.libs <<
|
8
|
-
t.test_files = FileList[
|
9
|
-
t.ruby_opts = [
|
6
|
+
t.libs << 'test'
|
7
|
+
t.libs << 'lib'
|
8
|
+
t.test_files = FileList['test/**/*_test.rb']
|
9
|
+
t.ruby_opts = ['-rbundler/setup']
|
10
10
|
end
|
11
11
|
|
12
|
-
task :
|
12
|
+
task default: :test
|
data/bin/check_lists
CHANGED
@@ -3,46 +3,45 @@
|
|
3
3
|
require 'bundler/setup'
|
4
4
|
require_relative '../lib/url_categorise'
|
5
5
|
|
6
|
-
puts
|
6
|
+
puts '=== CHECKING ALL URLs IN CONSTANTS ==='
|
7
7
|
|
8
8
|
UrlCategorise::Constants::DEFAULT_HOST_URLS.each do |category, urls|
|
9
9
|
puts "\n#{category.upcase}:"
|
10
|
-
|
10
|
+
|
11
11
|
# Skip categories that only reference other categories (symbols)
|
12
12
|
actual_urls = urls.reject { |url| url.is_a?(Symbol) }
|
13
|
-
|
13
|
+
|
14
14
|
if actual_urls.empty?
|
15
15
|
if urls.empty?
|
16
|
-
puts
|
16
|
+
puts ' Empty category (no URLs defined)'
|
17
17
|
else
|
18
18
|
puts " Only references other categories: #{urls}"
|
19
19
|
end
|
20
20
|
next
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
actual_urls.each do |url|
|
24
24
|
print " Testing #{url}... "
|
25
25
|
begin
|
26
26
|
response = HTTParty.head(url, timeout: 10)
|
27
27
|
case response.code
|
28
28
|
when 200
|
29
|
-
puts
|
29
|
+
puts '✅ OK'
|
30
30
|
when 404
|
31
|
-
puts
|
31
|
+
puts '❌ 404 Not Found'
|
32
32
|
when 403
|
33
|
-
puts
|
33
|
+
puts '❌ 403 Forbidden'
|
34
34
|
when 500..599
|
35
35
|
puts "❌ Server Error (#{response.code})"
|
36
36
|
else
|
37
37
|
puts "⚠️ HTTP #{response.code}"
|
38
38
|
end
|
39
39
|
rescue Net::TimeoutError, HTTParty::TimeoutError
|
40
|
-
puts
|
41
|
-
rescue SocketError, Errno::ECONNREFUSED
|
42
|
-
puts
|
43
|
-
rescue => e
|
40
|
+
puts '❌ Timeout'
|
41
|
+
rescue SocketError, Errno::ECONNREFUSED
|
42
|
+
puts '❌ DNS/Network Error'
|
43
|
+
rescue StandardError => e
|
44
44
|
puts "❌ Error: #{e.class}"
|
45
45
|
end
|
46
46
|
end
|
47
47
|
end
|
48
|
-
|
data/bin/console
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'url_categorise'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
8
8
|
|
9
9
|
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
require
|
10
|
+
require 'pry'
|
11
11
|
Pry.start
|