UrlCategorise 0.1.3 ā 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +7 -2
- data/.gitignore +1 -0
- data/CLAUDE.md +77 -2
- data/Gemfile.lock +13 -1
- data/README.md +332 -7
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +1 -1
- data/lib/url_categorise/client.rb +431 -33
- data/lib/url_categorise/dataset_processor.rb +9 -4
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +2 -0
- data/url_categorise.gemspec +4 -2
- metadata +52 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1271c16022cb3fb5e9efe9af22deca404204ee54e1b23243c4ebd862565c76b
|
4
|
+
data.tar.gz: '082814ad46484b87027e0a1d0042f3f37ab20a3d491f4a8e39a585e14fb40ac8'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8f74b667ba1d993237e224d3db7fcb0323816716961280e5fd27fbf162ecb162f83fd126af89ce812e98fbf8fe48a15c47697b656e345d23465a23ab9913581e
|
7
|
+
data.tar.gz: 57ed94a3e9a9be6db60c3d48dd9d4ac4da52836951b06a9e6937c44968e5509cda43193491c9ec20039649e01d5bb4a6214f769aab2d6c27c8d047d7b9bd5273
|
data/.claude/settings.local.json
CHANGED
@@ -7,10 +7,15 @@
|
|
7
7
|
"Bash(bundle exec ruby:*)",
|
8
8
|
"Bash(find:*)",
|
9
9
|
"Bash(grep:*)",
|
10
|
-
"Read(//Users/trex22/development/rubygems/kaggle/**)",
|
11
10
|
"Bash(for file in test/url_categorise/*dataset*test.rb)",
|
12
11
|
"Bash(do echo \"Checking $file...\")",
|
13
|
-
"Bash(done)"
|
12
|
+
"Bash(done)",
|
13
|
+
"Bash(bundle exec rubocop:*)",
|
14
|
+
"Bash(rubocop:*)",
|
15
|
+
"Bash(timeout 30 ruby dataset_loading_example.rb)",
|
16
|
+
"Bash(timeout:*)",
|
17
|
+
"Bash(DEBUG=1 timeout 300 ruby correct_usage_example.rb)",
|
18
|
+
"Bash(chmod:*)"
|
14
19
|
],
|
15
20
|
"deny": []
|
16
21
|
}
|
data/.gitignore
CHANGED
data/CLAUDE.md
CHANGED
@@ -84,14 +84,89 @@ The gem includes automatic monitoring and cleanup of broken URLs:
|
|
84
84
|
- **Data Hashing**: SHA256 content hashing for dataset change detection
|
85
85
|
- **Category Mapping**: Flexible column detection and category mapping for datasets
|
86
86
|
- **Credential Warnings**: Helpful warnings when Kaggle credentials are missing but functionality continues
|
87
|
+
- **IAB Compliance**: Full support for IAB Content Taxonomy v2.0 and v3.0 standards
|
88
|
+
- **Dataset-Specific Metrics**: Separate counting methods for dataset vs DNS list categorization
|
89
|
+
- **Enhanced Statistics**: Extended helper methods for comprehensive data insights
|
90
|
+
- **ActiveAttr Settings**: In-memory modification of client settings using attribute setters
|
91
|
+
- **Data Export**: Multiple export formats including hosts files per category and CSV data exports
|
92
|
+
- **CLI Commands**: Command-line utilities for data export and list checking
|
87
93
|
|
88
94
|
### Architecture
|
89
|
-
- `Client` class: Main interface for categorization
|
95
|
+
- `Client` class: Main interface for categorization with IAB compliance support and ActiveAttr attributes
|
90
96
|
- `DatasetProcessor` class: Handles Kaggle and CSV dataset processing
|
97
|
+
- `IabCompliance` module: Maps categories to IAB Content Taxonomy v2.0/v3.0 standards
|
91
98
|
- `Constants` module: Contains default list URLs and categories
|
92
99
|
- `ActiveRecordClient` class: Database-backed client with dataset history
|
93
100
|
- Modular design allows extending with new list sources and datasets
|
94
|
-
- Support for custom list directories, caching, and
|
101
|
+
- Support for custom list directories, caching, dataset integration, IAB compliance, and data export
|
102
|
+
- ActiveAttr integration for dynamic setting modification and attribute validation
|
103
|
+
|
104
|
+
### New Features (Latest Version)
|
105
|
+
|
106
|
+
#### Dynamic Settings with ActiveAttr
|
107
|
+
The Client class now uses ActiveAttr to provide dynamic attribute modification:
|
108
|
+
|
109
|
+
```ruby
|
110
|
+
client = UrlCategorise::Client.new
|
111
|
+
|
112
|
+
# Modify settings in-memory
|
113
|
+
client.smart_categorization_enabled = true
|
114
|
+
client.iab_compliance_enabled = true
|
115
|
+
client.iab_version = :v2
|
116
|
+
client.request_timeout = 30
|
117
|
+
client.dns_servers = ['8.8.8.8', '8.8.4.4']
|
118
|
+
|
119
|
+
# Settings take effect immediately
|
120
|
+
categories = client.categorise('reddit.com') # Uses new smart categorization rules
|
121
|
+
```
|
122
|
+
|
123
|
+
#### Data Export Features
|
124
|
+
|
125
|
+
##### Hosts File Export
|
126
|
+
Export all categorized domains as separate hosts files per category:
|
127
|
+
|
128
|
+
```ruby
|
129
|
+
# Export to default location (cache_dir/exports/hosts or ./exports/hosts)
|
130
|
+
result = client.export_hosts_files
|
131
|
+
|
132
|
+
# Export to custom location
|
133
|
+
result = client.export_hosts_files('/custom/export/path')
|
134
|
+
|
135
|
+
# Returns hash with file information:
|
136
|
+
# {
|
137
|
+
# malware: { path: '/path/malware.hosts', filename: 'malware.hosts', count: 1500 },
|
138
|
+
# advertising: { path: '/path/advertising.hosts', filename: 'advertising.hosts', count: 25000 },
|
139
|
+
# _summary: { total_categories: 15, total_domains: 50000, export_directory: '/path' }
|
140
|
+
# }
|
141
|
+
```
|
142
|
+
|
143
|
+
##### CSV Data Export
|
144
|
+
Export all data as a single CSV file for AI training and analysis:
|
145
|
+
|
146
|
+
```ruby
|
147
|
+
# Export to default location (cache_dir/exports/csv or ./exports/csv)
|
148
|
+
result = client.export_csv_data
|
149
|
+
|
150
|
+
# Export to custom location
|
151
|
+
result = client.export_csv_data('/custom/export/path')
|
152
|
+
|
153
|
+
# CSV includes: domain, category, source_type, is_dataset_category, iab_category_v2, iab_category_v3, export_timestamp
|
154
|
+
# Metadata file includes: export info, client settings, data summary, dataset metadata
|
155
|
+
```
|
156
|
+
|
157
|
+
#### CLI Commands
|
158
|
+
New command-line utilities for data export:
|
159
|
+
|
160
|
+
```bash
|
161
|
+
# Export hosts files
|
162
|
+
$ bundle exec export_hosts --output /tmp/hosts --verbose
|
163
|
+
|
164
|
+
# Export CSV data with IAB compliance
|
165
|
+
$ bundle exec export_csv --output /tmp/csv --iab-compliance --verbose
|
166
|
+
|
167
|
+
# Check URL health (existing)
|
168
|
+
$ bundle exec check_lists
|
169
|
+
```
|
95
170
|
|
96
171
|
### List Sources
|
97
172
|
Primary sources include:
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
UrlCategorise (0.1.
|
4
|
+
UrlCategorise (0.1.6)
|
5
|
+
active_attr (>= 0.17.1, < 1.0)
|
5
6
|
api_pattern (>= 0.0.6, < 1.0)
|
6
7
|
csv (>= 3.3.0, < 4.0)
|
7
8
|
digest (>= 3.1.0, < 4.0)
|
8
9
|
fileutils (>= 1.7.0, < 2.0)
|
9
10
|
httparty (>= 0.22.0, < 1.0)
|
10
11
|
json (>= 2.7.0, < 3.0)
|
12
|
+
kaggle (>= 0.0.3, < 1.0)
|
11
13
|
nokogiri (>= 1.18.9, < 2.0)
|
12
14
|
resolv (>= 0.4.0, < 1.0)
|
13
15
|
rubyzip (>= 2.3.0, < 3.0)
|
@@ -87,6 +89,12 @@ GEM
|
|
87
89
|
i18n (1.14.7)
|
88
90
|
concurrent-ruby (~> 1.0)
|
89
91
|
json (2.13.2)
|
92
|
+
kaggle (0.0.3)
|
93
|
+
csv (>= 3.3)
|
94
|
+
fileutils (>= 1.7)
|
95
|
+
httparty (>= 0.23)
|
96
|
+
oj (= 3.16.11)
|
97
|
+
rubyzip (>= 2.0)
|
90
98
|
logger (1.7.0)
|
91
99
|
loofah (2.24.1)
|
92
100
|
crass (~> 1.0.2)
|
@@ -107,6 +115,10 @@ GEM
|
|
107
115
|
bigdecimal (~> 3.1)
|
108
116
|
nokogiri (1.18.9-arm64-darwin)
|
109
117
|
racc (~> 1.4)
|
118
|
+
oj (3.16.11)
|
119
|
+
bigdecimal (>= 3.0)
|
120
|
+
ostruct (>= 0.2)
|
121
|
+
ostruct (0.6.3)
|
110
122
|
pry (0.15.2)
|
111
123
|
coderay (~> 1.1)
|
112
124
|
method_source (~> 1.0)
|
data/README.md
CHANGED
@@ -5,6 +5,8 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
5
5
|
## Features
|
6
6
|
|
7
7
|
- **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
|
8
|
+
- **Kaggle Dataset Integration**: Automatic loading and processing of machine learning datasets from Kaggle
|
9
|
+
- **Multiple Data Sources**: Supports blocklists, CSV datasets, and Kaggle ML datasets
|
8
10
|
- **Multiple List Formats**: Supports hosts files, pfSense, AdSense, uBlock Origin, dnsmasq, and plain text formats
|
9
11
|
- **Intelligent Caching**: Hash-based file update detection with configurable local cache
|
10
12
|
- **DNS Resolution**: Resolve domains to IPs and check against IP-based blocklists
|
@@ -14,6 +16,10 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
14
16
|
- **Metadata Tracking**: Track last update times, ETags, and content hashes
|
15
17
|
- **Health Monitoring**: Automatic detection and removal of broken blocklist sources
|
16
18
|
- **List Validation**: Built-in tools to verify all configured URLs are accessible
|
19
|
+
- **Auto-Loading Datasets**: Automatic processing of predefined datasets during client initialization
|
20
|
+
- **ActiveAttr Settings**: In-memory modification of client settings using attribute setters
|
21
|
+
- **Data Export**: Export categorized data as hosts files per category or comprehensive CSV exports
|
22
|
+
- **CLI Commands**: Command-line utilities for data export and list checking
|
17
23
|
|
18
24
|
## Installation
|
19
25
|
|
@@ -44,6 +50,15 @@ puts "Total hosts: #{client.count_of_hosts}"
|
|
44
50
|
puts "Categories: #{client.count_of_categories}"
|
45
51
|
puts "Data size: #{client.size_of_data} MB"
|
46
52
|
|
53
|
+
# Get detailed size breakdown
|
54
|
+
puts "Total data size: #{client.size_of_data} MB (#{client.size_of_data_bytes} bytes)"
|
55
|
+
puts "Blocklist data size: #{client.size_of_blocklist_data} MB (#{client.size_of_blocklist_data_bytes} bytes)"
|
56
|
+
puts "Dataset data size: #{client.size_of_dataset_data} MB (#{client.size_of_dataset_data_bytes} bytes)"
|
57
|
+
|
58
|
+
# Get dataset-specific statistics (if datasets are loaded)
|
59
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}"
|
60
|
+
puts "Dataset categories: #{client.count_of_dataset_categories}"
|
61
|
+
|
47
62
|
# Categorize a URL or domain
|
48
63
|
categories = client.categorise("badsite.com")
|
49
64
|
puts "Categories: #{categories}" # => [:malware, :phishing]
|
@@ -57,6 +72,83 @@ ip_categories = client.categorise_ip("192.168.1.100")
|
|
57
72
|
puts "IP categories: #{ip_categories}"
|
58
73
|
```
|
59
74
|
|
75
|
+
## New Features
|
76
|
+
|
77
|
+
### Dynamic Settings with ActiveAttr
|
78
|
+
|
79
|
+
The Client class now supports in-memory modification of settings using ActiveAttr:
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
client = UrlCategorise::Client.new
|
83
|
+
|
84
|
+
# Modify settings dynamically
|
85
|
+
client.smart_categorization_enabled = true
|
86
|
+
client.iab_compliance_enabled = true
|
87
|
+
client.iab_version = :v2
|
88
|
+
client.request_timeout = 30
|
89
|
+
client.dns_servers = ['8.8.8.8', '8.8.4.4']
|
90
|
+
|
91
|
+
# Settings take effect immediately - no need to recreate the client
|
92
|
+
categories = client.categorise('reddit.com') # Uses new smart categorization rules
|
93
|
+
```
|
94
|
+
|
95
|
+
### Data Export Features
|
96
|
+
|
97
|
+
#### Hosts File Export
|
98
|
+
|
99
|
+
Export all categorized domains as separate hosts files per category:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
# Export to default location
|
103
|
+
result = client.export_hosts_files
|
104
|
+
|
105
|
+
# Export to custom location
|
106
|
+
result = client.export_hosts_files('/custom/export/path')
|
107
|
+
|
108
|
+
# Result includes file information and summary
|
109
|
+
puts "Exported #{result[:_summary][:total_categories]} categories"
|
110
|
+
puts "Total domains: #{result[:_summary][:total_domains]}"
|
111
|
+
puts "Files saved to: #{result[:_summary][:export_directory]}"
|
112
|
+
```
|
113
|
+
|
114
|
+
Each category gets its own hosts file (e.g., `malware.hosts`, `advertising.hosts`) with proper headers and sorted domains.
|
115
|
+
|
116
|
+
#### CSV Data Export
|
117
|
+
|
118
|
+
Export all data as a single CSV file for AI training and analysis:
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
# Export to default location
|
122
|
+
result = client.export_csv_data
|
123
|
+
|
124
|
+
# Export to custom location with IAB compliance
|
125
|
+
client.iab_compliance_enabled = true
|
126
|
+
result = client.export_csv_data('/custom/export/path')
|
127
|
+
|
128
|
+
# CSV includes comprehensive data:
|
129
|
+
# - domain, category, source_type, is_dataset_category
|
130
|
+
# - iab_category_v2, iab_category_v3, export_timestamp
|
131
|
+
# - smart_categorization_enabled
|
132
|
+
|
133
|
+
# Metadata file includes:
|
134
|
+
# - Export info, client settings, data summary, dataset metadata
|
135
|
+
```
|
136
|
+
|
137
|
+
#### CLI Commands
|
138
|
+
|
139
|
+
New command-line utilities for data export:
|
140
|
+
|
141
|
+
```bash
|
142
|
+
# Export hosts files
|
143
|
+
$ bundle exec export_hosts --output /tmp/hosts --verbose
|
144
|
+
|
145
|
+
# Export CSV data with IAB compliance
|
146
|
+
$ bundle exec export_csv --output /tmp/csv --iab-compliance --verbose
|
147
|
+
|
148
|
+
# Check URL health (existing command)
|
149
|
+
$ bundle exec check_lists
|
150
|
+
```
|
151
|
+
|
60
152
|
## Advanced Configuration
|
61
153
|
|
62
154
|
### File Caching
|
@@ -113,7 +205,11 @@ client = UrlCategorise::Client.new(
|
|
113
205
|
cache_dir: "./url_cache", # Enable local caching
|
114
206
|
force_download: false, # Use cache when available
|
115
207
|
dns_servers: ['1.1.1.1', '1.0.0.1'], # Cloudflare DNS servers
|
116
|
-
request_timeout: 15
|
208
|
+
request_timeout: 15, # 15 second HTTP timeout
|
209
|
+
iab_compliance: true, # Enable IAB compliance
|
210
|
+
iab_version: :v3, # Use IAB Content Taxonomy v3.0
|
211
|
+
auto_load_datasets: false, # Disable automatic dataset loading (default)
|
212
|
+
smart_categorization: false # Disable smart post-processing (default)
|
117
213
|
)
|
118
214
|
```
|
119
215
|
|
@@ -132,6 +228,165 @@ host_urls = {
|
|
132
228
|
client = UrlCategorise::Client.new(host_urls: host_urls)
|
133
229
|
```
|
134
230
|
|
231
|
+
### Smart Categorization (Post-Processing)
|
232
|
+
|
233
|
+
Smart categorization solves the problem of overly broad domain-level categorization. For example, `reddit.com` might appear in health & fitness blocklists, but not all Reddit content is health-related.
|
234
|
+
|
235
|
+
#### The Problem
|
236
|
+
|
237
|
+
```ruby
|
238
|
+
# Without smart categorization
|
239
|
+
client.categorise("reddit.com")
|
240
|
+
# => [:reddit, :social_media, :health_and_fitness, :forums] # Too broad!
|
241
|
+
|
242
|
+
client.categorise("reddit.com/r/technology")
|
243
|
+
# => [:reddit, :social_media, :health_and_fitness, :forums] # Still wrong!
|
244
|
+
```
|
245
|
+
|
246
|
+
#### The Solution
|
247
|
+
|
248
|
+
```ruby
|
249
|
+
# Enable smart categorization
|
250
|
+
client = UrlCategorise::Client.new(
|
251
|
+
smart_categorization: true # Remove overly broad categories
|
252
|
+
)
|
253
|
+
|
254
|
+
client.categorise("reddit.com")
|
255
|
+
# => [:reddit, :social_media] # Much more accurate!
|
256
|
+
```
|
257
|
+
|
258
|
+
#### How It Works
|
259
|
+
|
260
|
+
Smart categorization automatically removes overly broad categories for known platforms:
|
261
|
+
|
262
|
+
- **Social Media Platforms** (Reddit, Facebook, Twitter, etc.): Removes categories like `:health_and_fitness`, `:forums`, `:news`, `:technology`, `:education`
|
263
|
+
- **Search Engines** (Google, Bing, etc.): Removes categories like `:news`, `:shopping`, `:travel`
|
264
|
+
- **Video Platforms** (YouTube, Vimeo, etc.): Removes categories like `:education`, `:entertainment`, `:music`
|
265
|
+
|
266
|
+
#### Custom Smart Rules
|
267
|
+
|
268
|
+
You can define custom rules for specific domains or URL patterns:
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
custom_rules = {
|
272
|
+
reddit_subreddits: {
|
273
|
+
domains: ['reddit.com'],
|
274
|
+
remove_categories: [:health_and_fitness, :forums],
|
275
|
+
add_categories_by_path: {
|
276
|
+
/\/r\/fitness/ => [:health_and_fitness], # Add back for /r/fitness
|
277
|
+
/\/r\/technology/ => [:technology], # Add technology for /r/technology
|
278
|
+
/\/r\/programming/ => [:technology, :programming]
|
279
|
+
}
|
280
|
+
},
|
281
|
+
my_company_domains: {
|
282
|
+
domains: ['mycompany.com'],
|
283
|
+
allowed_categories_only: [:business, :technology] # Only allow specific categories
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
client = UrlCategorise::Client.new(
|
288
|
+
smart_categorization: true,
|
289
|
+
smart_rules: custom_rules
|
290
|
+
)
|
291
|
+
|
292
|
+
# Now path-based categorization works
|
293
|
+
client.categorise('reddit.com') # => [:reddit, :social_media]
|
294
|
+
client.categorise('reddit.com/r/fitness') # => [:reddit, :social_media, :health_and_fitness]
|
295
|
+
client.categorise('reddit.com/r/technology') # => [:reddit, :social_media, :technology]
|
296
|
+
```
|
297
|
+
|
298
|
+
#### Available Rule Types
|
299
|
+
|
300
|
+
- **`remove_categories`**: Remove specific categories for domains
|
301
|
+
- **`keep_primary_only`**: Keep only specified categories, remove others
|
302
|
+
- **`allowed_categories_only`**: Only allow specific categories, block all others
|
303
|
+
- **`add_categories_by_path`**: Add categories based on URL path patterns
|
304
|
+
|
305
|
+
#### Smart Rules with IAB Compliance
|
306
|
+
|
307
|
+
Smart categorization works seamlessly with IAB compliance:
|
308
|
+
|
309
|
+
```ruby
|
310
|
+
client = UrlCategorise::Client.new(
|
311
|
+
smart_categorization: true,
|
312
|
+
iab_compliance: true,
|
313
|
+
iab_version: :v3
|
314
|
+
)
|
315
|
+
|
316
|
+
# Returns clean IAB codes after smart processing
|
317
|
+
categories = client.categorise("reddit.com") # => ["14"] (Society - Social Media)
|
318
|
+
```
|
319
|
+
|
320
|
+
## IAB Content Taxonomy Compliance
|
321
|
+
|
322
|
+
UrlCategorise supports IAB (Interactive Advertising Bureau) Content Taxonomy compliance for standardized content categorization:
|
323
|
+
|
324
|
+
### Basic IAB Compliance
|
325
|
+
|
326
|
+
```ruby
|
327
|
+
# Enable IAB v3.0 compliance (default)
|
328
|
+
client = UrlCategorise::Client.new(
|
329
|
+
iab_compliance: true,
|
330
|
+
iab_version: :v3
|
331
|
+
)
|
332
|
+
|
333
|
+
# Enable IAB v2.0 compliance
|
334
|
+
client = UrlCategorise::Client.new(
|
335
|
+
iab_compliance: true,
|
336
|
+
iab_version: :v2
|
337
|
+
)
|
338
|
+
|
339
|
+
# Categorization returns IAB codes instead of custom categories
|
340
|
+
categories = client.categorise("badsite.com")
|
341
|
+
puts categories # => ["626"] (IAB v3 code for illegal content)
|
342
|
+
|
343
|
+
# Check IAB compliance status
|
344
|
+
puts client.iab_compliant? # => true
|
345
|
+
|
346
|
+
# Get IAB mapping for a specific category
|
347
|
+
puts client.get_iab_mapping(:malware) # => "626" (v3) or "IAB25" (v2)
|
348
|
+
```
|
349
|
+
|
350
|
+
### IAB Category Mappings
|
351
|
+
|
352
|
+
The gem maps security and content categories to appropriate IAB codes:
|
353
|
+
|
354
|
+
**IAB Content Taxonomy v3.0 (recommended):**
|
355
|
+
- `malware`, `phishing`, `illegal` ā `626` (Illegal Content)
|
356
|
+
- `advertising`, `mobile_ads` ā `3` (Advertising)
|
357
|
+
- `gambling` ā `7-39` (Gambling)
|
358
|
+
- `pornography` ā `626` (Adult Content)
|
359
|
+
- `social_media` ā `14` (Society)
|
360
|
+
- `technology` ā `19` (Technology & Computing)
|
361
|
+
|
362
|
+
**IAB Content Taxonomy v2.0:**
|
363
|
+
- `malware`, `phishing` ā `IAB25` (Non-Standard Content)
|
364
|
+
- `advertising` ā `IAB3` (Advertising)
|
365
|
+
- `gambling` ā `IAB7-39` (Gambling)
|
366
|
+
- `pornography` ā `IAB25-3` (Pornography)
|
367
|
+
|
368
|
+
### Integration with Datasets
|
369
|
+
|
370
|
+
IAB compliance works seamlessly with dataset processing:
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
client = UrlCategorise::Client.new(
|
374
|
+
iab_compliance: true,
|
375
|
+
iab_version: :v3,
|
376
|
+
dataset_config: {
|
377
|
+
kaggle: { username: 'user', api_key: 'key' }
|
378
|
+
},
|
379
|
+
auto_load_datasets: true # Automatically load predefined datasets with IAB mapping
|
380
|
+
)
|
381
|
+
|
382
|
+
# Load additional datasets - categories will be mapped to IAB codes
|
383
|
+
client.load_kaggle_dataset('owner', 'dataset-name')
|
384
|
+
client.load_csv_dataset('https://example.com/data.csv')
|
385
|
+
|
386
|
+
# All categorization methods return IAB codes
|
387
|
+
categories = client.categorise("example.com") # => ["3", "626"]
|
388
|
+
```
|
389
|
+
|
135
390
|
## Available Categories
|
136
391
|
|
137
392
|
### Security & Threat Intelligence
|
@@ -194,9 +449,43 @@ ruby bin/check_lists
|
|
194
449
|
|
195
450
|
## Dataset Processing
|
196
451
|
|
197
|
-
UrlCategorise
|
452
|
+
UrlCategorise supports processing external datasets from Kaggle and CSV files to expand categorization data beyond traditional blocklists. This allows integration of machine learning datasets and custom URL classification data:
|
453
|
+
|
454
|
+
### Automatic Dataset Loading
|
455
|
+
|
456
|
+
Enable automatic loading of predefined datasets during client initialization:
|
457
|
+
|
458
|
+
```ruby
|
459
|
+
# Enable automatic dataset loading from constants
|
460
|
+
client = UrlCategorise::Client.new(
|
461
|
+
dataset_config: {
|
462
|
+
kaggle: {
|
463
|
+
username: ENV['KAGGLE_USERNAME'],
|
464
|
+
api_key: ENV['KAGGLE_API_KEY']
|
465
|
+
},
|
466
|
+
cache_path: './dataset_cache',
|
467
|
+
download_path: './downloads'
|
468
|
+
},
|
469
|
+
auto_load_datasets: true # Automatically loads all predefined datasets
|
470
|
+
)
|
471
|
+
|
472
|
+
# Datasets are now automatically integrated and ready for use
|
473
|
+
categories = client.categorise('https://example.com')
|
474
|
+
puts "Dataset categories loaded: #{client.count_of_dataset_categories}"
|
475
|
+
puts "Dataset hosts: #{client.count_of_dataset_hosts}"
|
476
|
+
```
|
477
|
+
|
478
|
+
The gem includes predefined high-quality datasets in constants:
|
479
|
+
- **`shaurov/website-classification-using-url`** - Comprehensive URL classification dataset
|
480
|
+
- **`hetulmehta/website-classification`** - Website categorization with cleaned text data
|
481
|
+
- **`shawon10/url-classification-dataset-dmoz`** - DMOZ-based URL classification
|
482
|
+
- **Data.world CSV dataset** - Additional URL categorization data
|
483
|
+
|
484
|
+
### Manual Dataset Loading
|
485
|
+
|
486
|
+
You can also load datasets manually for more control over the process:
|
198
487
|
|
199
|
-
|
488
|
+
#### Kaggle Dataset Integration
|
200
489
|
|
201
490
|
Load datasets directly from Kaggle using three authentication methods:
|
202
491
|
|
@@ -244,7 +533,7 @@ client.load_kaggle_dataset('owner', 'dataset-name', {
|
|
244
533
|
categories = client.categorise('https://example.com')
|
245
534
|
```
|
246
535
|
|
247
|
-
|
536
|
+
#### CSV Dataset Processing
|
248
537
|
|
249
538
|
Load datasets from direct CSV URLs:
|
250
539
|
|
@@ -286,7 +575,10 @@ dataset_config = {
|
|
286
575
|
timeout: 30 # HTTP timeout for downloads
|
287
576
|
}
|
288
577
|
|
289
|
-
client = UrlCategorise::Client.new(
|
578
|
+
client = UrlCategorise::Client.new(
|
579
|
+
dataset_config: dataset_config,
|
580
|
+
auto_load_datasets: true # Enable automatic loading of predefined datasets
|
581
|
+
)
|
290
582
|
```
|
291
583
|
|
292
584
|
### Disabling Kaggle Functionality
|
@@ -485,7 +777,18 @@ class UrlCategorizerService
|
|
485
777
|
cache_dir: Rails.root.join('tmp', 'url_cache'),
|
486
778
|
use_database: true,
|
487
779
|
force_download: Rails.env.development?,
|
488
|
-
request_timeout: Rails.env.production? ? 30 : 10 # Longer timeout in production
|
780
|
+
request_timeout: Rails.env.production? ? 30 : 10, # Longer timeout in production
|
781
|
+
iab_compliance: Rails.env.production?, # Enable IAB compliance in production
|
782
|
+
iab_version: :v3, # Use IAB Content Taxonomy v3.0
|
783
|
+
auto_load_datasets: Rails.env.production?, # Auto-load datasets in production
|
784
|
+
dataset_config: {
|
785
|
+
kaggle: {
|
786
|
+
username: ENV['KAGGLE_USERNAME'],
|
787
|
+
api_key: ENV['KAGGLE_API_KEY']
|
788
|
+
},
|
789
|
+
cache_path: Rails.root.join('tmp', 'dataset_cache'),
|
790
|
+
download_path: Rails.root.join('tmp', 'dataset_downloads')
|
791
|
+
}
|
489
792
|
)
|
490
793
|
end
|
491
794
|
|
@@ -508,12 +811,34 @@ class UrlCategorizerService
|
|
508
811
|
end
|
509
812
|
|
510
813
|
def stats
|
511
|
-
@client.database_stats
|
814
|
+
base_stats = @client.database_stats
|
815
|
+
base_stats.merge({
|
816
|
+
dataset_hosts: @client.count_of_dataset_hosts,
|
817
|
+
dataset_categories: @client.count_of_dataset_categories,
|
818
|
+
iab_compliant: @client.iab_compliant?,
|
819
|
+
iab_version: @client.iab_version
|
820
|
+
})
|
512
821
|
end
|
513
822
|
|
514
823
|
def refresh_lists!
|
515
824
|
@client.update_database
|
516
825
|
end
|
826
|
+
|
827
|
+
def load_dataset(type, identifier, options = {})
|
828
|
+
case type.to_s
|
829
|
+
when 'kaggle'
|
830
|
+
owner, dataset = identifier.split('/')
|
831
|
+
@client.load_kaggle_dataset(owner, dataset, options)
|
832
|
+
when 'csv'
|
833
|
+
@client.load_csv_dataset(identifier, options)
|
834
|
+
else
|
835
|
+
raise ArgumentError, "Unsupported dataset type: #{type}"
|
836
|
+
end
|
837
|
+
end
|
838
|
+
|
839
|
+
def get_iab_mapping(category)
|
840
|
+
@client.get_iab_mapping(category)
|
841
|
+
end
|
517
842
|
end
|
518
843
|
```
|
519
844
|
|
data/bin/export_csv
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'optparse'
|
5
|
+
require_relative '../lib/url_categorise'
|
6
|
+
|
7
|
+
options = {
|
8
|
+
output_path: nil,
|
9
|
+
cache_dir: nil,
|
10
|
+
verbose: false,
|
11
|
+
iab_compliance: false,
|
12
|
+
smart_categorization: false
|
13
|
+
}
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{$0} [options]"
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Export all categorized domains and metadata as a single CSV file for AI training"
|
19
|
+
opts.separator ""
|
20
|
+
|
21
|
+
opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/csv or ./exports/csv)") do |path|
|
22
|
+
options[:output_path] = path
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
|
26
|
+
options[:cache_dir] = path
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("--iab-compliance", "Enable IAB compliance for category mapping") do
|
30
|
+
options[:iab_compliance] = true
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("--smart-categorization", "Enable smart categorization") do
|
34
|
+
options[:smart_categorization] = true
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-v", "--verbose", "Verbose output") do
|
38
|
+
options[:verbose] = true
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-h", "--help", "Show this help message") do
|
42
|
+
puts opts
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
end.parse!
|
46
|
+
|
47
|
+
puts "=== UrlCategorise CSV Data Export ===" if options[:verbose]
|
48
|
+
puts "Initializing client..." if options[:verbose]
|
49
|
+
|
50
|
+
begin
|
51
|
+
client = UrlCategorise::Client.new(
|
52
|
+
cache_dir: options[:cache_dir],
|
53
|
+
iab_compliance: options[:iab_compliance],
|
54
|
+
smart_categorization: options[:smart_categorization]
|
55
|
+
)
|
56
|
+
|
57
|
+
puts "Exporting CSV data..." if options[:verbose]
|
58
|
+
|
59
|
+
result = client.export_csv_data(options[:output_path])
|
60
|
+
|
61
|
+
puts "\nā
Export completed successfully!"
|
62
|
+
puts "š Export directory: #{result[:export_directory]}"
|
63
|
+
puts "š CSV file: #{result[:csv_file]}"
|
64
|
+
puts "š Metadata file: #{result[:metadata_file]}"
|
65
|
+
|
66
|
+
puts "\nš Data Summary:"
|
67
|
+
puts " Total domains: #{result[:summary][:total_domains]}"
|
68
|
+
puts " Total categories: #{result[:summary][:total_categories]}"
|
69
|
+
puts " Dataset categories: #{result[:summary][:dataset_categories_count]}"
|
70
|
+
puts " Blocklist categories: #{result[:summary][:blocklist_categories_count]}"
|
71
|
+
|
72
|
+
if options[:verbose]
|
73
|
+
puts "\nš·ļø Categories included:"
|
74
|
+
result[:summary][:categories].each do |category|
|
75
|
+
puts " - #{category}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
rescue StandardError => e
|
80
|
+
puts "ā Error: #{e.message}"
|
81
|
+
puts e.backtrace if options[:verbose]
|
82
|
+
exit 1
|
83
|
+
end
|