UrlCategorise 0.1.3 โ†’ 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/export_hosts ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'optparse'
5
+ require_relative '../lib/url_categorise'
6
+
7
+ options = {
8
+ output_path: nil,
9
+ cache_dir: nil,
10
+ verbose: false
11
+ }
12
+
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{$0} [options]"
15
+ opts.separator ""
16
+ opts.separator "Export all categorized domains as separate hosts files per category"
17
+ opts.separator ""
18
+
19
+ opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/hosts or ./exports/hosts)") do |path|
20
+ options[:output_path] = path
21
+ end
22
+
23
+ opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
24
+ options[:cache_dir] = path
25
+ end
26
+
27
+ opts.on("-v", "--verbose", "Verbose output") do
28
+ options[:verbose] = true
29
+ end
30
+
31
+ opts.on("-h", "--help", "Show this help message") do
32
+ puts opts
33
+ exit
34
+ end
35
+ end.parse!
36
+
37
+ puts "=== UrlCategorise Hosts Export ===" if options[:verbose]
38
+ puts "Initializing client..." if options[:verbose]
39
+
40
+ begin
41
+ client = UrlCategorise::Client.new(
42
+ cache_dir: options[:cache_dir]
43
+ )
44
+
45
+ puts "Exporting hosts files..." if options[:verbose]
46
+
47
+ result = client.export_hosts_files(options[:output_path])
48
+
49
+ summary = result.delete(:_summary)
50
+
51
+ puts "\nโœ… Export completed successfully!"
52
+ puts "๐Ÿ“ Export directory: #{summary[:export_directory]}"
53
+ puts "๐Ÿ“Š Total categories exported: #{summary[:total_categories]}"
54
+ puts "๐ŸŒ Total domains exported: #{summary[:total_domains]}"
55
+ puts "๐Ÿ“„ Summary file: #{summary[:path]}"
56
+
57
+ if options[:verbose]
58
+ puts "\n๐Ÿ“‹ Files created:"
59
+ result.each do |category, info|
60
+ puts " #{info[:filename]} - #{info[:count]} domains"
61
+ end
62
+ end
63
+
64
+ rescue StandardError => e
65
+ puts "โŒ Error: #{e.message}"
66
+ puts e.backtrace if options[:verbose]
67
+ exit 1
68
+ end
data/bin/rake ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ rake
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require './lib/url_categorise'
5
+
6
+ puts "=== Large Dataset Loading Example ==="
7
+
8
+ # Configuration for handling large datasets (300+ MB)
9
+ # First test with cache-only mode
10
+ puts "Creating client with cached datasets only..."
11
+ client = UrlCategorise::Client.new(
12
+ cache_dir: './url_cache',
13
+ auto_load_datasets: true,
14
+ smart_categorization: true,
15
+ dataset_config: {
16
+ cache_path: './url_cache/datasets',
17
+ download_path: './url_cache/downloads',
18
+ kaggle: { credentials_file: '~/kaggle.json' }
19
+ }
20
+ )
21
+
22
+ puts "Client created successfully!"
23
+ puts ""
24
+ puts "Dataset Statistics:"
25
+ puts " Total categories: #{client.count_of_categories}"
26
+ puts " Dataset categories: #{client.count_of_dataset_categories}"
27
+ puts " Blocklist categories: #{client.count_of_categories - client.count_of_dataset_categories}"
28
+ puts ""
29
+ puts " Total hosts: #{client.count_of_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
30
+ puts " Dataset hosts: #{client.count_of_dataset_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
31
+ puts ""
32
+ puts " Total data size: #{client.size_of_data.round(1)} MB"
33
+ puts " Dataset data size: #{client.size_of_dataset_data.round(1)} MB"
34
+ puts " Blocklist data size: #{client.size_of_blocklist_data.round(1)} MB"
35
+
36
+ puts ""
37
+ puts "Dataset-specific Statistics:"
38
+ # Get dataset metadata if available
39
+ metadata = client.dataset_metadata
40
+ if metadata && !metadata.empty?
41
+ puts " Datasets loaded: #{metadata.size}"
42
+
43
+ # Calculate size for each dataset by finding its categories and domains
44
+ dataset_categories = client.instance_variable_get(:@dataset_categories)
45
+ total_dataset_size = 0
46
+
47
+ metadata.each_with_index do |(hash, data), index|
48
+ # Estimate size contribution of this dataset
49
+ dataset_portion = data[:total_entries].to_f / metadata.values.sum { |d| d[:total_entries] }
50
+ dataset_size_mb = (client.size_of_dataset_data * dataset_portion).round(2)
51
+ total_dataset_size += dataset_size_mb
52
+
53
+ puts " Dataset #{index + 1}:"
54
+ puts " Processed at: #{data[:processed_at]}"
55
+ puts " Total entries: #{data[:total_entries].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
56
+ puts " Estimated size: #{dataset_size_mb} MB"
57
+ puts " Data hash: #{hash[0..12]}..."
58
+ end
59
+
60
+ puts ""
61
+ puts " Total dataset size: #{total_dataset_size.round(2)} MB (#{client.size_of_dataset_data.round(1)} MB actual)"
62
+ else
63
+ puts " No dataset metadata available"
64
+ end
@@ -0,0 +1,215 @@
1
+ # UrlCategorise v0.1.4 Release Notes
2
+
3
+ ## New Features Added
4
+
5
+ ### ๐Ÿ†• Dataset-Specific Helper Methods
6
+
7
+ Added dedicated helper methods for tracking dataset-only statistics separately from DNS blocklists:
8
+
9
+ ```ruby
10
+ client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
11
+ client.load_kaggle_dataset('owner', 'dataset-name')
12
+
13
+ # New dataset-specific methods
14
+ client.count_of_dataset_hosts # Returns hosts from datasets only
15
+ client.count_of_dataset_categories # Returns categories from datasets only
16
+
17
+ # Existing methods still work and count both
18
+ client.count_of_hosts # All hosts (DNS lists + datasets)
19
+ client.count_of_categories # All categories (DNS lists + datasets)
20
+ ```
21
+
22
+ **Implementation Details:**
23
+ - `count_of_dataset_hosts` - Sums hosts from categories tracked in `@dataset_categories` Set
24
+ - `count_of_dataset_categories` - Returns size of `@dataset_categories` Set
25
+ - Gracefully handles nil categories with safe navigation (`&.size || 0`)
26
+ - Both methods return 0 when no datasets are loaded
27
+
28
+ ### ๐Ÿ†• IAB Content Taxonomy Compliance
29
+
30
+ Full support for IAB (Interactive Advertising Bureau) Content Taxonomy standards:
31
+
32
+ ```ruby
33
+ # Enable IAB v3.0 compliance (recommended)
34
+ client = UrlCategorise::Client.new(
35
+ iab_compliance: true,
36
+ iab_version: :v3
37
+ )
38
+
39
+ # Enable IAB v2.0 compliance
40
+ client = UrlCategorise::Client.new(
41
+ iab_compliance: true,
42
+ iab_version: :v2
43
+ )
44
+
45
+ # Categorization returns IAB codes instead of custom categories
46
+ categories = client.categorise("badsite.com")
47
+ puts categories # => ["626"] (IAB v3 code for illegal content)
48
+
49
+ # Helper methods
50
+ client.iab_compliant? # => true/false
51
+ client.get_iab_mapping(:malware) # => "626" (v3) or "IAB25" (v2)
52
+ ```
53
+
54
+ **IAB Category Mappings:**
55
+
56
+ **IAB Content Taxonomy v3.0:**
57
+ - Security threats (`malware`, `phishing`, `illegal`) โ†’ `626` (Illegal Content)
58
+ - Advertising (`advertising`, `mobile_ads`) โ†’ `3` (Advertising)
59
+ - Gambling โ†’ `7-39` (Gambling subcategory)
60
+ - Adult content (`pornography`) โ†’ `626` (Adult Content)
61
+ - Social platforms โ†’ `14` (Society)
62
+ - Technology โ†’ `19` (Technology & Computing)
63
+
64
+ **IAB Content Taxonomy v2.0:**
65
+ - Security threats โ†’ `IAB25` (Non-Standard Content)
66
+ - Advertising โ†’ `IAB3` (Advertising)
67
+ - Gambling โ†’ `IAB7-39` (Gambling subcategory)
68
+ - Adult content โ†’ `IAB25-3` (Pornography)
69
+
70
+ **Implementation Details:**
71
+ - New `IabCompliance` module with comprehensive mappings
72
+ - Support for both v2.0 and v3.0 standards
73
+ - IAB compliance affects all categorization methods (`categorise`, `categorise_ip`, `resolve_and_categorise`)
74
+ - Automatic deduplication of IAB codes
75
+ - Graceful handling of unknown categories (returns 'Unknown')
76
+
77
+ ### ๐Ÿงช Comprehensive Test Coverage
78
+
79
+ Added extensive test suites for new features:
80
+
81
+ **Dataset Methods Tests:**
82
+ - `client_dataset_methods_test.rb` (9 tests)
83
+ - Tests for empty, populated, and mixed dataset scenarios
84
+ - Integration tests with existing methods
85
+ - Edge case handling (nil categories, empty arrays)
86
+
87
+ **IAB Compliance Tests:**
88
+ - `iab_compliance_test.rb` (14 tests) - Module functionality
89
+ - `client_iab_compliance_test.rb` (19 tests) - Client integration
90
+ - Comprehensive mapping validation for v2 and v3
91
+ - Integration with all categorization methods
92
+ - DNS resolution with IAB compliance
93
+ - Error handling and edge cases
94
+
95
+ **Total Test Stats:**
96
+ - **316 tests, 2455 assertions, 0 failures, 0 errors**
97
+ - **94.69% line coverage** (660/697 lines)
98
+ - All new features fully tested with edge cases
99
+
100
+ ### ๐Ÿ”ง Technical Implementation
101
+
102
+ **New Files:**
103
+ - `lib/url_categorise/iab_compliance.rb` - IAB mapping module
104
+ - `test/url_categorise/client_dataset_methods_test.rb` - Dataset helper tests
105
+ - `test/url_categorise/iab_compliance_test.rb` - IAB module tests
106
+ - `test/url_categorise/client_iab_compliance_test.rb` - IAB client integration tests
107
+
108
+ **Updated Files:**
109
+ - `lib/url_categorise/client.rb` - Added dataset helpers and IAB support
110
+ - `lib/url_categorise.rb` - Required new IAB module
111
+ - `lib/url_categorise/version.rb` - Bumped to 0.1.4
112
+
113
+ **New Client Attributes:**
114
+ - `iab_compliance_enabled` - Boolean flag for IAB compliance
115
+ - `iab_version` - IAB taxonomy version (:v2 or :v3)
116
+
117
+ **New Client Methods:**
118
+ - `count_of_dataset_hosts` - Dataset-specific host count
119
+ - `count_of_dataset_categories` - Dataset-specific category count
120
+ - `iab_compliant?` - Check IAB compliance status
121
+ - `get_iab_mapping(category)` - Get IAB code for category
122
+
123
+ ### ๐Ÿš€ Usage Examples
124
+
125
+ **Dataset Statistics:**
126
+ ```ruby
127
+ client = UrlCategorise::Client.new(dataset_config: { kaggle: {} })
128
+ client.load_kaggle_dataset('owner', 'dataset')
129
+
130
+ puts "Total hosts: #{client.count_of_hosts}" # All sources
131
+ puts "Dataset hosts: #{client.count_of_dataset_hosts}" # Datasets only
132
+ puts "DNS list hosts: #{client.count_of_hosts - client.count_of_dataset_hosts}"
133
+ ```
134
+
135
+ **IAB Compliance:**
136
+ ```ruby
137
+ # Production environment with IAB compliance
138
+ client = UrlCategorise::Client.new(
139
+ iab_compliance: true,
140
+ iab_version: :v3,
141
+ dataset_config: { kaggle: { username: 'user', api_key: 'key' } }
142
+ )
143
+
144
+ # All methods return IAB codes
145
+ domain_cats = client.categorise("example.com") # => ["3", "626"]
146
+ ip_cats = client.categorise_ip("192.168.1.100") # => ["626"]
147
+ resolved_cats = client.resolve_and_categorise("site.com") # => ["3"]
148
+
149
+ # Check compliance
150
+ puts "IAB compliant: #{client.iab_compliant?}" # => true
151
+ puts "Using version: #{client.iab_version}" # => :v3
152
+ ```
153
+
154
+ **Rails Service Integration:**
155
+ ```ruby
156
+ class UrlCategorizerService
157
+ def initialize
158
+ @client = UrlCategorise::ActiveRecordClient.new(
159
+ iab_compliance: Rails.env.production?,
160
+ iab_version: :v3,
161
+ dataset_config: {
162
+ kaggle: {
163
+ username: ENV['KAGGLE_USERNAME'],
164
+ api_key: ENV['KAGGLE_API_KEY']
165
+ }
166
+ }
167
+ )
168
+ end
169
+
170
+ def stats
171
+ {
172
+ total_hosts: @client.count_of_hosts,
173
+ dataset_hosts: @client.count_of_dataset_hosts,
174
+ dns_list_hosts: @client.count_of_hosts - @client.count_of_dataset_hosts,
175
+ iab_compliant: @client.iab_compliant?,
176
+ iab_version: @client.iab_version
177
+ }
178
+ end
179
+ end
180
+ ```
181
+
182
+ ### ๐Ÿ”„ Migration Guide
183
+
184
+ **From v0.1.3 to v0.1.4:**
185
+
186
+ 1. **No Breaking Changes** - All existing code continues to work
187
+ 2. **Optional New Features** - IAB compliance and dataset helpers are opt-in
188
+ 3. **Enhanced Statistics** - Use new helper methods for better insights
189
+
190
+ **Recommended Updates:**
191
+ ```ruby
192
+ # Before (still works)
193
+ client = UrlCategorise::Client.new
194
+ puts "Total: #{client.count_of_hosts}"
195
+
196
+ # After (enhanced)
197
+ client = UrlCategorise::Client.new(
198
+ iab_compliance: true, # Optional IAB compliance
199
+ iab_version: :v3 # Optional version selection
200
+ )
201
+ puts "Total hosts: #{client.count_of_hosts}"
202
+ puts "Dataset hosts: #{client.count_of_dataset_hosts}"
203
+ puts "IAB compliant: #{client.iab_compliant?}"
204
+ ```
205
+
206
+ ### ๐Ÿ“Š Quality Assurance
207
+
208
+ - โœ… All 316 tests pass with 0 failures/errors
209
+ - โœ… 94.69% line coverage maintained
210
+ - โœ… Code style enforced with rubocop
211
+ - โœ… Comprehensive edge case testing
212
+ - โœ… Memory-efficient implementation
213
+ - โœ… Backward compatibility preserved
214
+
215
+ This release adds powerful new features while maintaining the reliability and performance standards established in previous versions.
@@ -118,7 +118,7 @@ module UrlCategorise
118
118
  return unless UrlCategorise::Models.available?
119
119
 
120
120
  # Store list metadata
121
- @host_urls.each do |category, urls|
121
+ (host_urls || {}).each do |category, urls|
122
122
  urls.each do |url|
123
123
  next unless url.is_a?(String)
124
124