UrlCategorise 0.1.2 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +10 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +88 -3
- data/Gemfile +2 -2
- data/Gemfile.lock +18 -9
- data/README.md +517 -4
- data/Rakefile +8 -8
- data/bin/check_lists +12 -13
- data/bin/console +3 -3
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +98 -21
- data/lib/url_categorise/client.rb +641 -134
- data/lib/url_categorise/constants.rb +86 -71
- data/lib/url_categorise/dataset_processor.rb +476 -0
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/models.rb +53 -14
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +3 -0
- data/url_categorise.gemspec +37 -33
- metadata +142 -52
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
1
3
|
module UrlCategorise
|
2
4
|
class Client < ApiPattern::Client
|
3
5
|
include ::UrlCategorise::Constants
|
6
|
+
include ActiveAttr::Model
|
4
7
|
|
5
8
|
def self.compatible_api_version
|
6
9
|
'v2'
|
@@ -10,49 +13,93 @@ module UrlCategorise
|
|
10
13
|
'v2 2025-08-23'
|
11
14
|
end
|
12
15
|
|
13
|
-
|
16
|
+
attribute :host_urls, default: -> { DEFAULT_HOST_URLS }
|
17
|
+
attribute :cache_dir
|
18
|
+
attribute :force_download, type: Boolean, default: false
|
19
|
+
attribute :dns_servers, default: ['1.1.1.1', '1.0.0.1']
|
20
|
+
attribute :request_timeout, type: Integer, default: 10
|
21
|
+
attribute :iab_compliance_enabled, type: Boolean, default: false
|
22
|
+
attribute :iab_version, default: :v3
|
23
|
+
attribute :auto_load_datasets, type: Boolean, default: false
|
24
|
+
attribute :smart_categorization_enabled, type: Boolean, default: false
|
25
|
+
attribute :smart_rules, default: -> { {} }
|
26
|
+
|
27
|
+
attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
|
28
|
+
|
29
|
+
def initialize(**kwargs)
|
30
|
+
# Extract dataset_config for later use
|
31
|
+
dataset_config = kwargs.fetch(:dataset_config, {})
|
32
|
+
|
33
|
+
# Set ActiveAttr attributes - preserve explicitly passed values including nil
|
34
|
+
self.host_urls = kwargs.key?(:host_urls) ? kwargs[:host_urls] : DEFAULT_HOST_URLS
|
35
|
+
self.cache_dir = kwargs[:cache_dir] # will be nil if not provided or explicitly nil
|
36
|
+
self.force_download = kwargs.key?(:force_download) ? kwargs[:force_download] : false
|
37
|
+
self.dns_servers = kwargs.key?(:dns_servers) ? kwargs[:dns_servers] : ['1.1.1.1', '1.0.0.1']
|
38
|
+
self.request_timeout = kwargs.key?(:request_timeout) ? kwargs[:request_timeout] : 10
|
39
|
+
self.iab_compliance_enabled = kwargs.key?(:iab_compliance) ? kwargs[:iab_compliance] : false
|
40
|
+
self.iab_version = kwargs.key?(:iab_version) ? kwargs[:iab_version] : :v3
|
41
|
+
self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
|
42
|
+
self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
|
43
|
+
self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
|
14
44
|
|
15
|
-
def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false, dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10)
|
16
|
-
@host_urls = host_urls
|
17
|
-
@cache_dir = cache_dir
|
18
|
-
@force_download = force_download
|
19
|
-
@dns_servers = dns_servers
|
20
|
-
@request_timeout = request_timeout
|
21
45
|
@metadata = {}
|
46
|
+
@dataset_categories = Set.new # Track which categories come from datasets
|
47
|
+
|
48
|
+
# Initialize dataset processor if config provided
|
49
|
+
@dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
|
50
|
+
|
22
51
|
@hosts = fetch_and_build_host_lists
|
52
|
+
|
53
|
+
# Auto-load datasets from constants if enabled
|
54
|
+
load_datasets_from_constants if auto_load_datasets && @dataset_processor
|
23
55
|
end
|
24
56
|
|
25
57
|
def categorise(url)
|
26
58
|
host = (URI.parse(url).host || url).downcase
|
27
|
-
host = host.gsub(
|
59
|
+
host = host.gsub('www.', '')
|
28
60
|
|
29
|
-
@hosts.keys.select do |category|
|
61
|
+
categories = @hosts.keys.select do |category|
|
30
62
|
@hosts[category].any? do |blocked_host|
|
31
63
|
host == blocked_host || host.end_with?(".#{blocked_host}")
|
32
64
|
end
|
33
65
|
end
|
66
|
+
|
67
|
+
# Apply smart categorization if enabled
|
68
|
+
categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
|
69
|
+
|
70
|
+
if iab_compliance_enabled
|
71
|
+
IabCompliance.get_iab_categories(categories, iab_version)
|
72
|
+
else
|
73
|
+
categories
|
74
|
+
end
|
34
75
|
end
|
35
76
|
|
36
77
|
def categorise_ip(ip_address)
|
37
|
-
@hosts.keys.select do |category|
|
78
|
+
categories = @hosts.keys.select do |category|
|
38
79
|
@hosts[category].include?(ip_address)
|
39
80
|
end
|
81
|
+
|
82
|
+
if iab_compliance_enabled
|
83
|
+
IabCompliance.get_iab_categories(categories, iab_version)
|
84
|
+
else
|
85
|
+
categories
|
86
|
+
end
|
40
87
|
end
|
41
88
|
|
42
89
|
def resolve_and_categorise(domain)
|
43
90
|
categories = categorise(domain)
|
44
|
-
|
91
|
+
|
45
92
|
begin
|
46
|
-
resolver = Resolv::DNS.new(nameserver:
|
93
|
+
resolver = Resolv::DNS.new(nameserver: dns_servers)
|
47
94
|
ip_addresses = resolver.getaddresses(domain).map(&:to_s)
|
48
|
-
|
95
|
+
|
49
96
|
ip_addresses.each do |ip|
|
50
97
|
categories.concat(categorise_ip(ip))
|
51
98
|
end
|
52
|
-
rescue
|
99
|
+
rescue StandardError
|
53
100
|
# DNS resolution failed, return domain categories only
|
54
101
|
end
|
55
|
-
|
102
|
+
|
56
103
|
categories.uniq
|
57
104
|
end
|
58
105
|
|
@@ -70,59 +117,113 @@ module UrlCategorise
|
|
70
117
|
hash_size_in_mb(@hosts)
|
71
118
|
end
|
72
119
|
|
120
|
+
def size_of_dataset_data
|
121
|
+
dataset_hosts = {}
|
122
|
+
@dataset_categories.each do |category|
|
123
|
+
dataset_hosts[category] = @hosts[category] || []
|
124
|
+
end
|
125
|
+
hash_size_in_mb(dataset_hosts)
|
126
|
+
end
|
127
|
+
|
128
|
+
def size_of_blocklist_data
|
129
|
+
blocklist_hosts = {}
|
130
|
+
@hosts.each do |category, domains|
|
131
|
+
blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
|
132
|
+
end
|
133
|
+
hash_size_in_mb(blocklist_hosts)
|
134
|
+
end
|
135
|
+
|
136
|
+
def size_of_data_bytes
|
137
|
+
hash_size_in_bytes(@hosts)
|
138
|
+
end
|
139
|
+
|
140
|
+
def size_of_dataset_data_bytes
|
141
|
+
dataset_hosts = {}
|
142
|
+
@dataset_categories.each do |category|
|
143
|
+
dataset_hosts[category] = @hosts[category] || []
|
144
|
+
end
|
145
|
+
hash_size_in_bytes(dataset_hosts)
|
146
|
+
end
|
147
|
+
|
148
|
+
def size_of_blocklist_data_bytes
|
149
|
+
blocklist_hosts = {}
|
150
|
+
@hosts.each do |category, domains|
|
151
|
+
blocklist_hosts[category] = domains unless @dataset_categories.include?(category)
|
152
|
+
end
|
153
|
+
hash_size_in_bytes(blocklist_hosts)
|
154
|
+
end
|
155
|
+
|
156
|
+
def count_of_dataset_hosts
|
157
|
+
@dataset_categories.map do |category|
|
158
|
+
@hosts[category]&.size || 0
|
159
|
+
end.sum
|
160
|
+
end
|
161
|
+
|
162
|
+
def count_of_dataset_categories
|
163
|
+
@dataset_categories.size
|
164
|
+
end
|
165
|
+
|
166
|
+
def iab_compliant?
|
167
|
+
iab_compliance_enabled
|
168
|
+
end
|
169
|
+
|
170
|
+
def get_iab_mapping(category)
|
171
|
+
return nil unless iab_compliance_enabled
|
172
|
+
|
173
|
+
IabCompliance.map_category_to_iab(category, iab_version)
|
174
|
+
end
|
175
|
+
|
73
176
|
def check_all_lists
|
74
|
-
puts
|
75
|
-
|
177
|
+
puts 'Checking all lists in constants...'
|
178
|
+
|
76
179
|
unreachable_lists = {}
|
77
180
|
missing_categories = []
|
78
181
|
successful_lists = {}
|
79
|
-
|
80
|
-
|
182
|
+
|
183
|
+
(host_urls || {}).each do |category, urls|
|
81
184
|
puts "\nChecking category: #{category}"
|
82
|
-
|
185
|
+
|
83
186
|
if urls.empty?
|
84
187
|
missing_categories << category
|
85
|
-
puts
|
188
|
+
puts ' ❌ No URLs defined for category'
|
86
189
|
next
|
87
190
|
end
|
88
|
-
|
191
|
+
|
89
192
|
unreachable_lists[category] = []
|
90
193
|
successful_lists[category] = []
|
91
|
-
|
194
|
+
|
92
195
|
urls.each do |url|
|
93
196
|
# Skip symbol references (combined categories)
|
94
197
|
if url.is_a?(Symbol)
|
95
198
|
puts " ➡️ References other category: #{url}"
|
96
199
|
next
|
97
200
|
end
|
98
|
-
|
201
|
+
|
99
202
|
unless url_valid?(url)
|
100
|
-
unreachable_lists[category] << { url: url, error:
|
203
|
+
unreachable_lists[category] << { url: url, error: 'Invalid URL format' }
|
101
204
|
puts " ❌ Invalid URL format: #{url}"
|
102
205
|
next
|
103
206
|
end
|
104
|
-
|
207
|
+
|
105
208
|
print " 🔍 Testing #{url}... "
|
106
|
-
|
209
|
+
|
107
210
|
begin
|
108
|
-
response = HTTParty.head(url, timeout:
|
109
|
-
|
211
|
+
response = HTTParty.head(url, timeout: request_timeout, follow_redirects: true)
|
212
|
+
|
110
213
|
case response.code
|
111
214
|
when 200
|
112
|
-
puts
|
215
|
+
puts '✅ OK'
|
113
216
|
successful_lists[category] << url
|
114
217
|
when 301, 302, 307, 308
|
115
218
|
puts "↗️ Redirect (#{response.code})"
|
116
|
-
if response.headers['location']
|
117
|
-
puts " Redirects to: #{response.headers['location']}"
|
118
|
-
end
|
219
|
+
puts " Redirects to: #{response.headers['location']}" if response.headers['location']
|
119
220
|
successful_lists[category] << url
|
120
221
|
when 404
|
121
|
-
puts
|
122
|
-
unreachable_lists[category] << { url: url, error:
|
222
|
+
puts '❌ Not Found (404)'
|
223
|
+
unreachable_lists[category] << { url: url, error: '404 Not Found' }
|
123
224
|
when 403
|
124
|
-
puts
|
125
|
-
unreachable_lists[category] << { url: url, error:
|
225
|
+
puts '❌ Forbidden (403)'
|
226
|
+
unreachable_lists[category] << { url: url, error: '403 Forbidden' }
|
126
227
|
when 500..599
|
127
228
|
puts "❌ Server Error (#{response.code})"
|
128
229
|
unreachable_lists[category] << { url: url, error: "Server Error #{response.code}" }
|
@@ -130,51 +231,50 @@ module UrlCategorise
|
|
130
231
|
puts "⚠️ Unexpected response (#{response.code})"
|
131
232
|
unreachable_lists[category] << { url: url, error: "HTTP #{response.code}" }
|
132
233
|
end
|
133
|
-
|
134
234
|
rescue Timeout::Error
|
135
|
-
puts
|
136
|
-
unreachable_lists[category] << { url: url, error:
|
235
|
+
puts '❌ Timeout'
|
236
|
+
unreachable_lists[category] << { url: url, error: 'Request timeout' }
|
137
237
|
rescue SocketError => e
|
138
|
-
puts
|
238
|
+
puts '❌ DNS/Network Error'
|
139
239
|
unreachable_lists[category] << { url: url, error: "DNS/Network: #{e.message}" }
|
140
240
|
rescue HTTParty::Error, Net::HTTPError => e
|
141
|
-
puts
|
241
|
+
puts '❌ HTTP Error'
|
142
242
|
unreachable_lists[category] << { url: url, error: "HTTP Error: #{e.message}" }
|
143
243
|
rescue StandardError => e
|
144
244
|
puts "❌ Error: #{e.class}"
|
145
245
|
unreachable_lists[category] << { url: url, error: "#{e.class}: #{e.message}" }
|
146
246
|
end
|
147
|
-
|
247
|
+
|
148
248
|
# Small delay to be respectful to servers
|
149
249
|
sleep(0.1)
|
150
250
|
end
|
151
|
-
|
251
|
+
|
152
252
|
# Remove empty arrays
|
153
253
|
unreachable_lists.delete(category) if unreachable_lists[category].empty?
|
154
254
|
successful_lists.delete(category) if successful_lists[category].empty?
|
155
255
|
end
|
156
|
-
|
256
|
+
|
157
257
|
# Generate summary report
|
158
|
-
puts "\n" +
|
159
|
-
puts
|
160
|
-
puts
|
161
|
-
|
258
|
+
puts "\n" + '=' * 80
|
259
|
+
puts 'LIST HEALTH REPORT'
|
260
|
+
puts '=' * 80
|
261
|
+
|
162
262
|
puts "\n📊 SUMMARY:"
|
163
|
-
total_categories =
|
263
|
+
total_categories = (host_urls || {}).keys.length
|
164
264
|
categories_with_issues = unreachable_lists.keys.length + missing_categories.length
|
165
265
|
categories_healthy = total_categories - categories_with_issues
|
166
|
-
|
266
|
+
|
167
267
|
puts " Total categories: #{total_categories}"
|
168
268
|
puts " Healthy categories: #{categories_healthy}"
|
169
269
|
puts " Categories with issues: #{categories_with_issues}"
|
170
|
-
|
270
|
+
|
171
271
|
if missing_categories.any?
|
172
272
|
puts "\n❌ CATEGORIES WITH NO URLS (#{missing_categories.length}):"
|
173
273
|
missing_categories.each do |category|
|
174
274
|
puts " - #{category}"
|
175
275
|
end
|
176
276
|
end
|
177
|
-
|
277
|
+
|
178
278
|
if unreachable_lists.any?
|
179
279
|
puts "\n❌ UNREACHABLE LISTS:"
|
180
280
|
unreachable_lists.each do |category, failed_urls|
|
@@ -185,15 +285,15 @@ module UrlCategorise
|
|
185
285
|
end
|
186
286
|
end
|
187
287
|
end
|
188
|
-
|
288
|
+
|
189
289
|
puts "\n✅ WORKING CATEGORIES (#{successful_lists.keys.length}):"
|
190
290
|
successful_lists.keys.sort.each do |category|
|
191
291
|
url_count = successful_lists[category].length
|
192
292
|
puts " - #{category} (#{url_count} URL#{'s' if url_count != 1})"
|
193
293
|
end
|
194
|
-
|
195
|
-
puts "\n" +
|
196
|
-
|
294
|
+
|
295
|
+
puts "\n" + '=' * 80
|
296
|
+
|
197
297
|
# Return structured data for programmatic use
|
198
298
|
{
|
199
299
|
summary: {
|
@@ -207,23 +307,363 @@ module UrlCategorise
|
|
207
307
|
}
|
208
308
|
end
|
209
309
|
|
310
|
+
def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
|
311
|
+
raise Error, 'Dataset processor not configured' unless @dataset_processor
|
312
|
+
|
313
|
+
default_options = { use_cache: true, integrate_data: true }
|
314
|
+
merged_options = default_options.merge(options)
|
315
|
+
|
316
|
+
dataset = @dataset_processor.process_kaggle_dataset(dataset_owner, dataset_name, merged_options)
|
317
|
+
|
318
|
+
if merged_options[:integrate_data]
|
319
|
+
integrate_dataset(dataset, merged_options[:category_mappings] || {})
|
320
|
+
else
|
321
|
+
dataset
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
def load_csv_dataset(url, options = {})
|
326
|
+
raise Error, 'Dataset processor not configured' unless @dataset_processor
|
327
|
+
|
328
|
+
default_options = { use_cache: true, integrate_data: true }
|
329
|
+
merged_options = default_options.merge(options)
|
330
|
+
|
331
|
+
dataset = @dataset_processor.process_csv_dataset(url, merged_options)
|
332
|
+
|
333
|
+
if merged_options[:integrate_data]
|
334
|
+
integrate_dataset(dataset, merged_options[:category_mappings] || {})
|
335
|
+
else
|
336
|
+
dataset
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
def dataset_metadata
|
341
|
+
return {} unless @dataset_processor
|
342
|
+
|
343
|
+
@dataset_metadata || {}
|
344
|
+
end
|
345
|
+
|
346
|
+
def reload_with_datasets
|
347
|
+
# Store dataset categories before reload (only those that were added via integrate_dataset)
|
348
|
+
dataset_category_data = {}
|
349
|
+
if @hosts
|
350
|
+
@dataset_categories.each do |category|
|
351
|
+
dataset_category_data[category] = @hosts[category].dup if @hosts[category]
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
@hosts = fetch_and_build_host_lists
|
356
|
+
|
357
|
+
# Restore dataset categories
|
358
|
+
dataset_category_data.each do |category, domains|
|
359
|
+
@hosts[category] ||= []
|
360
|
+
@hosts[category].concat(domains).uniq!
|
361
|
+
end
|
362
|
+
|
363
|
+
# Reload datasets from constants if auto-loading is enabled
|
364
|
+
load_datasets_from_constants if auto_load_datasets && @dataset_processor
|
365
|
+
|
366
|
+
self
|
367
|
+
end
|
368
|
+
|
369
|
+
def export_hosts_files(output_path = nil)
|
370
|
+
export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'hosts') : File.join(Dir.pwd, 'exports', 'hosts'))
|
371
|
+
|
372
|
+
FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
|
373
|
+
|
374
|
+
exported_files = {}
|
375
|
+
|
376
|
+
@hosts.each do |category, domains|
|
377
|
+
next if domains.empty?
|
378
|
+
|
379
|
+
filename = "#{category}.hosts"
|
380
|
+
file_path = File.join(export_dir, filename)
|
381
|
+
|
382
|
+
File.open(file_path, 'w') do |file|
|
383
|
+
file.puts "# #{category.to_s.gsub('_', ' ').split.map(&:capitalize).join(' ')} - Generated by UrlCategorise"
|
384
|
+
file.puts "# Generated at: #{Time.now}"
|
385
|
+
file.puts "# Total entries: #{domains.length}"
|
386
|
+
file.puts ""
|
387
|
+
|
388
|
+
domains.sort.each do |domain|
|
389
|
+
file.puts "0.0.0.0 #{domain}"
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
exported_files[category] = {
|
394
|
+
path: file_path,
|
395
|
+
filename: filename,
|
396
|
+
count: domains.length
|
397
|
+
}
|
398
|
+
end
|
399
|
+
|
400
|
+
# Create summary file
|
401
|
+
summary_path = File.join(export_dir, '_export_summary.txt')
|
402
|
+
File.open(summary_path, 'w') do |file|
|
403
|
+
file.puts "UrlCategorise Hosts Export Summary"
|
404
|
+
file.puts "=================================="
|
405
|
+
file.puts "Generated at: #{Time.now}"
|
406
|
+
file.puts "Export directory: #{export_dir}"
|
407
|
+
file.puts "Total categories: #{exported_files.keys.length}"
|
408
|
+
file.puts "Total domains: #{@hosts.values.map(&:length).sum}"
|
409
|
+
file.puts ""
|
410
|
+
file.puts "Files created:"
|
411
|
+
|
412
|
+
exported_files.each do |category, info|
|
413
|
+
file.puts " #{info[:filename]} - #{info[:count]} domains"
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
exported_files[:_summary] = {
|
418
|
+
path: summary_path,
|
419
|
+
total_categories: exported_files.keys.length,
|
420
|
+
total_domains: @hosts.values.map(&:length).sum,
|
421
|
+
export_directory: export_dir
|
422
|
+
}
|
423
|
+
|
424
|
+
exported_files
|
425
|
+
end
|
426
|
+
|
427
|
+
def export_csv_data(output_path = nil)
|
428
|
+
require 'csv'
|
429
|
+
|
430
|
+
export_dir = output_path || (cache_dir ? File.join(cache_dir, 'exports', 'csv') : File.join(Dir.pwd, 'exports', 'csv'))
|
431
|
+
|
432
|
+
FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
|
433
|
+
|
434
|
+
filename = "url_categorise_data_export_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
|
435
|
+
file_path = File.join(export_dir, filename)
|
436
|
+
|
437
|
+
CSV.open(file_path, 'w', headers: true) do |csv|
|
438
|
+
# Add headers
|
439
|
+
csv << [
|
440
|
+
'domain',
|
441
|
+
'category',
|
442
|
+
'source_type',
|
443
|
+
'is_dataset_category',
|
444
|
+
'iab_category_v2',
|
445
|
+
'iab_category_v3',
|
446
|
+
'export_timestamp',
|
447
|
+
'smart_categorization_enabled'
|
448
|
+
]
|
449
|
+
|
450
|
+
# Export all host/category data
|
451
|
+
@hosts.each do |category, domains|
|
452
|
+
domains.each do |domain|
|
453
|
+
source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
|
454
|
+
is_dataset_category = @dataset_categories.include?(category)
|
455
|
+
|
456
|
+
# Get IAB mappings if compliance is enabled
|
457
|
+
iab_v2 = nil
|
458
|
+
iab_v3 = nil
|
459
|
+
if iab_compliance_enabled
|
460
|
+
iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
|
461
|
+
iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
|
462
|
+
end
|
463
|
+
|
464
|
+
csv << [
|
465
|
+
domain,
|
466
|
+
category,
|
467
|
+
source_type,
|
468
|
+
is_dataset_category,
|
469
|
+
iab_v2,
|
470
|
+
iab_v3,
|
471
|
+
Time.now.iso8601,
|
472
|
+
smart_categorization_enabled
|
473
|
+
]
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
# Create metadata file
|
479
|
+
metadata_path = File.join(export_dir, "#{File.basename(filename, '.csv')}_metadata.json")
|
480
|
+
metadata = {
|
481
|
+
export_info: {
|
482
|
+
timestamp: Time.now.iso8601,
|
483
|
+
filename: filename,
|
484
|
+
file_path: file_path,
|
485
|
+
metadata_path: metadata_path
|
486
|
+
},
|
487
|
+
client_settings: {
|
488
|
+
iab_compliance_enabled: iab_compliance_enabled,
|
489
|
+
iab_version: iab_version,
|
490
|
+
smart_categorization_enabled: smart_categorization_enabled,
|
491
|
+
auto_load_datasets: auto_load_datasets
|
492
|
+
},
|
493
|
+
data_summary: {
|
494
|
+
total_domains: @hosts.values.map(&:length).sum,
|
495
|
+
total_categories: @hosts.keys.length,
|
496
|
+
dataset_categories_count: @dataset_categories.size,
|
497
|
+
blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
|
498
|
+
categories: @hosts.keys.sort.map(&:to_s)
|
499
|
+
},
|
500
|
+
dataset_metadata: dataset_metadata
|
501
|
+
}
|
502
|
+
|
503
|
+
File.write(metadata_path, JSON.pretty_generate(metadata))
|
504
|
+
|
505
|
+
{
|
506
|
+
csv_file: file_path,
|
507
|
+
metadata_file: metadata_path,
|
508
|
+
summary: metadata[:data_summary],
|
509
|
+
export_directory: export_dir
|
510
|
+
}
|
511
|
+
end
|
512
|
+
|
210
513
|
private
|
211
514
|
|
515
|
+
def initialize_dataset_processor(config)
|
516
|
+
processor_config = {
|
517
|
+
download_path: config[:download_path] || cache_dir&.+(File::SEPARATOR + 'downloads'),
|
518
|
+
cache_path: config[:cache_path] || cache_dir&.+(File::SEPARATOR + 'datasets'),
|
519
|
+
timeout: config[:timeout] || request_timeout,
|
520
|
+
enable_kaggle: config.fetch(:enable_kaggle, true) # Default to true for backwards compatibility
|
521
|
+
}
|
522
|
+
|
523
|
+
# Add Kaggle credentials if provided and Kaggle is enabled
|
524
|
+
if config[:kaggle] && processor_config[:enable_kaggle]
|
525
|
+
kaggle_config = config[:kaggle]
|
526
|
+
processor_config.merge!({
|
527
|
+
username: kaggle_config[:username],
|
528
|
+
api_key: kaggle_config[:api_key],
|
529
|
+
credentials_file: kaggle_config[:credentials_file]
|
530
|
+
})
|
531
|
+
end
|
532
|
+
|
533
|
+
DatasetProcessor.new(**processor_config)
|
534
|
+
rescue Error => e
|
535
|
+
# Dataset processor failed to initialize, but client can still work without it
|
536
|
+
puts "Warning: Dataset processor initialization failed: #{e.message}" if ENV['DEBUG']
|
537
|
+
nil
|
538
|
+
end
|
539
|
+
|
540
|
+
def integrate_dataset(dataset, category_mappings)
|
541
|
+
return dataset unless @dataset_processor
|
542
|
+
return nil unless dataset # Handle nil datasets gracefully
|
543
|
+
|
544
|
+
categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
|
545
|
+
|
546
|
+
# Store metadata
|
547
|
+
@dataset_metadata ||= {}
|
548
|
+
@dataset_metadata[categorized_data[:_metadata][:data_hash]] = categorized_data[:_metadata]
|
549
|
+
|
550
|
+
# Remove metadata from the working data
|
551
|
+
categorized_data.delete(:_metadata)
|
552
|
+
|
553
|
+
# Merge with existing host data
|
554
|
+
categorized_data.each do |category, domains|
|
555
|
+
next if category.to_s.start_with?('_') # Skip internal keys
|
556
|
+
|
557
|
+
# Convert category to symbol for consistency
|
558
|
+
category_sym = category.to_sym
|
559
|
+
@hosts[category_sym] ||= []
|
560
|
+
@hosts[category_sym].concat(domains).uniq!
|
561
|
+
|
562
|
+
# Track this as a dataset category
|
563
|
+
@dataset_categories.add(category_sym)
|
564
|
+
end
|
565
|
+
|
566
|
+
dataset
|
567
|
+
end
|
568
|
+
|
569
|
+
def load_datasets_from_constants
|
570
|
+
return unless defined?(CATEGORIY_DATABASES) && CATEGORIY_DATABASES.is_a?(Array)
|
571
|
+
return unless @dataset_processor
|
572
|
+
|
573
|
+
puts "Loading #{CATEGORIY_DATABASES.length} datasets from constants..." if ENV['DEBUG']
|
574
|
+
loaded_count = 0
|
575
|
+
|
576
|
+
CATEGORIY_DATABASES.each do |dataset_config|
|
577
|
+
begin
|
578
|
+
case dataset_config[:type]
|
579
|
+
when :kaggle
|
580
|
+
# Parse the kaggle path to get owner and dataset name
|
581
|
+
path_parts = dataset_config[:path].split('/')
|
582
|
+
next unless path_parts.length == 2
|
583
|
+
|
584
|
+
dataset_owner, dataset_name = path_parts
|
585
|
+
|
586
|
+
# Check if dataset is already cached before attempting to load
|
587
|
+
cache_key = @dataset_processor.send(:generate_cache_key, "#{dataset_owner}/#{dataset_name}", :kaggle)
|
588
|
+
cache_file = File.join(@dataset_processor.cache_path, cache_key)
|
589
|
+
|
590
|
+
if File.exist?(cache_file)
|
591
|
+
puts "Loading cached Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
|
592
|
+
load_kaggle_dataset(dataset_owner, dataset_name, {
|
593
|
+
use_cache: true,
|
594
|
+
integrate_data: true
|
595
|
+
})
|
596
|
+
loaded_count += 1
|
597
|
+
else
|
598
|
+
puts "Attempting to download missing Kaggle dataset: #{dataset_owner}/#{dataset_name}" if ENV['DEBUG']
|
599
|
+
begin
|
600
|
+
load_kaggle_dataset(dataset_owner, dataset_name, {
|
601
|
+
use_cache: true,
|
602
|
+
integrate_data: true
|
603
|
+
})
|
604
|
+
loaded_count += 1
|
605
|
+
rescue Error => e
|
606
|
+
puts "Warning: Failed to download Kaggle dataset #{dataset_owner}/#{dataset_name}: #{e.message}" if ENV['DEBUG']
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
when :csv
|
611
|
+
# Check if CSV dataset is cached
|
612
|
+
cache_key = @dataset_processor.send(:generate_cache_key, dataset_config[:path], :csv)
|
613
|
+
cache_file = File.join(@dataset_processor.cache_path, cache_key)
|
614
|
+
|
615
|
+
if File.exist?(cache_file)
|
616
|
+
puts "Loading cached CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
|
617
|
+
load_csv_dataset(dataset_config[:path], {
|
618
|
+
use_cache: true,
|
619
|
+
integrate_data: true
|
620
|
+
})
|
621
|
+
loaded_count += 1
|
622
|
+
else
|
623
|
+
puts "Attempting to download missing CSV dataset: #{dataset_config[:path]}" if ENV['DEBUG']
|
624
|
+
begin
|
625
|
+
load_csv_dataset(dataset_config[:path], {
|
626
|
+
use_cache: true,
|
627
|
+
integrate_data: true
|
628
|
+
})
|
629
|
+
loaded_count += 1
|
630
|
+
rescue Error => e
|
631
|
+
puts "Warning: Failed to download CSV dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
632
|
+
end
|
633
|
+
end
|
634
|
+
end
|
635
|
+
rescue Error => e
|
636
|
+
puts "Warning: Failed to load dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
637
|
+
# Continue loading other datasets even if one fails
|
638
|
+
rescue StandardError => e
|
639
|
+
puts "Warning: Unexpected error loading dataset #{dataset_config[:path]}: #{e.message}" if ENV['DEBUG']
|
640
|
+
# Continue loading other datasets even if one fails
|
641
|
+
end
|
642
|
+
end
|
643
|
+
|
644
|
+
puts "Finished loading datasets from constants (#{loaded_count}/#{CATEGORIY_DATABASES.length} loaded)" if ENV['DEBUG']
|
645
|
+
end
|
646
|
+
|
212
647
|
def hash_size_in_mb(hash)
|
648
|
+
size_bytes = hash_size_in_bytes(hash)
|
649
|
+
(size_bytes / ONE_MEGABYTE.to_f).round(2)
|
650
|
+
end
|
651
|
+
|
652
|
+
def hash_size_in_bytes(hash)
|
213
653
|
size = 0
|
654
|
+
hash.each do |_key, value|
|
655
|
+
next unless value.is_a?(Array)
|
214
656
|
|
215
|
-
hash.each do |key, value|
|
216
657
|
size += value.join.length
|
217
658
|
end
|
218
|
-
|
219
|
-
(size / ONE_MEGABYTE).round(2)
|
659
|
+
size
|
220
660
|
end
|
221
661
|
|
222
662
|
def fetch_and_build_host_lists
|
223
663
|
@hosts = {}
|
224
664
|
|
225
|
-
host_urls.keys.each do |category|
|
226
|
-
@hosts[category] = build_host_data(host_urls[category])
|
665
|
+
(host_urls || {}).keys.each do |category|
|
666
|
+
@hosts[category] = build_host_data((host_urls || {})[category])
|
227
667
|
end
|
228
668
|
|
229
669
|
sub_category_values = categories_with_keys
|
@@ -241,76 +681,145 @@ module UrlCategorise
|
|
241
681
|
@hosts
|
242
682
|
end
|
243
683
|
|
684
|
+
def initialize_smart_rules(custom_rules)
|
685
|
+
custom_rules = {} if custom_rules.nil?
|
686
|
+
default_rules = {
|
687
|
+
social_media_platforms: {
|
688
|
+
domains: %w[reddit.com facebook.com twitter.com x.com instagram.com linkedin.com
|
689
|
+
pinterest.com tiktok.com youtube.com snapchat.com discord.com],
|
690
|
+
remove_categories: %i[health_and_fitness forums news technology education
|
691
|
+
business finance entertainment travel sports politics
|
692
|
+
science music art food_and_drink shopping gaming]
|
693
|
+
},
|
694
|
+
search_engines: {
|
695
|
+
domains: %w[google.com bing.com yahoo.com duckduckgo.com baidu.com yandex.com],
|
696
|
+
remove_categories: %i[news shopping travel health_and_fitness finance technology]
|
697
|
+
},
|
698
|
+
video_platforms: {
|
699
|
+
domains: %w[youtube.com vimeo.com dailymotion.com twitch.tv],
|
700
|
+
remove_categories: %i[education news entertainment music sports gaming]
|
701
|
+
},
|
702
|
+
news_aggregators: {
|
703
|
+
domains: %w[reddit.com digg.com],
|
704
|
+
keep_primary_only: %i[social_media reddit digg]
|
705
|
+
}
|
706
|
+
}
|
707
|
+
|
708
|
+
# Merge custom rules with defaults
|
709
|
+
default_rules.merge(custom_rules)
|
710
|
+
end
|
711
|
+
|
712
|
+
def apply_smart_categorization(url, categories)
|
713
|
+
return categories unless smart_categorization_enabled
|
714
|
+
|
715
|
+
host = extract_host(url)
|
716
|
+
|
717
|
+
smart_rules.each do |_rule_name, rule_config|
|
718
|
+
if rule_config[:domains]&.any? { |domain| host == domain || host.end_with?(".#{domain}") }
|
719
|
+
categories = apply_rule(categories, rule_config, host, url)
|
720
|
+
end
|
721
|
+
end
|
722
|
+
|
723
|
+
categories
|
724
|
+
end
|
725
|
+
|
726
|
+
def apply_rule(categories, rule_config, _host, url)
|
727
|
+
# Rule: Remove overly broad categories for specific platforms
|
728
|
+
if rule_config[:remove_categories]
|
729
|
+
categories = categories.reject { |cat| rule_config[:remove_categories].include?(cat) }
|
730
|
+
end
|
731
|
+
|
732
|
+
# Rule: Keep only primary categories
|
733
|
+
if rule_config[:keep_primary_only]
|
734
|
+
primary_categories = categories & rule_config[:keep_primary_only]
|
735
|
+
categories = primary_categories if primary_categories.any?
|
736
|
+
end
|
737
|
+
|
738
|
+
# Rule: Add specific categories based on URL patterns
|
739
|
+
if rule_config[:add_categories_by_path]
|
740
|
+
rule_config[:add_categories_by_path].each do |path_pattern, additional_categories|
|
741
|
+
categories = (categories + additional_categories).uniq if url.match?(path_pattern)
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
# Rule: Remove all categories except allowed ones
|
746
|
+
categories &= rule_config[:allowed_categories_only] if rule_config[:allowed_categories_only]
|
747
|
+
|
748
|
+
categories
|
749
|
+
end
|
750
|
+
|
751
|
+
def extract_host(url)
|
752
|
+
(URI.parse(url).host || url).downcase.gsub('www.', '')
|
753
|
+
rescue URI::InvalidURIError
|
754
|
+
url.downcase.gsub('www.', '')
|
755
|
+
end
|
756
|
+
|
244
757
|
def build_host_data(urls)
|
245
758
|
all_hosts = []
|
246
|
-
|
759
|
+
|
247
760
|
urls.each do |url|
|
248
761
|
next unless url_valid?(url)
|
249
|
-
|
762
|
+
|
250
763
|
hosts_data = nil
|
251
|
-
|
252
|
-
if
|
253
|
-
|
254
|
-
end
|
255
|
-
|
764
|
+
|
765
|
+
hosts_data = read_from_cache(url) if cache_dir && !force_download
|
766
|
+
|
256
767
|
if hosts_data.nil?
|
257
768
|
hosts_data = download_and_parse_list(url)
|
258
|
-
save_to_cache(url, hosts_data) if
|
769
|
+
save_to_cache(url, hosts_data) if cache_dir
|
259
770
|
end
|
260
|
-
|
771
|
+
|
261
772
|
all_hosts.concat(hosts_data) if hosts_data
|
262
773
|
end
|
263
|
-
|
774
|
+
|
264
775
|
all_hosts.compact.sort.uniq
|
265
776
|
end
|
266
777
|
|
267
778
|
def download_and_parse_list(url)
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
return []
|
292
|
-
end
|
779
|
+
raw_data = HTTParty.get(url, timeout: request_timeout)
|
780
|
+
return [] if raw_data.body.nil? || raw_data.body.empty?
|
781
|
+
|
782
|
+
# Store metadata
|
783
|
+
etag = raw_data.headers['etag']
|
784
|
+
last_modified = raw_data.headers['last-modified']
|
785
|
+
@metadata[url] = {
|
786
|
+
last_updated: Time.now,
|
787
|
+
etag: etag,
|
788
|
+
last_modified: last_modified,
|
789
|
+
content_hash: Digest::SHA256.hexdigest(raw_data.body),
|
790
|
+
status: 'success'
|
791
|
+
}
|
792
|
+
|
793
|
+
parse_list_content(raw_data.body, detect_list_format(raw_data.body))
|
794
|
+
rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
|
795
|
+
# Log the error but continue with other lists
|
796
|
+
@metadata[url] = {
|
797
|
+
last_updated: Time.now,
|
798
|
+
error: e.message,
|
799
|
+
status: 'failed'
|
800
|
+
}
|
801
|
+
[]
|
293
802
|
end
|
294
803
|
|
295
804
|
def parse_list_content(content, format)
|
296
805
|
lines = content.split("\n").reject { |line| line.empty? || line.strip.start_with?('#') }
|
297
|
-
|
806
|
+
|
298
807
|
case format
|
299
808
|
when :hosts
|
300
|
-
lines.map
|
809
|
+
lines.map do |line|
|
301
810
|
parts = line.split(' ')
|
302
811
|
# Extract domain from hosts format: "0.0.0.0 domain.com" -> "domain.com"
|
303
812
|
parts.length >= 2 ? parts[1].strip : nil
|
304
|
-
|
813
|
+
end.compact.reject(&:empty?)
|
305
814
|
when :plain
|
306
815
|
lines.map(&:strip)
|
307
816
|
when :dnsmasq
|
308
|
-
lines.map
|
309
|
-
match = line.match(
|
817
|
+
lines.map do |line|
|
818
|
+
match = line.match(%r{address=/(.+?)/})
|
310
819
|
match ? match[1] : nil
|
311
|
-
|
820
|
+
end.compact
|
312
821
|
when :ublock
|
313
|
-
lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[
|
822
|
+
lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[$\^].*$/, '').strip }.reject(&:empty?)
|
314
823
|
else
|
315
824
|
lines.map(&:strip)
|
316
825
|
end
|
@@ -319,88 +828,86 @@ module UrlCategorise
|
|
319
828
|
def detect_list_format(content)
|
320
829
|
# Skip comments and empty lines, then look at first 20 non-comment lines
|
321
830
|
sample_lines = content.split("\n")
|
322
|
-
|
323
|
-
|
324
|
-
|
831
|
+
.reject { |line| line.empty? || line.strip.start_with?('#') }
|
832
|
+
.first(20)
|
833
|
+
|
325
834
|
return :hosts if sample_lines.any? { |line| line.match(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+/) }
|
326
835
|
return :dnsmasq if sample_lines.any? { |line| line.include?('address=/') }
|
327
836
|
return :ublock if sample_lines.any? { |line| line.match(/^\|\|/) }
|
328
|
-
|
837
|
+
|
329
838
|
:plain
|
330
839
|
end
|
331
840
|
|
332
841
|
def cache_file_path(url)
|
333
|
-
return nil unless
|
334
|
-
|
335
|
-
FileUtils.mkdir_p(
|
842
|
+
return nil unless cache_dir
|
843
|
+
|
844
|
+
FileUtils.mkdir_p(cache_dir) unless Dir.exist?(cache_dir)
|
336
845
|
filename = Digest::MD5.hexdigest(url) + '.cache'
|
337
|
-
File.join(
|
846
|
+
File.join(cache_dir, filename)
|
338
847
|
end
|
339
848
|
|
340
849
|
def read_from_cache(url)
|
341
850
|
cache_file = cache_file_path(url)
|
342
851
|
return nil unless cache_file && File.exist?(cache_file)
|
343
|
-
|
852
|
+
|
344
853
|
cache_data = Marshal.load(File.read(cache_file))
|
345
|
-
|
854
|
+
|
346
855
|
# Check if we should update based on hash or time
|
347
|
-
if should_update_cache?(url, cache_data)
|
348
|
-
|
349
|
-
end
|
350
|
-
|
856
|
+
return nil if should_update_cache?(url, cache_data)
|
857
|
+
|
351
858
|
cache_data[:hosts]
|
352
|
-
rescue
|
859
|
+
rescue StandardError
|
353
860
|
nil
|
354
861
|
end
|
355
862
|
|
356
863
|
def save_to_cache(url, hosts_data)
|
357
864
|
cache_file = cache_file_path(url)
|
358
865
|
return unless cache_file
|
359
|
-
|
866
|
+
|
360
867
|
cache_data = {
|
361
868
|
hosts: hosts_data,
|
362
869
|
metadata: @metadata[url],
|
363
870
|
cached_at: Time.now
|
364
871
|
}
|
365
|
-
|
872
|
+
|
366
873
|
File.write(cache_file, Marshal.dump(cache_data))
|
367
|
-
rescue
|
874
|
+
rescue StandardError
|
368
875
|
# Cache save failed, continue without caching
|
369
876
|
end
|
370
877
|
|
371
878
|
def should_update_cache?(url, cache_data)
|
372
|
-
return true if
|
879
|
+
return true if force_download
|
373
880
|
return true unless cache_data[:metadata]
|
374
|
-
|
881
|
+
|
375
882
|
# Update if cache is older than 24 hours
|
376
883
|
cache_age = Time.now - cache_data[:cached_at]
|
377
884
|
return true if cache_age > 24 * 60 * 60
|
378
|
-
|
885
|
+
|
379
886
|
# Check if remote content has changed
|
380
887
|
begin
|
381
|
-
head_response = HTTParty.head(url, timeout:
|
888
|
+
head_response = HTTParty.head(url, timeout: request_timeout)
|
382
889
|
remote_etag = head_response.headers['etag']
|
383
890
|
remote_last_modified = head_response.headers['last-modified']
|
384
|
-
|
891
|
+
|
385
892
|
cached_metadata = cache_data[:metadata]
|
386
|
-
|
893
|
+
|
387
894
|
return true if remote_etag && cached_metadata[:etag] && remote_etag != cached_metadata[:etag]
|
388
|
-
|
895
|
+
if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
|
896
|
+
return true
|
897
|
+
end
|
389
898
|
rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError
|
390
899
|
# If HEAD request fails, assume we should update
|
391
900
|
return true
|
392
901
|
end
|
393
|
-
|
902
|
+
|
394
903
|
false
|
395
904
|
end
|
396
905
|
|
397
|
-
private
|
398
|
-
|
399
906
|
def categories_with_keys
|
400
907
|
keyed_categories = {}
|
401
908
|
|
402
|
-
host_urls.keys.each do |category|
|
403
|
-
category_values = host_urls[category].select do |url|
|
909
|
+
(host_urls || {}).keys.each do |category|
|
910
|
+
category_values = (host_urls || {})[category].select do |url|
|
404
911
|
url.is_a?(Symbol)
|
405
912
|
end
|
406
913
|
|