UrlCategorise 0.0.3 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +13 -0
- data/.github/workflows/ci.yml +57 -0
- data/.ruby-version +1 -0
- data/CLAUDE.md +134 -0
- data/Gemfile.lock +127 -67
- data/README.md +553 -27
- data/Rakefile +2 -0
- data/bin/check_lists +48 -0
- data/docs/.keep +2 -0
- data/docs/v0.1-context.md +115 -0
- data/lib/url_categorise/active_record_client.rb +118 -0
- data/lib/url_categorise/client.rb +336 -24
- data/lib/url_categorise/constants.rb +52 -4
- data/lib/url_categorise/models.rb +105 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +11 -0
- data/url_categorise.gemspec +22 -9
- metadata +215 -27
data/README.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# UrlCategorise
|
2
|
+
|
3
|
+
A comprehensive Ruby gem for categorizing URLs and domains based on various security and content blocklists. It downloads and processes multiple types of lists to provide domain categorization across many categories including malware, phishing, advertising, tracking, gambling, and more.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
|
8
|
+
- **Multiple List Formats**: Supports hosts files, pfSense, AdSense, uBlock Origin, dnsmasq, and plain text formats
|
9
|
+
- **Intelligent Caching**: Hash-based file update detection with configurable local cache
|
10
|
+
- **DNS Resolution**: Resolve domains to IPs and check against IP-based blocklists
|
11
|
+
- **High-Quality Sources**: Integrates lists from HaGeZi, StevenBlack, The Block List Project, and specialized security feeds
|
12
|
+
- **ActiveRecord Integration**: Optional database storage for high-performance lookups
|
13
|
+
- **IP Categorization**: Support for IP address and subnet-based categorization
|
14
|
+
- **Metadata Tracking**: Track last update times, ETags, and content hashes
|
15
|
+
- **Health Monitoring**: Automatic detection and removal of broken blocklist sources
|
16
|
+
- **List Validation**: Built-in tools to verify all configured URLs are accessible
|
3
17
|
|
4
18
|
## Installation
|
5
19
|
|
@@ -15,40 +29,545 @@ And then execute:
|
|
15
29
|
|
16
30
|
Or install it yourself as:
|
17
31
|
|
18
|
-
$ gem install
|
32
|
+
$ gem install url_categorise
|
33
|
+
|
34
|
+
## Basic Usage
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
require 'url_categorise'
|
38
|
+
|
39
|
+
# Initialize with default lists (60+ categories)
|
40
|
+
client = UrlCategorise::Client.new
|
41
|
+
|
42
|
+
# Get basic statistics
|
43
|
+
puts "Total hosts: #{client.count_of_hosts}"
|
44
|
+
puts "Categories: #{client.count_of_categories}"
|
45
|
+
puts "Data size: #{client.size_of_data} MB"
|
46
|
+
|
47
|
+
# Categorize a URL or domain
|
48
|
+
categories = client.categorise("badsite.com")
|
49
|
+
puts "Categories: #{categories}" # => [:malware, :phishing]
|
50
|
+
|
51
|
+
# Check if domain resolves to suspicious IPs
|
52
|
+
categories = client.resolve_and_categorise("suspicious-domain.com")
|
53
|
+
puts "Domain + IP categories: #{categories}"
|
54
|
+
|
55
|
+
# Categorize an IP address directly
|
56
|
+
ip_categories = client.categorise_ip("192.168.1.100")
|
57
|
+
puts "IP categories: #{ip_categories}"
|
58
|
+
```
|
59
|
+
|
60
|
+
## Advanced Configuration
|
61
|
+
|
62
|
+
### File Caching
|
63
|
+
|
64
|
+
Enable local file caching to improve performance and reduce bandwidth:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
# Cache files locally and check for updates
|
68
|
+
client = UrlCategorise::Client.new(
|
69
|
+
cache_dir: "./url_cache",
|
70
|
+
force_download: false # Use cache when available
|
71
|
+
)
|
72
|
+
|
73
|
+
# Force fresh download ignoring cache
|
74
|
+
client = UrlCategorise::Client.new(
|
75
|
+
cache_dir: "./url_cache",
|
76
|
+
force_download: true
|
77
|
+
)
|
78
|
+
```
|
79
|
+
|
80
|
+
### Custom DNS Servers
|
81
|
+
|
82
|
+
Configure custom DNS servers for domain resolution:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
client = UrlCategorise::Client.new(
|
86
|
+
dns_servers: ['8.8.8.8', '8.8.4.4'] # Default: ['1.1.1.1', '1.0.0.1']
|
87
|
+
)
|
88
|
+
```
|
89
|
+
|
90
|
+
### Request Timeout Configuration
|
91
|
+
|
92
|
+
Configure HTTP request timeout for downloading blocklists:
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
# Default timeout is 10 seconds
|
96
|
+
client = UrlCategorise::Client.new(
|
97
|
+
request_timeout: 30 # 30 second timeout for slow networks
|
98
|
+
)
|
99
|
+
|
100
|
+
# For faster networks or when you want quick failures
|
101
|
+
client = UrlCategorise::Client.new(
|
102
|
+
request_timeout: 5 # 5 second timeout
|
103
|
+
)
|
104
|
+
```
|
105
|
+
|
106
|
+
### Complete Configuration Example
|
107
|
+
|
108
|
+
Here's a comprehensive example with all available options:
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
client = UrlCategorise::Client.new(
|
112
|
+
host_urls: UrlCategorise::Constants::DEFAULT_HOST_URLS, # Use default or custom lists
|
113
|
+
cache_dir: "./url_cache", # Enable local caching
|
114
|
+
force_download: false, # Use cache when available
|
115
|
+
dns_servers: ['1.1.1.1', '1.0.0.1'], # Cloudflare DNS servers
|
116
|
+
request_timeout: 15 # 15 second HTTP timeout
|
117
|
+
)
|
118
|
+
```
|
119
|
+
|
120
|
+
### Custom Lists
|
121
|
+
|
122
|
+
Use your own curated lists or subset of categories:
|
123
|
+
|
124
|
+
```ruby
|
125
|
+
# Custom host list configuration
|
126
|
+
host_urls = {
|
127
|
+
malware: ["https://example.com/malware-domains.txt"],
|
128
|
+
phishing: ["https://example.com/phishing-domains.txt"],
|
129
|
+
combined_bad: [:malware, :phishing] # Combine categories
|
130
|
+
}
|
131
|
+
|
132
|
+
client = UrlCategorise::Client.new(host_urls: host_urls)
|
133
|
+
```
|
134
|
+
|
135
|
+
## Available Categories
|
136
|
+
|
137
|
+
### Security & Threat Intelligence
|
138
|
+
- **malware**, **phishing**, **threat_indicators** - Core security threats
|
139
|
+
- **cryptojacking**, **phishing_extended** - Advanced security categories
|
140
|
+
- **threat_intelligence** - HaGeZi threat intelligence feeds
|
141
|
+
- **sanctions_ips**, **compromised_ips**, **tor_exit_nodes**, **open_proxy_ips** - IP-based security lists
|
142
|
+
|
143
|
+
### Content Filtering
|
144
|
+
- **advertising**, **tracking**, **gambling**, **pornography** - Content categories
|
145
|
+
- **social_media**, **gaming**, **dating_services** - Platform-specific lists
|
146
|
+
- **hate_and_junk**, **fraud**, **scam**, **redirect** - Unwanted content
|
147
|
+
|
148
|
+
### Network Security
|
149
|
+
- **top_attack_sources**, **suspicious_domains** - Network threat feeds
|
150
|
+
- **dns_over_https_bypass** - DNS-over-HTTPS and VPN bypass detection
|
151
|
+
- **dyndns**, **badware_hoster** - Infrastructure-based threats
|
152
|
+
|
153
|
+
### Corporate & Platform Lists
|
154
|
+
- **google**, **facebook**, **microsoft**, **apple** - Major tech platforms
|
155
|
+
- **youtube**, **tiktok**, **twitter**, **instagram** - Social media platforms
|
156
|
+
- **amazon**, **adobe**, **cloudflare** - Service providers
|
157
|
+
|
158
|
+
### Specialized & Regional
|
159
|
+
- **newly_registered_domains** - Recently registered domains (high risk)
|
160
|
+
- **most_abused_tlds** - Most abused top-level domains
|
161
|
+
- **chinese_ad_hosts**, **korean_ad_hosts** - Regional advertising
|
162
|
+
- **mobile_ads**, **smart_tv_ads** - Device-specific advertising
|
163
|
+
- **news**, **fakenews** - News and misinformation
|
164
|
+
|
165
|
+
### Content Categories
|
166
|
+
- **piracy**, **torrent**, **drugs**, **vaping** - Restricted content
|
167
|
+
- **crypto**, **nsa** - Specialized blocking lists
|
168
|
+
|
169
|
+
## Health Monitoring
|
170
|
+
|
171
|
+
The gem includes built-in health monitoring to ensure all blocklist sources remain accessible:
|
172
|
+
|
173
|
+
```ruby
|
174
|
+
# Check health of all configured lists
|
175
|
+
client = UrlCategorise::Client.new
|
176
|
+
health_report = client.check_all_lists
|
177
|
+
|
178
|
+
puts "Healthy categories: #{health_report[:summary][:healthy_categories]}"
|
179
|
+
puts "Categories with issues: #{health_report[:summary][:categories_with_issues]}"
|
180
|
+
|
181
|
+
# View detailed issues
|
182
|
+
health_report[:unreachable_lists].each do |category, failures|
|
183
|
+
puts "#{category}: #{failures.map { |f| f[:error] }.join(', ')}"
|
184
|
+
end
|
185
|
+
```
|
186
|
+
|
187
|
+
Use the included script to check all URLs:
|
188
|
+
```bash
|
189
|
+
# Check all URLs in constants
|
190
|
+
ruby bin/check_lists
|
191
|
+
```
|
192
|
+
|
193
|
+
[View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
|
194
|
+
|
195
|
+
## ActiveRecord Integration
|
196
|
+
|
197
|
+
For high-performance applications, enable database storage:
|
198
|
+
|
199
|
+
```ruby
|
200
|
+
# Add to Gemfile
|
201
|
+
gem 'activerecord'
|
202
|
+
gem 'sqlite3' # or your preferred database
|
203
|
+
|
204
|
+
# Generate migration
|
205
|
+
puts UrlCategorise::Models.generate_migration
|
206
|
+
|
207
|
+
# Use ActiveRecord client (automatically populates database)
|
208
|
+
client = UrlCategorise::ActiveRecordClient.new(
|
209
|
+
cache_dir: "./cache",
|
210
|
+
use_database: true
|
211
|
+
)
|
212
|
+
|
213
|
+
# Database-backed lookups (much faster for repeated queries)
|
214
|
+
categories = client.categorise("example.com")
|
215
|
+
|
216
|
+
# Get database statistics
|
217
|
+
stats = client.database_stats
|
218
|
+
# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90 }
|
219
|
+
|
220
|
+
# Direct model access
|
221
|
+
domain_record = UrlCategorise::Models::Domain.find_by(domain: "example.com")
|
222
|
+
ip_record = UrlCategorise::Models::IpAddress.find_by(ip_address: "1.2.3.4")
|
223
|
+
```
|
224
|
+
|
225
|
+
## Rails Integration
|
226
|
+
|
227
|
+
### Installation
|
228
|
+
|
229
|
+
Add to your Gemfile:
|
230
|
+
|
231
|
+
```ruby
|
232
|
+
gem 'url_categorise'
|
233
|
+
# Optional for database integration
|
234
|
+
gem 'activerecord' # Usually already included in Rails
|
235
|
+
```
|
236
|
+
|
237
|
+
### Generate Migration
|
238
|
+
|
239
|
+
```bash
|
240
|
+
# Generate the migration file
|
241
|
+
rails generate migration CreateUrlCategoriseTables
|
242
|
+
|
243
|
+
# Replace the generated migration content with:
|
244
|
+
```
|
245
|
+
|
246
|
+
```ruby
|
247
|
+
class CreateUrlCategoriseTables < ActiveRecord::Migration[7.0]
|
248
|
+
def change
|
249
|
+
create_table :url_categorise_list_metadata do |t|
|
250
|
+
t.string :name, null: false, index: { unique: true }
|
251
|
+
t.string :url, null: false
|
252
|
+
t.text :categories, null: false
|
253
|
+
t.string :file_path
|
254
|
+
t.datetime :fetched_at
|
255
|
+
t.string :file_hash
|
256
|
+
t.datetime :file_updated_at
|
257
|
+
t.timestamps
|
258
|
+
end
|
259
|
+
|
260
|
+
create_table :url_categorise_domains do |t|
|
261
|
+
t.string :domain, null: false, index: { unique: true }
|
262
|
+
t.text :categories, null: false
|
263
|
+
t.timestamps
|
264
|
+
end
|
265
|
+
|
266
|
+
add_index :url_categorise_domains, :domain
|
267
|
+
add_index :url_categorise_domains, :categories
|
268
|
+
|
269
|
+
create_table :url_categorise_ip_addresses do |t|
|
270
|
+
t.string :ip_address, null: false, index: { unique: true }
|
271
|
+
t.text :categories, null: false
|
272
|
+
t.timestamps
|
273
|
+
end
|
274
|
+
|
275
|
+
add_index :url_categorise_ip_addresses, :ip_address
|
276
|
+
add_index :url_categorise_ip_addresses, :categories
|
277
|
+
end
|
278
|
+
end
|
279
|
+
```
|
280
|
+
|
281
|
+
```bash
|
282
|
+
# Run the migration
|
283
|
+
rails db:migrate
|
284
|
+
```
|
285
|
+
|
286
|
+
### Service Class Example
|
287
|
+
|
288
|
+
Create a service class for URL categorization:
|
289
|
+
|
290
|
+
```ruby
|
291
|
+
# app/services/url_categorizer_service.rb
|
292
|
+
class UrlCategorizerService
|
293
|
+
include Singleton
|
294
|
+
|
295
|
+
def initialize
|
296
|
+
@client = UrlCategorise::ActiveRecordClient.new(
|
297
|
+
cache_dir: Rails.root.join('tmp', 'url_cache'),
|
298
|
+
use_database: true,
|
299
|
+
force_download: Rails.env.development?,
|
300
|
+
request_timeout: Rails.env.production? ? 30 : 10 # Longer timeout in production
|
301
|
+
)
|
302
|
+
end
|
303
|
+
|
304
|
+
def categorise(url)
|
305
|
+
Rails.cache.fetch("url_category_#{url}", expires_in: 1.hour) do
|
306
|
+
@client.categorise(url)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
def categorise_with_ip_resolution(url)
|
311
|
+
Rails.cache.fetch("url_ip_category_#{url}", expires_in: 1.hour) do
|
312
|
+
@client.resolve_and_categorise(url)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
def categorise_ip(ip_address)
|
317
|
+
Rails.cache.fetch("ip_category_#{ip_address}", expires_in: 6.hours) do
|
318
|
+
@client.categorise_ip(ip_address)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def stats
|
323
|
+
@client.database_stats
|
324
|
+
end
|
325
|
+
|
326
|
+
def refresh_lists!
|
327
|
+
@client.update_database
|
328
|
+
end
|
329
|
+
end
|
330
|
+
```
|
331
|
+
|
332
|
+
### Controller Example
|
333
|
+
|
334
|
+
```ruby
|
335
|
+
# app/controllers/api/v1/url_categorization_controller.rb
|
336
|
+
class Api::V1::UrlCategorizationController < ApplicationController
|
337
|
+
before_action :authenticate_api_key # Your authentication method
|
338
|
+
|
339
|
+
def categorise
|
340
|
+
url = params[:url]
|
341
|
+
|
342
|
+
if url.blank?
|
343
|
+
render json: { error: 'URL parameter is required' }, status: :bad_request
|
344
|
+
return
|
345
|
+
end
|
346
|
+
|
347
|
+
begin
|
348
|
+
categories = UrlCategorizerService.instance.categorise(url)
|
349
|
+
|
350
|
+
render json: {
|
351
|
+
url: url,
|
352
|
+
categories: categories,
|
353
|
+
risk_level: calculate_risk_level(categories),
|
354
|
+
timestamp: Time.current
|
355
|
+
}
|
356
|
+
rescue => e
|
357
|
+
Rails.logger.error "URL categorization failed for #{url}: #{e.message}"
|
358
|
+
render json: { error: 'Categorization failed' }, status: :internal_server_error
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
def categorise_with_ip
|
363
|
+
url = params[:url]
|
364
|
+
|
365
|
+
begin
|
366
|
+
categories = UrlCategorizerService.instance.categorise_with_ip_resolution(url)
|
367
|
+
|
368
|
+
render json: {
|
369
|
+
url: url,
|
370
|
+
categories: categories,
|
371
|
+
includes_ip_check: true,
|
372
|
+
risk_level: calculate_risk_level(categories),
|
373
|
+
timestamp: Time.current
|
374
|
+
}
|
375
|
+
rescue => e
|
376
|
+
Rails.logger.error "URL+IP categorization failed for #{url}: #{e.message}"
|
377
|
+
render json: { error: 'Categorization failed' }, status: :internal_server_error
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
def stats
|
382
|
+
render json: UrlCategorizerService.instance.stats
|
383
|
+
end
|
384
|
+
|
385
|
+
private
|
386
|
+
|
387
|
+
def calculate_risk_level(categories)
|
388
|
+
high_risk = [:malware, :phishing, :threat_indicators, :cryptojacking, :phishing_extended]
|
389
|
+
medium_risk = [:gambling, :pornography, :tor_exit_nodes, :compromised_ips, :suspicious_domains]
|
390
|
+
|
391
|
+
return 'high' if (categories & high_risk).any?
|
392
|
+
return 'medium' if (categories & medium_risk).any?
|
393
|
+
return 'low' if categories.any?
|
394
|
+
'unknown'
|
395
|
+
end
|
396
|
+
end
|
397
|
+
```
|
398
|
+
|
399
|
+
### Model Integration Example
|
400
|
+
|
401
|
+
Add URL categorization to your existing models:
|
402
|
+
|
403
|
+
```ruby
|
404
|
+
# app/models/website.rb
|
405
|
+
class Website < ApplicationRecord
|
406
|
+
validates :url, presence: true, uniqueness: true
|
407
|
+
|
408
|
+
after_create :categorize_url
|
409
|
+
|
410
|
+
def categories
|
411
|
+
super || categorize_url
|
412
|
+
end
|
413
|
+
|
414
|
+
def risk_level
|
415
|
+
high_risk_categories = [:malware, :phishing, :threat_indicators, :cryptojacking]
|
416
|
+
return 'high' if (categories & high_risk_categories).any?
|
417
|
+
return 'medium' if categories.include?(:gambling) || categories.include?(:pornography)
|
418
|
+
return 'low' if categories.any?
|
419
|
+
'unknown'
|
420
|
+
end
|
421
|
+
|
422
|
+
def is_safe?
|
423
|
+
risk_level == 'low' || risk_level == 'unknown'
|
424
|
+
end
|
425
|
+
|
426
|
+
private
|
427
|
+
|
428
|
+
def categorize_url
|
429
|
+
cats = UrlCategorizerService.instance.categorise(url)
|
430
|
+
update_column(:categories, cats) if persisted?
|
431
|
+
cats
|
432
|
+
end
|
433
|
+
end
|
434
|
+
```
|
435
|
+
|
436
|
+
### Background Job Example
|
19
437
|
|
20
|
-
|
21
|
-
The default host lists I picked for their separated categories.
|
22
|
-
I didn't select them for the quality of data
|
23
|
-
Use at your own risk!
|
438
|
+
For processing large batches of URLs:
|
24
439
|
|
25
440
|
```ruby
|
26
|
-
|
27
|
-
|
441
|
+
# app/jobs/url_categorization_job.rb
|
442
|
+
class UrlCategorizationJob < ApplicationJob
|
443
|
+
queue_as :default
|
444
|
+
|
445
|
+
def perform(batch_id, urls)
|
446
|
+
service = UrlCategorizerService.instance
|
447
|
+
|
448
|
+
results = urls.map do |url|
|
449
|
+
begin
|
450
|
+
categories = service.categorise_with_ip_resolution(url)
|
451
|
+
{ url: url, categories: categories, status: 'success' }
|
452
|
+
rescue => e
|
453
|
+
Rails.logger.error "Failed to categorize #{url}: #{e.message}"
|
454
|
+
{ url: url, error: e.message, status: 'failed' }
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
# Store results in your preferred way (database, Redis, etc.)
|
459
|
+
BatchResult.create!(
|
460
|
+
batch_id: batch_id,
|
461
|
+
results: results,
|
462
|
+
completed_at: Time.current
|
463
|
+
)
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
# Usage:
|
468
|
+
urls = ['http://example.com', 'http://suspicious-site.com']
|
469
|
+
UrlCategorizationJob.perform_later('batch_123', urls)
|
470
|
+
```
|
471
|
+
|
472
|
+
### Configuration
|
28
473
|
|
29
|
-
|
30
|
-
|
31
|
-
|
474
|
+
```ruby
|
475
|
+
# config/initializers/url_categorise.rb
|
476
|
+
Rails.application.configure do
|
477
|
+
config.after_initialize do
|
478
|
+
# Warm up the categorizer on app start
|
479
|
+
UrlCategorizerService.instance if Rails.env.production?
|
480
|
+
end
|
481
|
+
end
|
482
|
+
```
|
32
483
|
|
33
|
-
|
34
|
-
client.categorise(url)
|
484
|
+
### Rake Tasks
|
35
485
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
486
|
+
```ruby
|
487
|
+
# lib/tasks/url_categorise.rake
|
488
|
+
namespace :url_categorise do
|
489
|
+
desc "Update all categorization lists"
|
490
|
+
task refresh_lists: :environment do
|
491
|
+
puts "Refreshing URL categorization lists..."
|
492
|
+
UrlCategorizerService.instance.refresh_lists!
|
493
|
+
puts "Lists refreshed successfully!"
|
494
|
+
puts "Stats: #{UrlCategorizerService.instance.stats}"
|
495
|
+
end
|
496
|
+
|
497
|
+
desc "Show categorization statistics"
|
498
|
+
task stats: :environment do
|
499
|
+
stats = UrlCategorizerService.instance.stats
|
500
|
+
puts "URL Categorization Statistics:"
|
501
|
+
puts " Domains: #{stats[:domains]}"
|
502
|
+
puts " IP Addresses: #{stats[:ip_addresses]}"
|
503
|
+
puts " Categories: #{stats[:categories]}"
|
504
|
+
puts " List Metadata: #{stats[:list_metadata]}"
|
505
|
+
end
|
506
|
+
end
|
507
|
+
```
|
40
508
|
|
41
|
-
|
42
|
-
client = UrlCategorise::Client.new(host_urls: host_urls)
|
509
|
+
### Cron Job Setup
|
43
510
|
|
44
|
-
|
45
|
-
host_urls = {
|
46
|
-
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
|
47
|
-
bad_links: [:abuse]
|
48
|
-
}
|
511
|
+
Add to your crontab or use whenever gem:
|
49
512
|
|
50
|
-
|
51
|
-
|
513
|
+
```ruby
|
514
|
+
# config/schedule.rb (if using whenever gem)
|
515
|
+
every 1.day, at: '2:00 am' do
|
516
|
+
rake 'url_categorise:refresh_lists'
|
517
|
+
end
|
518
|
+
```
|
519
|
+
|
520
|
+
This Rails integration provides enterprise-level URL categorization with caching, background processing, and comprehensive error handling.
|
521
|
+
|
522
|
+
## List Format Support
|
523
|
+
|
524
|
+
The gem automatically detects and parses multiple blocklist formats:
|
525
|
+
|
526
|
+
### Hosts File Format
|
527
|
+
```
|
528
|
+
0.0.0.0 badsite.com
|
529
|
+
127.0.0.1 malware.com
|
530
|
+
```
|
531
|
+
|
532
|
+
### Plain Text Format
|
533
|
+
```
|
534
|
+
badsite.com
|
535
|
+
malware.com
|
536
|
+
```
|
537
|
+
|
538
|
+
### dnsmasq Format
|
539
|
+
```
|
540
|
+
address=/badsite.com/0.0.0.0
|
541
|
+
address=/malware.com/0.0.0.0
|
542
|
+
```
|
543
|
+
|
544
|
+
### uBlock Origin Format
|
545
|
+
```
|
546
|
+
||badsite.com^
|
547
|
+
||malware.com^$important
|
548
|
+
```
|
549
|
+
|
550
|
+
## Performance Tips
|
551
|
+
|
552
|
+
1. **Use Caching**: Enable `cache_dir` for faster subsequent runs
|
553
|
+
2. **Database Storage**: Use `ActiveRecordClient` for applications with frequent lookups
|
554
|
+
3. **Selective Categories**: Only load categories you need for better performance
|
555
|
+
4. **Batch Processing**: Process multiple URLs in batches when possible
|
556
|
+
|
557
|
+
## Metadata and Updates
|
558
|
+
|
559
|
+
Access detailed metadata about downloaded lists:
|
560
|
+
|
561
|
+
```ruby
|
562
|
+
client = UrlCategorise::Client.new(cache_dir: "./cache")
|
563
|
+
|
564
|
+
# Access metadata for each list
|
565
|
+
client.metadata.each do |url, meta|
|
566
|
+
puts "URL: #{url}"
|
567
|
+
puts "Last updated: #{meta[:last_updated]}"
|
568
|
+
puts "ETag: #{meta[:etag]}"
|
569
|
+
puts "Content hash: #{meta[:content_hash]}"
|
570
|
+
end
|
52
571
|
```
|
53
572
|
|
54
573
|
## Development
|
@@ -62,6 +581,13 @@ To run tests execute:
|
|
62
581
|
|
63
582
|
$ rake test
|
64
583
|
|
584
|
+
### Test Coverage
|
585
|
+
The gem includes comprehensive test coverage using SimpleCov. To generate coverage reports:
|
586
|
+
|
587
|
+
$ rake test
|
588
|
+
|
589
|
+
Coverage reports are generated in the `coverage/` directory. The gem maintains a minimum coverage threshold of 80% to ensure code quality and reliability.
|
590
|
+
|
65
591
|
## Contributing
|
66
592
|
|
67
593
|
Bug reports and pull requests are welcome on GitHub at https://github.com/trex22/url_categorise. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
data/Rakefile
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
require "bundler/gem_tasks"
|
2
|
+
require "bundler/setup"
|
2
3
|
require "rake/testtask"
|
3
4
|
|
4
5
|
Rake::TestTask.new(:test) do |t|
|
5
6
|
t.libs << "test"
|
6
7
|
t.libs << "lib"
|
7
8
|
t.test_files = FileList["test/**/*_test.rb"]
|
9
|
+
t.ruby_opts = ["-rbundler/setup"]
|
8
10
|
end
|
9
11
|
|
10
12
|
task :default => :test
|
data/bin/check_lists
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require_relative '../lib/url_categorise'
|
5
|
+
|
6
|
+
puts "=== CHECKING ALL URLs IN CONSTANTS ==="
|
7
|
+
|
8
|
+
UrlCategorise::Constants::DEFAULT_HOST_URLS.each do |category, urls|
|
9
|
+
puts "\n#{category.upcase}:"
|
10
|
+
|
11
|
+
# Skip categories that only reference other categories (symbols)
|
12
|
+
actual_urls = urls.reject { |url| url.is_a?(Symbol) }
|
13
|
+
|
14
|
+
if actual_urls.empty?
|
15
|
+
if urls.empty?
|
16
|
+
puts " Empty category (no URLs defined)"
|
17
|
+
else
|
18
|
+
puts " Only references other categories: #{urls}"
|
19
|
+
end
|
20
|
+
next
|
21
|
+
end
|
22
|
+
|
23
|
+
actual_urls.each do |url|
|
24
|
+
print " Testing #{url}... "
|
25
|
+
begin
|
26
|
+
response = HTTParty.head(url, timeout: 10)
|
27
|
+
case response.code
|
28
|
+
when 200
|
29
|
+
puts "✅ OK"
|
30
|
+
when 404
|
31
|
+
puts "❌ 404 Not Found"
|
32
|
+
when 403
|
33
|
+
puts "❌ 403 Forbidden"
|
34
|
+
when 500..599
|
35
|
+
puts "❌ Server Error (#{response.code})"
|
36
|
+
else
|
37
|
+
puts "⚠️ HTTP #{response.code}"
|
38
|
+
end
|
39
|
+
rescue Net::TimeoutError, HTTParty::TimeoutError
|
40
|
+
puts "❌ Timeout"
|
41
|
+
rescue SocketError, Errno::ECONNREFUSED => e
|
42
|
+
puts "❌ DNS/Network Error"
|
43
|
+
rescue => e
|
44
|
+
puts "❌ Error: #{e.class}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
data/docs/.keep
ADDED