UrlCategorise 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e4725733a26240e3bc23bfe9292d3ed0222db4e0308c5228da12a9ede1347e4b
4
- data.tar.gz: a315a03357b48260c543459fab91ac8ec0601a0d7001b120bdc6f72e543e3671
3
+ metadata.gz: c22ada722efc33979930ece5d8737625943f1d815d2c7acdbfbc0f5b0fede15f
4
+ data.tar.gz: 15747c8a4b23c4c805cd74118627dfa8d102d4d813ae9f08c8c49011ff2ebad8
5
5
  SHA512:
6
- metadata.gz: 74c8be653a2ae97ad74300105a95c326685d9669defd5c980cf16636e96965080a587453c21d5af6014874145f8f93d66c444574142f1568588f82b9577a7440
7
- data.tar.gz: 450ae86237aa6a743a289e9b1165b9bae49f78068af8d727114763629b863567dc37de1b2ba38522e93a0ae08dd1683b79a062df2e2d18e76884fd4f8415c1db
6
+ metadata.gz: b1c21b59b8a7631c4c39f206fb37bb9c51c1925a544ddd5f8a627b595dafa92d366d880a33c6303eafc477ae9be4130055af060953e27617e5ce198618e5f9bd
7
+ data.tar.gz: 1621c52578daf8957546c778125df120f9480dbe2f2d365feb6024890ddfe5affac3aae77100047ea46b15c96939db1fc228b876806197c63dda954199536bc9
@@ -16,8 +16,15 @@
16
16
  "Bash(timeout:*)",
17
17
  "Bash(DEBUG=1 timeout 300 ruby correct_usage_example.rb)",
18
18
  "Bash(chmod:*)",
19
- "Bash(bundle exec bin/export_csv:*)"
19
+ "Bash(bundle exec bin/export_csv:*)",
20
+ "Bash(bundle exec rg:*)",
21
+ "WebFetch(domain:github.com)",
22
+ "WebFetch(domain:api.github.com)",
23
+ "WebFetch(domain:raw.githubusercontent.com)",
24
+ "WebSearch",
25
+ "WebFetch(domain:firebog.net)",
26
+ "Bash(curl:*)"
20
27
  ],
21
28
  "deny": []
22
29
  }
23
- }
30
+ }
data/.gitignore CHANGED
@@ -56,3 +56,4 @@ build-iPhoneSimulator/
56
56
  # .rubocop-https?--*
57
57
  url_cache/*
58
58
  exports/*
59
+ .DS_Store
data/.rubocop.yml ADDED
@@ -0,0 +1,56 @@
1
+ inherit_gem:
2
+ rubocop-rails-omakase: rubocop.yml
3
+
4
+ AllCops:
5
+ TargetRubyVersion: 3.0
6
+ NewCops: enable
7
+ Exclude:
8
+ - 'bin/*'
9
+ - 'vendor/**/*'
10
+ - 'tmp/**/*'
11
+ - 'db/migrate/*'
12
+
13
+ # Allow longer lines for URL constants
14
+ Layout/LineLength:
15
+ Max: 200
16
+ Exclude:
17
+ - 'lib/url_categorise/constants.rb'
18
+ - 'test/**/*'
19
+
20
+ # Allow complex methods in client due to categorization logic
21
+ Metrics/MethodLength:
22
+ Max: 30
23
+ Exclude:
24
+ - 'lib/url_categorise/client.rb'
25
+ - 'test/**/*'
26
+
27
+ # Allow complex classes for main client
28
+ Metrics/ClassLength:
29
+ Max: 500
30
+ Exclude:
31
+ - 'lib/url_categorise/client.rb'
32
+ - 'test/**/*'
33
+
34
+ # Allow higher complexity for categorization methods
35
+ Metrics/CyclomaticComplexity:
36
+ Max: 15
37
+ Exclude:
38
+ - 'lib/url_categorise/client.rb'
39
+
40
+ # Allow higher ABC size for complex categorization logic
41
+ Metrics/AbcSize:
42
+ Max: 25
43
+ Exclude:
44
+ - 'lib/url_categorise/client.rb'
45
+ - 'test/**/*'
46
+
47
+ # Allow higher parameter count for client initialization
48
+ Metrics/ParameterLists:
49
+ Max: 8
50
+
51
+ # Allow block length for tests and constants
52
+ Metrics/BlockLength:
53
+ Exclude:
54
+ - 'test/**/*'
55
+ - 'lib/url_categorise/constants.rb'
56
+ - 'url_categorise.gemspec'
data/.sublime-project ADDED
@@ -0,0 +1,14 @@
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": ".",
5
+ "folder_exclude_patterns": [
6
+ "node_modules", ".git", "tmp", "log",
7
+ "vendor/bundle", "coverage", ".bundle"
8
+ ],
9
+ "file_exclude_patterns": [
10
+ ".log", ".min.js", "*.min.css"
11
+ ]
12
+ }
13
+ ]
14
+ }
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
- source 'https://rubygems.org'
1
+ source "https://rubygems.org"
2
2
 
3
- git_source(:github) { |_repo_name| 'https://github.com/TRex22/url_categorise' }
3
+ git_source(:github) { |_repo_name| "https://github.com/TRex22/url_categorise" }
4
4
 
5
5
  # Specify your gem's dependencies in url_categorise.gemspec
6
6
  gemspec
data/Gemfile.lock CHANGED
@@ -1,17 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- UrlCategorise (0.1.7)
4
+ UrlCategorise (0.1.9)
5
5
  active_attr (>= 0.17.1, < 1.0)
6
6
  api_pattern (>= 0.0.6, < 1.0)
7
7
  csv (>= 3.3.0, < 4.0)
8
8
  digest (>= 3.1.0, < 4.0)
9
9
  fileutils (>= 1.7.0, < 2.0)
10
- httparty (>= 0.22.0, < 1.0)
10
+ httparty (>= 0.24.0, < 1.0)
11
11
  json (>= 2.7.0, < 3.0)
12
12
  kaggle (>= 0.0.3, < 1.0)
13
- nokogiri (>= 1.18.9, < 2.0)
14
- reline (>= 0.6.2)
13
+ nokogiri (>= 1.19.1, < 2.0)
14
+ reline (>= 0.6.2, < 2.0)
15
15
  resolv (>= 0.4.0, < 1.0)
16
16
  rubyzip (>= 2.3.0, < 3.0)
17
17
 
@@ -65,10 +65,14 @@ GEM
65
65
  csv (>= 3.3.0)
66
66
  httparty (>= 0.22.0)
67
67
  nokogiri (>= 1.16.0)
68
+ ast (2.4.3)
68
69
  base64 (0.3.0)
69
70
  benchmark (0.4.1)
70
- bigdecimal (3.2.2)
71
+ bigdecimal (4.1.0)
71
72
  builder (3.3.0)
73
+ bundler-audit (0.9.3)
74
+ bundler (>= 1.2.0)
75
+ thor (~> 1.0)
72
76
  coderay (1.1.3)
73
77
  concurrent-ruby (1.3.5)
74
78
  connection_pool (2.5.3)
@@ -83,7 +87,7 @@ GEM
83
87
  erubi (1.13.1)
84
88
  fileutils (1.7.3)
85
89
  hashdiff (1.2.0)
86
- httparty (0.23.1)
90
+ httparty (0.24.2)
87
91
  csv
88
92
  mini_mime (>= 1.0.0)
89
93
  multi_xml (>= 0.5.2)
@@ -97,6 +101,8 @@ GEM
97
101
  httparty (>= 0.23)
98
102
  oj (= 3.16.11)
99
103
  rubyzip (>= 2.0)
104
+ language_server-protocol (3.17.0.5)
105
+ lint_roller (1.1.0)
100
106
  logger (1.7.0)
101
107
  loofah (2.24.1)
102
108
  crass (~> 1.0.2)
@@ -113,22 +119,28 @@ GEM
113
119
  ruby-progressbar
114
120
  mocha (2.4.5)
115
121
  ruby2_keywords (>= 0.0.5)
116
- multi_xml (0.7.2)
117
- bigdecimal (~> 3.1)
118
- nokogiri (1.18.9-arm64-darwin)
122
+ multi_xml (0.8.1)
123
+ bigdecimal (>= 3.1, < 5)
124
+ nokogiri (1.19.2-arm64-darwin)
119
125
  racc (~> 1.4)
120
126
  oj (3.16.11)
121
127
  bigdecimal (>= 3.0)
122
128
  ostruct (>= 0.2)
123
129
  ostruct (0.6.3)
130
+ parallel (1.27.0)
131
+ parser (3.3.9.0)
132
+ ast (~> 2.4.1)
133
+ racc
134
+ prism (1.4.0)
124
135
  pry (0.15.2)
125
136
  coderay (~> 1.1)
126
137
  method_source (~> 1.0)
127
138
  public_suffix (6.0.2)
128
139
  racc (1.8.1)
129
- rack (2.2.17)
130
- rack-session (1.0.2)
131
- rack (< 3)
140
+ rack (3.2.5)
141
+ rack-session (2.1.1)
142
+ base64 (>= 0.1.0)
143
+ rack (>= 3.0.0)
132
144
  rack-test (2.2.0)
133
145
  rack (>= 1.3)
134
146
  rails-dom-testing (2.3.0)
@@ -138,11 +150,41 @@ GEM
138
150
  rails-html-sanitizer (1.6.2)
139
151
  loofah (~> 2.21)
140
152
  nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
153
+ rainbow (3.1.1)
141
154
  rake (13.3.0)
155
+ regexp_parser (2.11.2)
142
156
  reline (0.6.2)
143
157
  io-console (~> 0.5)
144
158
  resolv (0.6.2)
145
- rexml (3.4.1)
159
+ rexml (3.4.4)
160
+ rubocop (1.80.1)
161
+ json (~> 2.3)
162
+ language_server-protocol (~> 3.17.0.2)
163
+ lint_roller (~> 1.1.0)
164
+ parallel (~> 1.10)
165
+ parser (>= 3.3.0.2)
166
+ rainbow (>= 2.2.2, < 4.0)
167
+ regexp_parser (>= 2.9.3, < 3.0)
168
+ rubocop-ast (>= 1.46.0, < 2.0)
169
+ ruby-progressbar (~> 1.7)
170
+ unicode-display_width (>= 2.4.0, < 4.0)
171
+ rubocop-ast (1.46.0)
172
+ parser (>= 3.3.7.2)
173
+ prism (~> 1.4)
174
+ rubocop-performance (1.25.0)
175
+ lint_roller (~> 1.1)
176
+ rubocop (>= 1.75.0, < 2.0)
177
+ rubocop-ast (>= 1.38.0, < 2.0)
178
+ rubocop-rails (2.33.3)
179
+ activesupport (>= 4.2.0)
180
+ lint_roller (~> 1.1)
181
+ rack (>= 1.1)
182
+ rubocop (>= 1.75.0, < 2.0)
183
+ rubocop-ast (>= 1.44.0, < 2.0)
184
+ rubocop-rails-omakase (1.1.0)
185
+ rubocop (>= 1.72)
186
+ rubocop-performance (>= 1.24)
187
+ rubocop-rails (>= 2.30)
146
188
  ruby-progressbar (1.13.0)
147
189
  ruby2_keywords (0.0.5)
148
190
  rubyzip (2.4.1)
@@ -154,11 +196,15 @@ GEM
154
196
  simplecov-html (0.13.2)
155
197
  simplecov_json_formatter (0.1.4)
156
198
  sqlite3 (2.7.3-arm64-darwin)
199
+ thor (1.5.0)
157
200
  timecop (0.9.10)
158
201
  timeout (0.4.3)
159
202
  tzinfo (2.0.6)
160
203
  concurrent-ruby (~> 1.0)
161
- uri (1.0.3)
204
+ unicode-display_width (3.1.5)
205
+ unicode-emoji (~> 4.0, >= 4.0.4)
206
+ unicode-emoji (4.0.4)
207
+ uri (1.1.1)
162
208
  useragent (0.16.11)
163
209
  webmock (3.24.0)
164
210
  addressable (>= 2.8.0)
@@ -167,10 +213,12 @@ GEM
167
213
 
168
214
  PLATFORMS
169
215
  arm64-darwin-24
216
+ arm64-darwin-25
170
217
 
171
218
  DEPENDENCIES
172
219
  UrlCategorise!
173
220
  activerecord (>= 8.0)
221
+ bundler-audit (~> 0.9)
174
222
  logger
175
223
  minitest (~> 5.25.5)
176
224
  minitest-focus (~> 1.4.0)
@@ -178,6 +226,7 @@ DEPENDENCIES
178
226
  mocha (~> 2.4.5)
179
227
  pry (~> 0.15.2)
180
228
  rake (~> 13.3.0)
229
+ rubocop-rails-omakase (~> 1.0)
181
230
  simplecov (~> 0.22.0)
182
231
  sqlite3 (>= 2.7)
183
232
  timecop (~> 0.9.10)
data/README.md CHANGED
@@ -6,6 +6,8 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
6
6
 
7
7
  - **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
8
8
  - **Video Content Detection**: Advanced regex-based categorization with `video_url?` method to distinguish video content from other website resources
9
+ - **Blog Content Detection**: Simple string-based `blog_url?` method to identify blog-related URLs and content
10
+ - **Debug Mode**: Comprehensive debug logging with timing information for initialization and operations
9
11
  - **Custom Video Lists**: Generate and maintain comprehensive video hosting domain lists using yt-dlp extractors
10
12
  - **Kaggle Dataset Integration**: Automatic loading and processing of machine learning datasets from Kaggle
11
13
  - **Multiple Data Sources**: Supports blocklists, CSV datasets, and Kaggle ML datasets
@@ -162,11 +164,11 @@ $ bundle exec export_hosts --output /tmp/hosts --verbose
162
164
  # Export CSV data with all features enabled
163
165
  $ bundle exec export_csv --output /tmp/csv --iab-compliance --smart-categorization --auto-load-datasets --verbose
164
166
 
165
- # Generate updated video hosting lists
166
- $ ruby bin/generate_video_lists
167
+ # Generate updated video hosting lists (with custom worker configuration)
168
+ $ ruby bin/generate_video_lists --workers 8 --batch-size 50
167
169
 
168
- # Check health of all blocklist URLs
169
- $ bundle exec check_lists
170
+ # Check health of all blocklist URLs (with parallel processing)
171
+ $ bundle exec check_lists --parallel --threads 16
170
172
 
171
173
  # Export with custom Kaggle credentials
172
174
  $ bundle exec export_csv --auto-load-datasets --kaggle-credentials ~/my-kaggle.json --verbose
@@ -183,6 +185,8 @@ $ bundle exec check_lists
183
185
  - `--kaggle-credentials FILE`: Specify custom Kaggle credentials file
184
186
  - `--iab-compliance`: Enable IAB Content Taxonomy mapping
185
187
  - `--smart-categorization`: Enable intelligent category filtering
188
+ - `--parallel` / `--threads NUM`: Enable parallel URL checking with configurable thread count
189
+ - `--workers NUM`: Configure number of Ractor workers for video list generation
186
190
 
187
191
  ## Advanced Configuration
188
192
 
@@ -244,10 +248,32 @@ client = UrlCategorise::Client.new(
244
248
  iab_compliance: true, # Enable IAB compliance
245
249
  iab_version: :v3, # Use IAB Content Taxonomy v3.0
246
250
  auto_load_datasets: false, # Disable automatic dataset loading (default)
247
- smart_categorization: false # Disable smart post-processing (default)
251
+ smart_categorization: false, # Disable smart post-processing (default)
252
+ max_threads: 8, # Max threads for parallel processing (default: 8)
253
+ max_ractor_workers: 4, # Max Ractor workers for parallel processing (default: 4 or CPU count)
254
+ parallel_loading: true # Enable parallel loading when available (default: true)
248
255
  )
249
256
  ```
250
257
 
258
+ ### Parallel Processing Configuration
259
+
260
+ The gem uses Ruby Ractors (when available) or Threads for parallel processing to significantly improve performance:
261
+
262
+ ```ruby
263
+ # Configure parallel processing parameters
264
+ client = UrlCategorise::Client.new(
265
+ parallel_loading: true, # Enable/disable parallel loading (default: true if Ractors available)
266
+ max_threads: 16, # Maximum threads for thread-based processing (default: 8)
267
+ max_ractor_workers: 8 # Maximum Ractor workers for Ractor-based processing (default: 4 or CPU count)
268
+ )
269
+ ```
270
+
271
+ **Performance Notes:**
272
+ - **Ractors**: Used by default on Ruby 3.0+ when available, providing better isolation and performance
273
+ - **Threads**: Fallback method when Ractors are unavailable, still provides significant speedup
274
+ - **Test Environment**: Automatically switches to sequential processing during tests to avoid issues
275
+ - **Worker Pools**: Both Ractors and Threads use worker pool patterns to prevent resource exhaustion
276
+
251
277
  ### Custom Lists
252
278
 
253
279
  Use your own curated lists or subset of categories:
@@ -273,7 +299,7 @@ The gem maintains a comprehensive list of video hosting domains extracted from y
273
299
 
274
300
  ```ruby
275
301
  # Generate/update video hosting lists
276
- system("ruby bin/generate_video_lists")
302
+ system("ruby bin/generate_video_lists --workers 4 --batch-size 25")
277
303
 
278
304
  # Use video hosting categorization
279
305
  client = UrlCategorise::Client.new
@@ -331,13 +357,56 @@ client.video_url?("https://google.com/search?q=cats") # => false
331
357
  3. Returns `true` only if both conditions are met
332
358
  4. Handles invalid URLs gracefully (returns `false`)
333
359
 
360
+ #### Additional Video URL Helper Methods
361
+
362
+ The gem provides specialized helper methods for different types of video content:
363
+
364
+ ```ruby
365
+ client = UrlCategorise::Client.new(regex_categorization: true)
366
+
367
+ # Detect short-form video content
368
+ client.shorts_url?("https://youtube.com/shorts/abc123defgh") # => true
369
+ client.shorts_url?("https://tiktok.com/@user/video/123456789") # => true
370
+ client.shorts_url?("https://youtube.com/watch?v=test123") # => false
371
+
372
+ # Detect playlist URLs
373
+ client.playlist_url?("https://youtube.com/playlist?list=PLtest123") # => true
374
+ client.playlist_url?("https://youtube.com/watch?v=abc123&list=PLtest123") # => true
375
+ client.playlist_url?("https://vimeo.com/album/123456") # => true
376
+ client.playlist_url?("https://youtube.com/watch?v=test123") # => false
377
+
378
+ # Detect music content (works with video platforms hosting music)
379
+ client.music_url?("https://music.youtube.com/watch?v=abc123") # => true
380
+ client.music_url?("https://youtube.com/watch?v=abc123defgh&list=PLmusic") # => true
381
+ client.music_url?("https://youtube.com/c/musicchannel") # => true
382
+ client.music_url?("https://youtube.com/watch?v=regularvideo") # => false
383
+
384
+ # Detect channel/profile URLs
385
+ client.channel_url?("https://youtube.com/@channelname") # => true
386
+ client.channel_url?("https://tiktok.com/@username") # => true
387
+ client.channel_url?("https://twitch.tv/streamername") # => true
388
+ client.channel_url?("https://youtube.com/watch?v=test123") # => false
389
+
390
+ # Detect live stream URLs
391
+ client.live_stream_url?("https://youtube.com/live/streamid") # => true
392
+ client.live_stream_url?("https://twitch.tv/streamername") # => true
393
+ client.live_stream_url?("https://youtube.com/watch?v=test123") # => false
394
+ ```
395
+
396
+ **All helper methods:**
397
+ - Require `regex_categorization: true` to be enabled
398
+ - First verify the URL is from a video hosting domain
399
+ - Use specific regex patterns for accurate detection
400
+ - Handle invalid URLs gracefully (return `false`)
401
+ - Work across multiple video platforms (YouTube, TikTok, Vimeo, Twitch, etc.)
402
+
334
403
  #### Maintaining Video Lists
335
404
 
336
405
  The gem includes a script to generate and maintain comprehensive video hosting lists:
337
406
 
338
407
  ```bash
339
- # Generate updated video hosting lists
340
- ruby bin/generate_video_lists
408
+ # Generate updated video hosting lists (configurable parallel processing)
409
+ ruby bin/generate_video_lists --workers 8 --batch-size 50 --threshold 20
341
410
 
342
411
  # This creates:
343
412
  # - lists/video_hosting_domains.hosts (PiHole compatible)
@@ -346,6 +415,92 @@ ruby bin/generate_video_lists
346
415
 
347
416
  The script fetches data from yt-dlp extractors and combines it with manually curated major platforms to ensure comprehensive coverage.
348
417
 
418
+ ### Blog Content Detection
419
+
420
+ The gem provides a `blog_url?` method to identify blog-related URLs using simple string matching patterns:
421
+
422
+ ```ruby
423
+ client = UrlCategorise::Client.new
424
+
425
+ # Basic blog path detection
426
+ client.blog_url?("https://example.com/blog/") # => true
427
+ client.blog_url?("https://example.com/blogs/tech") # => true
428
+ client.blog_url?("https://example.com/blog?page=1") # => true
429
+
430
+ # Blog subdomains
431
+ client.blog_url?("https://blog.example.com/") # => true
432
+ client.blog_url?("https://blog.company.org/article") # => true
433
+
434
+ # Blog platforms
435
+ client.blog_url?("https://example.wordpress.com/") # => true
436
+ client.blog_url?("https://example.blogspot.com/post") # => true
437
+ client.blog_url?("https://medium.com/@user/article") # => true
438
+ client.blog_url?("https://user.substack.com/p/post") # => true
439
+
440
+ # Blog-like content paths
441
+ client.blog_url?("https://example.com/post/123") # => true
442
+ client.blog_url?("https://example.com/articles/tech") # => true
443
+ client.blog_url?("https://example.com/diary/entry") # => true
444
+
445
+ # Blog keywords in URLs
446
+ client.blog_url?("https://example.com/corporate-blog") # => true
447
+ client.blog_url?("https://example-blog.com/") # => true
448
+
449
+ # Non-blog URLs return false
450
+ client.blog_url?("https://example.com/") # => false
451
+ client.blog_url?("https://example.com/products") # => false
452
+ ```
453
+
454
+ **Detection patterns include:**
455
+ - `/blog/` or `/blogs/` in URL paths
456
+ - `blog.` subdomains
457
+ - `blog-` or `-blog` in domain names
458
+ - Common blog platforms (WordPress, Blogspot, Medium, Substack)
459
+ - Blog-like content paths (`/post/`, `/articles/`, `/diary/`, `/journal/`)
460
+ - The word "blog" anywhere in the URL
461
+ - Case-insensitive matching
462
+ - Graceful handling of invalid URLs
463
+
464
+ ### Debug Mode
465
+
466
+ The gem includes comprehensive debug functionality to help you understand what's happening during initialization and operation:
467
+
468
+ ```ruby
469
+ # Enable debug mode during initialization
470
+ client = UrlCategorise::Client.new(debug: true)
471
+
472
+ # Or enable it dynamically using ActiveAttr
473
+ client = UrlCategorise::Client.new
474
+ client.debug_enabled = true
475
+
476
+ # Debug output shows:
477
+ # [UrlCategorise DEBUG] Initializing UrlCategorise Client with debug enabled
478
+ # [UrlCategorise DEBUG] Loading host lists from 15 categories
479
+ # [UrlCategorise DEBUG] Processing host list: https://example.com/malware.txt
480
+ # [UrlCategorise DEBUG] Cache miss for https://example.com/malware.txt
481
+ # [UrlCategorise DEBUG] Downloading and parsing https://example.com/malware.txt completed in 234.56ms
482
+ # [UrlCategorise DEBUG] Downloaded 1500 hosts from https://example.com/malware.txt
483
+ # [UrlCategorise DEBUG] Total unique hosts collected: 45000
484
+ # [UrlCategorise DEBUG] Host lists loading completed in 2543.21ms
485
+ # [UrlCategorise DEBUG] Client initialization completed
486
+ ```
487
+
488
+ **Debug features include:**
489
+ - Initialization timing and progress
490
+ - Host list loading with individual URL timing
491
+ - Cache hit/miss information
492
+ - Download progress and host counts
493
+ - Dataset loading progress (when datasets are enabled)
494
+ - Regex pattern loading information
495
+ - Total timing for major operations
496
+ - Off by default, easily enabled via constructor or ActiveAttr
497
+
498
+ **Timing accuracy:**
499
+ - Millisecond precision timing
500
+ - Individual operation timing
501
+ - Cumulative timing for complex operations
502
+ - Helpful for performance optimization and debugging
503
+
349
504
  ### Smart Categorization (Post-Processing)
350
505
 
351
506
  Smart categorization solves the problem of overly broad domain-level categorization. For example, `reddit.com` might appear in health & fitness blocklists, but not all Reddit content is health-related.
@@ -559,8 +714,11 @@ end
559
714
 
560
715
  Use the included script to check all URLs:
561
716
  ```bash
562
- # Check all URLs in constants
717
+ # Check all URLs in constants (sequential)
563
718
  ruby bin/check_lists
719
+
720
+ # Check URLs in parallel with custom thread count
721
+ ruby bin/check_lists --parallel --threads 12 --verbose
564
722
  ```
565
723
 
566
724
  [View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ Rake::TestTask.new(:test) do |t|
6
6
  t.libs << 'test'
7
7
  t.libs << 'lib'
8
8
  t.test_files = FileList['test/**/*_test.rb']
9
- t.ruby_opts = ['-rbundler/setup']
9
+ t.ruby_opts = [ '-rbundler/setup' ]
10
10
  end
11
11
 
12
12
  task default: :test