UrlCategorise 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +9 -2
- data/.gitignore +1 -0
- data/.rubocop.yml +56 -0
- data/.sublime-project +14 -0
- data/Gemfile +2 -2
- data/Gemfile.lock +63 -14
- data/README.md +167 -9
- data/Rakefile +1 -1
- data/bin/check_lists +124 -22
- data/bin/export_csv +45 -36
- data/bin/export_hosts +34 -24
- data/bin/generate_categorised_lists +533 -0
- data/bin/generate_social_media_lists +398 -0
- data/bin/generate_video_lists +303 -124
- data/correct_usage_example.rb +18 -18
- data/lib/url_categorise/active_record_client.rb +11 -11
- data/lib/url_categorise/client.rb +843 -225
- data/lib/url_categorise/constants.rb +148 -70
- data/lib/url_categorise/dataset_processor.rb +61 -63
- data/lib/url_categorise/iab_compliance.rb +111 -99
- data/lib/url_categorise/models.rb +15 -15
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +14 -14
- data/lists/blogging_domains.hosts +62 -0
- data/lists/crypto_domains.hosts +46 -0
- data/lists/developer_domains.hosts +108 -0
- data/lists/forum_domains.hosts +65 -0
- data/lists/gaming_domains.hosts +99 -0
- data/lists/messaging_domains.hosts +96 -0
- data/lists/music_domains.hosts +62 -0
- data/lists/security_domains.hosts +48 -0
- data/lists/social_media_domains.hosts +443 -0
- data/lists/streaming_domains.hosts +41 -0
- data/lists/video_hosting_domains.hosts +189 -167
- data/lists/video_url_patterns.txt +15 -15
- data/url_categorise.gemspec +9 -7
- metadata +66 -18
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c22ada722efc33979930ece5d8737625943f1d815d2c7acdbfbc0f5b0fede15f
|
|
4
|
+
data.tar.gz: 15747c8a4b23c4c805cd74118627dfa8d102d4d813ae9f08c8c49011ff2ebad8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b1c21b59b8a7631c4c39f206fb37bb9c51c1925a544ddd5f8a627b595dafa92d366d880a33c6303eafc477ae9be4130055af060953e27617e5ce198618e5f9bd
|
|
7
|
+
data.tar.gz: 1621c52578daf8957546c778125df120f9480dbe2f2d365feb6024890ddfe5affac3aae77100047ea46b15c96939db1fc228b876806197c63dda954199536bc9
|
data/.claude/settings.local.json
CHANGED
|
@@ -16,8 +16,15 @@
|
|
|
16
16
|
"Bash(timeout:*)",
|
|
17
17
|
"Bash(DEBUG=1 timeout 300 ruby correct_usage_example.rb)",
|
|
18
18
|
"Bash(chmod:*)",
|
|
19
|
-
"Bash(bundle exec bin/export_csv:*)"
|
|
19
|
+
"Bash(bundle exec bin/export_csv:*)",
|
|
20
|
+
"Bash(bundle exec rg:*)",
|
|
21
|
+
"WebFetch(domain:github.com)",
|
|
22
|
+
"WebFetch(domain:api.github.com)",
|
|
23
|
+
"WebFetch(domain:raw.githubusercontent.com)",
|
|
24
|
+
"WebSearch",
|
|
25
|
+
"WebFetch(domain:firebog.net)",
|
|
26
|
+
"Bash(curl:*)"
|
|
20
27
|
],
|
|
21
28
|
"deny": []
|
|
22
29
|
}
|
|
23
|
-
}
|
|
30
|
+
}
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
inherit_gem:
|
|
2
|
+
rubocop-rails-omakase: rubocop.yml
|
|
3
|
+
|
|
4
|
+
AllCops:
|
|
5
|
+
TargetRubyVersion: 3.0
|
|
6
|
+
NewCops: enable
|
|
7
|
+
Exclude:
|
|
8
|
+
- 'bin/*'
|
|
9
|
+
- 'vendor/**/*'
|
|
10
|
+
- 'tmp/**/*'
|
|
11
|
+
- 'db/migrate/*'
|
|
12
|
+
|
|
13
|
+
# Allow longer lines for URL constants
|
|
14
|
+
Layout/LineLength:
|
|
15
|
+
Max: 200
|
|
16
|
+
Exclude:
|
|
17
|
+
- 'lib/url_categorise/constants.rb'
|
|
18
|
+
- 'test/**/*'
|
|
19
|
+
|
|
20
|
+
# Allow complex methods in client due to categorization logic
|
|
21
|
+
Metrics/MethodLength:
|
|
22
|
+
Max: 30
|
|
23
|
+
Exclude:
|
|
24
|
+
- 'lib/url_categorise/client.rb'
|
|
25
|
+
- 'test/**/*'
|
|
26
|
+
|
|
27
|
+
# Allow complex classes for main client
|
|
28
|
+
Metrics/ClassLength:
|
|
29
|
+
Max: 500
|
|
30
|
+
Exclude:
|
|
31
|
+
- 'lib/url_categorise/client.rb'
|
|
32
|
+
- 'test/**/*'
|
|
33
|
+
|
|
34
|
+
# Allow higher complexity for categorization methods
|
|
35
|
+
Metrics/CyclomaticComplexity:
|
|
36
|
+
Max: 15
|
|
37
|
+
Exclude:
|
|
38
|
+
- 'lib/url_categorise/client.rb'
|
|
39
|
+
|
|
40
|
+
# Allow higher ABC size for complex categorization logic
|
|
41
|
+
Metrics/AbcSize:
|
|
42
|
+
Max: 25
|
|
43
|
+
Exclude:
|
|
44
|
+
- 'lib/url_categorise/client.rb'
|
|
45
|
+
- 'test/**/*'
|
|
46
|
+
|
|
47
|
+
# Allow higher parameter count for client initialization
|
|
48
|
+
Metrics/ParameterLists:
|
|
49
|
+
Max: 8
|
|
50
|
+
|
|
51
|
+
# Allow block length for tests and constants
|
|
52
|
+
Metrics/BlockLength:
|
|
53
|
+
Exclude:
|
|
54
|
+
- 'test/**/*'
|
|
55
|
+
- 'lib/url_categorise/constants.rb'
|
|
56
|
+
- 'url_categorise.gemspec'
|
data/.sublime-project
ADDED
data/Gemfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
source
|
|
1
|
+
source "https://rubygems.org"
|
|
2
2
|
|
|
3
|
-
git_source(:github) { |_repo_name|
|
|
3
|
+
git_source(:github) { |_repo_name| "https://github.com/TRex22/url_categorise" }
|
|
4
4
|
|
|
5
5
|
# Specify your gem's dependencies in url_categorise.gemspec
|
|
6
6
|
gemspec
|
data/Gemfile.lock
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
UrlCategorise (0.1.
|
|
4
|
+
UrlCategorise (0.1.9)
|
|
5
5
|
active_attr (>= 0.17.1, < 1.0)
|
|
6
6
|
api_pattern (>= 0.0.6, < 1.0)
|
|
7
7
|
csv (>= 3.3.0, < 4.0)
|
|
8
8
|
digest (>= 3.1.0, < 4.0)
|
|
9
9
|
fileutils (>= 1.7.0, < 2.0)
|
|
10
|
-
httparty (>= 0.
|
|
10
|
+
httparty (>= 0.24.0, < 1.0)
|
|
11
11
|
json (>= 2.7.0, < 3.0)
|
|
12
12
|
kaggle (>= 0.0.3, < 1.0)
|
|
13
|
-
nokogiri (>= 1.
|
|
14
|
-
reline (>= 0.6.2)
|
|
13
|
+
nokogiri (>= 1.19.1, < 2.0)
|
|
14
|
+
reline (>= 0.6.2, < 2.0)
|
|
15
15
|
resolv (>= 0.4.0, < 1.0)
|
|
16
16
|
rubyzip (>= 2.3.0, < 3.0)
|
|
17
17
|
|
|
@@ -65,10 +65,14 @@ GEM
|
|
|
65
65
|
csv (>= 3.3.0)
|
|
66
66
|
httparty (>= 0.22.0)
|
|
67
67
|
nokogiri (>= 1.16.0)
|
|
68
|
+
ast (2.4.3)
|
|
68
69
|
base64 (0.3.0)
|
|
69
70
|
benchmark (0.4.1)
|
|
70
|
-
bigdecimal (
|
|
71
|
+
bigdecimal (4.1.0)
|
|
71
72
|
builder (3.3.0)
|
|
73
|
+
bundler-audit (0.9.3)
|
|
74
|
+
bundler (>= 1.2.0)
|
|
75
|
+
thor (~> 1.0)
|
|
72
76
|
coderay (1.1.3)
|
|
73
77
|
concurrent-ruby (1.3.5)
|
|
74
78
|
connection_pool (2.5.3)
|
|
@@ -83,7 +87,7 @@ GEM
|
|
|
83
87
|
erubi (1.13.1)
|
|
84
88
|
fileutils (1.7.3)
|
|
85
89
|
hashdiff (1.2.0)
|
|
86
|
-
httparty (0.
|
|
90
|
+
httparty (0.24.2)
|
|
87
91
|
csv
|
|
88
92
|
mini_mime (>= 1.0.0)
|
|
89
93
|
multi_xml (>= 0.5.2)
|
|
@@ -97,6 +101,8 @@ GEM
|
|
|
97
101
|
httparty (>= 0.23)
|
|
98
102
|
oj (= 3.16.11)
|
|
99
103
|
rubyzip (>= 2.0)
|
|
104
|
+
language_server-protocol (3.17.0.5)
|
|
105
|
+
lint_roller (1.1.0)
|
|
100
106
|
logger (1.7.0)
|
|
101
107
|
loofah (2.24.1)
|
|
102
108
|
crass (~> 1.0.2)
|
|
@@ -113,22 +119,28 @@ GEM
|
|
|
113
119
|
ruby-progressbar
|
|
114
120
|
mocha (2.4.5)
|
|
115
121
|
ruby2_keywords (>= 0.0.5)
|
|
116
|
-
multi_xml (0.
|
|
117
|
-
bigdecimal (
|
|
118
|
-
nokogiri (1.
|
|
122
|
+
multi_xml (0.8.1)
|
|
123
|
+
bigdecimal (>= 3.1, < 5)
|
|
124
|
+
nokogiri (1.19.2-arm64-darwin)
|
|
119
125
|
racc (~> 1.4)
|
|
120
126
|
oj (3.16.11)
|
|
121
127
|
bigdecimal (>= 3.0)
|
|
122
128
|
ostruct (>= 0.2)
|
|
123
129
|
ostruct (0.6.3)
|
|
130
|
+
parallel (1.27.0)
|
|
131
|
+
parser (3.3.9.0)
|
|
132
|
+
ast (~> 2.4.1)
|
|
133
|
+
racc
|
|
134
|
+
prism (1.4.0)
|
|
124
135
|
pry (0.15.2)
|
|
125
136
|
coderay (~> 1.1)
|
|
126
137
|
method_source (~> 1.0)
|
|
127
138
|
public_suffix (6.0.2)
|
|
128
139
|
racc (1.8.1)
|
|
129
|
-
rack (
|
|
130
|
-
rack-session (1.
|
|
131
|
-
|
|
140
|
+
rack (3.2.5)
|
|
141
|
+
rack-session (2.1.1)
|
|
142
|
+
base64 (>= 0.1.0)
|
|
143
|
+
rack (>= 3.0.0)
|
|
132
144
|
rack-test (2.2.0)
|
|
133
145
|
rack (>= 1.3)
|
|
134
146
|
rails-dom-testing (2.3.0)
|
|
@@ -138,11 +150,41 @@ GEM
|
|
|
138
150
|
rails-html-sanitizer (1.6.2)
|
|
139
151
|
loofah (~> 2.21)
|
|
140
152
|
nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
|
|
153
|
+
rainbow (3.1.1)
|
|
141
154
|
rake (13.3.0)
|
|
155
|
+
regexp_parser (2.11.2)
|
|
142
156
|
reline (0.6.2)
|
|
143
157
|
io-console (~> 0.5)
|
|
144
158
|
resolv (0.6.2)
|
|
145
|
-
rexml (3.4.
|
|
159
|
+
rexml (3.4.4)
|
|
160
|
+
rubocop (1.80.1)
|
|
161
|
+
json (~> 2.3)
|
|
162
|
+
language_server-protocol (~> 3.17.0.2)
|
|
163
|
+
lint_roller (~> 1.1.0)
|
|
164
|
+
parallel (~> 1.10)
|
|
165
|
+
parser (>= 3.3.0.2)
|
|
166
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
167
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
|
168
|
+
rubocop-ast (>= 1.46.0, < 2.0)
|
|
169
|
+
ruby-progressbar (~> 1.7)
|
|
170
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
|
171
|
+
rubocop-ast (1.46.0)
|
|
172
|
+
parser (>= 3.3.7.2)
|
|
173
|
+
prism (~> 1.4)
|
|
174
|
+
rubocop-performance (1.25.0)
|
|
175
|
+
lint_roller (~> 1.1)
|
|
176
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
177
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
|
178
|
+
rubocop-rails (2.33.3)
|
|
179
|
+
activesupport (>= 4.2.0)
|
|
180
|
+
lint_roller (~> 1.1)
|
|
181
|
+
rack (>= 1.1)
|
|
182
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
183
|
+
rubocop-ast (>= 1.44.0, < 2.0)
|
|
184
|
+
rubocop-rails-omakase (1.1.0)
|
|
185
|
+
rubocop (>= 1.72)
|
|
186
|
+
rubocop-performance (>= 1.24)
|
|
187
|
+
rubocop-rails (>= 2.30)
|
|
146
188
|
ruby-progressbar (1.13.0)
|
|
147
189
|
ruby2_keywords (0.0.5)
|
|
148
190
|
rubyzip (2.4.1)
|
|
@@ -154,11 +196,15 @@ GEM
|
|
|
154
196
|
simplecov-html (0.13.2)
|
|
155
197
|
simplecov_json_formatter (0.1.4)
|
|
156
198
|
sqlite3 (2.7.3-arm64-darwin)
|
|
199
|
+
thor (1.5.0)
|
|
157
200
|
timecop (0.9.10)
|
|
158
201
|
timeout (0.4.3)
|
|
159
202
|
tzinfo (2.0.6)
|
|
160
203
|
concurrent-ruby (~> 1.0)
|
|
161
|
-
|
|
204
|
+
unicode-display_width (3.1.5)
|
|
205
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
|
206
|
+
unicode-emoji (4.0.4)
|
|
207
|
+
uri (1.1.1)
|
|
162
208
|
useragent (0.16.11)
|
|
163
209
|
webmock (3.24.0)
|
|
164
210
|
addressable (>= 2.8.0)
|
|
@@ -167,10 +213,12 @@ GEM
|
|
|
167
213
|
|
|
168
214
|
PLATFORMS
|
|
169
215
|
arm64-darwin-24
|
|
216
|
+
arm64-darwin-25
|
|
170
217
|
|
|
171
218
|
DEPENDENCIES
|
|
172
219
|
UrlCategorise!
|
|
173
220
|
activerecord (>= 8.0)
|
|
221
|
+
bundler-audit (~> 0.9)
|
|
174
222
|
logger
|
|
175
223
|
minitest (~> 5.25.5)
|
|
176
224
|
minitest-focus (~> 1.4.0)
|
|
@@ -178,6 +226,7 @@ DEPENDENCIES
|
|
|
178
226
|
mocha (~> 2.4.5)
|
|
179
227
|
pry (~> 0.15.2)
|
|
180
228
|
rake (~> 13.3.0)
|
|
229
|
+
rubocop-rails-omakase (~> 1.0)
|
|
181
230
|
simplecov (~> 0.22.0)
|
|
182
231
|
sqlite3 (>= 2.7)
|
|
183
232
|
timecop (~> 0.9.10)
|
data/README.md
CHANGED
|
@@ -6,6 +6,8 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
|
6
6
|
|
|
7
7
|
- **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
|
|
8
8
|
- **Video Content Detection**: Advanced regex-based categorization with `video_url?` method to distinguish video content from other website resources
|
|
9
|
+
- **Blog Content Detection**: Simple string-based `blog_url?` method to identify blog-related URLs and content
|
|
10
|
+
- **Debug Mode**: Comprehensive debug logging with timing information for initialization and operations
|
|
9
11
|
- **Custom Video Lists**: Generate and maintain comprehensive video hosting domain lists using yt-dlp extractors
|
|
10
12
|
- **Kaggle Dataset Integration**: Automatic loading and processing of machine learning datasets from Kaggle
|
|
11
13
|
- **Multiple Data Sources**: Supports blocklists, CSV datasets, and Kaggle ML datasets
|
|
@@ -162,11 +164,11 @@ $ bundle exec export_hosts --output /tmp/hosts --verbose
|
|
|
162
164
|
# Export CSV data with all features enabled
|
|
163
165
|
$ bundle exec export_csv --output /tmp/csv --iab-compliance --smart-categorization --auto-load-datasets --verbose
|
|
164
166
|
|
|
165
|
-
# Generate updated video hosting lists
|
|
166
|
-
$ ruby bin/generate_video_lists
|
|
167
|
+
# Generate updated video hosting lists (with custom worker configuration)
|
|
168
|
+
$ ruby bin/generate_video_lists --workers 8 --batch-size 50
|
|
167
169
|
|
|
168
|
-
# Check health of all blocklist URLs
|
|
169
|
-
$ bundle exec check_lists
|
|
170
|
+
# Check health of all blocklist URLs (with parallel processing)
|
|
171
|
+
$ bundle exec check_lists --parallel --threads 16
|
|
170
172
|
|
|
171
173
|
# Export with custom Kaggle credentials
|
|
172
174
|
$ bundle exec export_csv --auto-load-datasets --kaggle-credentials ~/my-kaggle.json --verbose
|
|
@@ -183,6 +185,8 @@ $ bundle exec check_lists
|
|
|
183
185
|
- `--kaggle-credentials FILE`: Specify custom Kaggle credentials file
|
|
184
186
|
- `--iab-compliance`: Enable IAB Content Taxonomy mapping
|
|
185
187
|
- `--smart-categorization`: Enable intelligent category filtering
|
|
188
|
+
- `--parallel` / `--threads NUM`: Enable parallel URL checking with configurable thread count
|
|
189
|
+
- `--workers NUM`: Configure number of Ractor workers for video list generation
|
|
186
190
|
|
|
187
191
|
## Advanced Configuration
|
|
188
192
|
|
|
@@ -244,10 +248,32 @@ client = UrlCategorise::Client.new(
|
|
|
244
248
|
iab_compliance: true, # Enable IAB compliance
|
|
245
249
|
iab_version: :v3, # Use IAB Content Taxonomy v3.0
|
|
246
250
|
auto_load_datasets: false, # Disable automatic dataset loading (default)
|
|
247
|
-
smart_categorization: false
|
|
251
|
+
smart_categorization: false, # Disable smart post-processing (default)
|
|
252
|
+
max_threads: 8, # Max threads for parallel processing (default: 8)
|
|
253
|
+
max_ractor_workers: 4, # Max Ractor workers for parallel processing (default: 4 or CPU count)
|
|
254
|
+
parallel_loading: true # Enable parallel loading when available (default: true)
|
|
248
255
|
)
|
|
249
256
|
```
|
|
250
257
|
|
|
258
|
+
### Parallel Processing Configuration
|
|
259
|
+
|
|
260
|
+
The gem uses Ruby Ractors (when available) or Threads for parallel processing to significantly improve performance:
|
|
261
|
+
|
|
262
|
+
```ruby
|
|
263
|
+
# Configure parallel processing parameters
|
|
264
|
+
client = UrlCategorise::Client.new(
|
|
265
|
+
parallel_loading: true, # Enable/disable parallel loading (default: true if Ractors available)
|
|
266
|
+
max_threads: 16, # Maximum threads for thread-based processing (default: 8)
|
|
267
|
+
max_ractor_workers: 8 # Maximum Ractor workers for Ractor-based processing (default: 4 or CPU count)
|
|
268
|
+
)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
**Performance Notes:**
|
|
272
|
+
- **Ractors**: Used by default on Ruby 3.0+ when available, providing better isolation and performance
|
|
273
|
+
- **Threads**: Fallback method when Ractors are unavailable, still provides significant speedup
|
|
274
|
+
- **Test Environment**: Automatically switches to sequential processing during tests to avoid issues
|
|
275
|
+
- **Worker Pools**: Both Ractors and Threads use worker pool patterns to prevent resource exhaustion
|
|
276
|
+
|
|
251
277
|
### Custom Lists
|
|
252
278
|
|
|
253
279
|
Use your own curated lists or subset of categories:
|
|
@@ -273,7 +299,7 @@ The gem maintains a comprehensive list of video hosting domains extracted from y
|
|
|
273
299
|
|
|
274
300
|
```ruby
|
|
275
301
|
# Generate/update video hosting lists
|
|
276
|
-
system("ruby bin/generate_video_lists")
|
|
302
|
+
system("ruby bin/generate_video_lists --workers 4 --batch-size 25")
|
|
277
303
|
|
|
278
304
|
# Use video hosting categorization
|
|
279
305
|
client = UrlCategorise::Client.new
|
|
@@ -331,13 +357,56 @@ client.video_url?("https://google.com/search?q=cats") # => false
|
|
|
331
357
|
3. Returns `true` only if both conditions are met
|
|
332
358
|
4. Handles invalid URLs gracefully (returns `false`)
|
|
333
359
|
|
|
360
|
+
#### Additional Video URL Helper Methods
|
|
361
|
+
|
|
362
|
+
The gem provides specialized helper methods for different types of video content:
|
|
363
|
+
|
|
364
|
+
```ruby
|
|
365
|
+
client = UrlCategorise::Client.new(regex_categorization: true)
|
|
366
|
+
|
|
367
|
+
# Detect short-form video content
|
|
368
|
+
client.shorts_url?("https://youtube.com/shorts/abc123defgh") # => true
|
|
369
|
+
client.shorts_url?("https://tiktok.com/@user/video/123456789") # => true
|
|
370
|
+
client.shorts_url?("https://youtube.com/watch?v=test123") # => false
|
|
371
|
+
|
|
372
|
+
# Detect playlist URLs
|
|
373
|
+
client.playlist_url?("https://youtube.com/playlist?list=PLtest123") # => true
|
|
374
|
+
client.playlist_url?("https://youtube.com/watch?v=abc123&list=PLtest123") # => true
|
|
375
|
+
client.playlist_url?("https://vimeo.com/album/123456") # => true
|
|
376
|
+
client.playlist_url?("https://youtube.com/watch?v=test123") # => false
|
|
377
|
+
|
|
378
|
+
# Detect music content (works with video platforms hosting music)
|
|
379
|
+
client.music_url?("https://music.youtube.com/watch?v=abc123") # => true
|
|
380
|
+
client.music_url?("https://youtube.com/watch?v=abc123defgh&list=PLmusic") # => true
|
|
381
|
+
client.music_url?("https://youtube.com/c/musicchannel") # => true
|
|
382
|
+
client.music_url?("https://youtube.com/watch?v=regularvideo") # => false
|
|
383
|
+
|
|
384
|
+
# Detect channel/profile URLs
|
|
385
|
+
client.channel_url?("https://youtube.com/@channelname") # => true
|
|
386
|
+
client.channel_url?("https://tiktok.com/@username") # => true
|
|
387
|
+
client.channel_url?("https://twitch.tv/streamername") # => true
|
|
388
|
+
client.channel_url?("https://youtube.com/watch?v=test123") # => false
|
|
389
|
+
|
|
390
|
+
# Detect live stream URLs
|
|
391
|
+
client.live_stream_url?("https://youtube.com/live/streamid") # => true
|
|
392
|
+
client.live_stream_url?("https://twitch.tv/streamername") # => true
|
|
393
|
+
client.live_stream_url?("https://youtube.com/watch?v=test123") # => false
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
**All helper methods:**
|
|
397
|
+
- Require `regex_categorization: true` to be enabled
|
|
398
|
+
- First verify the URL is from a video hosting domain
|
|
399
|
+
- Use specific regex patterns for accurate detection
|
|
400
|
+
- Handle invalid URLs gracefully (return `false`)
|
|
401
|
+
- Work across multiple video platforms (YouTube, TikTok, Vimeo, Twitch, etc.)
|
|
402
|
+
|
|
334
403
|
#### Maintaining Video Lists
|
|
335
404
|
|
|
336
405
|
The gem includes a script to generate and maintain comprehensive video hosting lists:
|
|
337
406
|
|
|
338
407
|
```bash
|
|
339
|
-
# Generate updated video hosting lists
|
|
340
|
-
ruby bin/generate_video_lists
|
|
408
|
+
# Generate updated video hosting lists (configurable parallel processing)
|
|
409
|
+
ruby bin/generate_video_lists --workers 8 --batch-size 50 --threshold 20
|
|
341
410
|
|
|
342
411
|
# This creates:
|
|
343
412
|
# - lists/video_hosting_domains.hosts (PiHole compatible)
|
|
@@ -346,6 +415,92 @@ ruby bin/generate_video_lists
|
|
|
346
415
|
|
|
347
416
|
The script fetches data from yt-dlp extractors and combines it with manually curated major platforms to ensure comprehensive coverage.
|
|
348
417
|
|
|
418
|
+
### Blog Content Detection
|
|
419
|
+
|
|
420
|
+
The gem provides a `blog_url?` method to identify blog-related URLs using simple string matching patterns:
|
|
421
|
+
|
|
422
|
+
```ruby
|
|
423
|
+
client = UrlCategorise::Client.new
|
|
424
|
+
|
|
425
|
+
# Basic blog path detection
|
|
426
|
+
client.blog_url?("https://example.com/blog/") # => true
|
|
427
|
+
client.blog_url?("https://example.com/blogs/tech") # => true
|
|
428
|
+
client.blog_url?("https://example.com/blog?page=1") # => true
|
|
429
|
+
|
|
430
|
+
# Blog subdomains
|
|
431
|
+
client.blog_url?("https://blog.example.com/") # => true
|
|
432
|
+
client.blog_url?("https://blog.company.org/article") # => true
|
|
433
|
+
|
|
434
|
+
# Blog platforms
|
|
435
|
+
client.blog_url?("https://example.wordpress.com/") # => true
|
|
436
|
+
client.blog_url?("https://example.blogspot.com/post") # => true
|
|
437
|
+
client.blog_url?("https://medium.com/@user/article") # => true
|
|
438
|
+
client.blog_url?("https://user.substack.com/p/post") # => true
|
|
439
|
+
|
|
440
|
+
# Blog-like content paths
|
|
441
|
+
client.blog_url?("https://example.com/post/123") # => true
|
|
442
|
+
client.blog_url?("https://example.com/articles/tech") # => true
|
|
443
|
+
client.blog_url?("https://example.com/diary/entry") # => true
|
|
444
|
+
|
|
445
|
+
# Blog keywords in URLs
|
|
446
|
+
client.blog_url?("https://example.com/corporate-blog") # => true
|
|
447
|
+
client.blog_url?("https://example-blog.com/") # => true
|
|
448
|
+
|
|
449
|
+
# Non-blog URLs return false
|
|
450
|
+
client.blog_url?("https://example.com/") # => false
|
|
451
|
+
client.blog_url?("https://example.com/products") # => false
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
**Detection patterns include:**
|
|
455
|
+
- `/blog/` or `/blogs/` in URL paths
|
|
456
|
+
- `blog.` subdomains
|
|
457
|
+
- `blog-` or `-blog` in domain names
|
|
458
|
+
- Common blog platforms (WordPress, Blogspot, Medium, Substack)
|
|
459
|
+
- Blog-like content paths (`/post/`, `/articles/`, `/diary/`, `/journal/`)
|
|
460
|
+
- The word "blog" anywhere in the URL
|
|
461
|
+
- Case-insensitive matching
|
|
462
|
+
- Graceful handling of invalid URLs
|
|
463
|
+
|
|
464
|
+
### Debug Mode
|
|
465
|
+
|
|
466
|
+
The gem includes comprehensive debug functionality to help you understand what's happening during initialization and operation:
|
|
467
|
+
|
|
468
|
+
```ruby
|
|
469
|
+
# Enable debug mode during initialization
|
|
470
|
+
client = UrlCategorise::Client.new(debug: true)
|
|
471
|
+
|
|
472
|
+
# Or enable it dynamically using ActiveAttr
|
|
473
|
+
client = UrlCategorise::Client.new
|
|
474
|
+
client.debug_enabled = true
|
|
475
|
+
|
|
476
|
+
# Debug output shows:
|
|
477
|
+
# [UrlCategorise DEBUG] Initializing UrlCategorise Client with debug enabled
|
|
478
|
+
# [UrlCategorise DEBUG] Loading host lists from 15 categories
|
|
479
|
+
# [UrlCategorise DEBUG] Processing host list: https://example.com/malware.txt
|
|
480
|
+
# [UrlCategorise DEBUG] Cache miss for https://example.com/malware.txt
|
|
481
|
+
# [UrlCategorise DEBUG] Downloading and parsing https://example.com/malware.txt completed in 234.56ms
|
|
482
|
+
# [UrlCategorise DEBUG] Downloaded 1500 hosts from https://example.com/malware.txt
|
|
483
|
+
# [UrlCategorise DEBUG] Total unique hosts collected: 45000
|
|
484
|
+
# [UrlCategorise DEBUG] Host lists loading completed in 2543.21ms
|
|
485
|
+
# [UrlCategorise DEBUG] Client initialization completed
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
**Debug features include:**
|
|
489
|
+
- Initialization timing and progress
|
|
490
|
+
- Host list loading with individual URL timing
|
|
491
|
+
- Cache hit/miss information
|
|
492
|
+
- Download progress and host counts
|
|
493
|
+
- Dataset loading progress (when datasets are enabled)
|
|
494
|
+
- Regex pattern loading information
|
|
495
|
+
- Total timing for major operations
|
|
496
|
+
- Off by default, easily enabled via constructor or ActiveAttr
|
|
497
|
+
|
|
498
|
+
**Timing accuracy:**
|
|
499
|
+
- Millisecond precision timing
|
|
500
|
+
- Individual operation timing
|
|
501
|
+
- Cumulative timing for complex operations
|
|
502
|
+
- Helpful for performance optimization and debugging
|
|
503
|
+
|
|
349
504
|
### Smart Categorization (Post-Processing)
|
|
350
505
|
|
|
351
506
|
Smart categorization solves the problem of overly broad domain-level categorization. For example, `reddit.com` might appear in health & fitness blocklists, but not all Reddit content is health-related.
|
|
@@ -559,8 +714,11 @@ end
|
|
|
559
714
|
|
|
560
715
|
Use the included script to check all URLs:
|
|
561
716
|
```bash
|
|
562
|
-
# Check all URLs in constants
|
|
717
|
+
# Check all URLs in constants (sequential)
|
|
563
718
|
ruby bin/check_lists
|
|
719
|
+
|
|
720
|
+
# Check URLs in parallel with custom thread count
|
|
721
|
+
ruby bin/check_lists --parallel --threads 12 --verbose
|
|
564
722
|
```
|
|
565
723
|
|
|
566
724
|
[View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
|