UrlCategorise 0.1.6 ā 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +71 -8
- data/Gemfile.lock +5 -1
- data/README.md +129 -11
- data/bin/export_csv +44 -7
- data/bin/generate_video_lists +373 -0
- data/docs/video-url-detection.md +353 -0
- data/lib/url_categorise/client.rb +320 -58
- data/lib/url_categorise/constants.rb +9 -6
- data/lib/url_categorise/dataset_processor.rb +18 -6
- data/lib/url_categorise/iab_compliance.rb +2 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lists/video_hosting_domains.hosts +7057 -0
- data/lists/video_url_patterns.txt +297 -0
- data/url_categorise.gemspec +1 -0
- metadata +19 -1
@@ -0,0 +1,373 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'net/http'
|
5
|
+
require 'uri'
|
6
|
+
require 'json'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'time'
|
9
|
+
require 'set'
|
10
|
+
|
11
|
+
module VideoListGenerator
|
12
|
+
class ExtractorParser
|
13
|
+
YT_DLP_API_URL = 'https://api.github.com/repos/yt-dlp/yt-dlp/contents/yt_dlp/extractor'
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@video_domains = Set.new
|
17
|
+
@video_regexes = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def generate_lists
|
21
|
+
puts "š¬ Generating video hosting lists from yt-dlp extractors..."
|
22
|
+
|
23
|
+
puts "š Adding manually curated video hosting domains..."
|
24
|
+
add_manual_domains
|
25
|
+
|
26
|
+
puts "š” Fetching yt-dlp extractor data..."
|
27
|
+
fetch_extractor_data
|
28
|
+
|
29
|
+
create_lists_directory
|
30
|
+
generate_hosts_file
|
31
|
+
generate_regex_file
|
32
|
+
|
33
|
+
puts "ā
Video hosting lists generated successfully!"
|
34
|
+
puts "š Files created in lists/ directory:"
|
35
|
+
puts " - video_hosting_domains.hosts (#{@video_domains.size} domains)"
|
36
|
+
puts " - video_url_patterns.txt (#{@video_regexes.size} patterns)"
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def add_manual_domains
|
42
|
+
# Core video hosting platforms - most important ones
|
43
|
+
core_domains = [
|
44
|
+
'youtube.com', 'youtu.be', 'youtube-nocookie.com',
|
45
|
+
'vimeo.com', 'player.vimeo.com', 'vimeopro.com',
|
46
|
+
'dailymotion.com', 'dai.ly',
|
47
|
+
'twitch.tv', 'clips.twitch.tv',
|
48
|
+
'tiktok.com', 'vm.tiktok.com',
|
49
|
+
'rumble.com',
|
50
|
+
'odysee.com', 'lbry.tv',
|
51
|
+
'bitchute.com',
|
52
|
+
'peertube.tv',
|
53
|
+
'archive.org',
|
54
|
+
'vevo.com',
|
55
|
+
'streamable.com',
|
56
|
+
'wistia.com', 'fast.wistia.com', 'fast.wistia.net'
|
57
|
+
]
|
58
|
+
|
59
|
+
core_domains.each { |domain| @video_domains.add(domain) }
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_extractor_data
|
63
|
+
uri = URI(YT_DLP_API_URL)
|
64
|
+
response = Net::HTTP.get_response(uri)
|
65
|
+
|
66
|
+
unless response.code == '200'
|
67
|
+
puts "ā ļø Could not fetch from yt-dlp API, using manual domains only"
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
files = JSON.parse(response.body)
|
72
|
+
|
73
|
+
# Process extractor files to find domains and patterns
|
74
|
+
extractor_files = files.select { |file| file['name'].end_with?('.py') && file['name'] != '__init__.py' }
|
75
|
+
|
76
|
+
puts "š Processing #{extractor_files.length} extractor files..."
|
77
|
+
|
78
|
+
extractor_files.each_with_index do |file, index|
|
79
|
+
print "\r\e[K" # Clear the entire line
|
80
|
+
print "Processing #{index + 1}/#{extractor_files.length}: #{file['name']}"
|
81
|
+
process_extractor_file(file)
|
82
|
+
end
|
83
|
+
|
84
|
+
puts "\nšÆ Found #{@video_domains.size} unique video hosting domains"
|
85
|
+
puts "š Extracted #{@video_regexes.size} URL pattern regexes"
|
86
|
+
end
|
87
|
+
|
88
|
+
def process_extractor_file(file_info)
|
89
|
+
begin
|
90
|
+
response = Net::HTTP.get_response(URI(file_info['download_url']))
|
91
|
+
return unless response.code == '200'
|
92
|
+
|
93
|
+
content = response.body
|
94
|
+
|
95
|
+
extract_domains_from_content(content, file_info['name'])
|
96
|
+
extract_regexes_from_content(content, file_info['name'])
|
97
|
+
rescue StandardError => e
|
98
|
+
# Silently skip errors to avoid spam
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_domains_from_content(content, filename)
|
103
|
+
# Look for _VALID_URL patterns and extract domains
|
104
|
+
valid_url_patterns = content.scan(/_VALID_URL\s*=\s*r?['"]([^'"]+)['"]/m)
|
105
|
+
|
106
|
+
valid_url_patterns.each do |pattern|
|
107
|
+
pattern = pattern[0] if pattern.is_a?(Array)
|
108
|
+
domains = extract_domains_from_regex(pattern)
|
109
|
+
domains.each { |domain| @video_domains.add(domain) }
|
110
|
+
end
|
111
|
+
|
112
|
+
# Look for IE_NAME patterns
|
113
|
+
ie_name_patterns = content.scan(/IE_NAME\s*=\s*['"]([^'"]+)['"]/m)
|
114
|
+
ie_name_patterns.each do |name|
|
115
|
+
name = name[0] if name.is_a?(Array)
|
116
|
+
next unless name && name.include?('.') && valid_domain?(name)
|
117
|
+
@video_domains.add(name.downcase)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Look for hardcoded URLs in the content
|
121
|
+
url_patterns = content.scan(/['"]https?:\/\/(?:www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i)
|
122
|
+
url_patterns.each do |match|
|
123
|
+
domain = match[0] if match.is_a?(Array)
|
124
|
+
next unless domain && valid_domain?(domain.downcase)
|
125
|
+
@video_domains.add(domain.downcase)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Look for class-based domain hints from filename
|
129
|
+
if filename.end_with?('.py')
|
130
|
+
base_name = filename.gsub('.py', '').downcase
|
131
|
+
# Try to infer domain from extractor name
|
132
|
+
if base_name.include?('.')
|
133
|
+
potential_domain = base_name
|
134
|
+
elsif base_name.length > 3 && !%w[common generic base].include?(base_name)
|
135
|
+
# Common patterns: youtube -> youtube.com, vimeo -> vimeo.com
|
136
|
+
potential_domain = "#{base_name}.com"
|
137
|
+
end
|
138
|
+
|
139
|
+
if potential_domain && valid_domain?(potential_domain)
|
140
|
+
@video_domains.add(potential_domain)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def extract_regexes_from_content(content, filename)
|
146
|
+
# Extract _VALID_URL patterns for video detection
|
147
|
+
valid_url_patterns = content.scan(/_VALID_URL\s*=\s*r?['"]([^'"]+)['"]/m)
|
148
|
+
|
149
|
+
valid_url_patterns.each do |pattern|
|
150
|
+
pattern = pattern[0] if pattern.is_a?(Array)
|
151
|
+
|
152
|
+
cleaned_pattern = clean_regex_pattern(pattern)
|
153
|
+
if cleaned_pattern && !cleaned_pattern.empty? && is_useful_pattern?(cleaned_pattern)
|
154
|
+
@video_regexes << {
|
155
|
+
source: filename.gsub('.py', ''),
|
156
|
+
pattern: cleaned_pattern,
|
157
|
+
original: pattern
|
158
|
+
}
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def extract_domains_from_regex(pattern)
|
164
|
+
domains = Set.new
|
165
|
+
|
166
|
+
# Extract domains from common regex patterns - more comprehensive
|
167
|
+
patterns_to_try = [
|
168
|
+
# Standard URLs
|
169
|
+
/https?:\/\/(?:www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i,
|
170
|
+
# Escaped URLs with backslashes
|
171
|
+
/https?:\\\/\\\/(?:www\\.)?([a-zA-Z0-9-]+\\.[a-zA-Z]{2,})/i,
|
172
|
+
# Domain patterns with optional www (escaped)
|
173
|
+
/\(\?\:www\\\.\)\?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i,
|
174
|
+
# Simple domain references
|
175
|
+
/([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i
|
176
|
+
]
|
177
|
+
|
178
|
+
patterns_to_try.each do |regex|
|
179
|
+
matches = pattern.scan(regex)
|
180
|
+
matches.each do |match|
|
181
|
+
domain = match.is_a?(Array) ? match[0] : match
|
182
|
+
domain = clean_domain(domain) if domain
|
183
|
+
if domain && valid_domain?(domain.downcase)
|
184
|
+
domains.add(domain.downcase)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
domains.to_a
|
190
|
+
end
|
191
|
+
|
192
|
+
def clean_domain(domain)
|
193
|
+
return nil unless domain
|
194
|
+
|
195
|
+
# Remove regex escapes and clean up
|
196
|
+
cleaned = domain.gsub(/\\\./, '.').gsub(/[\(\)\?\:\|\\]/, '')
|
197
|
+
cleaned = cleaned.strip
|
198
|
+
|
199
|
+
# Remove trailing regex syntax
|
200
|
+
cleaned = cleaned.split(/[\(\)\[\]]/)[0] if cleaned.include?('(') || cleaned.include?('[')
|
201
|
+
|
202
|
+
cleaned
|
203
|
+
end
|
204
|
+
|
205
|
+
def clean_regex_pattern(pattern)
|
206
|
+
return nil unless pattern
|
207
|
+
|
208
|
+
# Simplify common patterns for practical use
|
209
|
+
cleaned = pattern.dup
|
210
|
+
|
211
|
+
# Remove complex lookaheads/lookbehinds
|
212
|
+
cleaned = cleaned.gsub(/\(\?\![^)]*\)/, '')
|
213
|
+
cleaned = cleaned.gsub(/\(\?\<\![^)]*\)/, '')
|
214
|
+
|
215
|
+
# Simplify common patterns
|
216
|
+
cleaned = cleaned.gsub(/\(\?\:www\\\.\)\?/, '(?:www\.)?')
|
217
|
+
cleaned = cleaned.gsub(/\\\./, '\.')
|
218
|
+
|
219
|
+
# Validate regex with warning suppression
|
220
|
+
begin
|
221
|
+
original_warning = $-w
|
222
|
+
$-w = nil # Disable warnings
|
223
|
+
Regexp.new(cleaned)
|
224
|
+
cleaned
|
225
|
+
rescue RegexpError
|
226
|
+
nil
|
227
|
+
ensure
|
228
|
+
$-w = original_warning # Restore warnings
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def is_useful_pattern?(pattern)
|
233
|
+
# Filter out overly generic or useless patterns
|
234
|
+
return false if pattern == '.*'
|
235
|
+
return false if pattern.length < 10
|
236
|
+
return false if pattern.include?('blob:')
|
237
|
+
return false if pattern.match?(/\$\s*$/) # ends with just $
|
238
|
+
|
239
|
+
true
|
240
|
+
end
|
241
|
+
|
242
|
+
def valid_domain?(domain)
|
243
|
+
return false unless domain
|
244
|
+
return false if domain.length < 4
|
245
|
+
return false unless domain.match?(/^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/)
|
246
|
+
return false if domain.include?('example.') || domain.include?('test.')
|
247
|
+
return false if domain.include?('localhost')
|
248
|
+
return false if domain.end_with?('.html') || domain.end_with?('.htm')
|
249
|
+
true
|
250
|
+
end
|
251
|
+
|
252
|
+
def create_lists_directory
|
253
|
+
FileUtils.mkdir_p('lists')
|
254
|
+
end
|
255
|
+
|
256
|
+
def generate_hosts_file
|
257
|
+
hosts_file_path = 'lists/video_hosting_domains.hosts'
|
258
|
+
|
259
|
+
File.open(hosts_file_path, 'w') do |file|
|
260
|
+
write_hosts_header(file)
|
261
|
+
|
262
|
+
@video_domains.sort.each do |domain|
|
263
|
+
file.puts "0.0.0.0 #{domain}"
|
264
|
+
file.puts "0.0.0.0 www.#{domain}" unless domain.start_with?('www.')
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
def generate_regex_file
|
270
|
+
regex_file_path = 'lists/video_url_patterns.txt'
|
271
|
+
|
272
|
+
File.open(regex_file_path, 'w') do |file|
|
273
|
+
write_regex_header(file)
|
274
|
+
|
275
|
+
# Add manual high-priority patterns first
|
276
|
+
add_manual_video_patterns(file)
|
277
|
+
|
278
|
+
@video_regexes.each do |regex_info|
|
279
|
+
file.puts "# Source: #{regex_info[:source]}"
|
280
|
+
file.puts "# Pattern: #{regex_info[:pattern]}"
|
281
|
+
file.puts "# Original: #{regex_info[:original]}"
|
282
|
+
file.puts regex_info[:pattern]
|
283
|
+
file.puts ""
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def write_hosts_header(file)
|
289
|
+
file.puts "# Video Hosting Domains - PiHole Compatible Hosts File"
|
290
|
+
file.puts "# Generated on: #{Time.now.strftime('%Y-%m-%d %H:%M:%S UTC')}"
|
291
|
+
file.puts "# Source: yt-dlp extractors (https://github.com/yt-dlp/yt-dlp)"
|
292
|
+
file.puts "# Purpose: Block access to video hosting websites"
|
293
|
+
file.puts "# Format: 0.0.0.0 domain.com"
|
294
|
+
file.puts "#"
|
295
|
+
file.puts "# This file contains domains extracted from yt-dlp video extractors"
|
296
|
+
file.puts "# to help identify and categorize video hosting websites."
|
297
|
+
file.puts "#"
|
298
|
+
file.puts ""
|
299
|
+
end
|
300
|
+
|
301
|
+
def add_manual_video_patterns(file)
|
302
|
+
# High-priority manual patterns for popular platforms
|
303
|
+
manual_patterns = [
|
304
|
+
{
|
305
|
+
source: 'manual_youtube',
|
306
|
+
pattern: 'https?://(?:www\\.)?(?:youtube\\.com/watch\\?v=|youtu\\.be/)[a-zA-Z0-9_-]{11}',
|
307
|
+
description: 'YouTube video watch URLs'
|
308
|
+
},
|
309
|
+
{
|
310
|
+
source: 'manual_youtube_shorts',
|
311
|
+
pattern: 'https?://(?:www\\.)?youtube\\.com/shorts/[a-zA-Z0-9_-]{11}',
|
312
|
+
description: 'YouTube Shorts URLs'
|
313
|
+
},
|
314
|
+
{
|
315
|
+
source: 'manual_vimeo',
|
316
|
+
pattern: 'https?://(?:www\\.)?vimeo\\.com/\\d+',
|
317
|
+
description: 'Vimeo video URLs'
|
318
|
+
},
|
319
|
+
{
|
320
|
+
source: 'manual_dailymotion',
|
321
|
+
pattern: 'https?://(?:www\\.)?dailymotion\\.com/video/[a-zA-Z0-9]+',
|
322
|
+
description: 'Dailymotion video URLs'
|
323
|
+
},
|
324
|
+
{
|
325
|
+
source: 'manual_twitch_videos',
|
326
|
+
pattern: 'https?://(?:www\\.)?twitch\\.tv/videos/\\d+',
|
327
|
+
description: 'Twitch video URLs'
|
328
|
+
},
|
329
|
+
{
|
330
|
+
source: 'manual_tiktok',
|
331
|
+
pattern: 'https?://(?:www\\.)?tiktok\\.com/@[^/]+/video/\\d+',
|
332
|
+
description: 'TikTok video URLs'
|
333
|
+
}
|
334
|
+
]
|
335
|
+
|
336
|
+
file.puts "# ===== MANUAL HIGH-PRIORITY PATTERNS ====="
|
337
|
+
file.puts ""
|
338
|
+
|
339
|
+
manual_patterns.each do |pattern_info|
|
340
|
+
file.puts "# Source: #{pattern_info[:source]}"
|
341
|
+
file.puts "# Description: #{pattern_info[:description]}"
|
342
|
+
file.puts "# Pattern: #{pattern_info[:pattern]}"
|
343
|
+
file.puts pattern_info[:pattern]
|
344
|
+
file.puts ""
|
345
|
+
end
|
346
|
+
|
347
|
+
file.puts "# ===== EXTRACTED PATTERNS FROM YT-DLP ====="
|
348
|
+
file.puts ""
|
349
|
+
end
|
350
|
+
|
351
|
+
def write_regex_header(file)
|
352
|
+
file.puts "# Video URL Detection Patterns"
|
353
|
+
file.puts "# Generated on: #{Time.now.strftime('%Y-%m-%d %H:%M:%S UTC')}"
|
354
|
+
file.puts "# Source: yt-dlp extractors (https://github.com/yt-dlp/yt-dlp) + manual patterns"
|
355
|
+
file.puts "# Purpose: Regex patterns to detect video URLs vs other content"
|
356
|
+
file.puts "#"
|
357
|
+
file.puts "# These patterns help distinguish between:"
|
358
|
+
file.puts "# - Direct video content URLs"
|
359
|
+
file.puts "# - Homepage, playlist, user profile, community content URLs"
|
360
|
+
file.puts "#"
|
361
|
+
file.puts "# Usage: Use these patterns to categorize URLs from video hosting domains"
|
362
|
+
file.puts "# to determine if they contain actual video content or other resources"
|
363
|
+
file.puts "#"
|
364
|
+
file.puts ""
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
# Run the generator if this file is executed directly
|
370
|
+
if __FILE__ == $0
|
371
|
+
generator = VideoListGenerator::ExtractorParser.new
|
372
|
+
generator.generate_lists
|
373
|
+
end
|