UrlCategorise 0.1.3 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/export_hosts ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'optparse'
5
+ require_relative '../lib/url_categorise'
6
+
7
+ options = {
8
+ output_path: nil,
9
+ cache_dir: nil,
10
+ verbose: false
11
+ }
12
+
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{$0} [options]"
15
+ opts.separator ""
16
+ opts.separator "Export all categorized domains as separate hosts files per category"
17
+ opts.separator ""
18
+
19
+ opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/hosts or ./exports/hosts)") do |path|
20
+ options[:output_path] = path
21
+ end
22
+
23
+ opts.on("-c", "--cache-dir PATH", "Cache directory path for client initialization") do |path|
24
+ options[:cache_dir] = path
25
+ end
26
+
27
+ opts.on("-v", "--verbose", "Verbose output") do
28
+ options[:verbose] = true
29
+ end
30
+
31
+ opts.on("-h", "--help", "Show this help message") do
32
+ puts opts
33
+ exit
34
+ end
35
+ end.parse!
36
+
37
+ puts "=== UrlCategorise Hosts Export ===" if options[:verbose]
38
+ puts "Initializing client..." if options[:verbose]
39
+
40
+ begin
41
+ client = UrlCategorise::Client.new(
42
+ cache_dir: options[:cache_dir]
43
+ )
44
+
45
+ puts "Exporting hosts files..." if options[:verbose]
46
+
47
+ result = client.export_hosts_files(options[:output_path])
48
+
49
+ summary = result.delete(:_summary)
50
+
51
+ puts "\nāœ… Export completed successfully!"
52
+ puts "šŸ“ Export directory: #{summary[:export_directory]}"
53
+ puts "šŸ“Š Total categories exported: #{summary[:total_categories]}"
54
+ puts "🌐 Total domains exported: #{summary[:total_domains]}"
55
+ puts "šŸ“„ Summary file: #{summary[:path]}"
56
+
57
+ if options[:verbose]
58
+ puts "\nšŸ“‹ Files created:"
59
+ result.each do |category, info|
60
+ puts " #{info[:filename]} - #{info[:count]} domains"
61
+ end
62
+ end
63
+
64
+ rescue StandardError => e
65
+ puts "āŒ Error: #{e.message}"
66
+ puts e.backtrace if options[:verbose]
67
+ exit 1
68
+ end
@@ -0,0 +1,373 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'net/http'
5
+ require 'uri'
6
+ require 'json'
7
+ require 'fileutils'
8
+ require 'time'
9
+ require 'set'
10
+
11
+ module VideoListGenerator
12
+ class ExtractorParser
13
+ YT_DLP_API_URL = 'https://api.github.com/repos/yt-dlp/yt-dlp/contents/yt_dlp/extractor'
14
+
15
+ def initialize
16
+ @video_domains = Set.new
17
+ @video_regexes = []
18
+ end
19
+
20
+ def generate_lists
21
+ puts "šŸŽ¬ Generating video hosting lists from yt-dlp extractors..."
22
+
23
+ puts "šŸ“ Adding manually curated video hosting domains..."
24
+ add_manual_domains
25
+
26
+ puts "šŸ“” Fetching yt-dlp extractor data..."
27
+ fetch_extractor_data
28
+
29
+ create_lists_directory
30
+ generate_hosts_file
31
+ generate_regex_file
32
+
33
+ puts "āœ… Video hosting lists generated successfully!"
34
+ puts "šŸ“ Files created in lists/ directory:"
35
+ puts " - video_hosting_domains.hosts (#{@video_domains.size} domains)"
36
+ puts " - video_url_patterns.txt (#{@video_regexes.size} patterns)"
37
+ end
38
+
39
+ private
40
+
41
+ def add_manual_domains
42
+ # Core video hosting platforms - most important ones
43
+ core_domains = [
44
+ 'youtube.com', 'youtu.be', 'youtube-nocookie.com',
45
+ 'vimeo.com', 'player.vimeo.com', 'vimeopro.com',
46
+ 'dailymotion.com', 'dai.ly',
47
+ 'twitch.tv', 'clips.twitch.tv',
48
+ 'tiktok.com', 'vm.tiktok.com',
49
+ 'rumble.com',
50
+ 'odysee.com', 'lbry.tv',
51
+ 'bitchute.com',
52
+ 'peertube.tv',
53
+ 'archive.org',
54
+ 'vevo.com',
55
+ 'streamable.com',
56
+ 'wistia.com', 'fast.wistia.com', 'fast.wistia.net'
57
+ ]
58
+
59
+ core_domains.each { |domain| @video_domains.add(domain) }
60
+ end
61
+
62
+ def fetch_extractor_data
63
+ uri = URI(YT_DLP_API_URL)
64
+ response = Net::HTTP.get_response(uri)
65
+
66
+ unless response.code == '200'
67
+ puts "āš ļø Could not fetch from yt-dlp API, using manual domains only"
68
+ return
69
+ end
70
+
71
+ files = JSON.parse(response.body)
72
+
73
+ # Process extractor files to find domains and patterns
74
+ extractor_files = files.select { |file| file['name'].end_with?('.py') && file['name'] != '__init__.py' }
75
+
76
+ puts "šŸ“ Processing #{extractor_files.length} extractor files..."
77
+
78
+ extractor_files.each_with_index do |file, index|
79
+ print "\r\e[K" # Clear the entire line
80
+ print "Processing #{index + 1}/#{extractor_files.length}: #{file['name']}"
81
+ process_extractor_file(file)
82
+ end
83
+
84
+ puts "\nšŸŽÆ Found #{@video_domains.size} unique video hosting domains"
85
+ puts "šŸ” Extracted #{@video_regexes.size} URL pattern regexes"
86
+ end
87
+
88
+ def process_extractor_file(file_info)
89
+ begin
90
+ response = Net::HTTP.get_response(URI(file_info['download_url']))
91
+ return unless response.code == '200'
92
+
93
+ content = response.body
94
+
95
+ extract_domains_from_content(content, file_info['name'])
96
+ extract_regexes_from_content(content, file_info['name'])
97
+ rescue StandardError => e
98
+ # Silently skip errors to avoid spam
99
+ end
100
+ end
101
+
102
+ def extract_domains_from_content(content, filename)
103
+ # Look for _VALID_URL patterns and extract domains
104
+ valid_url_patterns = content.scan(/_VALID_URL\s*=\s*r?['"]([^'"]+)['"]/m)
105
+
106
+ valid_url_patterns.each do |pattern|
107
+ pattern = pattern[0] if pattern.is_a?(Array)
108
+ domains = extract_domains_from_regex(pattern)
109
+ domains.each { |domain| @video_domains.add(domain) }
110
+ end
111
+
112
+ # Look for IE_NAME patterns
113
+ ie_name_patterns = content.scan(/IE_NAME\s*=\s*['"]([^'"]+)['"]/m)
114
+ ie_name_patterns.each do |name|
115
+ name = name[0] if name.is_a?(Array)
116
+ next unless name && name.include?('.') && valid_domain?(name)
117
+ @video_domains.add(name.downcase)
118
+ end
119
+
120
+ # Look for hardcoded URLs in the content
121
+ url_patterns = content.scan(/['"]https?:\/\/(?:www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i)
122
+ url_patterns.each do |match|
123
+ domain = match[0] if match.is_a?(Array)
124
+ next unless domain && valid_domain?(domain.downcase)
125
+ @video_domains.add(domain.downcase)
126
+ end
127
+
128
+ # Look for class-based domain hints from filename
129
+ if filename.end_with?('.py')
130
+ base_name = filename.gsub('.py', '').downcase
131
+ # Try to infer domain from extractor name
132
+ if base_name.include?('.')
133
+ potential_domain = base_name
134
+ elsif base_name.length > 3 && !%w[common generic base].include?(base_name)
135
+ # Common patterns: youtube -> youtube.com, vimeo -> vimeo.com
136
+ potential_domain = "#{base_name}.com"
137
+ end
138
+
139
+ if potential_domain && valid_domain?(potential_domain)
140
+ @video_domains.add(potential_domain)
141
+ end
142
+ end
143
+ end
144
+
145
+ def extract_regexes_from_content(content, filename)
146
+ # Extract _VALID_URL patterns for video detection
147
+ valid_url_patterns = content.scan(/_VALID_URL\s*=\s*r?['"]([^'"]+)['"]/m)
148
+
149
+ valid_url_patterns.each do |pattern|
150
+ pattern = pattern[0] if pattern.is_a?(Array)
151
+
152
+ cleaned_pattern = clean_regex_pattern(pattern)
153
+ if cleaned_pattern && !cleaned_pattern.empty? && is_useful_pattern?(cleaned_pattern)
154
+ @video_regexes << {
155
+ source: filename.gsub('.py', ''),
156
+ pattern: cleaned_pattern,
157
+ original: pattern
158
+ }
159
+ end
160
+ end
161
+ end
162
+
163
+ def extract_domains_from_regex(pattern)
164
+ domains = Set.new
165
+
166
+ # Extract domains from common regex patterns - more comprehensive
167
+ patterns_to_try = [
168
+ # Standard URLs
169
+ /https?:\/\/(?:www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i,
170
+ # Escaped URLs with backslashes
171
+ /https?:\\\/\\\/(?:www\\.)?([a-zA-Z0-9-]+\\.[a-zA-Z]{2,})/i,
172
+ # Domain patterns with optional www (escaped)
173
+ /\(\?\:www\\\.\)\?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i,
174
+ # Simple domain references
175
+ /([a-zA-Z0-9-]+\.[a-zA-Z]{2,})/i
176
+ ]
177
+
178
+ patterns_to_try.each do |regex|
179
+ matches = pattern.scan(regex)
180
+ matches.each do |match|
181
+ domain = match.is_a?(Array) ? match[0] : match
182
+ domain = clean_domain(domain) if domain
183
+ if domain && valid_domain?(domain.downcase)
184
+ domains.add(domain.downcase)
185
+ end
186
+ end
187
+ end
188
+
189
+ domains.to_a
190
+ end
191
+
192
+ def clean_domain(domain)
193
+ return nil unless domain
194
+
195
+ # Remove regex escapes and clean up
196
+ cleaned = domain.gsub(/\\\./, '.').gsub(/[\(\)\?\:\|\\]/, '')
197
+ cleaned = cleaned.strip
198
+
199
+ # Remove trailing regex syntax
200
+ cleaned = cleaned.split(/[\(\)\[\]]/)[0] if cleaned.include?('(') || cleaned.include?('[')
201
+
202
+ cleaned
203
+ end
204
+
205
+ def clean_regex_pattern(pattern)
206
+ return nil unless pattern
207
+
208
+ # Simplify common patterns for practical use
209
+ cleaned = pattern.dup
210
+
211
+ # Remove complex lookaheads/lookbehinds
212
+ cleaned = cleaned.gsub(/\(\?\![^)]*\)/, '')
213
+ cleaned = cleaned.gsub(/\(\?\<\![^)]*\)/, '')
214
+
215
+ # Simplify common patterns
216
+ cleaned = cleaned.gsub(/\(\?\:www\\\.\)\?/, '(?:www\.)?')
217
+ cleaned = cleaned.gsub(/\\\./, '\.')
218
+
219
+ # Validate regex with warning suppression
220
+ begin
221
+ original_warning = $-w
222
+ $-w = nil # Disable warnings
223
+ Regexp.new(cleaned)
224
+ cleaned
225
+ rescue RegexpError
226
+ nil
227
+ ensure
228
+ $-w = original_warning # Restore warnings
229
+ end
230
+ end
231
+
232
+ def is_useful_pattern?(pattern)
233
+ # Filter out overly generic or useless patterns
234
+ return false if pattern == '.*'
235
+ return false if pattern.length < 10
236
+ return false if pattern.include?('blob:')
237
+ return false if pattern.match?(/\$\s*$/) # ends with just $
238
+
239
+ true
240
+ end
241
+
242
+ def valid_domain?(domain)
243
+ return false unless domain
244
+ return false if domain.length < 4
245
+ return false unless domain.match?(/^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/)
246
+ return false if domain.include?('example.') || domain.include?('test.')
247
+ return false if domain.include?('localhost')
248
+ return false if domain.end_with?('.html') || domain.end_with?('.htm')
249
+ true
250
+ end
251
+
252
+ def create_lists_directory
253
+ FileUtils.mkdir_p('lists')
254
+ end
255
+
256
+ def generate_hosts_file
257
+ hosts_file_path = 'lists/video_hosting_domains.hosts'
258
+
259
+ File.open(hosts_file_path, 'w') do |file|
260
+ write_hosts_header(file)
261
+
262
+ @video_domains.sort.each do |domain|
263
+ file.puts "0.0.0.0 #{domain}"
264
+ file.puts "0.0.0.0 www.#{domain}" unless domain.start_with?('www.')
265
+ end
266
+ end
267
+ end
268
+
269
+ def generate_regex_file
270
+ regex_file_path = 'lists/video_url_patterns.txt'
271
+
272
+ File.open(regex_file_path, 'w') do |file|
273
+ write_regex_header(file)
274
+
275
+ # Add manual high-priority patterns first
276
+ add_manual_video_patterns(file)
277
+
278
+ @video_regexes.each do |regex_info|
279
+ file.puts "# Source: #{regex_info[:source]}"
280
+ file.puts "# Pattern: #{regex_info[:pattern]}"
281
+ file.puts "# Original: #{regex_info[:original]}"
282
+ file.puts regex_info[:pattern]
283
+ file.puts ""
284
+ end
285
+ end
286
+ end
287
+
288
+ def write_hosts_header(file)
289
+ file.puts "# Video Hosting Domains - PiHole Compatible Hosts File"
290
+ file.puts "# Generated on: #{Time.now.strftime('%Y-%m-%d %H:%M:%S UTC')}"
291
+ file.puts "# Source: yt-dlp extractors (https://github.com/yt-dlp/yt-dlp)"
292
+ file.puts "# Purpose: Block access to video hosting websites"
293
+ file.puts "# Format: 0.0.0.0 domain.com"
294
+ file.puts "#"
295
+ file.puts "# This file contains domains extracted from yt-dlp video extractors"
296
+ file.puts "# to help identify and categorize video hosting websites."
297
+ file.puts "#"
298
+ file.puts ""
299
+ end
300
+
301
+ def add_manual_video_patterns(file)
302
+ # High-priority manual patterns for popular platforms
303
+ manual_patterns = [
304
+ {
305
+ source: 'manual_youtube',
306
+ pattern: 'https?://(?:www\\.)?(?:youtube\\.com/watch\\?v=|youtu\\.be/)[a-zA-Z0-9_-]{11}',
307
+ description: 'YouTube video watch URLs'
308
+ },
309
+ {
310
+ source: 'manual_youtube_shorts',
311
+ pattern: 'https?://(?:www\\.)?youtube\\.com/shorts/[a-zA-Z0-9_-]{11}',
312
+ description: 'YouTube Shorts URLs'
313
+ },
314
+ {
315
+ source: 'manual_vimeo',
316
+ pattern: 'https?://(?:www\\.)?vimeo\\.com/\\d+',
317
+ description: 'Vimeo video URLs'
318
+ },
319
+ {
320
+ source: 'manual_dailymotion',
321
+ pattern: 'https?://(?:www\\.)?dailymotion\\.com/video/[a-zA-Z0-9]+',
322
+ description: 'Dailymotion video URLs'
323
+ },
324
+ {
325
+ source: 'manual_twitch_videos',
326
+ pattern: 'https?://(?:www\\.)?twitch\\.tv/videos/\\d+',
327
+ description: 'Twitch video URLs'
328
+ },
329
+ {
330
+ source: 'manual_tiktok',
331
+ pattern: 'https?://(?:www\\.)?tiktok\\.com/@[^/]+/video/\\d+',
332
+ description: 'TikTok video URLs'
333
+ }
334
+ ]
335
+
336
+ file.puts "# ===== MANUAL HIGH-PRIORITY PATTERNS ====="
337
+ file.puts ""
338
+
339
+ manual_patterns.each do |pattern_info|
340
+ file.puts "# Source: #{pattern_info[:source]}"
341
+ file.puts "# Description: #{pattern_info[:description]}"
342
+ file.puts "# Pattern: #{pattern_info[:pattern]}"
343
+ file.puts pattern_info[:pattern]
344
+ file.puts ""
345
+ end
346
+
347
+ file.puts "# ===== EXTRACTED PATTERNS FROM YT-DLP ====="
348
+ file.puts ""
349
+ end
350
+
351
+ def write_regex_header(file)
352
+ file.puts "# Video URL Detection Patterns"
353
+ file.puts "# Generated on: #{Time.now.strftime('%Y-%m-%d %H:%M:%S UTC')}"
354
+ file.puts "# Source: yt-dlp extractors (https://github.com/yt-dlp/yt-dlp) + manual patterns"
355
+ file.puts "# Purpose: Regex patterns to detect video URLs vs other content"
356
+ file.puts "#"
357
+ file.puts "# These patterns help distinguish between:"
358
+ file.puts "# - Direct video content URLs"
359
+ file.puts "# - Homepage, playlist, user profile, community content URLs"
360
+ file.puts "#"
361
+ file.puts "# Usage: Use these patterns to categorize URLs from video hosting domains"
362
+ file.puts "# to determine if they contain actual video content or other resources"
363
+ file.puts "#"
364
+ file.puts ""
365
+ end
366
+ end
367
+ end
368
+
369
+ # Run the generator if this file is executed directly
370
+ if __FILE__ == $0
371
+ generator = VideoListGenerator::ExtractorParser.new
372
+ generator.generate_lists
373
+ end
data/bin/rake ADDED
@@ -0,0 +1,2 @@
1
+ #!/bin/bash
2
+ rake
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require './lib/url_categorise'
5
+
6
+ puts "=== Large Dataset Loading Example ==="
7
+
8
+ # Configuration for handling large datasets (300+ MB)
9
+ # First test with cache-only mode
10
+ puts "Creating client with cached datasets only..."
11
+ client = UrlCategorise::Client.new(
12
+ cache_dir: './url_cache',
13
+ auto_load_datasets: true,
14
+ smart_categorization: true,
15
+ dataset_config: {
16
+ cache_path: './url_cache/datasets',
17
+ download_path: './url_cache/downloads',
18
+ kaggle: { credentials_file: '~/kaggle.json' }
19
+ }
20
+ )
21
+
22
+ puts "Client created successfully!"
23
+ puts ""
24
+ puts "Dataset Statistics:"
25
+ puts " Total categories: #{client.count_of_categories}"
26
+ puts " Dataset categories: #{client.count_of_dataset_categories}"
27
+ puts " Blocklist categories: #{client.count_of_categories - client.count_of_dataset_categories}"
28
+ puts ""
29
+ puts " Total hosts: #{client.count_of_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
30
+ puts " Dataset hosts: #{client.count_of_dataset_hosts.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
31
+ puts ""
32
+ puts " Total data size: #{client.size_of_data.round(1)} MB"
33
+ puts " Dataset data size: #{client.size_of_dataset_data.round(1)} MB"
34
+ puts " Blocklist data size: #{client.size_of_blocklist_data.round(1)} MB"
35
+
36
+ puts ""
37
+ puts "Dataset-specific Statistics:"
38
+ # Get dataset metadata if available
39
+ metadata = client.dataset_metadata
40
+ if metadata && !metadata.empty?
41
+ puts " Datasets loaded: #{metadata.size}"
42
+
43
+ # Calculate size for each dataset by finding its categories and domains
44
+ dataset_categories = client.instance_variable_get(:@dataset_categories)
45
+ total_dataset_size = 0
46
+
47
+ metadata.each_with_index do |(hash, data), index|
48
+ # Estimate size contribution of this dataset
49
+ dataset_portion = data[:total_entries].to_f / metadata.values.sum { |d| d[:total_entries] }
50
+ dataset_size_mb = (client.size_of_dataset_data * dataset_portion).round(2)
51
+ total_dataset_size += dataset_size_mb
52
+
53
+ puts " Dataset #{index + 1}:"
54
+ puts " Processed at: #{data[:processed_at]}"
55
+ puts " Total entries: #{data[:total_entries].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse}"
56
+ puts " Estimated size: #{dataset_size_mb} MB"
57
+ puts " Data hash: #{hash[0..12]}..."
58
+ end
59
+
60
+ puts ""
61
+ puts " Total dataset size: #{total_dataset_size.round(2)} MB (#{client.size_of_dataset_data.round(1)} MB actual)"
62
+ else
63
+ puts " No dataset metadata available"
64
+ end