UrlCategorise 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +71 -8
- data/Gemfile.lock +5 -1
- data/README.md +129 -11
- data/bin/export_csv +44 -7
- data/bin/generate_video_lists +373 -0
- data/docs/video-url-detection.md +353 -0
- data/lib/url_categorise/client.rb +320 -58
- data/lib/url_categorise/constants.rb +9 -6
- data/lib/url_categorise/dataset_processor.rb +18 -6
- data/lib/url_categorise/iab_compliance.rb +2 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lists/video_hosting_domains.hosts +7057 -0
- data/lists/video_url_patterns.txt +297 -0
- data/url_categorise.gemspec +1 -0
- metadata +19 -1
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'set'
|
2
|
+
require 'digest'
|
2
3
|
|
3
4
|
module UrlCategorise
|
4
5
|
class Client < ApiPattern::Client
|
@@ -23,8 +24,10 @@ module UrlCategorise
|
|
23
24
|
attribute :auto_load_datasets, type: Boolean, default: false
|
24
25
|
attribute :smart_categorization_enabled, type: Boolean, default: false
|
25
26
|
attribute :smart_rules, default: -> { {} }
|
27
|
+
attribute :regex_categorization_enabled, type: Boolean, default: false
|
28
|
+
attribute :regex_patterns_file, default: -> { VIDEO_URL_PATTERNS_FILE }
|
26
29
|
|
27
|
-
attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
|
30
|
+
attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories, :regex_patterns
|
28
31
|
|
29
32
|
def initialize(**kwargs)
|
30
33
|
# Extract dataset_config for later use
|
@@ -41,15 +44,21 @@ module UrlCategorise
|
|
41
44
|
self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
|
42
45
|
self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
|
43
46
|
self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
|
47
|
+
self.regex_categorization_enabled = kwargs.key?(:regex_categorization) ? kwargs[:regex_categorization] : false
|
48
|
+
self.regex_patterns_file = kwargs.key?(:regex_patterns_file) ? kwargs[:regex_patterns_file] : VIDEO_URL_PATTERNS_FILE
|
44
49
|
|
45
50
|
@metadata = {}
|
46
51
|
@dataset_categories = Set.new # Track which categories come from datasets
|
52
|
+
@regex_patterns = {}
|
47
53
|
|
48
54
|
# Initialize dataset processor if config provided
|
49
55
|
@dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
|
50
56
|
|
51
57
|
@hosts = fetch_and_build_host_lists
|
52
58
|
|
59
|
+
# Load regex patterns if enabled
|
60
|
+
load_regex_patterns if regex_categorization_enabled
|
61
|
+
|
53
62
|
# Auto-load datasets from constants if enabled
|
54
63
|
load_datasets_from_constants if auto_load_datasets && @dataset_processor
|
55
64
|
end
|
@@ -67,6 +76,9 @@ module UrlCategorise
|
|
67
76
|
# Apply smart categorization if enabled
|
68
77
|
categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
|
69
78
|
|
79
|
+
# Apply regex categorization if enabled
|
80
|
+
categories = apply_regex_categorization(url, categories) if regex_categorization_enabled
|
81
|
+
|
70
82
|
if iab_compliance_enabled
|
71
83
|
IabCompliance.get_iab_categories(categories, iab_version)
|
72
84
|
else
|
@@ -103,6 +115,29 @@ module UrlCategorise
|
|
103
115
|
categories.uniq
|
104
116
|
end
|
105
117
|
|
118
|
+
def video_url?(url)
|
119
|
+
return false unless url && !url.empty?
|
120
|
+
return false unless regex_categorization_enabled && @regex_patterns.any?
|
121
|
+
|
122
|
+
# First check if it's from a video hosting domain
|
123
|
+
categories = categorise(url)
|
124
|
+
video_hosting_categories = categories & [:video, :video_hosting, :youtube, :vimeo, :tiktok, :dailymotion, :twitch]
|
125
|
+
|
126
|
+
return false unless video_hosting_categories.any?
|
127
|
+
|
128
|
+
# Then check if it matches video content patterns
|
129
|
+
@regex_patterns.each do |_category, patterns|
|
130
|
+
patterns.each do |pattern_info|
|
131
|
+
return true if url.match?(pattern_info[:pattern])
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
false
|
136
|
+
rescue StandardError
|
137
|
+
# Handle any regex or URL parsing errors gracefully
|
138
|
+
false
|
139
|
+
end
|
140
|
+
|
106
141
|
def count_of_hosts
|
107
142
|
@hosts.keys.map do |category|
|
108
143
|
@hosts[category].size
|
@@ -431,58 +466,259 @@ module UrlCategorise
|
|
431
466
|
|
432
467
|
FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
|
433
468
|
|
434
|
-
|
469
|
+
# Create single comprehensive CSV with ALL data
|
470
|
+
timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
|
471
|
+
filename = "url_categorise_comprehensive_export_#{timestamp}.csv"
|
435
472
|
file_path = File.join(export_dir, filename)
|
436
473
|
|
474
|
+
# Collect all available data
|
475
|
+
all_data = collect_all_export_data
|
476
|
+
|
477
|
+
# Create CSV with dynamic headers
|
478
|
+
headers = determine_comprehensive_headers(all_data)
|
479
|
+
|
437
480
|
CSV.open(file_path, 'w', headers: true) do |csv|
|
438
|
-
|
439
|
-
csv << [
|
440
|
-
'domain',
|
441
|
-
'category',
|
442
|
-
'source_type',
|
443
|
-
'is_dataset_category',
|
444
|
-
'iab_category_v2',
|
445
|
-
'iab_category_v3',
|
446
|
-
'export_timestamp',
|
447
|
-
'smart_categorization_enabled'
|
448
|
-
]
|
481
|
+
csv << headers
|
449
482
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
483
|
+
all_data.each do |entry|
|
484
|
+
row = headers.map { |header| entry[header] || entry[header.to_sym] || '' }
|
485
|
+
csv << row
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
# Create summary file
|
490
|
+
summary_filename = "export_summary_#{timestamp}.json"
|
491
|
+
summary_file_path = File.join(export_dir, summary_filename)
|
492
|
+
|
493
|
+
summary = create_comprehensive_export_summary(file_path, all_data, export_dir)
|
494
|
+
File.write(summary_file_path, JSON.pretty_generate(summary))
|
495
|
+
|
496
|
+
{
|
497
|
+
csv_file: file_path,
|
498
|
+
summary_file: summary_file_path,
|
499
|
+
summary: summary[:data_summary],
|
500
|
+
export_directory: export_dir,
|
501
|
+
total_entries: all_data.length
|
502
|
+
}
|
503
|
+
end
|
504
|
+
|
505
|
+
private
|
506
|
+
|
507
|
+
def load_regex_patterns
|
508
|
+
return unless regex_patterns_file
|
509
|
+
|
510
|
+
@regex_patterns = {}
|
511
|
+
current_category = nil
|
512
|
+
|
513
|
+
content = fetch_regex_patterns_content
|
514
|
+
return unless content
|
515
|
+
|
516
|
+
content.split("\n").each do |line|
|
517
|
+
line = line.strip
|
518
|
+
next if line.empty?
|
519
|
+
|
520
|
+
# Check if this line is a source comment
|
521
|
+
if line.match(/^# Source: (.+)$/)
|
522
|
+
current_category = $1.downcase
|
523
|
+
@regex_patterns[current_category] = [] unless @regex_patterns[current_category]
|
524
|
+
elsif current_category && !line.start_with?('#') && !line.empty?
|
525
|
+
# This is a regex pattern
|
526
|
+
begin
|
527
|
+
regex = Regexp.new(line)
|
528
|
+
@regex_patterns[current_category] << {
|
529
|
+
pattern: regex,
|
530
|
+
raw: line
|
531
|
+
}
|
532
|
+
rescue RegexpError => e
|
533
|
+
puts "Warning: Invalid regex pattern '#{line}': #{e.message}"
|
534
|
+
end
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
puts "Loaded #{@regex_patterns.values.flatten.size} regex patterns from #{@regex_patterns.keys.size} categories" if @regex_patterns.any?
|
539
|
+
end
|
540
|
+
|
541
|
+
def fetch_regex_patterns_content
|
542
|
+
if regex_patterns_file.start_with?('http://', 'https://')
|
543
|
+
# Remote URL
|
544
|
+
begin
|
545
|
+
response = HTTParty.get(regex_patterns_file, timeout: request_timeout)
|
546
|
+
return response.body if response.code == 200
|
547
|
+
rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
|
548
|
+
puts "Warning: Failed to fetch regex patterns from #{regex_patterns_file}: #{e.message}"
|
549
|
+
return nil
|
550
|
+
end
|
551
|
+
elsif regex_patterns_file.start_with?('file://')
|
552
|
+
# Local file URL
|
553
|
+
file_path = regex_patterns_file.sub('file://', '')
|
554
|
+
return File.read(file_path) if File.exist?(file_path)
|
555
|
+
elsif File.exist?(regex_patterns_file)
|
556
|
+
# Direct file path
|
557
|
+
return File.read(regex_patterns_file)
|
558
|
+
end
|
559
|
+
|
560
|
+
puts "Warning: Regex patterns file not found: #{regex_patterns_file}"
|
561
|
+
nil
|
562
|
+
end
|
563
|
+
|
564
|
+
def apply_regex_categorization(url, existing_categories)
|
565
|
+
return existing_categories unless @regex_patterns.any?
|
566
|
+
|
567
|
+
# If we have existing categories that match domains, check if the URL matches video patterns
|
568
|
+
video_categories = existing_categories & [:video, :video_hosting, :youtube, :vimeo, :tiktok]
|
569
|
+
|
570
|
+
if video_categories.any?
|
571
|
+
# Check if this URL matches any video patterns
|
572
|
+
@regex_patterns.each do |category, patterns|
|
573
|
+
patterns.each do |pattern_info|
|
574
|
+
if url.match?(pattern_info[:pattern])
|
575
|
+
# This is a video content URL, add a more specific categorization
|
576
|
+
existing_categories << "#{video_categories.first}_content".to_sym unless existing_categories.include?("#{video_categories.first}_content".to_sym)
|
577
|
+
break
|
462
578
|
end
|
463
|
-
|
464
|
-
csv << [
|
465
|
-
domain,
|
466
|
-
category,
|
467
|
-
source_type,
|
468
|
-
is_dataset_category,
|
469
|
-
iab_v2,
|
470
|
-
iab_v3,
|
471
|
-
Time.now.iso8601,
|
472
|
-
smart_categorization_enabled
|
473
|
-
]
|
474
579
|
end
|
475
580
|
end
|
476
581
|
end
|
582
|
+
|
583
|
+
existing_categories.uniq
|
584
|
+
end
|
585
|
+
|
586
|
+
def collect_all_export_data
|
587
|
+
all_data = []
|
588
|
+
|
589
|
+
# 1. Add all processed domain/category mappings
|
590
|
+
@hosts.each do |category, domains|
|
591
|
+
domains.each do |domain|
|
592
|
+
source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
|
593
|
+
is_dataset_category = @dataset_categories.include?(category)
|
594
|
+
|
595
|
+
# Get IAB mappings if compliance is enabled
|
596
|
+
iab_v2 = nil
|
597
|
+
iab_v3 = nil
|
598
|
+
if iab_compliance_enabled
|
599
|
+
iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
|
600
|
+
iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
|
601
|
+
end
|
602
|
+
|
603
|
+
entry = {
|
604
|
+
'data_type' => 'domain_categorization',
|
605
|
+
'domain' => domain,
|
606
|
+
'url' => domain, # For compatibility
|
607
|
+
'category' => category.to_s,
|
608
|
+
'source_type' => source_type,
|
609
|
+
'is_dataset_category' => is_dataset_category,
|
610
|
+
'iab_category_v2' => iab_v2,
|
611
|
+
'iab_category_v3' => iab_v3,
|
612
|
+
'export_timestamp' => Time.now.iso8601,
|
613
|
+
'smart_categorization_enabled' => smart_categorization_enabled
|
614
|
+
}
|
615
|
+
|
616
|
+
all_data << entry
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
# 2. Add raw dataset content from cache
|
621
|
+
collect_cached_dataset_content.each do |entry|
|
622
|
+
entry['data_type'] = 'raw_dataset_content'
|
623
|
+
all_data << entry
|
624
|
+
end
|
625
|
+
|
626
|
+
# 3. Try to collect currently loaded dataset data if available
|
627
|
+
collect_current_dataset_content.each do |entry|
|
628
|
+
entry['data_type'] = 'current_dataset_content'
|
629
|
+
all_data << entry
|
630
|
+
end
|
631
|
+
|
632
|
+
all_data
|
633
|
+
end
|
634
|
+
|
635
|
+
def collect_cached_dataset_content
|
636
|
+
cached_data = []
|
637
|
+
return cached_data unless @dataset_processor
|
638
|
+
|
639
|
+
# Collect from cached datasets if available
|
640
|
+
(@dataset_metadata || {}).each do |data_hash, metadata|
|
641
|
+
cache_key = @dataset_processor.send(:generate_cache_key, metadata[:source_identifier] || data_hash, metadata[:source_type]&.to_sym || :unknown)
|
642
|
+
cached_result = @dataset_processor.send(:load_from_cache, cache_key)
|
643
|
+
|
644
|
+
if cached_result && cached_result.is_a?(Hash) && cached_result['raw_content']
|
645
|
+
cached_result['raw_content'].each do |entry|
|
646
|
+
enhanced_entry = entry.dup
|
647
|
+
enhanced_entry['dataset_source'] = metadata[:source_identifier] || 'unknown'
|
648
|
+
enhanced_entry['dataset_type'] = metadata[:source_type] || 'unknown'
|
649
|
+
enhanced_entry['processed_at'] = metadata[:processed_at]
|
650
|
+
cached_data << enhanced_entry
|
651
|
+
end
|
652
|
+
elsif cached_result.is_a?(Array)
|
653
|
+
# Legacy format - array of entries
|
654
|
+
cached_result.each do |entry|
|
655
|
+
next unless entry.is_a?(Hash)
|
656
|
+
enhanced_entry = entry.dup
|
657
|
+
enhanced_entry['dataset_source'] = metadata[:source_identifier] || 'unknown'
|
658
|
+
enhanced_entry['dataset_type'] = metadata[:source_type] || 'unknown'
|
659
|
+
enhanced_entry['processed_at'] = metadata[:processed_at]
|
660
|
+
cached_data << enhanced_entry
|
661
|
+
end
|
662
|
+
end
|
663
|
+
end
|
664
|
+
|
665
|
+
cached_data
|
666
|
+
end
|
667
|
+
|
668
|
+
def collect_current_dataset_content
|
669
|
+
# This is a placeholder - in practice, the original dataset content
|
670
|
+
# is processed and only domain mappings are kept in @hosts.
|
671
|
+
# The raw content should come from cache, but if we want to be more
|
672
|
+
# aggressive, we could re-process datasets here or store them differently.
|
673
|
+
[]
|
674
|
+
end
|
675
|
+
|
676
|
+
def determine_comprehensive_headers(all_data)
|
677
|
+
# Collect all unique keys from all entries
|
678
|
+
all_keys = Set.new
|
679
|
+
all_data.each do |entry|
|
680
|
+
all_keys.merge(entry.keys.map(&:to_s))
|
681
|
+
end
|
682
|
+
all_keys_array = all_keys.to_a
|
683
|
+
|
684
|
+
# Core headers that should appear first
|
685
|
+
core_headers = %w[data_type domain url category]
|
686
|
+
|
687
|
+
# Standard categorization headers
|
688
|
+
categorization_headers = %w[source_type is_dataset_category iab_category_v2 iab_category_v3]
|
689
|
+
|
690
|
+
# Dataset content headers
|
691
|
+
content_headers = %w[title description text content summary body]
|
477
692
|
|
478
|
-
#
|
479
|
-
|
480
|
-
|
693
|
+
# Metadata headers
|
694
|
+
metadata_headers = %w[dataset_source dataset_type processed_at export_timestamp smart_categorization_enabled]
|
695
|
+
|
696
|
+
# Build final header order
|
697
|
+
ordered_headers = []
|
698
|
+
ordered_headers += (core_headers & all_keys_array)
|
699
|
+
ordered_headers += (categorization_headers & all_keys_array)
|
700
|
+
ordered_headers += (content_headers & all_keys_array)
|
701
|
+
|
702
|
+
# Add any remaining headers (alphabetically sorted)
|
703
|
+
remaining_headers = (all_keys_array - ordered_headers - metadata_headers).sort
|
704
|
+
ordered_headers += remaining_headers
|
705
|
+
|
706
|
+
# Add metadata headers at the end
|
707
|
+
ordered_headers += (metadata_headers & all_keys_array)
|
708
|
+
|
709
|
+
ordered_headers
|
710
|
+
end
|
711
|
+
|
712
|
+
def create_comprehensive_export_summary(file_path, all_data, export_dir)
|
713
|
+
domain_entries = all_data.select { |entry| entry['data_type'] == 'domain_categorization' }
|
714
|
+
dataset_entries = all_data.select { |entry| entry['data_type']&.include?('dataset') }
|
715
|
+
|
716
|
+
{
|
481
717
|
export_info: {
|
482
718
|
timestamp: Time.now.iso8601,
|
483
|
-
|
484
|
-
|
485
|
-
|
719
|
+
export_directory: export_dir,
|
720
|
+
csv_file: file_path,
|
721
|
+
total_entries: all_data.length
|
486
722
|
},
|
487
723
|
client_settings: {
|
488
724
|
iab_compliance_enabled: iab_compliance_enabled,
|
@@ -491,25 +727,21 @@ module UrlCategorise
|
|
491
727
|
auto_load_datasets: auto_load_datasets
|
492
728
|
},
|
493
729
|
data_summary: {
|
730
|
+
total_entries: all_data.length,
|
731
|
+
domain_categorization_entries: domain_entries.length,
|
732
|
+
dataset_content_entries: dataset_entries.length,
|
494
733
|
total_domains: @hosts.values.map(&:length).sum,
|
495
734
|
total_categories: @hosts.keys.length,
|
496
735
|
dataset_categories_count: @dataset_categories.size,
|
497
736
|
blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
|
498
|
-
categories: @hosts.keys.sort.map(&:to_s)
|
737
|
+
categories: @hosts.keys.sort.map(&:to_s),
|
738
|
+
has_dataset_content: dataset_entries.any?
|
499
739
|
},
|
500
740
|
dataset_metadata: dataset_metadata
|
501
741
|
}
|
502
|
-
|
503
|
-
File.write(metadata_path, JSON.pretty_generate(metadata))
|
504
|
-
|
505
|
-
{
|
506
|
-
csv_file: file_path,
|
507
|
-
metadata_file: metadata_path,
|
508
|
-
summary: metadata[:data_summary],
|
509
|
-
export_directory: export_dir
|
510
|
-
}
|
511
742
|
end
|
512
743
|
|
744
|
+
|
513
745
|
private
|
514
746
|
|
515
747
|
def initialize_dataset_processor(config)
|
@@ -541,14 +773,23 @@ module UrlCategorise
|
|
541
773
|
return dataset unless @dataset_processor
|
542
774
|
return nil unless dataset # Handle nil datasets gracefully
|
543
775
|
|
544
|
-
|
776
|
+
processed_result = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
|
545
777
|
|
546
|
-
#
|
547
|
-
|
548
|
-
|
778
|
+
# Handle new data structure with categories and raw_content
|
779
|
+
if processed_result.is_a?(Hash) && processed_result['categories']
|
780
|
+
categorized_data = processed_result['categories']
|
781
|
+
metadata = processed_result['_metadata']
|
782
|
+
else
|
783
|
+
# Legacy format - assume the whole result is categorized data
|
784
|
+
categorized_data = processed_result
|
785
|
+
metadata = categorized_data[:_metadata] if categorized_data.respond_to?(:delete)
|
786
|
+
end
|
549
787
|
|
550
|
-
#
|
551
|
-
|
788
|
+
# Store metadata
|
789
|
+
if metadata
|
790
|
+
@dataset_metadata ||= {}
|
791
|
+
@dataset_metadata[metadata[:data_hash]] = metadata
|
792
|
+
end
|
552
793
|
|
553
794
|
# Merge with existing host data
|
554
795
|
categorized_data.each do |category, domains|
|
@@ -776,6 +1017,24 @@ module UrlCategorise
|
|
776
1017
|
end
|
777
1018
|
|
778
1019
|
def download_and_parse_list(url)
|
1020
|
+
if url.start_with?('file://')
|
1021
|
+
# Handle local file URLs
|
1022
|
+
file_path = url.sub('file://', '')
|
1023
|
+
return [] unless File.exist?(file_path)
|
1024
|
+
|
1025
|
+
content = File.read(file_path)
|
1026
|
+
return [] if content.nil? || content.empty?
|
1027
|
+
|
1028
|
+
# Store metadata
|
1029
|
+
@metadata[url] = {
|
1030
|
+
last_updated: Time.now,
|
1031
|
+
content_hash: Digest::SHA256.hexdigest(content),
|
1032
|
+
status: 'success'
|
1033
|
+
}
|
1034
|
+
|
1035
|
+
return parse_list_content(content, detect_list_format(content))
|
1036
|
+
end
|
1037
|
+
|
779
1038
|
raw_data = HTTParty.get(url, timeout: request_timeout)
|
780
1039
|
return [] if raw_data.body.nil? || raw_data.body.empty?
|
781
1040
|
|
@@ -922,6 +1181,9 @@ module UrlCategorise
|
|
922
1181
|
end
|
923
1182
|
|
924
1183
|
def url_valid?(url)
|
1184
|
+
return false if url.nil? || url.empty?
|
1185
|
+
return true if url.start_with?('file://')
|
1186
|
+
|
925
1187
|
uri = URI.parse(url)
|
926
1188
|
uri.is_a?(URI::HTTP) && !uri.host.nil?
|
927
1189
|
rescue URI::InvalidURIError
|
@@ -2,6 +2,9 @@ module UrlCategorise
|
|
2
2
|
module Constants
|
3
3
|
ONE_MEGABYTE = 1_048_576
|
4
4
|
|
5
|
+
# Video URL patterns for detecting video content
|
6
|
+
VIDEO_URL_PATTERNS_FILE = 'https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_url_patterns.txt'.freeze
|
7
|
+
|
5
8
|
# crawler data
|
6
9
|
# https://commoncrawl.org/
|
7
10
|
|
@@ -16,12 +19,13 @@ module UrlCategorise
|
|
16
19
|
DEFAULT_HOST_URLS = {
|
17
20
|
abuse: ['https://github.com/blocklistproject/Lists/raw/master/abuse.txt'],
|
18
21
|
adobe: ['https://github.com/blocklistproject/Lists/raw/master/adobe.txt'],
|
22
|
+
adult: %i[pornography dating_services drugs gambling],
|
19
23
|
advertising: ['https://blocklistproject.github.io/Lists/ads.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt'],
|
20
24
|
amazon: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all'],
|
21
25
|
amp_hosts: ['https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt'],
|
22
26
|
apple: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all'],
|
23
27
|
cloudflare: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all'],
|
24
|
-
crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt'],
|
28
|
+
crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt', 'https://v.firebog.net/hosts/Prigent-Crypto.txt'],
|
25
29
|
dating_services: ['https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt'],
|
26
30
|
drugs: ['https://github.com/blocklistproject/Lists/raw/master/drugs.txt'],
|
27
31
|
facebook: ['https://github.com/blocklistproject/Lists/raw/master/facebook.txt',
|
@@ -39,10 +43,10 @@ module UrlCategorise
|
|
39
43
|
microsoft: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all'],
|
40
44
|
mozilla: ['https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all'],
|
41
45
|
nsa: ['https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS'],
|
42
|
-
phishing: ['https://blocklistproject.github.io/Lists/phishing.txt'],
|
46
|
+
phishing: ['https://blocklistproject.github.io/Lists/phishing.txt', 'https://openphish.com/feed.txt'],
|
43
47
|
pinterest: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all'],
|
44
48
|
piracy: ['https://github.com/blocklistproject/Lists/raw/master/piracy.txt', 'https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/anti.piracy.txt'],
|
45
|
-
pornography: ['https://blocklistproject.github.io/Lists/porn.txt'],
|
49
|
+
pornography: ['https://blocklistproject.github.io/Lists/porn.txt', 'https://v.firebog.net/hosts/Prigent-Adult.txt'],
|
46
50
|
reddit: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt'],
|
47
51
|
redirect: ['https://github.com/blocklistproject/Lists/raw/master/redirect.txt'],
|
48
52
|
scam: ['https://blocklistproject.github.io/Lists/scam.txt'],
|
@@ -53,6 +57,8 @@ module UrlCategorise
|
|
53
57
|
tracking: ['https://blocklistproject.github.io/Lists/tracking.txt'],
|
54
58
|
twitter: ['https://github.com/blocklistproject/Lists/raw/master/twitter.txt', 'https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all'],
|
55
59
|
vaping: ['https://github.com/blocklistproject/Lists/raw/master/vaping.txt'],
|
60
|
+
video: ['https://raw.githubusercontent.com/wilwade/pihole-block-video/master/hosts.txt'],
|
61
|
+
video_hosting: ['https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_hosting_domains.hosts'],
|
56
62
|
whatsapp: ['https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp'],
|
57
63
|
youtube: ['https://github.com/blocklistproject/Lists/raw/master/youtube.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube'],
|
58
64
|
|
@@ -82,9 +88,6 @@ module UrlCategorise
|
|
82
88
|
|
83
89
|
# Extended categories for better organization
|
84
90
|
cryptojacking: ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'],
|
85
|
-
# ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
|
86
|
-
# botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"], # URL returns 403 Forbidden
|
87
|
-
phishing_extended: ['https://openphish.com/feed.txt'],
|
88
91
|
|
89
92
|
# Regional and specialized lists
|
90
93
|
chinese_ad_hosts: ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts'],
|
@@ -124,28 +124,40 @@ module UrlCategorise
|
|
124
124
|
|
125
125
|
def integrate_dataset_into_categorization(dataset, category_mappings = {})
|
126
126
|
categorized_data = {}
|
127
|
+
raw_content = []
|
127
128
|
|
128
129
|
case dataset
|
129
130
|
when Hash
|
130
131
|
# Single dataset with multiple files
|
131
132
|
dataset.each do |file_name, data|
|
132
133
|
process_dataset_file(data, file_name, category_mappings, categorized_data)
|
134
|
+
# Collect raw content
|
135
|
+
if data.is_a?(Array)
|
136
|
+
raw_content.concat(data.map { |row| row.is_a?(Hash) ? row : {} })
|
137
|
+
end
|
133
138
|
end
|
134
139
|
when Array
|
135
140
|
# Single file dataset
|
136
141
|
process_dataset_file(dataset, 'default', category_mappings, categorized_data)
|
142
|
+
# Collect raw content
|
143
|
+
raw_content.concat(dataset.map { |row| row.is_a?(Hash) ? row : {} })
|
137
144
|
else
|
138
145
|
raise Error, "Unsupported dataset format: #{dataset.class}"
|
139
146
|
end
|
140
147
|
|
141
|
-
#
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
148
|
+
# Store both processed categorization data and raw content
|
149
|
+
result = {
|
150
|
+
'categories' => categorized_data,
|
151
|
+
'raw_content' => raw_content,
|
152
|
+
'_metadata' => {
|
153
|
+
processed_at: Time.now,
|
154
|
+
data_hash: generate_dataset_hash(dataset),
|
155
|
+
total_entries: count_total_entries(dataset),
|
156
|
+
raw_content_entries: raw_content.length
|
157
|
+
}
|
146
158
|
}
|
147
159
|
|
148
|
-
|
160
|
+
result
|
149
161
|
end
|
150
162
|
|
151
163
|
private
|
@@ -45,6 +45,7 @@ module UrlCategorise
|
|
45
45
|
# Social & Media
|
46
46
|
social_media: 'IAB14', # Society
|
47
47
|
streaming: 'IAB1-2', # Music
|
48
|
+
video_hosting: 'IAB1-2', # Music (video hosting platforms)
|
48
49
|
blogs: 'IAB14', # Society
|
49
50
|
forums: 'IAB19', # Technology & Computing
|
50
51
|
|
@@ -107,6 +108,7 @@ module UrlCategorise
|
|
107
108
|
# Social & Media
|
108
109
|
social_media: '14', # Society
|
109
110
|
streaming: '1-2', # Music & Audio
|
111
|
+
video_hosting: '1-2', # Music & Audio (video hosting platforms)
|
110
112
|
blogs: '14', # Society
|
111
113
|
forums: '19', # Technology & Computing
|
112
114
|
|