UrlCategorise 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  require 'set'
2
+ require 'digest'
2
3
 
3
4
  module UrlCategorise
4
5
  class Client < ApiPattern::Client
@@ -23,8 +24,10 @@ module UrlCategorise
23
24
  attribute :auto_load_datasets, type: Boolean, default: false
24
25
  attribute :smart_categorization_enabled, type: Boolean, default: false
25
26
  attribute :smart_rules, default: -> { {} }
27
+ attribute :regex_categorization_enabled, type: Boolean, default: false
28
+ attribute :regex_patterns_file, default: -> { VIDEO_URL_PATTERNS_FILE }
26
29
 
27
- attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories
30
+ attr_reader :hosts, :metadata, :dataset_processor, :dataset_categories, :regex_patterns
28
31
 
29
32
  def initialize(**kwargs)
30
33
  # Extract dataset_config for later use
@@ -41,15 +44,21 @@ module UrlCategorise
41
44
  self.auto_load_datasets = kwargs.key?(:auto_load_datasets) ? kwargs[:auto_load_datasets] : false
42
45
  self.smart_categorization_enabled = kwargs.key?(:smart_categorization) ? kwargs[:smart_categorization] : false
43
46
  self.smart_rules = initialize_smart_rules(kwargs.key?(:smart_rules) ? kwargs[:smart_rules] : {})
47
+ self.regex_categorization_enabled = kwargs.key?(:regex_categorization) ? kwargs[:regex_categorization] : false
48
+ self.regex_patterns_file = kwargs.key?(:regex_patterns_file) ? kwargs[:regex_patterns_file] : VIDEO_URL_PATTERNS_FILE
44
49
 
45
50
  @metadata = {}
46
51
  @dataset_categories = Set.new # Track which categories come from datasets
52
+ @regex_patterns = {}
47
53
 
48
54
  # Initialize dataset processor if config provided
49
55
  @dataset_processor = initialize_dataset_processor(dataset_config) unless dataset_config.empty?
50
56
 
51
57
  @hosts = fetch_and_build_host_lists
52
58
 
59
+ # Load regex patterns if enabled
60
+ load_regex_patterns if regex_categorization_enabled
61
+
53
62
  # Auto-load datasets from constants if enabled
54
63
  load_datasets_from_constants if auto_load_datasets && @dataset_processor
55
64
  end
@@ -67,6 +76,9 @@ module UrlCategorise
67
76
  # Apply smart categorization if enabled
68
77
  categories = apply_smart_categorization(url, categories) if smart_categorization_enabled
69
78
 
79
+ # Apply regex categorization if enabled
80
+ categories = apply_regex_categorization(url, categories) if regex_categorization_enabled
81
+
70
82
  if iab_compliance_enabled
71
83
  IabCompliance.get_iab_categories(categories, iab_version)
72
84
  else
@@ -103,6 +115,29 @@ module UrlCategorise
103
115
  categories.uniq
104
116
  end
105
117
 
118
+ def video_url?(url)
119
+ return false unless url && !url.empty?
120
+ return false unless regex_categorization_enabled && @regex_patterns.any?
121
+
122
+ # First check if it's from a video hosting domain
123
+ categories = categorise(url)
124
+ video_hosting_categories = categories & [:video, :video_hosting, :youtube, :vimeo, :tiktok, :dailymotion, :twitch]
125
+
126
+ return false unless video_hosting_categories.any?
127
+
128
+ # Then check if it matches video content patterns
129
+ @regex_patterns.each do |_category, patterns|
130
+ patterns.each do |pattern_info|
131
+ return true if url.match?(pattern_info[:pattern])
132
+ end
133
+ end
134
+
135
+ false
136
+ rescue StandardError
137
+ # Handle any regex or URL parsing errors gracefully
138
+ false
139
+ end
140
+
106
141
  def count_of_hosts
107
142
  @hosts.keys.map do |category|
108
143
  @hosts[category].size
@@ -431,58 +466,259 @@ module UrlCategorise
431
466
 
432
467
  FileUtils.mkdir_p(export_dir) unless Dir.exist?(export_dir)
433
468
 
434
- filename = "url_categorise_data_export_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
469
+ # Create single comprehensive CSV with ALL data
470
+ timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
471
+ filename = "url_categorise_comprehensive_export_#{timestamp}.csv"
435
472
  file_path = File.join(export_dir, filename)
436
473
 
474
+ # Collect all available data
475
+ all_data = collect_all_export_data
476
+
477
+ # Create CSV with dynamic headers
478
+ headers = determine_comprehensive_headers(all_data)
479
+
437
480
  CSV.open(file_path, 'w', headers: true) do |csv|
438
- # Add headers
439
- csv << [
440
- 'domain',
441
- 'category',
442
- 'source_type',
443
- 'is_dataset_category',
444
- 'iab_category_v2',
445
- 'iab_category_v3',
446
- 'export_timestamp',
447
- 'smart_categorization_enabled'
448
- ]
481
+ csv << headers
449
482
 
450
- # Export all host/category data
451
- @hosts.each do |category, domains|
452
- domains.each do |domain|
453
- source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
454
- is_dataset_category = @dataset_categories.include?(category)
455
-
456
- # Get IAB mappings if compliance is enabled
457
- iab_v2 = nil
458
- iab_v3 = nil
459
- if iab_compliance_enabled
460
- iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
461
- iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
483
+ all_data.each do |entry|
484
+ row = headers.map { |header| entry[header] || entry[header.to_sym] || '' }
485
+ csv << row
486
+ end
487
+ end
488
+
489
+ # Create summary file
490
+ summary_filename = "export_summary_#{timestamp}.json"
491
+ summary_file_path = File.join(export_dir, summary_filename)
492
+
493
+ summary = create_comprehensive_export_summary(file_path, all_data, export_dir)
494
+ File.write(summary_file_path, JSON.pretty_generate(summary))
495
+
496
+ {
497
+ csv_file: file_path,
498
+ summary_file: summary_file_path,
499
+ summary: summary[:data_summary],
500
+ export_directory: export_dir,
501
+ total_entries: all_data.length
502
+ }
503
+ end
504
+
505
+ private
506
+
507
+ def load_regex_patterns
508
+ return unless regex_patterns_file
509
+
510
+ @regex_patterns = {}
511
+ current_category = nil
512
+
513
+ content = fetch_regex_patterns_content
514
+ return unless content
515
+
516
+ content.split("\n").each do |line|
517
+ line = line.strip
518
+ next if line.empty?
519
+
520
+ # Check if this line is a source comment
521
+ if line.match(/^# Source: (.+)$/)
522
+ current_category = $1.downcase
523
+ @regex_patterns[current_category] = [] unless @regex_patterns[current_category]
524
+ elsif current_category && !line.start_with?('#') && !line.empty?
525
+ # This is a regex pattern
526
+ begin
527
+ regex = Regexp.new(line)
528
+ @regex_patterns[current_category] << {
529
+ pattern: regex,
530
+ raw: line
531
+ }
532
+ rescue RegexpError => e
533
+ puts "Warning: Invalid regex pattern '#{line}': #{e.message}"
534
+ end
535
+ end
536
+ end
537
+
538
+ puts "Loaded #{@regex_patterns.values.flatten.size} regex patterns from #{@regex_patterns.keys.size} categories" if @regex_patterns.any?
539
+ end
540
+
541
+ def fetch_regex_patterns_content
542
+ if regex_patterns_file.start_with?('http://', 'https://')
543
+ # Remote URL
544
+ begin
545
+ response = HTTParty.get(regex_patterns_file, timeout: request_timeout)
546
+ return response.body if response.code == 200
547
+ rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
548
+ puts "Warning: Failed to fetch regex patterns from #{regex_patterns_file}: #{e.message}"
549
+ return nil
550
+ end
551
+ elsif regex_patterns_file.start_with?('file://')
552
+ # Local file URL
553
+ file_path = regex_patterns_file.sub('file://', '')
554
+ return File.read(file_path) if File.exist?(file_path)
555
+ elsif File.exist?(regex_patterns_file)
556
+ # Direct file path
557
+ return File.read(regex_patterns_file)
558
+ end
559
+
560
+ puts "Warning: Regex patterns file not found: #{regex_patterns_file}"
561
+ nil
562
+ end
563
+
564
+ def apply_regex_categorization(url, existing_categories)
565
+ return existing_categories unless @regex_patterns.any?
566
+
567
+ # If we have existing categories that match domains, check if the URL matches video patterns
568
+ video_categories = existing_categories & [:video, :video_hosting, :youtube, :vimeo, :tiktok]
569
+
570
+ if video_categories.any?
571
+ # Check if this URL matches any video patterns
572
+ @regex_patterns.each do |category, patterns|
573
+ patterns.each do |pattern_info|
574
+ if url.match?(pattern_info[:pattern])
575
+ # This is a video content URL, add a more specific categorization
576
+ existing_categories << "#{video_categories.first}_content".to_sym unless existing_categories.include?("#{video_categories.first}_content".to_sym)
577
+ break
462
578
  end
463
-
464
- csv << [
465
- domain,
466
- category,
467
- source_type,
468
- is_dataset_category,
469
- iab_v2,
470
- iab_v3,
471
- Time.now.iso8601,
472
- smart_categorization_enabled
473
- ]
474
579
  end
475
580
  end
476
581
  end
582
+
583
+ existing_categories.uniq
584
+ end
585
+
586
+ def collect_all_export_data
587
+ all_data = []
588
+
589
+ # 1. Add all processed domain/category mappings
590
+ @hosts.each do |category, domains|
591
+ domains.each do |domain|
592
+ source_type = @dataset_categories.include?(category) ? 'dataset' : 'blocklist'
593
+ is_dataset_category = @dataset_categories.include?(category)
594
+
595
+ # Get IAB mappings if compliance is enabled
596
+ iab_v2 = nil
597
+ iab_v3 = nil
598
+ if iab_compliance_enabled
599
+ iab_v2 = IabCompliance.map_category_to_iab(category, :v2)
600
+ iab_v3 = IabCompliance.map_category_to_iab(category, :v3)
601
+ end
602
+
603
+ entry = {
604
+ 'data_type' => 'domain_categorization',
605
+ 'domain' => domain,
606
+ 'url' => domain, # For compatibility
607
+ 'category' => category.to_s,
608
+ 'source_type' => source_type,
609
+ 'is_dataset_category' => is_dataset_category,
610
+ 'iab_category_v2' => iab_v2,
611
+ 'iab_category_v3' => iab_v3,
612
+ 'export_timestamp' => Time.now.iso8601,
613
+ 'smart_categorization_enabled' => smart_categorization_enabled
614
+ }
615
+
616
+ all_data << entry
617
+ end
618
+ end
619
+
620
+ # 2. Add raw dataset content from cache
621
+ collect_cached_dataset_content.each do |entry|
622
+ entry['data_type'] = 'raw_dataset_content'
623
+ all_data << entry
624
+ end
625
+
626
+ # 3. Try to collect currently loaded dataset data if available
627
+ collect_current_dataset_content.each do |entry|
628
+ entry['data_type'] = 'current_dataset_content'
629
+ all_data << entry
630
+ end
631
+
632
+ all_data
633
+ end
634
+
635
+ def collect_cached_dataset_content
636
+ cached_data = []
637
+ return cached_data unless @dataset_processor
638
+
639
+ # Collect from cached datasets if available
640
+ (@dataset_metadata || {}).each do |data_hash, metadata|
641
+ cache_key = @dataset_processor.send(:generate_cache_key, metadata[:source_identifier] || data_hash, metadata[:source_type]&.to_sym || :unknown)
642
+ cached_result = @dataset_processor.send(:load_from_cache, cache_key)
643
+
644
+ if cached_result && cached_result.is_a?(Hash) && cached_result['raw_content']
645
+ cached_result['raw_content'].each do |entry|
646
+ enhanced_entry = entry.dup
647
+ enhanced_entry['dataset_source'] = metadata[:source_identifier] || 'unknown'
648
+ enhanced_entry['dataset_type'] = metadata[:source_type] || 'unknown'
649
+ enhanced_entry['processed_at'] = metadata[:processed_at]
650
+ cached_data << enhanced_entry
651
+ end
652
+ elsif cached_result.is_a?(Array)
653
+ # Legacy format - array of entries
654
+ cached_result.each do |entry|
655
+ next unless entry.is_a?(Hash)
656
+ enhanced_entry = entry.dup
657
+ enhanced_entry['dataset_source'] = metadata[:source_identifier] || 'unknown'
658
+ enhanced_entry['dataset_type'] = metadata[:source_type] || 'unknown'
659
+ enhanced_entry['processed_at'] = metadata[:processed_at]
660
+ cached_data << enhanced_entry
661
+ end
662
+ end
663
+ end
664
+
665
+ cached_data
666
+ end
667
+
668
+ def collect_current_dataset_content
669
+ # This is a placeholder - in practice, the original dataset content
670
+ # is processed and only domain mappings are kept in @hosts.
671
+ # The raw content should come from cache, but if we want to be more
672
+ # aggressive, we could re-process datasets here or store them differently.
673
+ []
674
+ end
675
+
676
+ def determine_comprehensive_headers(all_data)
677
+ # Collect all unique keys from all entries
678
+ all_keys = Set.new
679
+ all_data.each do |entry|
680
+ all_keys.merge(entry.keys.map(&:to_s))
681
+ end
682
+ all_keys_array = all_keys.to_a
683
+
684
+ # Core headers that should appear first
685
+ core_headers = %w[data_type domain url category]
686
+
687
+ # Standard categorization headers
688
+ categorization_headers = %w[source_type is_dataset_category iab_category_v2 iab_category_v3]
689
+
690
+ # Dataset content headers
691
+ content_headers = %w[title description text content summary body]
477
692
 
478
- # Create metadata file
479
- metadata_path = File.join(export_dir, "#{File.basename(filename, '.csv')}_metadata.json")
480
- metadata = {
693
+ # Metadata headers
694
+ metadata_headers = %w[dataset_source dataset_type processed_at export_timestamp smart_categorization_enabled]
695
+
696
+ # Build final header order
697
+ ordered_headers = []
698
+ ordered_headers += (core_headers & all_keys_array)
699
+ ordered_headers += (categorization_headers & all_keys_array)
700
+ ordered_headers += (content_headers & all_keys_array)
701
+
702
+ # Add any remaining headers (alphabetically sorted)
703
+ remaining_headers = (all_keys_array - ordered_headers - metadata_headers).sort
704
+ ordered_headers += remaining_headers
705
+
706
+ # Add metadata headers at the end
707
+ ordered_headers += (metadata_headers & all_keys_array)
708
+
709
+ ordered_headers
710
+ end
711
+
712
+ def create_comprehensive_export_summary(file_path, all_data, export_dir)
713
+ domain_entries = all_data.select { |entry| entry['data_type'] == 'domain_categorization' }
714
+ dataset_entries = all_data.select { |entry| entry['data_type']&.include?('dataset') }
715
+
716
+ {
481
717
  export_info: {
482
718
  timestamp: Time.now.iso8601,
483
- filename: filename,
484
- file_path: file_path,
485
- metadata_path: metadata_path
719
+ export_directory: export_dir,
720
+ csv_file: file_path,
721
+ total_entries: all_data.length
486
722
  },
487
723
  client_settings: {
488
724
  iab_compliance_enabled: iab_compliance_enabled,
@@ -491,25 +727,21 @@ module UrlCategorise
491
727
  auto_load_datasets: auto_load_datasets
492
728
  },
493
729
  data_summary: {
730
+ total_entries: all_data.length,
731
+ domain_categorization_entries: domain_entries.length,
732
+ dataset_content_entries: dataset_entries.length,
494
733
  total_domains: @hosts.values.map(&:length).sum,
495
734
  total_categories: @hosts.keys.length,
496
735
  dataset_categories_count: @dataset_categories.size,
497
736
  blocklist_categories_count: @hosts.keys.length - @dataset_categories.size,
498
- categories: @hosts.keys.sort.map(&:to_s)
737
+ categories: @hosts.keys.sort.map(&:to_s),
738
+ has_dataset_content: dataset_entries.any?
499
739
  },
500
740
  dataset_metadata: dataset_metadata
501
741
  }
502
-
503
- File.write(metadata_path, JSON.pretty_generate(metadata))
504
-
505
- {
506
- csv_file: file_path,
507
- metadata_file: metadata_path,
508
- summary: metadata[:data_summary],
509
- export_directory: export_dir
510
- }
511
742
  end
512
743
 
744
+
513
745
  private
514
746
 
515
747
  def initialize_dataset_processor(config)
@@ -541,14 +773,23 @@ module UrlCategorise
541
773
  return dataset unless @dataset_processor
542
774
  return nil unless dataset # Handle nil datasets gracefully
543
775
 
544
- categorized_data = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
776
+ processed_result = @dataset_processor.integrate_dataset_into_categorization(dataset, category_mappings)
545
777
 
546
- # Store metadata
547
- @dataset_metadata ||= {}
548
- @dataset_metadata[categorized_data[:_metadata][:data_hash]] = categorized_data[:_metadata]
778
+ # Handle new data structure with categories and raw_content
779
+ if processed_result.is_a?(Hash) && processed_result['categories']
780
+ categorized_data = processed_result['categories']
781
+ metadata = processed_result['_metadata']
782
+ else
783
+ # Legacy format - assume the whole result is categorized data
784
+ categorized_data = processed_result
785
+ metadata = categorized_data[:_metadata] if categorized_data.respond_to?(:delete)
786
+ end
549
787
 
550
- # Remove metadata from the working data
551
- categorized_data.delete(:_metadata)
788
+ # Store metadata
789
+ if metadata
790
+ @dataset_metadata ||= {}
791
+ @dataset_metadata[metadata[:data_hash]] = metadata
792
+ end
552
793
 
553
794
  # Merge with existing host data
554
795
  categorized_data.each do |category, domains|
@@ -776,6 +1017,24 @@ module UrlCategorise
776
1017
  end
777
1018
 
778
1019
  def download_and_parse_list(url)
1020
+ if url.start_with?('file://')
1021
+ # Handle local file URLs
1022
+ file_path = url.sub('file://', '')
1023
+ return [] unless File.exist?(file_path)
1024
+
1025
+ content = File.read(file_path)
1026
+ return [] if content.nil? || content.empty?
1027
+
1028
+ # Store metadata
1029
+ @metadata[url] = {
1030
+ last_updated: Time.now,
1031
+ content_hash: Digest::SHA256.hexdigest(content),
1032
+ status: 'success'
1033
+ }
1034
+
1035
+ return parse_list_content(content, detect_list_format(content))
1036
+ end
1037
+
779
1038
  raw_data = HTTParty.get(url, timeout: request_timeout)
780
1039
  return [] if raw_data.body.nil? || raw_data.body.empty?
781
1040
 
@@ -922,6 +1181,9 @@ module UrlCategorise
922
1181
  end
923
1182
 
924
1183
  def url_valid?(url)
1184
+ return false if url.nil? || url.empty?
1185
+ return true if url.start_with?('file://')
1186
+
925
1187
  uri = URI.parse(url)
926
1188
  uri.is_a?(URI::HTTP) && !uri.host.nil?
927
1189
  rescue URI::InvalidURIError
@@ -2,6 +2,9 @@ module UrlCategorise
2
2
  module Constants
3
3
  ONE_MEGABYTE = 1_048_576
4
4
 
5
+ # Video URL patterns for detecting video content
6
+ VIDEO_URL_PATTERNS_FILE = 'https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_url_patterns.txt'.freeze
7
+
5
8
  # crawler data
6
9
  # https://commoncrawl.org/
7
10
 
@@ -16,12 +19,13 @@ module UrlCategorise
16
19
  DEFAULT_HOST_URLS = {
17
20
  abuse: ['https://github.com/blocklistproject/Lists/raw/master/abuse.txt'],
18
21
  adobe: ['https://github.com/blocklistproject/Lists/raw/master/adobe.txt'],
22
+ adult: %i[pornography dating_services drugs gambling],
19
23
  advertising: ['https://blocklistproject.github.io/Lists/ads.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt'],
20
24
  amazon: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all'],
21
25
  amp_hosts: ['https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt'],
22
26
  apple: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all'],
23
27
  cloudflare: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all'],
24
- crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt'],
28
+ crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt', 'https://v.firebog.net/hosts/Prigent-Crypto.txt'],
25
29
  dating_services: ['https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt'],
26
30
  drugs: ['https://github.com/blocklistproject/Lists/raw/master/drugs.txt'],
27
31
  facebook: ['https://github.com/blocklistproject/Lists/raw/master/facebook.txt',
@@ -39,10 +43,10 @@ module UrlCategorise
39
43
  microsoft: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all'],
40
44
  mozilla: ['https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all'],
41
45
  nsa: ['https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS'],
42
- phishing: ['https://blocklistproject.github.io/Lists/phishing.txt'],
46
+ phishing: ['https://blocklistproject.github.io/Lists/phishing.txt', 'https://openphish.com/feed.txt'],
43
47
  pinterest: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all'],
44
48
  piracy: ['https://github.com/blocklistproject/Lists/raw/master/piracy.txt', 'https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/anti.piracy.txt'],
45
- pornography: ['https://blocklistproject.github.io/Lists/porn.txt'],
49
+ pornography: ['https://blocklistproject.github.io/Lists/porn.txt', 'https://v.firebog.net/hosts/Prigent-Adult.txt'],
46
50
  reddit: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt'],
47
51
  redirect: ['https://github.com/blocklistproject/Lists/raw/master/redirect.txt'],
48
52
  scam: ['https://blocklistproject.github.io/Lists/scam.txt'],
@@ -53,6 +57,8 @@ module UrlCategorise
53
57
  tracking: ['https://blocklistproject.github.io/Lists/tracking.txt'],
54
58
  twitter: ['https://github.com/blocklistproject/Lists/raw/master/twitter.txt', 'https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all'],
55
59
  vaping: ['https://github.com/blocklistproject/Lists/raw/master/vaping.txt'],
60
+ video: ['https://raw.githubusercontent.com/wilwade/pihole-block-video/master/hosts.txt'],
61
+ video_hosting: ['https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_hosting_domains.hosts'],
56
62
  whatsapp: ['https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp'],
57
63
  youtube: ['https://github.com/blocklistproject/Lists/raw/master/youtube.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube'],
58
64
 
@@ -82,9 +88,6 @@ module UrlCategorise
82
88
 
83
89
  # Extended categories for better organization
84
90
  cryptojacking: ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'],
85
- # ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
86
- # botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"], # URL returns 403 Forbidden
87
- phishing_extended: ['https://openphish.com/feed.txt'],
88
91
 
89
92
  # Regional and specialized lists
90
93
  chinese_ad_hosts: ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts'],
@@ -124,28 +124,40 @@ module UrlCategorise
124
124
 
125
125
  def integrate_dataset_into_categorization(dataset, category_mappings = {})
126
126
  categorized_data = {}
127
+ raw_content = []
127
128
 
128
129
  case dataset
129
130
  when Hash
130
131
  # Single dataset with multiple files
131
132
  dataset.each do |file_name, data|
132
133
  process_dataset_file(data, file_name, category_mappings, categorized_data)
134
+ # Collect raw content
135
+ if data.is_a?(Array)
136
+ raw_content.concat(data.map { |row| row.is_a?(Hash) ? row : {} })
137
+ end
133
138
  end
134
139
  when Array
135
140
  # Single file dataset
136
141
  process_dataset_file(dataset, 'default', category_mappings, categorized_data)
142
+ # Collect raw content
143
+ raw_content.concat(dataset.map { |row| row.is_a?(Hash) ? row : {} })
137
144
  else
138
145
  raise Error, "Unsupported dataset format: #{dataset.class}"
139
146
  end
140
147
 
141
- # Add metadata
142
- categorized_data[:_metadata] = {
143
- processed_at: Time.now,
144
- data_hash: generate_dataset_hash(dataset),
145
- total_entries: count_total_entries(dataset)
148
+ # Store both processed categorization data and raw content
149
+ result = {
150
+ 'categories' => categorized_data,
151
+ 'raw_content' => raw_content,
152
+ '_metadata' => {
153
+ processed_at: Time.now,
154
+ data_hash: generate_dataset_hash(dataset),
155
+ total_entries: count_total_entries(dataset),
156
+ raw_content_entries: raw_content.length
157
+ }
146
158
  }
147
159
 
148
- categorized_data
160
+ result
149
161
  end
150
162
 
151
163
  private
@@ -45,6 +45,7 @@ module UrlCategorise
45
45
  # Social & Media
46
46
  social_media: 'IAB14', # Society
47
47
  streaming: 'IAB1-2', # Music
48
+ video_hosting: 'IAB1-2', # Music (video hosting platforms)
48
49
  blogs: 'IAB14', # Society
49
50
  forums: 'IAB19', # Technology & Computing
50
51
 
@@ -107,6 +108,7 @@ module UrlCategorise
107
108
  # Social & Media
108
109
  social_media: '14', # Society
109
110
  streaming: '1-2', # Music & Audio
111
+ video_hosting: '1-2', # Music & Audio (video hosting platforms)
110
112
  blogs: '14', # Society
111
113
  forums: '19', # Technology & Computing
112
114
 
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end