RubyGems - UrlCategorise - Versions diffs - 0.1.2 → 0.1.3 - Mend

UrlCategorise 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.claude/settings.local.json +5 -1
data/CLAUDE.md +12 -2
data/Gemfile +2 -2
data/Gemfile.lock +6 -9
data/README.md +189 -1
data/Rakefile +8 -8
data/bin/check_lists +12 -13
data/bin/console +3 -3
data/lib/url_categorise/active_record_client.rb +97 -20
data/lib/url_categorise/client.rb +220 -111
data/lib/url_categorise/constants.rb +86 -71
data/lib/url_categorise/dataset_processor.rb +471 -0
data/lib/url_categorise/models.rb +53 -14
data/lib/url_categorise/version.rb +1 -1
data/lib/url_categorise.rb +1 -0
data/url_categorise.gemspec +34 -32
metadata +91 -50

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 74ab2a721a954a91958dc1c184fd76a4a2c3acd250730bb7db58c30286579fcd
-  data.tar.gz: 3b01bf42b654266dddcd85d684c2f3e25d455d0fe9b82094f07de2b95c026bd3
+  metadata.gz: dcb05d79b6bc09b5b338183d412cd309d9634a342c95b14ea25df5926d8609fb
+  data.tar.gz: effa4c7a010ee574fe6a41653af553a68710ceca46ebdb9dd5352096af5fa7e1
 SHA512:
-  metadata.gz: 2d175a1f72e2fda10771c770a694f07235cd7f5a3f93d1b487d82c98b4f7b490f7aaf665834dcf226bc1a6ae2cc833038731e8d9581af0a570302a7c17dad1c1
-  data.tar.gz: 2bd46865492bea3411b8506f294cf96b507c5c5beafad2584b2438977043fbfd3850e79c0a0342b11079829ef5bf3fd91c18c3f45ad5e0537fae8c78c9645245
+  metadata.gz: a527c801cbf6318305d640925dd75922c2ac1bcc76a5de75c75e3ad24698305c3d7d885d7da8dd280e61ceb1fe91a57eac185c5c209e11685f2ddb6833b120b9
+  data.tar.gz: de81765d20a0b36c54b71b935928777140f86f6e0a130c71ec7804ed28c1b3d8f12ca30fff5cf8c93952aaed7a9c279fe35a0d32bc83a851aad2556b55fd7942

data/.claude/settings.local.json CHANGED Viewed

@@ -6,7 +6,11 @@
       "Bash(ruby:*)",
       "Bash(bundle exec ruby:*)",
       "Bash(find:*)",
-      "Bash(grep:*)"
+      "Bash(grep:*)",
+      "Read(//Users/trex22/development/rubygems/kaggle/**)",
+      "Bash(for file in test/url_categorise/*dataset*test.rb)",
+      "Bash(do echo \"Checking $file...\")",
+      "Bash(done)"
     ],
     "deny": []
   }

data/CLAUDE.md CHANGED Viewed

@@ -78,12 +78,20 @@ The gem includes automatic monitoring and cleanup of broken URLs:
 - ActiveRecord/Rails integration (optional)
 - URL health monitoring and reporting
 - Automatic cleanup of broken blocklist sources
+- **Dataset Processing**: Kaggle and CSV dataset integration with three auth methods
+- **Optional Kaggle**: Can disable Kaggle functionality entirely while keeping CSV processing
+- **Smart Caching**: Cached datasets work without credentials, avoiding unnecessary authentication
+- **Data Hashing**: SHA256 content hashing for dataset change detection
+- **Category Mapping**: Flexible column detection and category mapping for datasets
+- **Credential Warnings**: Helpful warnings when Kaggle credentials are missing but functionality continues
 ### Architecture
 - `Client` class: Main interface for categorization
+- `DatasetProcessor` class: Handles Kaggle and CSV dataset processing
 - `Constants` module: Contains default list URLs and categories
-- Modular design allows extending with new list sources
-- Support for custom list directories and caching
+- `ActiveRecordClient` class: Database-backed client with dataset history
+- Modular design allows extending with new list sources and datasets
+- Support for custom list directories, caching, and dataset integration
 ### List Sources
 Primary sources include:
@@ -91,6 +99,8 @@ Primary sources include:
 - hagezi/dns-blocklists
 - StevenBlack/hosts
 - Various specialized security lists
+- **Kaggle datasets**: Public URL classification datasets
+- **Custom CSV files**: Direct CSV dataset URLs with flexible column mapping
 ### Testing Guidelines
 - Mock all HTTP requests using WebMock

data/Gemfile CHANGED Viewed

@@ -1,6 +1,6 @@
-source "https://rubygems.org"
+source 'https://rubygems.org'
-git_source(:github) {|repo_name| "https://github.com/TRex22/url_categorise" }
+git_source(:github) { |_repo_name| 'https://github.com/TRex22/url_categorise' }
 # Specify your gem's dependencies in url_categorise.gemspec
 gemspec

data/Gemfile.lock CHANGED Viewed

@@ -1,14 +1,16 @@
 PATH
   remote: .
   specs:
-    UrlCategorise (0.1.2)
+    UrlCategorise (0.1.3)
       api_pattern (>= 0.0.6, < 1.0)
       csv (>= 3.3.0, < 4.0)
       digest (>= 3.1.0, < 4.0)
       fileutils (>= 1.7.0, < 2.0)
       httparty (>= 0.22.0, < 1.0)
+      json (>= 2.7.0, < 3.0)
       nokogiri (>= 1.18.9, < 2.0)
       resolv (>= 0.4.0, < 1.0)
+      rubyzip (>= 2.3.0, < 3.0)
 GEM
   remote: https://rubygems.org/
@@ -78,19 +80,19 @@ GEM
     erubi (1.13.1)
     fileutils (1.7.3)
     hashdiff (1.2.0)
-    httparty (0.22.0)
+    httparty (0.23.1)
       csv
       mini_mime (>= 1.0.0)
       multi_xml (>= 0.5.2)
     i18n (1.14.7)
       concurrent-ruby (~> 1.0)
+    json (2.13.2)
     logger (1.7.0)
     loofah (2.24.1)
       crass (~> 1.0.2)
       nokogiri (>= 1.12.0)
     method_source (1.1.0)
     mini_mime (1.1.5)
-    mini_portile2 (2.8.9)
     minitest (5.25.5)
     minitest-focus (1.4.0)
       minitest (>= 4, < 6)
@@ -103,9 +105,6 @@ GEM
       ruby2_keywords (>= 0.0.5)
     multi_xml (0.7.2)
       bigdecimal (~> 3.1)
-    nokogiri (1.18.9)
-      mini_portile2 (~> 2.8.2)
-      racc (~> 1.4)
     nokogiri (1.18.9-arm64-darwin)
       racc (~> 1.4)
     pry (0.15.2)
@@ -130,6 +129,7 @@ GEM
     rexml (3.4.1)
     ruby-progressbar (1.13.0)
     ruby2_keywords (0.0.5)
+    rubyzip (2.4.1)
     securerandom (0.4.1)
     simplecov (0.22.0)
       docile (~> 1.1)
@@ -137,8 +137,6 @@ GEM
       simplecov_json_formatter (~> 0.1)
     simplecov-html (0.13.2)
     simplecov_json_formatter (0.1.4)
-    sqlite3 (2.7.3)
-      mini_portile2 (~> 2.8.0)
     sqlite3 (2.7.3-arm64-darwin)
     timecop (0.9.10)
     timeout (0.4.3)
@@ -153,7 +151,6 @@ GEM
 PLATFORMS
   arm64-darwin-24
-  ruby
 DEPENDENCIES
   UrlCategorise!

data/README.md CHANGED Viewed

@@ -192,6 +192,159 @@ ruby bin/check_lists
 [View all 60+ categories in constants.rb](lib/url_categorise/constants.rb)
+## Dataset Processing
+UrlCategorise now supports processing external datasets from Kaggle and CSV files to expand categorization data:
+### Kaggle Dataset Integration
+Load datasets directly from Kaggle using three authentication methods:
+```ruby
+# Method 1: Environment variables (KAGGLE_USERNAME, KAGGLE_KEY)
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    kaggle: {}  # Will use environment variables
+  }
+)
+# Method 2: Explicit credentials
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    kaggle: {
+      username: 'your_username',
+      api_key: 'your_api_key'
+    }
+  }
+)
+# Method 3: Credentials file (~/.kaggle/kaggle.json or custom path)
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    kaggle: {
+      credentials_file: '/path/to/kaggle.json'
+    }
+  }
+)
+# Load and integrate a Kaggle dataset
+client.load_kaggle_dataset('owner', 'dataset-name', {
+  use_cache: true,  # Cache processed data
+  category_mappings: {
+    url_column: 'website',      # Column containing URLs/domains
+    category_column: 'type',    # Column containing categories
+    category_map: {
+      'malicious' => 'malware', # Map dataset categories to your categories
+      'spam' => 'phishing'
+    }
+  }
+})
+# Check categorization with dataset data
+categories = client.categorise('https://example.com')
+```
+### CSV Dataset Processing
+Load datasets from direct CSV URLs:
+```ruby
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    download_path: './datasets',
+    cache_path: './dataset_cache'
+  }
+)
+# Load CSV dataset
+client.load_csv_dataset('https://example.com/url-classification.csv', {
+  use_cache: true,
+  category_mappings: {
+    url_column: 'url',
+    category_column: 'category'
+  }
+})
+```
+### Dataset Configuration Options
+```ruby
+dataset_config = {
+  # Kaggle functionality control
+  enable_kaggle: true,              # Set to false to disable Kaggle entirely (default: true)
+  # Kaggle authentication (optional - will try env vars and default file)
+  kaggle: {
+    username: 'kaggle_username',     # Or use KAGGLE_USERNAME env var
+    api_key: 'kaggle_api_key',       # Or use KAGGLE_KEY env var
+    credentials_file: '~/.kaggle/kaggle.json'  # Optional custom path
+  },
+  # File paths
+  download_path: './downloads',      # Where to store downloads
+  cache_path: './cache',            # Where to cache processed data
+  timeout: 30                       # HTTP timeout for downloads
+}
+client = UrlCategorise::Client.new(dataset_config: dataset_config)
+```
+### Disabling Kaggle Functionality
+You can completely disable Kaggle functionality if you only need CSV processing:
+```ruby
+# Disable Kaggle - only CSV datasets will work
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    enable_kaggle: false,
+    download_path: './datasets',
+    cache_path: './dataset_cache'
+  }
+)
+# This will raise an error
+# client.load_kaggle_dataset('owner', 'dataset')  # Error!
+# But CSV datasets still work
+client.load_csv_dataset('https://example.com/data.csv')
+```
+### Working with Cached Datasets
+If you have cached datasets, you can access them even without Kaggle credentials:
+```ruby
+# No credentials provided, but cached data will work
+client = UrlCategorise::Client.new(
+  dataset_config: {
+    kaggle: {},  # Empty config - will show warning but continue
+    download_path: './datasets',
+    cache_path: './cache'
+  }
+)
+# Will work if data is cached, otherwise will show helpful error message
+client.load_kaggle_dataset('owner', 'dataset', use_cache: true)
+```
+### Dataset Metadata and Hashing
+The system automatically tracks dataset metadata and generates content hashes:
+```ruby
+# Get dataset metadata
+metadata = client.dataset_metadata
+metadata.each do |data_hash, meta|
+  puts "Dataset hash: #{data_hash}"
+  puts "Processed at: #{meta[:processed_at]}"
+  puts "Total entries: #{meta[:total_entries]}"
+end
+# Reload client with fresh dataset integration
+client.reload_with_datasets
+```
 ## ActiveRecord Integration
 For high-performance applications, enable database storage:
@@ -215,11 +368,31 @@ categories = client.categorise("example.com")
 # Get database statistics
 stats = client.database_stats
-# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90 }
+# => { domains: 50000, ip_addresses: 15000, categories: 45, list_metadata: 90, dataset_metadata: 5 }
 # Direct model access
 domain_record = UrlCategorise::Models::Domain.find_by(domain: "example.com")
 ip_record = UrlCategorise::Models::IpAddress.find_by(ip_address: "1.2.3.4")
+# Dataset integration with ActiveRecord
+client = UrlCategorise::ActiveRecordClient.new(
+  use_database: true,
+  dataset_config: {
+    kaggle: { username: 'user', api_key: 'key' }
+  }
+)
+# Load datasets - automatically stored in database
+client.load_kaggle_dataset('owner', 'dataset')
+client.load_csv_dataset('https://example.com/data.csv')
+# View dataset history
+history = client.dataset_history(limit: 5)
+# => [{ source_type: 'kaggle', identifier: 'owner/dataset', total_entries: 1000, processed_at: ... }]
+# Filter by source type
+kaggle_history = client.dataset_history(source_type: 'kaggle')
+csv_history = client.dataset_history(source_type: 'csv')
 ```
 ## Rails Integration
@@ -274,6 +447,21 @@ class CreateUrlCategoriseTables < ActiveRecord::Migration[7.0]
     add_index :url_categorise_ip_addresses, :ip_address
     add_index :url_categorise_ip_addresses, :categories
+    create_table :url_categorise_dataset_metadata do |t|
+      t.string :source_type, null: false, index: true
+      t.string :identifier, null: false
+      t.string :data_hash, null: false, index: { unique: true }
+      t.integer :total_entries, null: false
+      t.text :category_mappings
+      t.text :processing_options
+      t.datetime :processed_at
+      t.timestamps
+    end
+    add_index :url_categorise_dataset_metadata, :source_type
+    add_index :url_categorise_dataset_metadata, :identifier
+    add_index :url_categorise_dataset_metadata, :processed_at
   end
 end
 ```

data/Rakefile CHANGED Viewed

@@ -1,12 +1,12 @@
-require "bundler/gem_tasks"
-require "bundler/setup"
-require "rake/testtask"
+require 'bundler/gem_tasks'
+require 'bundler/setup'
+require 'rake/testtask'
 Rake::TestTask.new(:test) do |t|
-  t.libs << "test"
-  t.libs << "lib"
-  t.test_files = FileList["test/**/*_test.rb"]
-  t.ruby_opts = ["-rbundler/setup"]
+  t.libs << 'test'
+  t.libs << 'lib'
+  t.test_files = FileList['test/**/*_test.rb']
+  t.ruby_opts = ['-rbundler/setup']
 end
-task :default => :test
+task default: :test

data/bin/check_lists CHANGED Viewed

@@ -3,46 +3,45 @@
 require 'bundler/setup'
 require_relative '../lib/url_categorise'
-puts "=== CHECKING ALL URLs IN CONSTANTS ==="
+puts '=== CHECKING ALL URLs IN CONSTANTS ==='
 UrlCategorise::Constants::DEFAULT_HOST_URLS.each do |category, urls|
   puts "\n#{category.upcase}:"
   # Skip categories that only reference other categories (symbols)
   actual_urls = urls.reject { |url| url.is_a?(Symbol) }
   if actual_urls.empty?
     if urls.empty?
-      puts "  Empty category (no URLs defined)"
+      puts '  Empty category (no URLs defined)'
     else
       puts "  Only references other categories: #{urls}"
     end
     next
   end
   actual_urls.each do |url|
     print "  Testing #{url}... "
     begin
       response = HTTParty.head(url, timeout: 10)
       case response.code
       when 200
-        puts "✅ OK"
+        puts '✅ OK'
       when 404
-        puts "❌ 404 Not Found"
+        puts '❌ 404 Not Found'
       when 403
-        puts "❌ 403 Forbidden"
+        puts '❌ 403 Forbidden'
       when 500..599
         puts "❌ Server Error (#{response.code})"
       else
         puts "⚠️ HTTP #{response.code}"
       end
     rescue Net::TimeoutError, HTTParty::TimeoutError
-      puts "❌ Timeout"
-    rescue SocketError, Errno::ECONNREFUSED => e
-      puts "❌ DNS/Network Error"
-    rescue => e
+      puts '❌ Timeout'
+    rescue SocketError, Errno::ECONNREFUSED
+      puts '❌ DNS/Network Error'
+    rescue StandardError => e
       puts "❌ Error: #{e.class}"
     end
   end
 end

data/bin/console CHANGED Viewed

@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
-require "bundler/setup"
-require "url_categorise"
+require 'bundler/setup'
+require 'url_categorise'
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.
 # (If you use this, don't forget to add pry to your Gemfile!)
-require "pry"
+require 'pry'
 Pry.start

data/lib/url_categorise/active_record_client.rb CHANGED Viewed

@@ -3,65 +3,125 @@ require_relative 'models'
 module UrlCategorise
   class ActiveRecordClient < Client
     def initialize(**kwargs)
-      raise "ActiveRecord not available" unless UrlCategorise::Models.available?
+      raise 'ActiveRecord not available' unless UrlCategorise::Models.available?
       @use_database = kwargs.delete(:use_database) { true }
       super(**kwargs)
       populate_database if @use_database
     end
     def categorise(url)
       return super(url) unless @use_database && UrlCategorise::Models.available?
-      host = (URI.parse(url).host || url).downcase.gsub("www.", "")
+      host = (URI.parse(url).host || url).downcase.gsub('www.', '')
       # Try database first
       categories = UrlCategorise::Models::Domain.categorise(host)
       return categories unless categories.empty?
       # Fallback to memory-based categorization
       super(url)
     end
     def categorise_ip(ip_address)
       return super(ip_address) unless @use_database && UrlCategorise::Models.available?
       # Try database first
       categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
       return categories unless categories.empty?
       # Fallback to memory-based categorization
       super(ip_address)
     end
     def update_database
       return unless @use_database && UrlCategorise::Models.available?
       populate_database
     end
     def database_stats
       return {} unless @use_database && UrlCategorise::Models.available?
       {
         domains: UrlCategorise::Models::Domain.count,
         ip_addresses: UrlCategorise::Models::IpAddress.count,
         list_metadata: UrlCategorise::Models::ListMetadata.count,
+        dataset_metadata: UrlCategorise::Models::DatasetMetadata.count,
         categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
       }
     end
+    def load_kaggle_dataset(dataset_owner, dataset_name, options = {})
+      result = super(dataset_owner, dataset_name, options)
+      # Store dataset metadata in database if enabled
+      if @use_database && UrlCategorise::Models.available? && @dataset_metadata
+        store_dataset_metadata_in_db(
+          source_type: 'kaggle',
+          identifier: "#{dataset_owner}/#{dataset_name}",
+          metadata: @dataset_metadata.values.last,
+          category_mappings: options[:category_mappings],
+          processing_options: options
+        )
+      end
+      # Repopulate database with integrated dataset domains
+      populate_database if @use_database
+      result
+    end
+    def load_csv_dataset(url, options = {})
+      result = super(url, options)
+      # Store dataset metadata in database if enabled
+      if @use_database && UrlCategorise::Models.available? && @dataset_metadata
+        store_dataset_metadata_in_db(
+          source_type: 'csv',
+          identifier: url,
+          metadata: @dataset_metadata.values.last,
+          category_mappings: options[:category_mappings],
+          processing_options: options
+        )
+      end
+      # Repopulate database with integrated dataset domains
+      populate_database if @use_database
+      result
+    end
+    def dataset_history(source_type: nil, limit: 10)
+      return [] unless @use_database && UrlCategorise::Models.available?
+      query = UrlCategorise::Models::DatasetMetadata.order(processed_at: :desc).limit(limit)
+      query = query.by_source(source_type) if source_type
+      query.map do |record|
+        {
+          source_type: record.source_type,
+          identifier: record.identifier,
+          data_hash: record.data_hash,
+          total_entries: record.total_entries,
+          processed_at: record.processed_at,
+          category_mappings: record.category_mappings,
+          processing_options: record.processing_options
+        }
+      end
+    end
     private
     def populate_database
       return unless UrlCategorise::Models.available?
       # Store list metadata
       @host_urls.each do |category, urls|
         urls.each do |url|
           next unless url.is_a?(String)
           metadata = @metadata[url] || {}
           UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
             record.name = category.to_s
@@ -76,7 +136,7 @@ module UrlCategorise
       @hosts.each do |category, domains|
         domains.each do |domain|
           next if domain.nil? || domain.empty?
           existing = UrlCategorise::Models::Domain.find_by(domain: domain)
           if existing
             # Add category if not already present
@@ -92,15 +152,15 @@ module UrlCategorise
       end
       # Store IP data (for IP-based lists)
-      ip_categories = [:sanctions_ips, :compromised_ips, :tor_exit_nodes, :open_proxy_ips,
-                       :banking_trojans, :malicious_ssl_certificates, :top_attack_sources]
+      ip_categories = %i[sanctions_ips compromised_ips tor_exit_nodes open_proxy_ips
+                         banking_trojans malicious_ssl_certificates top_attack_sources]
       ip_categories.each do |category|
         next unless @hosts[category]
         @hosts[category].each do |ip|
           next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
           existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
           if existing
             categories = existing.categories | [category.to_s]
@@ -114,5 +174,22 @@ module UrlCategorise
         end
       end
     end
+    def store_dataset_metadata_in_db(source_type:, identifier:, metadata:, category_mappings: nil,
+                                     processing_options: nil)
+      return unless UrlCategorise::Models.available?
+      UrlCategorise::Models::DatasetMetadata.find_or_create_by(data_hash: metadata[:data_hash]) do |record|
+        record.source_type = source_type
+        record.identifier = identifier
+        record.total_entries = metadata[:total_entries]
+        record.category_mappings = category_mappings || {}
+        record.processing_options = processing_options || {}
+        record.processed_at = metadata[:processed_at] || Time.now
+      end
+    rescue ActiveRecord::RecordInvalid => e
+      # Dataset metadata already exists or validation failed
+      puts "Warning: Failed to store dataset metadata: #{e.message}" if ENV['DEBUG']
+    end
   end
-end
+end