RubyGems - UrlCategorise - Versions diffs - 0.1.2 → 0.1.3 - Mend

UrlCategorise 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.claude/settings.local.json +5 -1
data/CLAUDE.md +12 -2
data/Gemfile +2 -2
data/Gemfile.lock +6 -9
data/README.md +189 -1
data/Rakefile +8 -8
data/bin/check_lists +12 -13
data/bin/console +3 -3
data/lib/url_categorise/active_record_client.rb +97 -20
data/lib/url_categorise/client.rb +220 -111
data/lib/url_categorise/constants.rb +86 -71
data/lib/url_categorise/dataset_processor.rb +471 -0
data/lib/url_categorise/models.rb +53 -14
data/lib/url_categorise/version.rb +1 -1
data/lib/url_categorise.rb +1 -0
data/url_categorise.gemspec +34 -32
metadata +91 -50

data/lib/url_categorise/dataset_processor.rb ADDED Viewed

@@ -0,0 +1,471 @@
+require 'httparty'
+require 'csv'
+require 'digest'
+require 'fileutils'
+require 'net/http'
+require 'timeout'
+require 'zip'
+require 'json'
+module UrlCategorise
+  class DatasetProcessor
+    include HTTParty
+    KAGGLE_BASE_URL = 'https://www.kaggle.com/api/v1'
+    DEFAULT_DOWNLOAD_PATH = './downloads'
+    DEFAULT_CACHE_PATH = './cache'
+    DEFAULT_TIMEOUT = 30
+    DEFAULT_CREDENTIALS_FILE = File.expand_path('~/.kaggle/kaggle.json')
+    attr_reader :username, :api_key, :download_path, :cache_path, :timeout, :kaggle_enabled
+    def initialize(username: nil, api_key: nil, credentials_file: nil, download_path: nil, cache_path: nil,
+                   timeout: nil, enable_kaggle: true)
+      @kaggle_enabled = enable_kaggle
+      if @kaggle_enabled
+        load_credentials(username, api_key, credentials_file)
+        warn_if_kaggle_credentials_missing
+      else
+        @username = nil
+        @api_key = nil
+      end
+      @download_path = download_path || DEFAULT_DOWNLOAD_PATH
+      @cache_path = cache_path || DEFAULT_CACHE_PATH
+      @timeout = timeout || DEFAULT_TIMEOUT
+      ensure_directories_exist
+      setup_httparty_options if kaggle_credentials_available?
+    end
+    def process_kaggle_dataset(dataset_owner, dataset_name, options = {})
+      unless @kaggle_enabled
+        raise Error, 'Kaggle functionality is disabled. Set enable_kaggle: true to use Kaggle datasets.'
+      end
+      dataset_path = "#{dataset_owner}/#{dataset_name}"
+      # Check cache first if requested - no credentials needed for cached data
+      if options[:use_cache]
+        cached_data = load_from_cache(generate_cache_key(dataset_path, :kaggle))
+        return cached_data if cached_data
+      end
+      # Check if we already have extracted files - no credentials needed
+      extracted_dir = get_extracted_dir(dataset_path)
+      if options[:use_cache] && Dir.exist?(extracted_dir) && !Dir.empty?(extracted_dir)
+        return handle_existing_dataset(extracted_dir, options)
+      end
+      # Only require credentials if we need to download fresh data
+      unless kaggle_credentials_available?
+        raise Error, 'Kaggle credentials required for downloading new datasets. ' \
+                     'Set KAGGLE_USERNAME/KAGGLE_KEY environment variables, provide credentials explicitly, ' \
+                     'or place kaggle.json file in ~/.kaggle/ directory.'
+      end
+      # Download from Kaggle API
+      response = authenticated_request(:get, "/datasets/download/#{dataset_path}")
+      raise Error, "Failed to download Kaggle dataset: #{response.message}" unless response.success?
+      # Process the downloaded data
+      result = process_dataset_response(response.body, dataset_path, :kaggle, options)
+      # Cache if requested
+      cache_processed_data(generate_cache_key(dataset_path, :kaggle), result) if options[:use_cache] && result
+      result
+    end
+    def process_csv_dataset(url, options = {})
+      cache_key = generate_cache_key(url, :csv)
+      # Check cache first if requested
+      if options[:use_cache]
+        cached_data = load_from_cache(cache_key)
+        return cached_data if cached_data
+      end
+      # Download CSV directly
+      response = HTTParty.get(url, timeout: @timeout, follow_redirects: true)
+      raise Error, "Failed to download CSV dataset: #{response.message}" unless response.success?
+      # Parse CSV content
+      result = parse_csv_content(response.body, options)
+      # Cache if requested
+      cache_processed_data(cache_key, result) if options[:use_cache] && result
+      result
+    end
+    def generate_dataset_hash(data)
+      content = case data
+                when Hash
+                  data.to_json
+                when Array
+                  data.to_json
+                when String
+                  data
+                else
+                  data.to_s
+                end
+      Digest::SHA256.hexdigest(content)
+    end
+    def integrate_dataset_into_categorization(dataset, category_mappings = {})
+      categorized_data = {}
+      case dataset
+      when Hash
+        # Single dataset with multiple files
+        dataset.each do |file_name, data|
+          process_dataset_file(data, file_name, category_mappings, categorized_data)
+        end
+      when Array
+        # Single file dataset
+        process_dataset_file(dataset, 'default', category_mappings, categorized_data)
+      else
+        raise Error, "Unsupported dataset format: #{dataset.class}"
+      end
+      # Add metadata
+      categorized_data[:_metadata] = {
+        processed_at: Time.now,
+        data_hash: generate_dataset_hash(dataset),
+        total_entries: count_total_entries(dataset)
+      }
+      categorized_data
+    end
+    private
+    def kaggle_credentials_available?
+      valid_credential?(@username) && valid_credential?(@api_key)
+    end
+    def warn_if_kaggle_credentials_missing
+      return if kaggle_credentials_available?
+      warn 'Warning: Kaggle credentials not found. Kaggle datasets will only work if they are already cached. ' \
+           'To download new Kaggle datasets, set KAGGLE_USERNAME/KAGGLE_KEY environment variables, ' \
+           'provide credentials explicitly, or place kaggle.json file in ~/.kaggle/ directory.'
+    end
+    def valid_credential?(credential)
+      credential && !credential.to_s.strip.empty?
+    end
+    def load_credentials(username, api_key, credentials_file)
+      # Try provided credentials file first
+      if credentials_file && File.exist?(credentials_file)
+        credentials = load_credentials_from_file(credentials_file)
+        @username = username || credentials['username']
+        @api_key = api_key || credentials['key']
+      # Try default kaggle.json file if no explicit credentials
+      elsif !username && !api_key && File.exist?(DEFAULT_CREDENTIALS_FILE)
+        credentials = load_credentials_from_file(DEFAULT_CREDENTIALS_FILE)
+        @username = credentials['username']
+        @api_key = credentials['key']
+      else
+        # Fall back to environment variables
+        @username = username || ENV['KAGGLE_USERNAME']
+        @api_key = api_key || ENV['KAGGLE_KEY']
+      end
+    end
+    def load_credentials_from_file(file_path)
+      content = File.read(file_path)
+      JSON.parse(content)
+    rescue JSON::ParserError => e
+      raise Error, "Invalid credentials file format: #{e.message}"
+    rescue StandardError => e
+      raise Error, "Failed to read credentials file: #{e.message}"
+    end
+    def ensure_directories_exist
+      FileUtils.mkdir_p(@download_path) unless Dir.exist?(@download_path)
+      FileUtils.mkdir_p(@cache_path) unless Dir.exist?(@cache_path)
+    end
+    def setup_httparty_options
+      self.class.base_uri KAGGLE_BASE_URL
+      self.class.default_options.merge!({
+                                          headers: {
+                                            'User-Agent' => 'url_categorise-ruby-client'
+                                          },
+                                          timeout: @timeout,
+                                          basic_auth: {
+                                            username: @username,
+                                            password: @api_key
+                                          }
+                                        })
+    end
+    def authenticated_request(method, endpoint, options = {})
+      self.class.send(method, endpoint, options)
+    rescue Timeout::Error, Net::ReadTimeout, Net::OpenTimeout
+      raise Error, 'Request timed out'
+    rescue StandardError => e
+      raise Error, "Request failed: #{e.message}"
+    end
+    def process_dataset_response(content, dataset_path, source_type, options)
+      if source_type == :kaggle
+        # Kaggle returns ZIP files
+        zip_file = save_zip_file(dataset_path, content)
+        extracted_dir = get_extracted_dir(dataset_path)
+        extract_zip_file(zip_file, extracted_dir)
+        File.delete(zip_file) if File.exist?(zip_file)
+        handle_extracted_dataset(extracted_dir, options)
+      else
+        # Direct content processing
+        parse_csv_content(content, options)
+      end
+    end
+    def get_extracted_dir(dataset_path)
+      dir_name = dataset_path.gsub('/', '_').gsub(/[^a-zA-Z0-9_-]/, '_')
+      File.join(@download_path, dir_name)
+    end
+    def save_zip_file(dataset_path, content)
+      filename = "#{dataset_path.gsub('/', '_')}_#{Time.now.to_i}.zip"
+      file_path = File.join(@download_path, filename)
+      File.open(file_path, 'wb') do |file|
+        file.write(content)
+      end
+      file_path
+    end
+    def extract_zip_file(zip_file_path, extract_to_dir)
+      FileUtils.mkdir_p(extract_to_dir)
+      Zip::File.open(zip_file_path) do |zip_file|
+        zip_file.each do |entry|
+          extract_path = File.join(extract_to_dir, entry.name)
+          if entry.directory?
+            FileUtils.mkdir_p(extract_path)
+          else
+            parent_dir = File.dirname(extract_path)
+            FileUtils.mkdir_p(parent_dir) unless Dir.exist?(parent_dir)
+            File.open(extract_path, 'wb') do |f|
+              f.write entry.get_input_stream.read
+            end
+          end
+        end
+      end
+    rescue Zip::Error => e
+      raise Error, "Failed to extract zip file: #{e.message}"
+    end
+    def handle_existing_dataset(extracted_dir, _options)
+      csv_files = find_csv_files(extracted_dir)
+      return parse_csv_files_to_hash(csv_files) unless csv_files.empty?
+      extracted_dir
+    end
+    def handle_extracted_dataset(extracted_dir, _options)
+      csv_files = find_csv_files(extracted_dir)
+      return parse_csv_files_to_hash(csv_files) unless csv_files.empty?
+      extracted_dir
+    end
+    def find_csv_files(directory)
+      Dir.glob(File.join(directory, '**', '*.csv'))
+    end
+    def parse_csv_files_to_hash(csv_files)
+      result = {}
+      csv_files.each do |csv_file|
+        file_name = File.basename(csv_file, '.csv')
+        result[file_name] = parse_csv_file(csv_file)
+      end
+      # If there's only one CSV file, return its data directly
+      result.length == 1 ? result.values.first : result
+    end
+    def parse_csv_file(file_path)
+      raise Error, "File does not exist: #{file_path}" unless File.exist?(file_path)
+      data = []
+      CSV.foreach(file_path, headers: true, liberal_parsing: true) do |row|
+        data << row.to_hash
+      end
+      data
+    rescue CSV::MalformedCSVError => e
+      raise Error, "Failed to parse CSV file: #{e.message}"
+    end
+    def parse_csv_content(content, _options = {})
+      data = []
+      CSV.parse(content, headers: true, liberal_parsing: true) do |row|
+        data << row.to_hash
+      end
+      data
+    rescue CSV::MalformedCSVError => e
+      raise Error, "Failed to parse CSV content: #{e.message}"
+    end
+    def generate_cache_key(identifier, source_type)
+      sanitized = identifier.gsub(/[^a-zA-Z0-9_-]/, '_')
+      "#{source_type}_#{sanitized}_processed.json"
+    end
+    def load_from_cache(cache_key)
+      cache_file_path = File.join(@cache_path, cache_key)
+      return nil unless File.exist?(cache_file_path)
+      content = File.read(cache_file_path)
+      JSON.parse(content)
+    rescue JSON::ParserError
+      nil # Invalid cache, will re-process
+    rescue StandardError
+      nil # Cache read error, will re-process
+    end
+    def cache_processed_data(cache_key, data)
+      cache_file_path = File.join(@cache_path, cache_key)
+      File.write(cache_file_path, JSON.pretty_generate(data))
+    rescue StandardError
+      # Cache write failed, continue without caching
+    end
+    def process_dataset_file(data, file_name, category_mappings, categorized_data)
+      return unless data.is_a?(Array) && !data.empty?
+      # If explicit column mappings are provided, use them for all rows
+      if category_mappings[:url_column] && category_mappings[:category_column]
+        url_col = category_mappings[:url_column]
+        category_col = category_mappings[:category_column]
+        data.each do |row|
+          url = row[url_col]&.strip
+          next unless url && !url.empty?
+          # Extract domain from URL
+          domain = extract_domain(url)
+          next unless domain
+          # Determine category
+          category = determine_category(row, category_col, category_mappings, file_name)
+          # Add to categorized data
+          categorized_data[category] ||= []
+          categorized_data[category] << domain unless categorized_data[category].include?(domain)
+        end
+      else
+        # Auto-detect columns for each row (handles mixed column structures)
+        data.each do |row|
+          url_columns = detect_url_columns(row)
+          category_columns = detect_category_columns(row)
+          # Use detected columns for this specific row
+          url_col = url_columns.first
+          category_col = category_columns.first
+          next unless url_col # Must have URL column
+          url = row[url_col]&.strip
+          next unless url && !url.empty?
+          # Extract domain from URL
+          domain = extract_domain(url)
+          next unless domain
+          # Determine category
+          category = determine_category(row, category_col, category_mappings, file_name)
+          # Add to categorized data
+          categorized_data[category] ||= []
+          categorized_data[category] << domain unless categorized_data[category].include?(domain)
+        end
+      end
+    end
+    def detect_url_columns(sample_row)
+      url_indicators = %w[url domain website site link address]
+      sample_row.keys.select do |key|
+        key_lower = key.to_s.downcase
+        url_indicators.any? { |indicator| key_lower.include?(indicator) }
+      end
+    end
+    def detect_category_columns(sample_row)
+      category_indicators = %w[category class type classification label]
+      sample_row.keys.select do |key|
+        key_lower = key.to_s.downcase
+        category_indicators.any? { |indicator| key_lower.include?(indicator) }
+      end
+    end
+    def extract_domain(url)
+      # Handle both full URLs and domain-only entries
+      return nil if url.nil? || url.empty?
+      # Add protocol if missing
+      url = "http://#{url}" unless url.match?(%r{\A\w+://})
+      uri = URI.parse(url)
+      domain = uri.host&.downcase
+      domain = domain.gsub(/\Awww\./, '') if domain # Remove www prefix
+      domain
+    rescue URI::InvalidURIError
+      # If URI parsing fails, try to extract domain manually
+      cleaned = url.gsub(%r{\A\w+://}, '').gsub(%r{/.*\z}, '').downcase
+      cleaned = cleaned.gsub(/\Awww\./, '')
+      cleaned.empty? ? nil : cleaned
+    end
+    def determine_category(row, category_col, category_mappings, file_name)
+      # Use explicit category column if available
+      if category_col && row[category_col]
+        category = row[category_col].to_s.strip.downcase
+        return map_category_name(category, category_mappings)
+      end
+      # Use file name as category if no category column
+      map_category_name(file_name, category_mappings)
+    end
+    def map_category_name(original_name, category_mappings)
+      # Use provided mapping or sanitize the name
+      mapped = category_mappings[:category_map]&.[](original_name)
+      return mapped if mapped
+      # Sanitize and format category name
+      sanitized = original_name.to_s.downcase
+                               .gsub(/[^a-z0-9_]/, '_')
+                               .gsub(/_+/, '_')
+                               .gsub(/\A_|_\z/, '')
+      sanitized.empty? ? 'dataset_category' : sanitized
+    end
+    def count_total_entries(dataset)
+      case dataset
+      when Hash
+        dataset.values.map { |v| v.is_a?(Array) ? v.length : 1 }.sum
+      when Array
+        dataset.length
+      else
+        1
+      end
+    end
+  end
+end

data/lib/url_categorise/models.rb CHANGED Viewed

@@ -18,28 +18,28 @@ else
       class ListMetadata < ActiveRecord::Base
         self.table_name = 'url_categorise_list_metadata'
         validates :name, presence: true, uniqueness: true
         validates :url, presence: true
         validates :categories, presence: true
         serialize :categories, coder: JSON
         scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
         scope :updated_since, ->(time) { where('updated_at > ?', time) }
       end
       class Domain < ActiveRecord::Base
         self.table_name = 'url_categorise_domains'
         validates :domain, presence: true, uniqueness: true
         validates :categories, presence: true
         serialize :categories, coder: JSON
         scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
         scope :search, ->(term) { where('domain LIKE ?', "%#{term}%") }
         def self.categorise(domain_name)
           record = find_by(domain: domain_name.downcase.gsub('www.', ''))
           record ? record.categories : []
@@ -48,21 +48,45 @@ else
       class IpAddress < ActiveRecord::Base
         self.table_name = 'url_categorise_ip_addresses'
         validates :ip_address, presence: true, uniqueness: true
         validates :categories, presence: true
         serialize :categories, coder: JSON
         scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
         scope :in_subnet, ->(subnet) { where('ip_address LIKE ?', "#{subnet}%") }
         def self.categorise(ip)
           record = find_by(ip_address: ip)
           record ? record.categories : []
         end
       end
+      class DatasetMetadata < ActiveRecord::Base
+        self.table_name = 'url_categorise_dataset_metadata'
+        validates :source_type, presence: true, inclusion: { in: %w[kaggle csv] }
+        validates :identifier, presence: true
+        validates :data_hash, presence: true, uniqueness: true
+        validates :total_entries, presence: true, numericality: { greater_than: 0 }
+        serialize :category_mappings, coder: JSON
+        serialize :processing_options, coder: JSON
+        scope :by_source, ->(source) { where(source_type: source) }
+        scope :by_identifier, ->(identifier) { where(identifier: identifier) }
+        scope :processed_since, ->(time) { where('processed_at > ?', time) }
+        def kaggle_dataset?
+          source_type == 'kaggle'
+        end
+        def csv_dataset?
+          source_type == 'csv'
+        end
+      end
       # Generator for Rails integration
       def self.generate_migration
         <<~MIGRATION
@@ -84,7 +108,7 @@ else
                 t.text :categories, null: false
                 t.timestamps
               end
+          #{'    '}
               add_index :url_categorise_domains, :domain
               add_index :url_categorise_domains, :categories
@@ -93,13 +117,28 @@ else
                 t.text :categories, null: false
                 t.timestamps
               end
+          #{'    '}
               add_index :url_categorise_ip_addresses, :ip_address
               add_index :url_categorise_ip_addresses, :categories
+              create_table :url_categorise_dataset_metadata do |t|
+                t.string :source_type, null: false, index: true
+                t.string :identifier, null: false
+                t.string :data_hash, null: false, index: { unique: true }
+                t.integer :total_entries, null: false
+                t.text :category_mappings
+                t.text :processing_options
+                t.datetime :processed_at
+                t.timestamps
+              end
+          #{'    '}
+              add_index :url_categorise_dataset_metadata, :source_type
+              add_index :url_categorise_dataset_metadata, :identifier
+              add_index :url_categorise_dataset_metadata, :processed_at
             end
           end
         MIGRATION
       end
     end
   end
-end
+end

data/lib/url_categorise/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlCategorise
-  VERSION = "0.1.2"
+  VERSION = '0.1.3'
 end

data/lib/url_categorise.rb CHANGED Viewed

@@ -8,6 +8,7 @@ require 'api-pattern'
 require 'url_categorise/version'
 require 'url_categorise/constants'
+require 'url_categorise/dataset_processor'
 require 'url_categorise/client'