RubyGems - kaggle - Versions diffs - 0.0.1 - Mend

kaggle 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/lib/kaggle/client.rb ADDED Viewed

@@ -0,0 +1,255 @@
+module Kaggle
+  class Client
+    include HTTParty
+    base_uri Constants::BASE_URL
+    attr_reader :username, :api_key, :download_path, :cache_path, :timeout
+    def initialize(username: nil, api_key: nil, credentials_file: nil, download_path: nil, cache_path: nil, timeout: nil)
+      load_credentials(username, api_key, credentials_file)
+      @download_path = download_path || Constants::DEFAULT_DOWNLOAD_PATH
+      @cache_path = cache_path || Constants::DEFAULT_CACHE_PATH
+      @timeout = timeout || Constants::DEFAULT_TIMEOUT
+      raise AuthenticationError, 'Username and API key are required' unless valid_credential?(@username) && valid_credential?(@api_key)
+      ensure_directories_exist
+      setup_httparty_options
+    end
+    def download_dataset(dataset_owner, dataset_name, options = {})
+      dataset_path = "#{dataset_owner}/#{dataset_name}"
+      # Check cache first for parsed data
+      if options[:use_cache] && options[:parse_csv]
+        cache_key = generate_cache_key(dataset_path)
+        if cached_file_exists?(cache_key)
+          return load_from_cache(cache_key)
+        end
+      end
+      # Check if we already have extracted files for this dataset
+      extracted_dir = get_extracted_dir(dataset_path)
+      if options[:use_cache] && Dir.exist?(extracted_dir) && !Dir.empty?(extracted_dir)
+        return handle_existing_dataset(extracted_dir, options)
+      end
+      # Download the zip file
+      response = authenticated_request(:get, "#{Constants::DATASET_ENDPOINTS[:download]}/#{dataset_path}")
+      unless response.success?
+        raise DownloadError, "Failed to download dataset: #{response.message}"
+      end
+      # Save zip file
+      zip_file = save_zip_file(dataset_path, response.body)
+      # Extract zip file
+      extract_zip_file(zip_file, extracted_dir)
+      # Clean up zip file
+      File.delete(zip_file) if File.exist?(zip_file)
+      # Handle the extracted files
+      result = handle_extracted_dataset(extracted_dir, options)
+      # Cache parsed CSV data if requested
+      if options[:use_cache] && options[:parse_csv] && (result.is_a?(Hash) || result.is_a?(Array))
+        cache_key = generate_cache_key(dataset_path)
+        cache_parsed_data(cache_key, result)
+      end
+      result
+    end
+    def dataset_files(dataset_owner, dataset_name)
+      dataset_path = "#{dataset_owner}/#{dataset_name}"
+      response = authenticated_request(:get, "#{Constants::DATASET_ENDPOINTS[:files]}/#{dataset_path}")
+      unless response.success?
+        raise DatasetNotFoundError, "Dataset not found or accessible: #{dataset_path}"
+      end
+      Oj.load(response.body)
+    rescue Oj::ParseError => e
+      raise ParseError, "Failed to parse dataset files response: #{e.message}"
+    end
+    def parse_csv_to_json(file_path)
+      raise Error, "File does not exist: #{file_path}" unless File.exist?(file_path)
+      raise Error, "File is not a CSV: #{file_path}" unless csv_file?(file_path)
+      data = []
+      CSV.foreach(file_path, headers: true) do |row|
+        data << row.to_hash
+      end
+      data
+    rescue CSV::MalformedCSVError => e
+      raise ParseError, "Failed to parse CSV file: #{e.message}"
+    end
+    private
+    def valid_credential?(credential)
+      credential && !credential.to_s.strip.empty?
+    end
+    def load_credentials(username, api_key, credentials_file)
+      # Try provided credentials file first
+      if credentials_file && File.exist?(credentials_file)
+        credentials = load_credentials_from_file(credentials_file)
+        @username = username || credentials['username']
+        @api_key = api_key || credentials['key']
+      # Try default kaggle.json file if no explicit credentials
+      elsif !username && !api_key && File.exist?(Constants::DEFAULT_CREDENTIALS_FILE)
+        credentials = load_credentials_from_file(Constants::DEFAULT_CREDENTIALS_FILE)
+        @username = credentials['username']
+        @api_key = credentials['key']
+      else
+        # Fall back to environment variables
+        @username = username || ENV['KAGGLE_USERNAME']
+        @api_key = api_key || ENV['KAGGLE_KEY']
+      end
+    end
+    def load_credentials_from_file(file_path)
+      content = File.read(file_path)
+      Oj.load(content)
+    rescue Oj::ParseError => e
+      raise AuthenticationError, "Invalid credentials file format: #{e.message}"
+    rescue => e
+      raise AuthenticationError, "Failed to read credentials file: #{e.message}"
+    end
+    def ensure_directories_exist
+      FileUtils.mkdir_p(@download_path) unless Dir.exist?(@download_path)
+      FileUtils.mkdir_p(@cache_path) unless Dir.exist?(@cache_path)
+    end
+    def setup_httparty_options
+      self.class.default_options.merge!({
+        headers: Constants::REQUIRED_HEADERS,
+        timeout: @timeout,
+        basic_auth: {
+          username: @username,
+          password: @api_key
+        }
+      })
+    end
+    def authenticated_request(method, endpoint, options = {})
+      self.class.send(method, endpoint, options)
+    rescue Timeout::Error, Net::ReadTimeout, Net::OpenTimeout
+      raise Error, 'Request timed out'
+    rescue => e
+      raise Error, "Request failed: #{e.message}"
+    end
+    def get_extracted_dir(dataset_path)
+      dir_name = dataset_path.gsub('/', '_')
+      File.join(@download_path, dir_name)
+    end
+    def save_zip_file(dataset_path, content)
+      filename = "#{dataset_path.gsub('/', '_')}.zip"
+      file_path = File.join(@download_path, filename)
+      File.open(file_path, 'wb') do |file|
+        file.write(content)
+      end
+      file_path
+    end
+    def extract_zip_file(zip_file_path, extract_to_dir)
+      FileUtils.mkdir_p(extract_to_dir)
+      Zip::File.open(zip_file_path) do |zip_file|
+        zip_file.each do |entry|
+          extract_path = File.join(extract_to_dir, entry.name)
+          if entry.directory?
+            # Create directory
+            FileUtils.mkdir_p(extract_path)
+          else
+            # Create parent directory if it doesn't exist
+            parent_dir = File.dirname(extract_path)
+            FileUtils.mkdir_p(parent_dir) unless Dir.exist?(parent_dir)
+            # Extract file manually to avoid path issues
+            File.open(extract_path, 'wb') do |f|
+              f.write entry.get_input_stream.read
+            end
+          end
+        end
+      end
+    rescue Zip::Error => e
+      raise DownloadError, "Failed to extract zip file: #{e.message}"
+    end
+    def handle_existing_dataset(extracted_dir, options)
+      if options[:parse_csv]
+        csv_files = find_csv_files(extracted_dir)
+        return parse_csv_files_to_json(csv_files) unless csv_files.empty?
+      end
+      extracted_dir
+    end
+    def handle_extracted_dataset(extracted_dir, options)
+      if options[:parse_csv]
+        csv_files = find_csv_files(extracted_dir)
+        unless csv_files.empty?
+          parsed_data = parse_csv_files_to_json(csv_files)
+          return parsed_data
+        end
+      end
+      extracted_dir
+    end
+    def find_csv_files(directory)
+      Dir.glob(File.join(directory, '**', '*.csv'))
+    end
+    def parse_csv_files_to_json(csv_files)
+      result = {}
+      csv_files.each do |csv_file|
+        file_name = File.basename(csv_file, '.csv')
+        result[file_name] = parse_csv_to_json(csv_file)
+      end
+      # If there's only one CSV file, return its data directly
+      result.length == 1 ? result.values.first : result
+    end
+    def generate_cache_key(dataset_path)
+      "#{dataset_path.gsub('/', '_')}_parsed.json"
+    end
+    def cached_file_exists?(cache_key)
+      File.exist?(File.join(@cache_path, cache_key))
+    end
+    def load_from_cache(cache_key)
+      cache_file_path = File.join(@cache_path, cache_key)
+      Oj.load(File.read(cache_file_path))
+    rescue Oj::ParseError => e
+      raise ParseError, "Failed to parse cached data: #{e.message}"
+    end
+    def cache_parsed_data(cache_key, data)
+      cache_file_path = File.join(@cache_path, cache_key)
+      File.write(cache_file_path, Oj.dump(data, mode: :compat, indent: 2))
+    end
+    def csv_file?(file_path)
+      File.extname(file_path).downcase == '.csv'
+    end
+  end
+end

data/lib/kaggle/constants.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Kaggle
+  module Constants
+    BASE_URL = 'https://www.kaggle.com/api/v1'
+    DEFAULT_DOWNLOAD_PATH = './downloads'
+    DEFAULT_CACHE_PATH = './cache'
+    DEFAULT_CREDENTIALS_FILE = './kaggle.json'
+    DEFAULT_TIMEOUT = 30
+    SUPPORTED_FORMATS = %w[csv json].freeze
+    DATASET_ENDPOINTS = {
+      view: '/datasets/view',
+      download: '/datasets/download',
+      files: '/datasets/data'
+    }.freeze
+    REQUIRED_HEADERS = {
+      'User-Agent' => 'Kaggle Ruby Client/0.0.1',
+      'Accept' => 'application/json'
+    }.freeze
+  end
+end

data/lib/kaggle/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Kaggle
+  VERSION = '0.0.1'
+end

data/lib/kaggle.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'httparty'
+require 'csv'
+require 'oj'
+require 'fileutils'
+require 'net/http'
+require 'timeout'
+require 'zip'
+require_relative 'kaggle/version'
+require_relative 'kaggle/constants'
+require_relative 'kaggle/client'
+module Kaggle
+  class Error < StandardError; end
+  class AuthenticationError < Error; end
+  class DatasetNotFoundError < Error; end
+  class DownloadError < Error; end
+  class ParseError < Error; end
+end

data/plans/benchmarks.md ADDED Viewed

@@ -0,0 +1,179 @@
+# Benchmarks and Performance Plan
+## Overview
+Implement benchmarking capabilities to measure and compare model performance, dataset processing speeds, and API response times.
+## Current State
+- No benchmarking functionality exists
+- Basic error handling and performance considerations in place
+- Opportunity to build comprehensive benchmarking suite
+## Planned Features
+### Phase 1: Dataset Benchmarks
+- [ ] **Download Speed Metrics**: Measure dataset download speeds
+- [ ] **Parsing Performance**: Benchmark CSV to JSON conversion speeds
+- [ ] **Cache Performance**: Measure cache hit/miss ratios and speeds
+- [ ] **Size vs Speed Analysis**: Correlate dataset size with processing time
+- [ ] **Format Comparison**: Compare performance across different file formats
+### Phase 2: Model Benchmarks
+- [ ] **Model Download Times**: Track model download performance
+- [ ] **Loading Benchmarks**: Measure model loading and initialization times
+- [ ] **Inference Speed**: Benchmark model prediction performance
+- [ ] **Memory Usage**: Monitor memory consumption during operations
+- [ ] **Framework Comparison**: Compare performance across ML frameworks
+### Phase 3: API Performance
+- [ ] **Response Time Tracking**: Monitor API endpoint response times
+- [ ] **Rate Limit Analysis**: Track API rate limiting and optimal usage patterns
+- [ ] **Concurrent Request Performance**: Benchmark parallel API calls
+- [ ] **Error Rate Monitoring**: Track API error rates over time
+- [ ] **Geolocation Performance**: Compare performance from different regions
+### Phase 4: System Benchmarks
+- [ ] **Network Performance**: Measure network conditions impact
+- [ ] **Disk I/O Performance**: Benchmark local file operations
+- [ ] **CPU/Memory Usage**: Profile resource consumption
+- [ ] **Platform Comparison**: Compare performance across operating systems
+- [ ] **Ruby Version Impact**: Benchmark across different Ruby versions
+## Technical Implementation
+### Benchmarking Framework
+```ruby
+# lib/kaggle/benchmark.rb
+module Kaggle
+  class Benchmark
+    include Benchmark as RubyBenchmark
+    attr_reader :results, :config
+    def initialize(config = {})
+      @config = default_config.merge(config)
+      @results = []
+    end
+    def run_dataset_benchmark(dataset_path, iterations: 5)
+      # Benchmark dataset operations
+    end
+    def run_api_benchmark(endpoint, iterations: 10)
+      # Benchmark API endpoint performance
+    end
+    def generate_report
+      # Generate performance report
+    end
+  end
+end
+# lib/kaggle/performance_monitor.rb
+class Kaggle::PerformanceMonitor
+  def self.monitor(operation_name, &block)
+    # Monitor and log performance metrics
+  end
+  def self.track_memory_usage(&block)
+    # Track memory usage during operations
+  end
+  def self.profile_cpu_usage(&block)
+    # Profile CPU usage patterns
+  end
+end
+```
+### Metrics Collection
+```ruby
+# Performance metrics structure
+{
+  operation: 'dataset_download',
+  timestamp: Time.current,
+  duration_ms: 1234,
+  memory_usage_mb: 45.6,
+  cpu_usage_percent: 23.4,
+  network_bytes: 1024000,
+  cache_hit: true,
+  error: nil,
+  metadata: {
+    dataset_size_mb: 100,
+    file_count: 5,
+    format: 'csv'
+  }
+}
+```
+### CLI Integration
+```bash
+# Run benchmarks
+kaggle benchmark datasets --iterations 10
+kaggle benchmark api --endpoint datasets/list
+kaggle benchmark models --model-id example/model
+# View benchmark results
+kaggle benchmark report
+kaggle benchmark compare --baseline v0.1.0
+kaggle benchmark export --format json
+# Performance profiling
+kaggle profile download dataset-owner/dataset-name
+kaggle profile parse large-dataset.csv
+```
+### Reporting and Visualization
+- [ ] **HTML Reports**: Generate detailed HTML performance reports
+- [ ] **CSV Export**: Export raw metrics for external analysis
+- [ ] **Comparison Reports**: Compare performance across versions/configurations
+- [ ] **Trend Analysis**: Track performance changes over time
+- [ ] **Regression Detection**: Alert on performance degradation
+### Integration with Testing
+```ruby
+# test/performance/benchmark_test.rb
+class BenchmarkTest < Minitest::Test
+  def test_dataset_download_performance
+    benchmark = Kaggle::Benchmark.new
+    result = benchmark.run_dataset_benchmark('test/dataset')
+    # Assert performance meets requirements
+    assert result.average_duration < 5000, "Download too slow"
+    assert result.memory_usage < 100, "Memory usage too high"
+  end
+end
+```
+## Performance Targets
+### Dataset Operations
+- CSV parsing: < 1MB/second for typical datasets
+- Download speed: Limited by network, not processing
+- Cache retrieval: < 100ms for typical datasets
+- Memory usage: < 2x dataset size during processing
+### API Operations
+- List requests: < 2 seconds response time
+- Download initiation: < 5 seconds
+- Metadata retrieval: < 1 second
+- Error recovery: < 30 seconds for retries
+### Model Operations
+- Model listing: < 3 seconds response time
+- Model download: Progress tracking every 5% completion
+- Model loading: Framework-dependent, track baseline
+- Inference: Model-specific, establish benchmarks
+## Continuous Integration
+- [ ] **Automated Benchmarks**: Run benchmarks in CI pipeline
+- [ ] **Performance Regression Tests**: Fail CI on significant slowdowns
+- [ ] **Baseline Tracking**: Maintain performance baselines across versions
+- [ ] **Alert System**: Notify maintainers of performance issues
+## Priority: Low
+Target completion: Version 0.5.0
+## Notes
+- Benchmarks should be optional and not affect normal gem usage
+- Consider integration with Ruby profiling tools (ruby-prof, memory_profiler)
+- Benchmarks may reveal optimization opportunities in current code
+- Results should be comparable across different environments

data/plans/cli_tool.md ADDED Viewed

@@ -0,0 +1,35 @@
+# CLI Tool Enhancement Plan
+## Current State
+The gem includes a basic CLI tool (`bin/kaggle`) with essential functionality for listing, downloading, and viewing dataset files.
+## Planned Enhancements
+### Phase 1: Core CLI Improvements
+- [ ] **Interactive Mode**: Add interactive prompts for common operations
+- [ ] **Progress Indicators**: Show download progress for large datasets
+- [ ] **Better Output Formatting**: Improve table formatting for dataset lists
+- [ ] **Configuration File Support**: Allow CLI configuration via YAML/JSON config files
+- [ ] **Verbose/Quiet Modes**: Add -v and -q flags for different output levels
+### Phase 2: Advanced Features
+- [ ] **Bulk Operations**: Support downloading multiple datasets with patterns
+- [ ] **Search Filters**: Advanced filtering options (date, size, format, etc.)
+- [ ] **Export Formats**: Support exporting dataset lists to CSV/JSON
+- [ ] **Parallel Downloads**: Download multiple datasets concurrently
+- [ ] **Resume Downloads**: Resume interrupted downloads
+### Phase 3: Competition Support
+- [ ] **Competition Listing**: List available competitions
+- [ ] **Competition Data**: Download competition datasets
+- [ ] **Submission Management**: Submit competition entries via CLI
+- [ ] **Leaderboard View**: View competition leaderboards
+### Implementation Notes
+- Use Thor or TTY toolkit for enhanced CLI functionality
+- Add comprehensive help system with examples
+- Include bash/zsh completion scripts
+- Implement proper signal handling for graceful interruption
+## Priority: Medium
+Target completion: Version 0.2.0

data/plans/initial_prompt.md ADDED Viewed

@@ -0,0 +1,11 @@
+Please help implement a new Ruby gem here. Use `/Users/trex22/development/url_categorise` as an example. The Version should be 0.0.1. Here is the API documentation: @https://www.kaggle.com/docs/api . Also here is another context resource: @https://github.com/Kaggle/kaggle-api. Create a new plan in plans/. For now the gem should only handle downloading and parsing datasets. For now only open CSV datasets into a json structure. Allow for optional parameters to specify download paths and caching paths as this gem will be used elsewhere where we want a cache download location so that the dataset does not always have to be downloaded. Create and update a README.md with relevant info and use `/Users/trex22/development/url_categorise` as an example. Also add in tests using `/Users/trex22/development/url_categorise` as an example with relevant bin/ scripts. Add in a CLAUDE.md as well. Lastly, update the plans with new plans for future development which include, CLI tool, lists, models, benchmarks etc ...
+List and plan out all actions before actioning.
+Use the oj gem version 3.16.11 instead of the ruby json library. Also please increase test        │
+│   coverage and fix the failing tests. Also copy codes of conduct from                               │
+│   /Users/trex22/development/url_categorise/.
+Oj has a slightly different interface: https://github.com/ohler55/oj

data/plans/lists.md ADDED Viewed

@@ -0,0 +1,77 @@
+# Lists Enhancement Plan
+## Overview
+Expand the current listing functionality to provide comprehensive discovery and filtering capabilities for Kaggle resources.
+## Current State
+- Basic dataset listing with search and pagination
+- Simple dataset file listing
+## Planned Enhancements
+### Phase 1: Enhanced Dataset Lists
+- [ ] **Advanced Filtering**: Filter by license, file formats, size, update date
+- [ ] **Sorting Options**: Sort by popularity, date, size, downloads
+- [ ] **Category Browsing**: Browse datasets by category/topic
+- [ ] **User/Organization Datasets**: List datasets by specific users or organizations
+- [ ] **Featured Datasets**: Highlight trending or featured datasets
+### Phase 2: Competition Lists
+- [ ] **Competition Discovery**: List active, completed, and upcoming competitions
+- [ ] **Competition Filtering**: Filter by category, prize pool, participant count
+- [ ] **Competition Search**: Search competitions by title, description, tags
+- [ ] **Personal Competitions**: List user's participated competitions
+- [ ] **Competition Metrics**: Show participation stats, deadlines, prizes
+### Phase 3: Model Lists
+- [ ] **Model Discovery**: List available models and frameworks
+- [ ] **Model Filtering**: Filter by framework, task type, performance metrics
+- [ ] **Model Versions**: Track different versions of models
+- [ ] **Popular Models**: Highlight trending and highly-rated models
+- [ ] **User Models**: List models by specific users
+### Phase 4: Kernel/Notebook Lists
+- [ ] **Code Discovery**: List public kernels and notebooks
+- [ ] **Language Filtering**: Filter by programming language (R, Python, etc.)
+- [ ] **Topic Browsing**: Browse by dataset or competition
+- [ ] **Popular Code**: Highlight most-voted and most-forked notebooks
+- [ ] **Recent Activity**: Show recently updated kernels
+## Technical Implementation
+### API Endpoints
+- Implement consistent pagination across all list types
+- Add caching layer for frequently accessed lists
+- Support bulk operations for multiple list requests
+### CLI Enhancements
+- Interactive filtering and sorting in CLI
+- Export capabilities (CSV, JSON, XML)
+- Bookmarking and favorites functionality
+- Watchlist for monitoring specific items
+### Data Structures
+```ruby
+# Enhanced listing response format
+{
+  items: [],           # List of resources
+  pagination: {        # Pagination metadata
+    page: 1,
+    per_page: 20,
+    total_pages: 50,
+    total_count: 1000
+  },
+  filters: {           # Applied filters
+    category: 'finance',
+    license: 'cc-by',
+    updated_since: '2023-01-01'
+  },
+  sort: {              # Current sorting
+    field: 'popularity',
+    direction: 'desc'
+  }
+}
+```
+## Priority: High
+Target completion: Version 0.3.0