easy_ml 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,8 @@
1
+ module EasyML
2
+ module Data
3
+ require_relative "data/utils"
4
+ require_relative "data/preprocessor"
5
+ require_relative "data/dataset"
6
+ require_relative "data/datasource"
7
+ end
8
+ end
@@ -0,0 +1,5 @@
1
+ module EasyML
2
+ module Deployment
3
+ require_relative "deployment/model_uploader"
4
+ end
5
+ end
@@ -0,0 +1,26 @@
1
+ require "rails/engine"
2
+
3
+ module EasyML
4
+ class Engine < Rails::Engine
5
+ isolate_namespace EasyML
6
+
7
+ initializer "easy_ml.inflections" do
8
+ require_relative "initializers/inflections"
9
+ end
10
+
11
+ initializer "easy_ml.setup_generators" do |app|
12
+ app.config.generators do |g|
13
+ g.templates.unshift File.expand_path("../templates", __dir__)
14
+ end
15
+ end
16
+
17
+ generators_path = File.expand_path("railtie/generators", __dir__)
18
+ generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
19
+ generators_dirs.each { |file| require file }
20
+
21
+ config.after_initialize do
22
+ require_relative "../../app/models/easy_ml/model"
23
+ require_relative "../../app/models/easy_ml/models"
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,4 @@
1
+ ActiveSupport::Inflector.inflections(:en) do |inflect|
2
+ inflect.acronym "EasyML"
3
+ inflect.acronym "ML"
4
+ end
@@ -0,0 +1,38 @@
1
+ module EasyML
2
+ module Logging
3
+ def self.included(base)
4
+ base.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def log_method(method_name, message, verbose: false)
9
+ original_method = instance_method(method_name)
10
+ define_method(method_name) do |*args, &block|
11
+ log_message(message, verbose: verbose)
12
+ result = original_method.bind(self).call(*args, &block)
13
+ result
14
+ end
15
+ end
16
+ end
17
+
18
+ def log_message(message, verbose: false)
19
+ if verbose
20
+ log_verbose(message)
21
+ else
22
+ puts message
23
+ end
24
+ end
25
+
26
+ def log_verbose(message)
27
+ puts message if @verbose
28
+ end
29
+
30
+ def log_warning(message)
31
+ puts "\e[33mWARNING: #{message}\e[0m"
32
+ end
33
+
34
+ def log_info(message)
35
+ puts "\e[34mINFO: #{message}\e[0m"
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,42 @@
1
+ # lib/railtie/generators/migration/migration_generator.rb
2
+ require "rails/generators"
3
+ require "rails/generators/active_record/migration"
4
+
5
+ module EasyML
6
+ module Generators
7
+ module Migration
8
+ class MigrationGenerator < Rails::Generators::Base
9
+ include Rails::Generators::Migration
10
+ namespace "easy_ml:migration"
11
+
12
+ # Set the source directory for templates
13
+ source_root File.expand_path("../../templates/migration", __dir__)
14
+
15
+ # Define the migration name
16
+ desc "Generates a migration for EasyMLModel with version and file for remote storage"
17
+
18
+ # Define the migration name; can be customized if needed
19
+ def self.migration_name
20
+ "create_easy_ml_models"
21
+ end
22
+
23
+ # Specify the next migration number
24
+ def self.next_migration_number(dirname)
25
+ if ActiveRecord.version < Gem::Version.new("7")
26
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
27
+ elsif ActiveRecord.timestamped_migrations
28
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
29
+ else
30
+ format("%.3d", (current_migration_number(dirname) + 1))
31
+ end
32
+ end
33
+
34
+ # Generate the migration file using the template
35
+ def create_migration_file
36
+ migration_template "create_easy_ml_models.rb.tt",
37
+ "db/migrate/#{self.class.migration_name}.rb"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,23 @@
1
+ # lib/railtie/generators/templates/migration/create_easy_ml_models.rb.tt
2
+ class CreateEasyMLModels < ActiveRecord::Migration[6.0]
3
+ def change
4
+ create_table :easy_ml_models do |t|
5
+ t.string :name, null: false
6
+ t.boolean :is_live, default: false
7
+ t.string :version, null: false
8
+ t.string :ml_model
9
+ t.string :task
10
+ t.json :metrics, default: []
11
+ t.json :file, null: false
12
+
13
+ t.timestamps
14
+
15
+ t.index :created_at
16
+ t.index :name
17
+ t.index :version
18
+ t.index :is_live
19
+ t.index [:name, :version], unique: true
20
+ t.index [:name, :version, :is_live]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,27 @@
1
+ require "active_support/duration"
2
+
3
+ module EasyML
4
+ module Support
5
+ module Age
6
+ def self.age(start_time, end_time, format: "human")
7
+ return nil unless start_time && end_time
8
+
9
+ age_duration = ActiveSupport::Duration.build((end_time - start_time).to_i)
10
+ age_parts = age_duration.parts
11
+
12
+ case format.to_s
13
+ when "human"
14
+ age_duration.inspect
15
+ when "days"
16
+ age_parts[:days]
17
+ when "hours"
18
+ age_parts[:hours]
19
+ when "minutes"
20
+ age_parts[:minutes]
21
+ when "integer"
22
+ age_duration.to_i
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1 @@
1
+ EST = ActiveSupport::TimeZone.new("America/New_York")
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class FileRotate
3
+ def initialize(directory, files_to_keep)
4
+ @directory = directory
5
+ @files_to_keep = files_to_keep
6
+ end
7
+
8
+ def cleanup(allowed_endings = %w[json])
9
+ return unless @directory.present?
10
+
11
+ allowed_patterns = allowed_endings.map { |ending| File.join(@directory, "**", "*#{ending}") }
12
+ files_to_check = allowed_patterns.empty? ? Dir.glob(File.join(@directory, "**/*")) : Dir.glob(allowed_patterns)
13
+ # Filter out directories
14
+ files_to_check = files_to_check.select { |file| File.file?(file) }
15
+
16
+ files_to_check.each do |file|
17
+ FileUtils.chown_R(`whoami`.chomp, "staff", file)
18
+ FileUtils.chmod_R(0o777, file)
19
+ File.delete(file) unless @files_to_keep.include?(file)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,66 @@
1
+ require "active_support/concern"
2
+ require "fileutils"
3
+
4
+ module EasyML
5
+ module Support
6
+ module GitIgnorable
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ class_attribute :gitignore_attributes, default: {}
11
+
12
+ def self.set_gitignore_callbacks(attribute, &block)
13
+ gitignore_attributes[attribute] = block
14
+
15
+ prepend GitignoreInitializer
16
+ end
17
+ end
18
+
19
+ module GitignoreInitializer
20
+ def initialize(options)
21
+ super
22
+ update_gitignore
23
+ end
24
+ end
25
+
26
+ class_methods do
27
+ def gitignore(attribute, &block)
28
+ set_gitignore_callbacks(attribute, &block)
29
+ end
30
+ end
31
+
32
+ def update_gitignore
33
+ self.class.gitignore_attributes.each do |attribute, block|
34
+ attribute_value = send(attribute)
35
+ next if attribute_value.blank?
36
+
37
+ patterns = block ? block.call(attribute_value) : attribute_value
38
+ next if patterns.nil? || (patterns.respond_to?(:empty?) && patterns.empty?)
39
+
40
+ patterns = [patterns] unless patterns.is_a?(Array)
41
+ patterns = relativize(patterns)
42
+ gitignore_path = File.join(Dir.pwd, ".gitignore")
43
+
44
+ FileUtils.mkdir_p(File.dirname(gitignore_path))
45
+ FileUtils.touch(gitignore_path) unless File.exist?(gitignore_path)
46
+
47
+ existing_content = File.read(gitignore_path).split("\n")
48
+ new_patterns = patterns.reject { |pattern| existing_content.include?(pattern) }
49
+ next if new_patterns.empty?
50
+
51
+ new_content = (existing_content + new_patterns).join("\n").strip
52
+ File.write(gitignore_path, new_content)
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ # Turn patterns like /Users/xyz/path/to/rails/x/**/* into: x/**/*
59
+ def relativize(patterns)
60
+ patterns.map do |pattern|
61
+ pattern.sub(%r{^#{Regexp.escape(Dir.pwd)}/}, "")
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,134 @@
1
+ require "glue_gun"
2
+
3
+ module EasyML
4
+ module Support
5
+ class SyncedDirectory
6
+ include GlueGun::DSL
7
+
8
+ attribute :root_dir, :string
9
+ attribute :s3_bucket, :string
10
+ attribute :s3_prefix, :string
11
+ attribute :s3_access_key_id, :string
12
+ attribute :s3_secret_access_key, :string
13
+
14
+ def sync
15
+ return false if synced?
16
+
17
+ mk_dir
18
+ clean_dir!
19
+ download
20
+ true
21
+ end
22
+
23
+ def files
24
+ Dir.glob(File.join(root_dir, File.join(s3_prefix, "*.csv")))
25
+ end
26
+
27
+ def age(format: "human")
28
+ Age.age(last_updated_at, EST.now, format: format)
29
+ end
30
+
31
+ def stale?
32
+ !synced?
33
+ end
34
+
35
+ def synced?
36
+ return @synced unless @synced.nil?
37
+
38
+ @synced = calculate_synced
39
+ end
40
+
41
+ def last_updated_at
42
+ return nil if files.empty?
43
+
44
+ files.map { |file| File.mtime(file) }.max.in_time_zone(EST)
45
+ end
46
+
47
+ private
48
+
49
+ def mk_dir
50
+ FileUtils.mkdir_p(root_dir)
51
+ end
52
+
53
+ def clean_dir!
54
+ FileUtils.rm_rf(root_dir)
55
+ end
56
+
57
+ def s3
58
+ @s3 ||= begin
59
+ credentials = Aws::Credentials.new(s3_access_key_id, s3_secret_access_key)
60
+ Aws::S3::Client.new(credentials: credentials)
61
+ end
62
+ end
63
+
64
+ def download
65
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
66
+ next if object.key.end_with?("/") # skip folders
67
+
68
+ gzipped_file_path = File.join(root_dir, object.key)
69
+ FileUtils.mkdir_p(File.dirname(gzipped_file_path))
70
+
71
+ s3.get_object(
72
+ response_target: gzipped_file_path,
73
+ bucket: s3_bucket,
74
+ key: object.key
75
+ )
76
+
77
+ puts "Downloaded #{object.key} to #{gzipped_file_path}"
78
+
79
+ # Ungzip the file
80
+ ungzipped_file_path = ungzip_file(gzipped_file_path)
81
+ puts "Ungzipped to #{ungzipped_file_path}"
82
+ end
83
+ end
84
+
85
+ def ungzip_file(gzipped_file_path)
86
+ ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
87
+
88
+ Zlib::GzipReader.open(gzipped_file_path) do |gz|
89
+ File.open(ungzipped_file_path, "wb") do |file|
90
+ file.write(gz.read)
91
+ end
92
+ end
93
+
94
+ File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
95
+ ungzipped_file_path
96
+ end
97
+
98
+ def expand_dir(dir)
99
+ return dir if dir.to_s[0] == "/"
100
+
101
+ Rails.root.join(dir)
102
+ end
103
+
104
+ def new_data_available?
105
+ return true if files.empty?
106
+
107
+ local_latest = last_updated_at
108
+ s3_latest = s3_last_updated_at
109
+
110
+ return false if s3_latest.nil?
111
+
112
+ s3_latest > local_latest
113
+ end
114
+
115
+ def calculate_synced
116
+ return false if age.nil?
117
+
118
+ !new_data_available?
119
+ end
120
+
121
+ def s3_last_updated_at
122
+ s3_latest = nil
123
+
124
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
125
+ next if object.key.end_with?("/")
126
+
127
+ s3_latest = [s3_latest, object.last_modified].compact.max
128
+ end
129
+
130
+ s3_latest.in_time_zone(EST)
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1 @@
1
+ UTC = ActiveSupport::TimeZone.new("UTC")
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ module Support
3
+ require_relative "support/age"
4
+ require_relative "support/git_ignorable"
5
+ require_relative "support/synced_directory"
6
+ require_relative "support/utc"
7
+ require_relative "support/est"
8
+ require_relative "support/file_rotate"
9
+ end
10
+ end
@@ -0,0 +1,92 @@
1
+ module EasyML
2
+ class Trainer
3
+ # include GlueGun::DSL
4
+ # include EasyML::Logging
5
+
6
+ # define_attr :verbose, default: false
7
+ # define_attr :root_dir do |root_dir|
8
+ # File.join(root_dir, "trainer")
9
+ # end
10
+
11
+ # define_config :dataset do |config|
12
+ # config.define_option :default do |option|
13
+ # option.set_class EasyML::Data::Dataset
14
+ # option.define_attr :root_dir
15
+ # option.define_attr :target
16
+ # option.define_attr :batch_size
17
+ # end
18
+ # end
19
+
20
+ # define_config :model do |config|
21
+ # config.define_option :default do |option|
22
+ # option.set_class EasyML::Model
23
+ # option.define_attr :root_dir
24
+ # option.define_attr :name
25
+ # option.define_attr :hyperparameters
26
+ # end
27
+ # end
28
+
29
+ # def train
30
+ # log_info("Starting training process") if verbose
31
+
32
+ # dataset.refresh!
33
+
34
+ # log_info("Fitting model") if verbose
35
+ # dataset.train(split_ys: true) do |xs, ys|
36
+ # model.fit(xs, ys)
37
+ # end
38
+
39
+ # log_info("Saving model") if verbose
40
+ # model.save
41
+
42
+ # log_info("Training completed") if verbose
43
+ # end
44
+
45
+ # def evaluate
46
+ # log_info("Starting evaluation process") if verbose
47
+
48
+ # results = {}
49
+
50
+ # %i[train test valid].each do |split|
51
+ # log_info("Evaluating on #{split} set") if verbose
52
+ # predictions = []
53
+ # actuals = []
54
+
55
+ # dataset.send(split, split_ys: true) do |xs, ys|
56
+ # batch_predictions = model.predict(xs)
57
+ # predictions.concat(batch_predictions.to_a)
58
+ # actuals.concat(ys.to_a)
59
+ # end
60
+
61
+ # results[split] = calculate_metrics(predictions, actuals)
62
+ # end
63
+
64
+ # log_info("Evaluation completed") if verbose
65
+ # results
66
+ # end
67
+
68
+ # private
69
+
70
+ # def calculate_metrics(predictions, actuals)
71
+ # # Implement your metric calculations here
72
+ # # This is a placeholder and should be replaced with actual metric calculations
73
+ # {
74
+ # mse: mean_squared_error(predictions, actuals),
75
+ # mae: mean_absolute_error(predictions, actuals),
76
+ # r2: r_squared(predictions, actuals)
77
+ # }
78
+ # end
79
+
80
+ # def mean_squared_error(predictions, actuals)
81
+ # # Implement MSE calculation
82
+ # end
83
+
84
+ # def mean_absolute_error(predictions, actuals)
85
+ # # Implement MAE calculation
86
+ # end
87
+
88
+ # def r_squared(predictions, actuals)
89
+ # # Implement R-squared calculation
90
+ # end
91
+ end
92
+ end
@@ -0,0 +1,29 @@
1
+ module EasyML::Transforms
2
+ def self.included(base)
3
+ base.extend(ClassMethods)
4
+ end
5
+
6
+ module ClassMethods
7
+ def transforms
8
+ @transforms ||= []
9
+ end
10
+
11
+ def transform(method_name)
12
+ transforms << method_name
13
+ end
14
+
15
+ def apply_transforms(df)
16
+ new.apply_transforms(df)
17
+ end
18
+ end
19
+
20
+ def missing_any?(list1, list2)
21
+ (list1 - list2).any?
22
+ end
23
+
24
+ def apply_transforms(df)
25
+ self.class.transforms.reduce(df) do |df, transform_method|
26
+ send(transform_method, df)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EasyML
4
+ VERSION = "0.1.1"
5
+ end
data/lib/easy_ml.rb ADDED
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails"
4
+ require "active_record"
5
+ require "active_model"
6
+ require "active_support/all"
7
+ require "glue_gun"
8
+ require "numo/narray"
9
+ require "xgboost"
10
+ require_relative "easy_ml/version"
11
+ require_relative "easy_ml/engine"
12
+
13
+ module EasyML
14
+ class Error < StandardError; end
15
+
16
+ require_relative "easy_ml/support"
17
+ require_relative "easy_ml/core_ext"
18
+ require_relative "easy_ml/logging"
19
+ require_relative "easy_ml/data"
20
+ require_relative "easy_ml/transforms"
21
+ require_relative "easy_ml/core"
22
+ require_relative "easy_ml/trainer"
23
+ end