easy_ml 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +270 -0
  3. data/Rakefile +12 -0
  4. data/app/models/easy_ml/model.rb +59 -0
  5. data/app/models/easy_ml/models/xgboost.rb +9 -0
  6. data/app/models/easy_ml/models.rb +5 -0
  7. data/lib/easy_ml/core/model.rb +29 -0
  8. data/lib/easy_ml/core/model_core.rb +181 -0
  9. data/lib/easy_ml/core/model_evaluator.rb +137 -0
  10. data/lib/easy_ml/core/models/hyperparameters/base.rb +34 -0
  11. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +19 -0
  12. data/lib/easy_ml/core/models/hyperparameters.rb +8 -0
  13. data/lib/easy_ml/core/models/xgboost.rb +10 -0
  14. data/lib/easy_ml/core/models/xgboost_core.rb +220 -0
  15. data/lib/easy_ml/core/models.rb +10 -0
  16. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +63 -0
  17. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +50 -0
  18. data/lib/easy_ml/core/tuner/adapters.rb +10 -0
  19. data/lib/easy_ml/core/tuner.rb +105 -0
  20. data/lib/easy_ml/core/uploaders/model_uploader.rb +24 -0
  21. data/lib/easy_ml/core/uploaders.rb +7 -0
  22. data/lib/easy_ml/core.rb +9 -0
  23. data/lib/easy_ml/core_ext/pathname.rb +9 -0
  24. data/lib/easy_ml/core_ext.rb +5 -0
  25. data/lib/easy_ml/data/dataloader.rb +6 -0
  26. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +31 -0
  27. data/lib/easy_ml/data/dataset/data/sample_info.json +1 -0
  28. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +1 -0
  29. data/lib/easy_ml/data/dataset/splits/file_split.rb +140 -0
  30. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +49 -0
  31. data/lib/easy_ml/data/dataset/splits/split.rb +98 -0
  32. data/lib/easy_ml/data/dataset/splits.rb +11 -0
  33. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +43 -0
  34. data/lib/easy_ml/data/dataset/splitters.rb +9 -0
  35. data/lib/easy_ml/data/dataset.rb +430 -0
  36. data/lib/easy_ml/data/datasource/datasource_factory.rb +60 -0
  37. data/lib/easy_ml/data/datasource/file_datasource.rb +40 -0
  38. data/lib/easy_ml/data/datasource/merged_datasource.rb +64 -0
  39. data/lib/easy_ml/data/datasource/polars_datasource.rb +41 -0
  40. data/lib/easy_ml/data/datasource/s3_datasource.rb +89 -0
  41. data/lib/easy_ml/data/datasource.rb +33 -0
  42. data/lib/easy_ml/data/preprocessor/preprocessor.rb +205 -0
  43. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +403 -0
  44. data/lib/easy_ml/data/preprocessor/utils.rb +17 -0
  45. data/lib/easy_ml/data/preprocessor.rb +238 -0
  46. data/lib/easy_ml/data/utils.rb +50 -0
  47. data/lib/easy_ml/data.rb +8 -0
  48. data/lib/easy_ml/deployment.rb +5 -0
  49. data/lib/easy_ml/engine.rb +26 -0
  50. data/lib/easy_ml/initializers/inflections.rb +4 -0
  51. data/lib/easy_ml/logging.rb +38 -0
  52. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +42 -0
  53. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +23 -0
  54. data/lib/easy_ml/support/age.rb +27 -0
  55. data/lib/easy_ml/support/est.rb +1 -0
  56. data/lib/easy_ml/support/file_rotate.rb +23 -0
  57. data/lib/easy_ml/support/git_ignorable.rb +66 -0
  58. data/lib/easy_ml/support/synced_directory.rb +134 -0
  59. data/lib/easy_ml/support/utc.rb +1 -0
  60. data/lib/easy_ml/support.rb +10 -0
  61. data/lib/easy_ml/trainer.rb +92 -0
  62. data/lib/easy_ml/transforms.rb +29 -0
  63. data/lib/easy_ml/version.rb +5 -0
  64. data/lib/easy_ml.rb +23 -0
  65. metadata +353 -0
@@ -0,0 +1,8 @@
1
+ module EasyML
2
+ module Data
3
+ require_relative "data/utils"
4
+ require_relative "data/preprocessor"
5
+ require_relative "data/dataset"
6
+ require_relative "data/datasource"
7
+ end
8
+ end
@@ -0,0 +1,5 @@
1
+ module EasyML
2
+ module Deployment
3
+ require_relative "deployment/model_uploader"
4
+ end
5
+ end
@@ -0,0 +1,26 @@
1
+ require "rails/engine"
2
+
3
+ module EasyML
4
+ class Engine < Rails::Engine
5
+ isolate_namespace EasyML
6
+
7
+ initializer "easy_ml.inflections" do
8
+ require_relative "initializers/inflections"
9
+ end
10
+
11
+ initializer "easy_ml.setup_generators" do |app|
12
+ app.config.generators do |g|
13
+ g.templates.unshift File.expand_path("../templates", __dir__)
14
+ end
15
+ end
16
+
17
+ generators_path = File.expand_path("railtie/generators", __dir__)
18
+ generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
19
+ generators_dirs.each { |file| require file }
20
+
21
+ config.after_initialize do
22
+ require_relative "../../app/models/easy_ml/model"
23
+ require_relative "../../app/models/easy_ml/models"
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,4 @@
1
+ ActiveSupport::Inflector.inflections(:en) do |inflect|
2
+ inflect.acronym "EasyML"
3
+ inflect.acronym "ML"
4
+ end
@@ -0,0 +1,38 @@
1
+ module EasyML
2
+ module Logging
3
+ def self.included(base)
4
+ base.extend(ClassMethods)
5
+ end
6
+
7
+ module ClassMethods
8
+ def log_method(method_name, message, verbose: false)
9
+ original_method = instance_method(method_name)
10
+ define_method(method_name) do |*args, &block|
11
+ log_message(message, verbose: verbose)
12
+ result = original_method.bind(self).call(*args, &block)
13
+ result
14
+ end
15
+ end
16
+ end
17
+
18
+ def log_message(message, verbose: false)
19
+ if verbose
20
+ log_verbose(message)
21
+ else
22
+ puts message
23
+ end
24
+ end
25
+
26
+ def log_verbose(message)
27
+ puts message if @verbose
28
+ end
29
+
30
+ def log_warning(message)
31
+ puts "\e[33mWARNING: #{message}\e[0m"
32
+ end
33
+
34
+ def log_info(message)
35
+ puts "\e[34mINFO: #{message}\e[0m"
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,42 @@
1
+ # lib/railtie/generators/migration/migration_generator.rb
2
+ require "rails/generators"
3
+ require "rails/generators/active_record/migration"
4
+
5
+ module EasyML
6
+ module Generators
7
+ module Migration
8
+ class MigrationGenerator < Rails::Generators::Base
9
+ include Rails::Generators::Migration
10
+ namespace "easy_ml:migration"
11
+
12
+ # Set the source directory for templates
13
+ source_root File.expand_path("../../templates/migration", __dir__)
14
+
15
+ # Define the migration name
16
+ desc "Generates a migration for EasyMLModel with version and file for remote storage"
17
+
18
+ # Define the migration name; can be customized if needed
19
+ def self.migration_name
20
+ "create_easy_ml_models"
21
+ end
22
+
23
+ # Specify the next migration number
24
+ def self.next_migration_number(dirname)
25
+ if ActiveRecord.version < Gem::Version.new("7")
26
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
27
+ elsif ActiveRecord.timestamped_migrations
28
+ Time.now.utc.strftime("%Y%m%d%H%M%S")
29
+ else
30
+ format("%.3d", (current_migration_number(dirname) + 1))
31
+ end
32
+ end
33
+
34
+ # Generate the migration file using the template
35
+ def create_migration_file
36
+ migration_template "create_easy_ml_models.rb.tt",
37
+ "db/migrate/#{self.class.migration_name}.rb"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,23 @@
1
+ # lib/railtie/generators/templates/migration/create_easy_ml_models.rb.tt
2
+ class CreateEasyMLModels < ActiveRecord::Migration[6.0]
3
+ def change
4
+ create_table :easy_ml_models do |t|
5
+ t.string :name, null: false
6
+ t.boolean :is_live, default: false
7
+ t.string :version, null: false
8
+ t.string :ml_model
9
+ t.string :task
10
+ t.json :metrics, default: []
11
+ t.json :file, null: false
12
+
13
+ t.timestamps
14
+
15
+ t.index :created_at
16
+ t.index :name
17
+ t.index :version
18
+ t.index :is_live
19
+ t.index [:name, :version], unique: true
20
+ t.index [:name, :version, :is_live]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,27 @@
1
+ require "active_support/duration"
2
+
3
+ module EasyML
4
+ module Support
5
+ module Age
6
+ def self.age(start_time, end_time, format: "human")
7
+ return nil unless start_time && end_time
8
+
9
+ age_duration = ActiveSupport::Duration.build((end_time - start_time).to_i)
10
+ age_parts = age_duration.parts
11
+
12
+ case format.to_s
13
+ when "human"
14
+ age_duration.inspect
15
+ when "days"
16
+ age_parts[:days]
17
+ when "hours"
18
+ age_parts[:hours]
19
+ when "minutes"
20
+ age_parts[:minutes]
21
+ when "integer"
22
+ age_duration.to_i
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1 @@
1
+ EST = ActiveSupport::TimeZone.new("America/New_York")
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class FileRotate
3
+ def initialize(directory, files_to_keep)
4
+ @directory = directory
5
+ @files_to_keep = files_to_keep
6
+ end
7
+
8
+ def cleanup(allowed_endings = %w[json])
9
+ return unless @directory.present?
10
+
11
+ allowed_patterns = allowed_endings.map { |ending| File.join(@directory, "**", "*#{ending}") }
12
+ files_to_check = allowed_patterns.empty? ? Dir.glob(File.join(@directory, "**/*")) : Dir.glob(allowed_patterns)
13
+ # Filter out directories
14
+ files_to_check = files_to_check.select { |file| File.file?(file) }
15
+
16
+ files_to_check.each do |file|
17
+ FileUtils.chown_R(`whoami`.chomp, "staff", file)
18
+ FileUtils.chmod_R(0o777, file)
19
+ File.delete(file) unless @files_to_keep.include?(file)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,66 @@
1
+ require "active_support/concern"
2
+ require "fileutils"
3
+
4
+ module EasyML
5
+ module Support
6
+ module GitIgnorable
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ class_attribute :gitignore_attributes, default: {}
11
+
12
+ def self.set_gitignore_callbacks(attribute, &block)
13
+ gitignore_attributes[attribute] = block
14
+
15
+ prepend GitignoreInitializer
16
+ end
17
+ end
18
+
19
+ module GitignoreInitializer
20
+ def initialize(options)
21
+ super
22
+ update_gitignore
23
+ end
24
+ end
25
+
26
+ class_methods do
27
+ def gitignore(attribute, &block)
28
+ set_gitignore_callbacks(attribute, &block)
29
+ end
30
+ end
31
+
32
+ def update_gitignore
33
+ self.class.gitignore_attributes.each do |attribute, block|
34
+ attribute_value = send(attribute)
35
+ next if attribute_value.blank?
36
+
37
+ patterns = block ? block.call(attribute_value) : attribute_value
38
+ next if patterns.nil? || (patterns.respond_to?(:empty?) && patterns.empty?)
39
+
40
+ patterns = [patterns] unless patterns.is_a?(Array)
41
+ patterns = relativize(patterns)
42
+ gitignore_path = File.join(Dir.pwd, ".gitignore")
43
+
44
+ FileUtils.mkdir_p(File.dirname(gitignore_path))
45
+ FileUtils.touch(gitignore_path) unless File.exist?(gitignore_path)
46
+
47
+ existing_content = File.read(gitignore_path).split("\n")
48
+ new_patterns = patterns.reject { |pattern| existing_content.include?(pattern) }
49
+ next if new_patterns.empty?
50
+
51
+ new_content = (existing_content + new_patterns).join("\n").strip
52
+ File.write(gitignore_path, new_content)
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ # Turn patterns like /Users/xyz/path/to/rails/x/**/* into: x/**/*
59
+ def relativize(patterns)
60
+ patterns.map do |pattern|
61
+ pattern.sub(%r{^#{Regexp.escape(Dir.pwd)}/}, "")
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,134 @@
1
+ require "glue_gun"
2
+
3
+ module EasyML
4
+ module Support
5
+ class SyncedDirectory
6
+ include GlueGun::DSL
7
+
8
+ attribute :root_dir, :string
9
+ attribute :s3_bucket, :string
10
+ attribute :s3_prefix, :string
11
+ attribute :s3_access_key_id, :string
12
+ attribute :s3_secret_access_key, :string
13
+
14
+ def sync
15
+ return false if synced?
16
+
17
+ mk_dir
18
+ clean_dir!
19
+ download
20
+ true
21
+ end
22
+
23
+ def files
24
+ Dir.glob(File.join(root_dir, File.join(s3_prefix, "*.csv")))
25
+ end
26
+
27
+ def age(format: "human")
28
+ Age.age(last_updated_at, EST.now, format: format)
29
+ end
30
+
31
+ def stale?
32
+ !synced?
33
+ end
34
+
35
+ def synced?
36
+ return @synced unless @synced.nil?
37
+
38
+ @synced = calculate_synced
39
+ end
40
+
41
+ def last_updated_at
42
+ return nil if files.empty?
43
+
44
+ files.map { |file| File.mtime(file) }.max.in_time_zone(EST)
45
+ end
46
+
47
+ private
48
+
49
+ def mk_dir
50
+ FileUtils.mkdir_p(root_dir)
51
+ end
52
+
53
+ def clean_dir!
54
+ FileUtils.rm_rf(root_dir)
55
+ end
56
+
57
+ def s3
58
+ @s3 ||= begin
59
+ credentials = Aws::Credentials.new(s3_access_key_id, s3_secret_access_key)
60
+ Aws::S3::Client.new(credentials: credentials)
61
+ end
62
+ end
63
+
64
+ def download
65
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
66
+ next if object.key.end_with?("/") # skip folders
67
+
68
+ gzipped_file_path = File.join(root_dir, object.key)
69
+ FileUtils.mkdir_p(File.dirname(gzipped_file_path))
70
+
71
+ s3.get_object(
72
+ response_target: gzipped_file_path,
73
+ bucket: s3_bucket,
74
+ key: object.key
75
+ )
76
+
77
+ puts "Downloaded #{object.key} to #{gzipped_file_path}"
78
+
79
+ # Ungzip the file
80
+ ungzipped_file_path = ungzip_file(gzipped_file_path)
81
+ puts "Ungzipped to #{ungzipped_file_path}"
82
+ end
83
+ end
84
+
85
+ def ungzip_file(gzipped_file_path)
86
+ ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
87
+
88
+ Zlib::GzipReader.open(gzipped_file_path) do |gz|
89
+ File.open(ungzipped_file_path, "wb") do |file|
90
+ file.write(gz.read)
91
+ end
92
+ end
93
+
94
+ File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
95
+ ungzipped_file_path
96
+ end
97
+
98
+ def expand_dir(dir)
99
+ return dir if dir.to_s[0] == "/"
100
+
101
+ Rails.root.join(dir)
102
+ end
103
+
104
+ def new_data_available?
105
+ return true if files.empty?
106
+
107
+ local_latest = last_updated_at
108
+ s3_latest = s3_last_updated_at
109
+
110
+ return false if s3_latest.nil?
111
+
112
+ s3_latest > local_latest
113
+ end
114
+
115
+ def calculate_synced
116
+ return false if age.nil?
117
+
118
+ !new_data_available?
119
+ end
120
+
121
+ def s3_last_updated_at
122
+ s3_latest = nil
123
+
124
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
125
+ next if object.key.end_with?("/")
126
+
127
+ s3_latest = [s3_latest, object.last_modified].compact.max
128
+ end
129
+
130
+ s3_latest.in_time_zone(EST)
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1 @@
1
+ UTC = ActiveSupport::TimeZone.new("UTC")
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ module Support
3
+ require_relative "support/age"
4
+ require_relative "support/git_ignorable"
5
+ require_relative "support/synced_directory"
6
+ require_relative "support/utc"
7
+ require_relative "support/est"
8
+ require_relative "support/file_rotate"
9
+ end
10
+ end
@@ -0,0 +1,92 @@
1
+ module EasyML
2
+ class Trainer
3
+ # include GlueGun::DSL
4
+ # include EasyML::Logging
5
+
6
+ # define_attr :verbose, default: false
7
+ # define_attr :root_dir do |root_dir|
8
+ # File.join(root_dir, "trainer")
9
+ # end
10
+
11
+ # define_config :dataset do |config|
12
+ # config.define_option :default do |option|
13
+ # option.set_class EasyML::Data::Dataset
14
+ # option.define_attr :root_dir
15
+ # option.define_attr :target
16
+ # option.define_attr :batch_size
17
+ # end
18
+ # end
19
+
20
+ # define_config :model do |config|
21
+ # config.define_option :default do |option|
22
+ # option.set_class EasyML::Model
23
+ # option.define_attr :root_dir
24
+ # option.define_attr :name
25
+ # option.define_attr :hyperparameters
26
+ # end
27
+ # end
28
+
29
+ # def train
30
+ # log_info("Starting training process") if verbose
31
+
32
+ # dataset.refresh!
33
+
34
+ # log_info("Fitting model") if verbose
35
+ # dataset.train(split_ys: true) do |xs, ys|
36
+ # model.fit(xs, ys)
37
+ # end
38
+
39
+ # log_info("Saving model") if verbose
40
+ # model.save
41
+
42
+ # log_info("Training completed") if verbose
43
+ # end
44
+
45
+ # def evaluate
46
+ # log_info("Starting evaluation process") if verbose
47
+
48
+ # results = {}
49
+
50
+ # %i[train test valid].each do |split|
51
+ # log_info("Evaluating on #{split} set") if verbose
52
+ # predictions = []
53
+ # actuals = []
54
+
55
+ # dataset.send(split, split_ys: true) do |xs, ys|
56
+ # batch_predictions = model.predict(xs)
57
+ # predictions.concat(batch_predictions.to_a)
58
+ # actuals.concat(ys.to_a)
59
+ # end
60
+
61
+ # results[split] = calculate_metrics(predictions, actuals)
62
+ # end
63
+
64
+ # log_info("Evaluation completed") if verbose
65
+ # results
66
+ # end
67
+
68
+ # private
69
+
70
+ # def calculate_metrics(predictions, actuals)
71
+ # # Implement your metric calculations here
72
+ # # This is a placeholder and should be replaced with actual metric calculations
73
+ # {
74
+ # mse: mean_squared_error(predictions, actuals),
75
+ # mae: mean_absolute_error(predictions, actuals),
76
+ # r2: r_squared(predictions, actuals)
77
+ # }
78
+ # end
79
+
80
+ # def mean_squared_error(predictions, actuals)
81
+ # # Implement MSE calculation
82
+ # end
83
+
84
+ # def mean_absolute_error(predictions, actuals)
85
+ # # Implement MAE calculation
86
+ # end
87
+
88
+ # def r_squared(predictions, actuals)
89
+ # # Implement R-squared calculation
90
+ # end
91
+ end
92
+ end
@@ -0,0 +1,29 @@
1
+ module EasyML::Transforms
2
+ def self.included(base)
3
+ base.extend(ClassMethods)
4
+ end
5
+
6
+ module ClassMethods
7
+ def transforms
8
+ @transforms ||= []
9
+ end
10
+
11
+ def transform(method_name)
12
+ transforms << method_name
13
+ end
14
+
15
+ def apply_transforms(df)
16
+ new.apply_transforms(df)
17
+ end
18
+ end
19
+
20
+ def missing_any?(list1, list2)
21
+ (list1 - list2).any?
22
+ end
23
+
24
+ def apply_transforms(df)
25
+ self.class.transforms.reduce(df) do |df, transform_method|
26
+ send(transform_method, df)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EasyML
4
+ VERSION = "0.1.1"
5
+ end
data/lib/easy_ml.rb ADDED
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails"
4
+ require "active_record"
5
+ require "active_model"
6
+ require "active_support/all"
7
+ require "glue_gun"
8
+ require "numo/narray"
9
+ require "xgboost"
10
+ require_relative "easy_ml/version"
11
+ require_relative "easy_ml/engine"
12
+
13
+ module EasyML
14
+ class Error < StandardError; end
15
+
16
+ require_relative "easy_ml/support"
17
+ require_relative "easy_ml/core_ext"
18
+ require_relative "easy_ml/logging"
19
+ require_relative "easy_ml/data"
20
+ require_relative "easy_ml/transforms"
21
+ require_relative "easy_ml/core"
22
+ require_relative "easy_ml/trainer"
23
+ end