easy_ml 0.1.4 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,79 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_events
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # status :string not null
8
+ # eventable_type :string
9
+ # eventable_id :bigint
10
+ # stacktrace :text
11
+ # created_at :datetime not null
12
+ # updated_at :datetime not null
13
+ #
14
+ module EasyML
15
+ class Event < ActiveRecord::Base
16
+ MAX_LINE_LENGTH = 65
17
+ self.table_name = "easy_ml_events"
18
+
19
+ STATUSES = %w[started success failed].freeze
20
+
21
+ belongs_to :eventable, polymorphic: true, optional: true
22
+
23
+ validates :name, presence: true
24
+ validates :status, presence: true, inclusion: { in: STATUSES }
25
+
26
+ # Helper method to extract worker name from class
27
+ def self.worker_name(worker_class)
28
+ worker_class.to_s.demodulize
29
+ end
30
+
31
+ # Scopes to help query events
32
+ scope :for_worker, ->(worker_class) { where(name: worker_name(worker_class)) }
33
+ scope :started, -> { where(status: "started") }
34
+ scope :succeeded, -> { where(status: "success") }
35
+ scope :failed, -> { where(status: "failed") }
36
+
37
+ def self.create_event(model, status, error = nil)
38
+ EasyML::Event.create!(
39
+ name: model.class.name.demodulize,
40
+ status: status,
41
+ eventable: model,
42
+ stacktrace: format_stacktrace(error),
43
+ )
44
+ end
45
+
46
+ def self.handle_error(model, error)
47
+ if error.is_a?(String)
48
+ begin
49
+ raise error
50
+ rescue StandardError => e
51
+ error = e
52
+ end
53
+ end
54
+ create_event(model, "failed", error)
55
+ Rails.logger.error("#{self.class.name} failed: #{error.message}")
56
+ end
57
+
58
+ def self.format_stacktrace(error)
59
+ return nil if error.nil?
60
+
61
+ topline = error.inspect
62
+
63
+ stacktrace = error.backtrace.select do |loc|
64
+ loc.match?(/easy_ml/)
65
+ end
66
+
67
+ %(#{topline}
68
+
69
+ #{stacktrace.join("\n")}
70
+ ).split("\n").map do |l|
71
+ l.gsub(/\s{2,}/, " ").strip
72
+ end.flat_map { |line| wrap_text(line, MAX_LINE_LENGTH) }.join("\n")
73
+ end
74
+
75
+ def self.wrap_text(text, max_length)
76
+ text.strip.scan(/.{1,#{max_length}}/)
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,437 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_features
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string
8
+ # version :bigint
9
+ # feature_class :string not null
10
+ # feature_position :integer
11
+ # batch_size :integer
12
+ # needs_fit :boolean
13
+ # sha :string
14
+ # primary_key :string is an Array
15
+ # applied_at :datetime
16
+ # fit_at :datetime
17
+ # refresh_every :bigint
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ #
21
+ module EasyML
22
+ class Feature < ActiveRecord::Base
23
+ self.table_name = "easy_ml_features"
24
+ include Historiographer::Silent
25
+ historiographer_mode :snapshot_only
26
+
27
+ class << self
28
+ def compute_sha(feature_class)
29
+ require "digest"
30
+ path = feature_class.constantize.instance_method(:transform).source_location.first
31
+ current_mtime = File.mtime(path)
32
+ cache_key = "feature_sha/#{path}"
33
+
34
+ cached = Rails.cache.read(cache_key)
35
+
36
+ if cached && cached[:mtime] == current_mtime
37
+ cached[:sha]
38
+ else
39
+ # Compute new SHA and cache it with the current mtime
40
+ sha = Digest::SHA256.hexdigest(File.read(path))
41
+ Rails.cache.write(cache_key, { sha: sha, mtime: current_mtime })
42
+ sha
43
+ end
44
+ end
45
+
46
+ def clear_sha_cache!
47
+ Rails.cache.delete_matched("feature_sha/*")
48
+ end
49
+ end
50
+
51
+ belongs_to :dataset, class_name: "EasyML::Dataset"
52
+
53
+ validates :feature_class, presence: true
54
+ validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
55
+ before_validation :set_feature_position, on: :create
56
+
57
+ scope :ordered, -> { order(feature_position: :asc) }
58
+ scope :has_changes, lambda {
59
+ # Get all unique feature classes
60
+ feature_classes = pluck(:feature_class).uniq
61
+
62
+ # Build conditions for each feature class
63
+ conditions = feature_classes.map do |klass|
64
+ current_sha = compute_sha(klass)
65
+ sanitize_sql_array(["(feature_class = ? AND (sha IS NULL OR sha != ?))", klass, current_sha])
66
+ end
67
+
68
+ # Combine all conditions with OR
69
+ where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
70
+ }
71
+ scope :never_applied, -> { where(applied_at: nil) }
72
+ scope :never_fit, -> do
73
+ fittable = where(fit_at: nil)
74
+ fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
75
+ where(id: fittable.map(&:id))
76
+ end
77
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
78
+
79
+ before_save :apply_defaults, if: :new_record?
80
+ before_save :update_sha
81
+ after_find :update_from_feature_class
82
+ before_save :update_from_feature_class
83
+
84
+ def feature_klass
85
+ feature_class.constantize
86
+ rescue NameError
87
+ raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
88
+ end
89
+
90
+ def adapter
91
+ @adapter ||= feature_klass.new
92
+ end
93
+
94
+ def fit_reasons
95
+ return [] if !adapter.respond_to?(:fit)
96
+
97
+ {
98
+ "Needs fit manually set" => read_attribute(:needs_fit),
99
+ "Datasource was refreshed" => datasource_was_refreshed?,
100
+ "Code changed" => code_changed?,
101
+ "Cache expired" => cache_expired?,
102
+ }.select { |k, v| v }.map { |k, v| k }
103
+ end
104
+
105
+ alias_method :refresh_reasons, :fit_reasons
106
+
107
+ def needs_fit?
108
+ fit_reasons.any?
109
+ end
110
+
111
+ def cache_expired?
112
+ return false if refresh_every.nil? || fit_at.nil?
113
+
114
+ fit_at < refresh_every.seconds.ago
115
+ end
116
+
117
+ def code_changed?
118
+ current_sha = self.class.compute_sha(feature_class)
119
+ sha != current_sha
120
+ end
121
+
122
+ def datasource_was_refreshed?
123
+ return true if fit_at.nil?
124
+ return false if dataset.datasource.refreshed_at.nil?
125
+
126
+ dataset.datasource.refreshed_at > fit_at
127
+ end
128
+
129
+ def batchable?
130
+ adapter.respond_to?(:batch) || (batch_size.present? &&
131
+ numeric_primary_key?)
132
+ end
133
+
134
+ def should_be_batchable?
135
+ adapter.respond_to?(:batch) || config.dig(:batch_size).present?
136
+ end
137
+
138
+ def numeric_primary_key?
139
+ if primary_key.nil?
140
+ return false unless should_be_batchable?
141
+ raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
142
+ end
143
+
144
+ dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(&:values).all? do |value|
145
+ case value
146
+ when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
147
+ else
148
+ value.is_a?(Numeric)
149
+ end
150
+ end
151
+ end
152
+
153
+ def build_batches
154
+ if batchable?
155
+ batch
156
+ else
157
+ [{ feature_id: id }]
158
+ end
159
+ end
160
+
161
+ def batch
162
+ reader = dataset.raw
163
+
164
+ if adapter.respond_to?(:batch)
165
+ array = adapter.batch(reader, self)
166
+ min_id = array.min
167
+ max_id = array.max
168
+ else
169
+ # Get all primary keys
170
+ begin
171
+ unless primary_key.present?
172
+ raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
173
+ end
174
+ df = reader.query(select: [primary_key.first])
175
+ rescue => e
176
+ raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
177
+ end
178
+ return [] if df.nil?
179
+
180
+ min_id = df[primary_key.first].min
181
+ max_id = df[primary_key.first].max
182
+ end
183
+
184
+ (min_id..max_id).step(batch_size).map do |batch_start|
185
+ batch_end = [batch_start + batch_size, max_id + 1].min - 1
186
+ {
187
+ feature_id: id,
188
+ batch_start: batch_start,
189
+ batch_end: batch_end,
190
+ }
191
+ end
192
+ end
193
+
194
+ def wipe
195
+ feature_store.wipe
196
+ end
197
+
198
+ def fit(features: [self], async: false)
199
+ jobs = features.flat_map(&:build_batches)
200
+ if async
201
+ EasyML::ComputeFeatureJob.enqueue_batch(jobs)
202
+ else
203
+ jobs.each do |job|
204
+ EasyML::ComputeFeatureJob.perform(nil, job)
205
+ end
206
+ end
207
+ end
208
+
209
+ # Fit a single batch, used for testing the user's feature implementation
210
+ def fit_batch(batch_args = {})
211
+ batch_args.symbolize_keys!
212
+ if batch_args.key?(:batch_start)
213
+ actually_fit_batch(batch_args)
214
+ else
215
+ actually_fit_batch(get_batch_args(**batch_args))
216
+ end
217
+ end
218
+
219
+ # Transform a single batch, used for testing the user's feature implementation
220
+ def transform_batch(df = nil, batch_args = {})
221
+ if df.present?
222
+ actually_transform_batch(df)
223
+ else
224
+ actually_transform_batch(build_batch(get_batch_args(**batch_args)))
225
+ end
226
+ end
227
+
228
+ def get_batch_args(batch_args = {})
229
+ unless batch_args.key?(:random)
230
+ batch_args[:random] = true
231
+ end
232
+ if batch_args[:random]
233
+ batch = build_batches.sample
234
+ else
235
+ batch = build_batches.first
236
+ end
237
+ end
238
+
239
+ def build_batch(batch_args = {})
240
+ batch_start = batch_args.dig(:batch_start)
241
+ batch_end = batch_args.dig(:batch_end)
242
+
243
+ if batch_start && batch_end
244
+ select = needs_columns.present? ? needs_columns : nil
245
+ filter = Polars.col(primary_key.first).is_between(batch_start, batch_end)
246
+ params = {
247
+ select: select,
248
+ filter: filter,
249
+ }.compact
250
+ else
251
+ params = {}
252
+ end
253
+ dataset.raw.query(**params)
254
+ end
255
+
256
+ def actually_fit_batch(batch_args = {})
257
+ return false unless adapter.respond_to?(:fit)
258
+
259
+ if adapter.respond_to?(:fit)
260
+ batch_args.symbolize_keys!
261
+
262
+ if adapter.respond_to?(:batch)
263
+ batch_df = adapter.fit(dataset.raw, self, batch_args)
264
+ else
265
+ df = build_batch(batch_args)
266
+ batch_df = adapter.fit(df, self, batch_args)
267
+ end
268
+ end
269
+ raise "Feature #{feature_class}#fit must return a dataframe" unless batch_df.present?
270
+ store(batch_df)
271
+ updates = {
272
+ applied_at: Time.current,
273
+ needs_fit: false,
274
+ }.compact
275
+ update!(updates)
276
+ batch_df
277
+ end
278
+
279
+ def actually_transform_batch(df)
280
+ return nil unless df.present?
281
+ return df if adapter.respond_to?(:fit) && feature_store.empty?
282
+
283
+ result = adapter.transform(df, self)
284
+ update!(applied_at: Time.current)
285
+ result
286
+ end
287
+
288
+ def compute_sha
289
+ self.class.compute_sha(feature_class)
290
+ end
291
+
292
+ # Position manipulation methods
293
+ def insert
294
+ save!
295
+ self
296
+ end
297
+
298
+ def insert_where(feature_class)
299
+ features = dataset.features.reload
300
+ target = features.detect { |t| t.feature_class == feature_class.to_s }
301
+ target_position = target&.feature_position
302
+ yield target_position
303
+ features.select { |t| target_position.nil? || t.feature_position > target_position }.each { |t| t.feature_position += 1 }
304
+ features += [self]
305
+
306
+ bulk_update_positions(features)
307
+ self
308
+ end
309
+
310
+ def prepend
311
+ insert_where(nil) do |_position|
312
+ self.feature_position = 0
313
+ end
314
+ end
315
+
316
+ def insert_before(feature_class)
317
+ insert_where(feature_class) do |position|
318
+ self.feature_position = position - 1
319
+ end
320
+ end
321
+
322
+ def insert_after(feature_class)
323
+ insert_where(feature_class) do |position|
324
+ self.feature_position = position + 1
325
+ end
326
+ end
327
+
328
+ def bump_version
329
+ old_version = version
330
+ write_attribute(:version, version + 1)
331
+ feature_store.cp(old_version, version)
332
+ self
333
+ end
334
+
335
+ def apply_defaults
336
+ self.name ||= self.feature_class.demodulize.titleize
337
+ self.version ||= 1
338
+ end
339
+
340
+ def needs_columns
341
+ config.dig(:needs_columns) || []
342
+ end
343
+
344
+ def upload_remote_files
345
+ feature_store.upload
346
+ end
347
+
348
+ def feature_store
349
+ @feature_store ||= EasyML::FeatureStore.new(self)
350
+ end
351
+
352
+ def upload_remote_files
353
+ feature_store.upload_remote_files
354
+ end
355
+
356
+ def files
357
+ feature_store.list_partitions
358
+ end
359
+
360
+ def query(filter: nil)
361
+ feature_store.query(filter: filter)
362
+ end
363
+
364
+ def store(df)
365
+ feature_store.store(df)
366
+ end
367
+
368
+ def batch_size
369
+ read_attribute(:batch_size) ||
370
+ config.dig(:batch_size) ||
371
+ (should_be_batchable? ? 10_000 : nil)
372
+ end
373
+
374
+ private
375
+
376
+ def bulk_update_positions(features)
377
+ # Use activerecord-import for bulk updates
378
+ features = order_features(features)
379
+ features.each(&:apply_defaults)
380
+ new_features = features.reject(&:persisted?)
381
+ existing_features = features.select(&:persisted?)
382
+ Feature.import(
383
+ existing_features,
384
+ on_duplicate_key_update: [:feature_position],
385
+ validate: false,
386
+ )
387
+ Feature.import(new_features)
388
+ end
389
+
390
+ def order_features(features)
391
+ features.sort_by { |t| t.feature_position }.each_with_index do |feature, index|
392
+ feature.feature_position = index
393
+ end
394
+ end
395
+
396
+ def set_feature_position
397
+ return if feature_position.present?
398
+
399
+ max_feature_position = dataset&.features&.maximum(:feature_position) || -1
400
+ self.feature_position = max_feature_position + 1
401
+ end
402
+
403
+ def update_sha
404
+ new_sha = compute_sha
405
+ if new_sha != self.sha
406
+ self.sha = new_sha
407
+ self.needs_fit = true
408
+ end
409
+ end
410
+
411
+ def update_from_feature_class
412
+ if read_attribute(:batch_size) != config.dig(:batch_size)
413
+ write_attribute(:batch_size, config.dig(:batch_size))
414
+ self.needs_fit = true
415
+ end
416
+
417
+ if self.primary_key != config.dig(:primary_key)
418
+ self.primary_key = [config.dig(:primary_key)].flatten
419
+ end
420
+
421
+ if new_refresh_every = config.dig(:refresh_every)
422
+ self.refresh_every = new_refresh_every.to_i
423
+ end
424
+ end
425
+
426
+ def feature_klass
427
+ @feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
428
+ end
429
+
430
+ def config
431
+ raise "Feature not found: #{feature_class}" unless feature_klass
432
+ feature_klass.features&.first
433
+ end
434
+ end
435
+
436
+ class InvalidFeatureError < StandardError; end
437
+ end
@@ -0,0 +1,38 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_feature_histories
4
+ #
5
+ # id :bigint not null, primary key
6
+ # feature_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string
9
+ # version :integer
10
+ # feature_class :string not null
11
+ # feature_position :integer
12
+ # batch_size :integer
13
+ # needs_fit :boolean
14
+ # sha :string
15
+ # primary_key :string
16
+ # applied_at :datetime
17
+ # fit_at :datetime
18
+ # refresh_every :integer
19
+ # created_at :datetime not null
20
+ # updated_at :datetime not null
21
+ # history_started_at :datetime not null
22
+ # history_ended_at :datetime
23
+ # history_user_id :integer
24
+ # snapshot_id :string
25
+ #
26
+ module EasyML
27
+ class FeatureHistory < ActiveRecord::Base
28
+ self.table_name = "easy_ml_feature_histories"
29
+ include Historiographer::History
30
+
31
+ after_find :download_remote_files
32
+ scope :ordered, -> { order(feature_position: :asc) }
33
+
34
+ def download_remote_files
35
+ feature_store&.download
36
+ end
37
+ end
38
+ end