easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,79 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_events
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # status :string not null
8
+ # eventable_type :string
9
+ # eventable_id :bigint
10
+ # stacktrace :text
11
+ # created_at :datetime not null
12
+ # updated_at :datetime not null
13
+ #
14
+ module EasyML
15
+ class Event < ActiveRecord::Base
16
+ MAX_LINE_LENGTH = 65
17
+ self.table_name = "easy_ml_events"
18
+
19
+ STATUSES = %w[started success failed].freeze
20
+
21
+ belongs_to :eventable, polymorphic: true, optional: true
22
+
23
+ validates :name, presence: true
24
+ validates :status, presence: true, inclusion: { in: STATUSES }
25
+
26
+ # Helper method to extract worker name from class
27
+ def self.worker_name(worker_class)
28
+ worker_class.to_s.demodulize
29
+ end
30
+
31
+ # Scopes to help query events
32
+ scope :for_worker, ->(worker_class) { where(name: worker_name(worker_class)) }
33
+ scope :started, -> { where(status: "started") }
34
+ scope :succeeded, -> { where(status: "success") }
35
+ scope :failed, -> { where(status: "failed") }
36
+
37
+ def self.create_event(model, status, error = nil)
38
+ EasyML::Event.create!(
39
+ name: model.class.name.demodulize,
40
+ status: status,
41
+ eventable: model,
42
+ stacktrace: format_stacktrace(error),
43
+ )
44
+ end
45
+
46
+ def self.handle_error(model, error)
47
+ if error.is_a?(String)
48
+ begin
49
+ raise error
50
+ rescue StandardError => e
51
+ error = e
52
+ end
53
+ end
54
+ create_event(model, "failed", error)
55
+ Rails.logger.error("#{self.class.name} failed: #{error.message}")
56
+ end
57
+
58
+ def self.format_stacktrace(error)
59
+ return nil if error.nil?
60
+
61
+ topline = error.inspect
62
+
63
+ stacktrace = error.backtrace.select do |loc|
64
+ loc.match?(/easy_ml/)
65
+ end
66
+
67
+ %(#{topline}
68
+
69
+ #{stacktrace.join("\n")}
70
+ ).split("\n").map do |l|
71
+ l.gsub(/\s{2,}/, " ").strip
72
+ end.flat_map { |line| wrap_text(line, MAX_LINE_LENGTH) }.join("\n")
73
+ end
74
+
75
+ def self.wrap_text(text, max_length)
76
+ text.strip.scan(/.{1,#{max_length}}/)
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,437 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_features
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :bigint not null
7
+ # name :string
8
+ # version :bigint
9
+ # feature_class :string not null
10
+ # feature_position :integer
11
+ # batch_size :integer
12
+ # needs_fit :boolean
13
+ # sha :string
14
+ # primary_key :string is an Array
15
+ # applied_at :datetime
16
+ # fit_at :datetime
17
+ # refresh_every :bigint
18
+ # created_at :datetime not null
19
+ # updated_at :datetime not null
20
+ #
21
+ module EasyML
22
+ class Feature < ActiveRecord::Base
23
+ self.table_name = "easy_ml_features"
24
+ include Historiographer::Silent
25
+ historiographer_mode :snapshot_only
26
+
27
+ class << self
28
+ def compute_sha(feature_class)
29
+ require "digest"
30
+ path = feature_class.constantize.instance_method(:transform).source_location.first
31
+ current_mtime = File.mtime(path)
32
+ cache_key = "feature_sha/#{path}"
33
+
34
+ cached = Rails.cache.read(cache_key)
35
+
36
+ if cached && cached[:mtime] == current_mtime
37
+ cached[:sha]
38
+ else
39
+ # Compute new SHA and cache it with the current mtime
40
+ sha = Digest::SHA256.hexdigest(File.read(path))
41
+ Rails.cache.write(cache_key, { sha: sha, mtime: current_mtime })
42
+ sha
43
+ end
44
+ end
45
+
46
+ def clear_sha_cache!
47
+ Rails.cache.delete_matched("feature_sha/*")
48
+ end
49
+ end
50
+
51
+ belongs_to :dataset, class_name: "EasyML::Dataset"
52
+
53
+ validates :feature_class, presence: true
54
+ validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
55
+ before_validation :set_feature_position, on: :create
56
+
57
+ scope :ordered, -> { order(feature_position: :asc) }
58
+ scope :has_changes, lambda {
59
+ # Get all unique feature classes
60
+ feature_classes = pluck(:feature_class).uniq
61
+
62
+ # Build conditions for each feature class
63
+ conditions = feature_classes.map do |klass|
64
+ current_sha = compute_sha(klass)
65
+ sanitize_sql_array(["(feature_class = ? AND (sha IS NULL OR sha != ?))", klass, current_sha])
66
+ end
67
+
68
+ # Combine all conditions with OR
69
+ where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
70
+ }
71
+ scope :never_applied, -> { where(applied_at: nil) }
72
+ scope :never_fit, -> do
73
+ fittable = where(fit_at: nil)
74
+ fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
75
+ where(id: fittable.map(&:id))
76
+ end
77
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
78
+
79
+ before_save :apply_defaults, if: :new_record?
80
+ before_save :update_sha
81
+ after_find :update_from_feature_class
82
+ before_save :update_from_feature_class
83
+
84
+ def feature_klass
85
+ feature_class.constantize
86
+ rescue NameError
87
+ raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
88
+ end
89
+
90
+ def adapter
91
+ @adapter ||= feature_klass.new
92
+ end
93
+
94
+ def fit_reasons
95
+ return [] if !adapter.respond_to?(:fit)
96
+
97
+ {
98
+ "Needs fit manually set" => read_attribute(:needs_fit),
99
+ "Datasource was refreshed" => datasource_was_refreshed?,
100
+ "Code changed" => code_changed?,
101
+ "Cache expired" => cache_expired?,
102
+ }.select { |k, v| v }.map { |k, v| k }
103
+ end
104
+
105
+ alias_method :refresh_reasons, :fit_reasons
106
+
107
+ def needs_fit?
108
+ fit_reasons.any?
109
+ end
110
+
111
+ def cache_expired?
112
+ return false if refresh_every.nil? || fit_at.nil?
113
+
114
+ fit_at < refresh_every.seconds.ago
115
+ end
116
+
117
+ def code_changed?
118
+ current_sha = self.class.compute_sha(feature_class)
119
+ sha != current_sha
120
+ end
121
+
122
+ def datasource_was_refreshed?
123
+ return true if fit_at.nil?
124
+ return false if dataset.datasource.refreshed_at.nil?
125
+
126
+ dataset.datasource.refreshed_at > fit_at
127
+ end
128
+
129
+ def batchable?
130
+ adapter.respond_to?(:batch) || (batch_size.present? &&
131
+ numeric_primary_key?)
132
+ end
133
+
134
+ def should_be_batchable?
135
+ adapter.respond_to?(:batch) || config.dig(:batch_size).present?
136
+ end
137
+
138
+ def numeric_primary_key?
139
+ if primary_key.nil?
140
+ return false unless should_be_batchable?
141
+ raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
142
+ end
143
+
144
+ dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(&:values).all? do |value|
145
+ case value
146
+ when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
147
+ else
148
+ value.is_a?(Numeric)
149
+ end
150
+ end
151
+ end
152
+
153
+ def build_batches
154
+ if batchable?
155
+ batch
156
+ else
157
+ [{ feature_id: id }]
158
+ end
159
+ end
160
+
161
+ def batch
162
+ reader = dataset.raw
163
+
164
+ if adapter.respond_to?(:batch)
165
+ array = adapter.batch(reader, self)
166
+ min_id = array.min
167
+ max_id = array.max
168
+ else
169
+ # Get all primary keys
170
+ begin
171
+ unless primary_key.present?
172
+ raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
173
+ end
174
+ df = reader.query(select: [primary_key.first])
175
+ rescue => e
176
+ raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
177
+ end
178
+ return [] if df.nil?
179
+
180
+ min_id = df[primary_key.first].min
181
+ max_id = df[primary_key.first].max
182
+ end
183
+
184
+ (min_id..max_id).step(batch_size).map do |batch_start|
185
+ batch_end = [batch_start + batch_size, max_id + 1].min - 1
186
+ {
187
+ feature_id: id,
188
+ batch_start: batch_start,
189
+ batch_end: batch_end,
190
+ }
191
+ end
192
+ end
193
+
194
+ def wipe
195
+ feature_store.wipe
196
+ end
197
+
198
+ def fit(features: [self], async: false)
199
+ jobs = features.flat_map(&:build_batches)
200
+ if async
201
+ EasyML::ComputeFeatureJob.enqueue_batch(jobs)
202
+ else
203
+ jobs.each do |job|
204
+ EasyML::ComputeFeatureJob.perform(nil, job)
205
+ end
206
+ end
207
+ end
208
+
209
+ # Fit a single batch, used for testing the user's feature implementation
210
+ def fit_batch(batch_args = {})
211
+ batch_args.symbolize_keys!
212
+ if batch_args.key?(:batch_start)
213
+ actually_fit_batch(batch_args)
214
+ else
215
+ actually_fit_batch(get_batch_args(**batch_args))
216
+ end
217
+ end
218
+
219
+ # Transform a single batch, used for testing the user's feature implementation
220
+ def transform_batch(df = nil, batch_args = {})
221
+ if df.present?
222
+ actually_transform_batch(df)
223
+ else
224
+ actually_transform_batch(build_batch(get_batch_args(**batch_args)))
225
+ end
226
+ end
227
+
228
+ def get_batch_args(batch_args = {})
229
+ unless batch_args.key?(:random)
230
+ batch_args[:random] = true
231
+ end
232
+ if batch_args[:random]
233
+ batch = build_batches.sample
234
+ else
235
+ batch = build_batches.first
236
+ end
237
+ end
238
+
239
+ def build_batch(batch_args = {})
240
+ batch_start = batch_args.dig(:batch_start)
241
+ batch_end = batch_args.dig(:batch_end)
242
+
243
+ if batch_start && batch_end
244
+ select = needs_columns.present? ? needs_columns : nil
245
+ filter = Polars.col(primary_key.first).is_between(batch_start, batch_end)
246
+ params = {
247
+ select: select,
248
+ filter: filter,
249
+ }.compact
250
+ else
251
+ params = {}
252
+ end
253
+ dataset.raw.query(**params)
254
+ end
255
+
256
+ def actually_fit_batch(batch_args = {})
257
+ return false unless adapter.respond_to?(:fit)
258
+
259
+ if adapter.respond_to?(:fit)
260
+ batch_args.symbolize_keys!
261
+
262
+ if adapter.respond_to?(:batch)
263
+ batch_df = adapter.fit(dataset.raw, self, batch_args)
264
+ else
265
+ df = build_batch(batch_args)
266
+ batch_df = adapter.fit(df, self, batch_args)
267
+ end
268
+ end
269
+ raise "Feature #{feature_class}#fit must return a dataframe" unless batch_df.present?
270
+ store(batch_df)
271
+ updates = {
272
+ applied_at: Time.current,
273
+ needs_fit: false,
274
+ }.compact
275
+ update!(updates)
276
+ batch_df
277
+ end
278
+
279
+ def actually_transform_batch(df)
280
+ return nil unless df.present?
281
+ return df if adapter.respond_to?(:fit) && feature_store.empty?
282
+
283
+ result = adapter.transform(df, self)
284
+ update!(applied_at: Time.current)
285
+ result
286
+ end
287
+
288
+ def compute_sha
289
+ self.class.compute_sha(feature_class)
290
+ end
291
+
292
+ # Position manipulation methods
293
+ def insert
294
+ save!
295
+ self
296
+ end
297
+
298
+ def insert_where(feature_class)
299
+ features = dataset.features.reload
300
+ target = features.detect { |t| t.feature_class == feature_class.to_s }
301
+ target_position = target&.feature_position
302
+ yield target_position
303
+ features.select { |t| target_position.nil? || t.feature_position > target_position }.each { |t| t.feature_position += 1 }
304
+ features += [self]
305
+
306
+ bulk_update_positions(features)
307
+ self
308
+ end
309
+
310
+ def prepend
311
+ insert_where(nil) do |_position|
312
+ self.feature_position = 0
313
+ end
314
+ end
315
+
316
+ def insert_before(feature_class)
317
+ insert_where(feature_class) do |position|
318
+ self.feature_position = position - 1
319
+ end
320
+ end
321
+
322
+ def insert_after(feature_class)
323
+ insert_where(feature_class) do |position|
324
+ self.feature_position = position + 1
325
+ end
326
+ end
327
+
328
+ def bump_version
329
+ old_version = version
330
+ write_attribute(:version, version + 1)
331
+ feature_store.cp(old_version, version)
332
+ self
333
+ end
334
+
335
+ def apply_defaults
336
+ self.name ||= self.feature_class.demodulize.titleize
337
+ self.version ||= 1
338
+ end
339
+
340
+ def needs_columns
341
+ config.dig(:needs_columns) || []
342
+ end
343
+
344
+ def upload_remote_files
345
+ feature_store.upload
346
+ end
347
+
348
+ def feature_store
349
+ @feature_store ||= EasyML::FeatureStore.new(self)
350
+ end
351
+
352
+ def upload_remote_files
353
+ feature_store.upload_remote_files
354
+ end
355
+
356
+ def files
357
+ feature_store.list_partitions
358
+ end
359
+
360
+ def query(filter: nil)
361
+ feature_store.query(filter: filter)
362
+ end
363
+
364
+ def store(df)
365
+ feature_store.store(df)
366
+ end
367
+
368
+ def batch_size
369
+ read_attribute(:batch_size) ||
370
+ config.dig(:batch_size) ||
371
+ (should_be_batchable? ? 10_000 : nil)
372
+ end
373
+
374
+ private
375
+
376
+ def bulk_update_positions(features)
377
+ # Use activerecord-import for bulk updates
378
+ features = order_features(features)
379
+ features.each(&:apply_defaults)
380
+ new_features = features.reject(&:persisted?)
381
+ existing_features = features.select(&:persisted?)
382
+ Feature.import(
383
+ existing_features,
384
+ on_duplicate_key_update: [:feature_position],
385
+ validate: false,
386
+ )
387
+ Feature.import(new_features)
388
+ end
389
+
390
+ def order_features(features)
391
+ features.sort_by { |t| t.feature_position }.each_with_index do |feature, index|
392
+ feature.feature_position = index
393
+ end
394
+ end
395
+
396
+ def set_feature_position
397
+ return if feature_position.present?
398
+
399
+ max_feature_position = dataset&.features&.maximum(:feature_position) || -1
400
+ self.feature_position = max_feature_position + 1
401
+ end
402
+
403
+ def update_sha
404
+ new_sha = compute_sha
405
+ if new_sha != self.sha
406
+ self.sha = new_sha
407
+ self.needs_fit = true
408
+ end
409
+ end
410
+
411
+ def update_from_feature_class
412
+ if read_attribute(:batch_size) != config.dig(:batch_size)
413
+ write_attribute(:batch_size, config.dig(:batch_size))
414
+ self.needs_fit = true
415
+ end
416
+
417
+ if self.primary_key != config.dig(:primary_key)
418
+ self.primary_key = [config.dig(:primary_key)].flatten
419
+ end
420
+
421
+ if new_refresh_every = config.dig(:refresh_every)
422
+ self.refresh_every = new_refresh_every.to_i
423
+ end
424
+ end
425
+
426
+ def feature_klass
427
+ @feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
428
+ end
429
+
430
+ def config
431
+ raise "Feature not found: #{feature_class}" unless feature_klass
432
+ feature_klass.features&.first
433
+ end
434
+ end
435
+
436
+ class InvalidFeatureError < StandardError; end
437
+ end
@@ -0,0 +1,38 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_feature_histories
4
+ #
5
+ # id :bigint not null, primary key
6
+ # feature_id :integer not null
7
+ # dataset_id :integer not null
8
+ # name :string
9
+ # version :integer
10
+ # feature_class :string not null
11
+ # feature_position :integer
12
+ # batch_size :integer
13
+ # needs_fit :boolean
14
+ # sha :string
15
+ # primary_key :string
16
+ # applied_at :datetime
17
+ # fit_at :datetime
18
+ # refresh_every :integer
19
+ # created_at :datetime not null
20
+ # updated_at :datetime not null
21
+ # history_started_at :datetime not null
22
+ # history_ended_at :datetime
23
+ # history_user_id :integer
24
+ # snapshot_id :string
25
+ #
26
+ module EasyML
27
+ class FeatureHistory < ActiveRecord::Base
28
+ self.table_name = "easy_ml_feature_histories"
29
+ include Historiographer::History
30
+
31
+ after_find :download_remote_files
32
+ scope :ordered, -> { order(feature_position: :asc) }
33
+
34
+ def download_remote_files
35
+ feature_store&.download
36
+ end
37
+ end
38
+ end