easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,56 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_dataset_histories
4
+ #
5
+ # id :bigint not null, primary key
6
+ # dataset_id :integer not null
7
+ # name :string not null
8
+ # description :string
9
+ # dataset_type :string
10
+ # status :string
11
+ # version :string
12
+ # datasource_id :integer
13
+ # root_dir :string
14
+ # configuration :json
15
+ # num_rows :integer
16
+ # workflow_status :string
17
+ # statistics :json
18
+ # preprocessor_statistics :json
19
+ # schema :json
20
+ # refreshed_at :datetime
21
+ # created_at :datetime not null
22
+ # updated_at :datetime not null
23
+ # history_started_at :datetime not null
24
+ # history_ended_at :datetime
25
+ # history_user_id :integer
26
+ # snapshot_id :string
27
+ #
28
+ module EasyML
29
+ class DatasetHistory < ActiveRecord::Base
30
+ self.table_name = "easy_ml_dataset_histories"
31
+ include Historiographer::History
32
+
33
+ has_many :columns,
34
+ ->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
35
+ class_name: "EasyML::ColumnHistory",
36
+ foreign_key: "dataset_id",
37
+ primary_key: "dataset_id",
38
+ extend: EasyML::ColumnList
39
+
40
+ def root_dir
41
+ read_attribute(:root_dir)
42
+ end
43
+
44
+ def fit
45
+ false
46
+ end
47
+
48
+ def processed?
49
+ true
50
+ end
51
+
52
+ def should_split?
53
+ false
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,182 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_datasources
4
+ #
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # datasource_type :string
8
+ # root_dir :string
9
+ # configuration :json
10
+ # refreshed_at :datetime
11
+ # created_at :datetime not null
12
+ # updated_at :datetime not null
13
+ #
14
+ module EasyML
15
+ class Datasource < ActiveRecord::Base
16
+ self.table_name = "easy_ml_datasources"
17
+ include Historiographer::Silent
18
+ historiographer_mode :snapshot_only
19
+ include EasyML::Concerns::Configurable
20
+
21
+ DATASOURCE_OPTIONS = {
22
+ "s3" => "EasyML::Datasources::S3Datasource",
23
+ "file" => "EasyML::Datasources::FileDatasource",
24
+ "polars" => "EasyML::Datasources::PolarsDatasource",
25
+ }
26
+ DATASOURCE_TYPES = [
27
+ {
28
+ value: "s3",
29
+ label: "Amazon S3",
30
+ description: "Connect to data stored in Amazon Simple Storage Service (S3) buckets",
31
+ },
32
+ {
33
+ value: "file",
34
+ label: "Local Files",
35
+ description: "Connect to data stored in local files",
36
+ },
37
+ {
38
+ value: "polars",
39
+ label: "Polars DataFrame",
40
+ description: "In-memory dataframe storage using Polars",
41
+ },
42
+ ].freeze
43
+ DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
44
+ DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
45
+
46
+ validates :name, presence: true
47
+ validates :datasource_type, presence: true
48
+ validates :datasource_type, inclusion: { in: DATASOURCE_NAMES }
49
+ # validate :validate_datasource_exists
50
+
51
+ before_save :set_root_dir
52
+ after_initialize :read_adapter_from_configuration, if: :persisted?
53
+ after_find :read_adapter_from_configuration
54
+ before_save :store_adapter_in_configuration
55
+ after_create :refresh_async
56
+
57
+ has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
58
+ attr_accessor :schema, :columns, :num_rows, :is_syncing
59
+
60
+ add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
61
+ DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
62
+ add_configuration_attributes attribute
63
+ end
64
+
65
+ delegate :query, :in_batches, :files, :all_files, :last_updated_at, :data, :needs_refresh?,
66
+ :should_sync?, :files_to_sync, :s3_access_key_id, :s3_secret_access_key,
67
+ :download_file, :clean, to: :adapter
68
+
69
+ def self.constants
70
+ {
71
+ DATASOURCE_TYPES: DATASOURCE_TYPES,
72
+ s3: EasyML::Datasources::S3Datasource.constants,
73
+ }
74
+ end
75
+
76
+ def reread(columns = nil)
77
+ return false unless adapter.respond_to?(:convert_to_parquet)
78
+
79
+ adapter.convert_to_parquet(columns)
80
+ end
81
+
82
+ def available_files
83
+ all_files.select { |f| File.exist?(f) && Pathname.new(f).extname == ".csv" }.map { |f| f.gsub(Regexp.new(Rails.root.to_s), "") }
84
+ end
85
+
86
+ def in_memory?
87
+ datasource_type == "polars"
88
+ end
89
+
90
+ def root_dir
91
+ persisted = read_attribute(:root_dir)
92
+ return persisted if persisted.present? && !persisted.to_s.blank?
93
+
94
+ default_root_dir
95
+ end
96
+
97
+ def refresh_async
98
+ update(is_syncing: true)
99
+ EasyML::SyncDatasourceJob.perform_later(id)
100
+ end
101
+
102
+ def before_sync
103
+ update!(is_syncing: true)
104
+ adapter.before_sync
105
+ Rails.logger.info("Starting sync for datasource #{id}")
106
+ end
107
+
108
+ def after_sync
109
+ adapter.after_sync
110
+ self.schema = data.schema.reduce({}) do |h, (k, v)|
111
+ h.tap do
112
+ h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
113
+ end
114
+ end
115
+ self.columns = data.columns
116
+ self.num_rows = data.shape[0]
117
+ self.is_syncing = false
118
+ self.refreshed_at = Time.now
119
+ save
120
+ end
121
+
122
+ def refresh
123
+ unless adapter.needs_refresh?
124
+ update!(is_syncing: false)
125
+ return
126
+ end
127
+
128
+ syncing do
129
+ adapter.refresh
130
+ end
131
+ end
132
+
133
+ def refresh!
134
+ syncing do
135
+ adapter.refresh!
136
+ end
137
+ end
138
+
139
+ def syncing
140
+ before_sync
141
+ yield.tap do
142
+ after_sync
143
+ end
144
+ end
145
+
146
+ private
147
+
148
+ def adapter
149
+ @adapter ||= begin
150
+ adapter_class = DATASOURCE_OPTIONS[datasource_type]
151
+ raise "Don't know how to use datasource adapter #{datasource_type}!" unless adapter_class.present?
152
+
153
+ adapter_class.constantize.new(self)
154
+ end
155
+ end
156
+
157
+ def default_root_dir
158
+ folder = name.gsub(/\s{2,}/, " ").split(" ").join("_").downcase
159
+ EasyML::Engine.root_dir.join("datasources").join(folder)
160
+ end
161
+
162
+ def set_root_dir
163
+ write_attribute(:root_dir, default_root_dir) unless read_attribute(:root_dir).present?
164
+ end
165
+
166
+ def read_adapter_from_configuration
167
+ return unless persisted?
168
+
169
+ adapter.read_from_configuration if adapter.respond_to?(:read_from_configuration)
170
+ end
171
+
172
+ def store_adapter_in_configuration
173
+ adapter.store_in_configuration if adapter.respond_to?(:store_in_configuration)
174
+ end
175
+
176
+ def validate_datasource_exists
177
+ return if adapter.exists?
178
+
179
+ errors.add(:root_dir, adapter.error_not_exists)
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,24 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_datasource_histories
4
+ #
5
+ # id :bigint not null, primary key
6
+ # datasource_id :integer not null
7
+ # name :string not null
8
+ # datasource_type :string
9
+ # root_dir :string
10
+ # configuration :json
11
+ # refreshed_at :datetime
12
+ # created_at :datetime not null
13
+ # updated_at :datetime not null
14
+ # history_started_at :datetime not null
15
+ # history_ended_at :datetime
16
+ # history_user_id :integer
17
+ # snapshot_id :string
18
+ #
19
+ module EasyML
20
+ class DatasourceHistory < ActiveRecord::Base
21
+ self.table_name = "easy_ml_datasource_histories"
22
+ include Historiographer::History
23
+ end
24
+ end
@@ -0,0 +1,54 @@
1
+ module EasyML
2
+ module Datasources
3
+ class BaseDatasource
4
+ include ActiveModel::Validations
5
+ include EasyML::Concerns::Configurable
6
+
7
+ attr_reader :datasource
8
+
9
+ def clean; end
10
+
11
+ def before_sync; end
12
+
13
+ def after_sync; end
14
+
15
+ def initialize(datasource)
16
+ @datasource = datasource
17
+ end
18
+
19
+ def query(*)
20
+ raise NotImplementedError
21
+ end
22
+
23
+ def in_batches(*)
24
+ raise NotImplementedError
25
+ end
26
+
27
+ def files
28
+ raise NotImplementedError
29
+ end
30
+
31
+ def last_updated_at
32
+ raise NotImplementedError
33
+ end
34
+
35
+ def data
36
+ raise NotImplementedError
37
+ end
38
+
39
+ def needs_refresh?
40
+ false
41
+ end
42
+
43
+ def refresh
44
+ datasource.syncing do
45
+ # Default implementation does nothing
46
+ end
47
+ end
48
+
49
+ def refresh!
50
+ refresh
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,58 @@
1
+ module EasyML
2
+ module Datasources
3
+ class FileDatasource < BaseDatasource
4
+ delegate :query, :convert_to_parquet, to: :reader
5
+
6
+ def after_sync
7
+ reader.normalize
8
+ end
9
+
10
+ def in_batches(&block)
11
+ reader.in_batches(&block)
12
+ end
13
+
14
+ def all_files
15
+ reader.all_files
16
+ end
17
+
18
+ def files
19
+ reader.files
20
+ end
21
+
22
+ def last_updated_at
23
+ files.map { |file| File.mtime(file) }.max
24
+ end
25
+
26
+ def needs_refresh?
27
+ false
28
+ end
29
+
30
+ def data
31
+ return @combined_df if @combined_df.present?
32
+
33
+ combined_df = nil
34
+ reader.in_batches do |df|
35
+ combined_df = combined_df.nil? ? df : combined_df.vstack(df)
36
+ end
37
+ @combined_df = combined_df
38
+ end
39
+
40
+ def exists?
41
+ Dir.glob(File.join(datasource.root_dir, "**/*.{csv,parquet}")).any?
42
+ end
43
+
44
+ def error_not_exists
45
+ "Expected to find datasource files at #{datasource.root_dir}"
46
+ end
47
+
48
+ private
49
+
50
+ def reader
51
+ @reader ||= EasyML::Data::PolarsReader.new(
52
+ root_dir: datasource.root_dir,
53
+ polars_args: (datasource.configuration || {}).dig("polars_args"),
54
+ )
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,89 @@
1
+ module EasyML
2
+ module Datasources
3
+ class PolarsDatasource < BaseDatasource
4
+ validates :df, presence: true
5
+ add_configuration_attributes :df
6
+
7
+ def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
8
+ return if df.nil?
9
+
10
+ df = self.df.clone
11
+ df = df.filter(filter) if filter
12
+ df = df.select(select) if select.present?
13
+ df = df.unique if unique
14
+ drop_cols &= df.columns
15
+ df = df.drop(drop_cols) unless drop_cols.empty?
16
+ df = df.sort(sort, reverse: descending) if sort
17
+ df = df.limit(limit) if limit
18
+ df
19
+ end
20
+
21
+ def in_batches(of: 10_000)
22
+ total_rows = df.shape[0]
23
+ (0...total_rows).step(of) do |start|
24
+ end_index = [start + of, total_rows].min
25
+ yield df.slice(start, end_index - start)
26
+ end
27
+ end
28
+
29
+ def all_files
30
+ []
31
+ end
32
+
33
+ def files
34
+ []
35
+ end
36
+
37
+ def last_updated_at
38
+ datasource.updated_at
39
+ end
40
+
41
+ def data
42
+ df
43
+ end
44
+
45
+ def df
46
+ datasource.df
47
+ end
48
+
49
+ def exists?
50
+ df.present?
51
+ end
52
+
53
+ def error_not_exists
54
+ "Must have a dataframe"
55
+ end
56
+
57
+ def store_in_configuration
58
+ return unless df
59
+
60
+ datasource.configuration = (datasource.configuration || {}).merge(
61
+ "df" => JSON.parse(df.write_json),
62
+ )
63
+ end
64
+
65
+ def read_from_configuration
66
+ return unless datasource.configuration&.key?("df")
67
+
68
+ df_data = datasource.configuration["df"]
69
+ return unless df_data.present? && df_data.key?("columns")
70
+
71
+ columns = df_data["columns"].map do |col|
72
+ dtype = case col["datatype"]
73
+ when Hash
74
+ if col["datatype"]["Datetime"]
75
+ Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
76
+ else
77
+ Polars::Utf8
78
+ end
79
+ else
80
+ Polars.const_get(col["datatype"])
81
+ end
82
+ Polars::Series.new(col["name"], col["values"], dtype: dtype)
83
+ end
84
+
85
+ datasource.df = Polars::DataFrame.new(columns)
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,97 @@
1
+ module EasyML
2
+ module Datasources
3
+ class S3Datasource < BaseDatasource
4
+ REGIONS = [
5
+ { value: "us-east-1", label: "US East (N. Virginia)" },
6
+ { value: "us-east-2", label: "US East (Ohio)" },
7
+ { value: "us-west-1", label: "US West (N. California)" },
8
+ { value: "us-west-2", label: "US West (Oregon)" },
9
+ ].freeze
10
+
11
+ def self.constants
12
+ { S3_REGIONS: REGIONS }
13
+ end
14
+
15
+ validates :s3_bucket, :s3_access_key_id, :s3_secret_access_key, presence: true
16
+
17
+ add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
18
+
19
+ delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
20
+ to: :synced_directory
21
+
22
+ def in_batches(&block)
23
+ synced_directory.in_batches(&block)
24
+ end
25
+
26
+ def all_files
27
+ synced_directory.all_files
28
+ end
29
+
30
+ def files
31
+ synced_directory.files
32
+ end
33
+
34
+ def last_updated_at
35
+ synced_directory.last_updated_at
36
+ end
37
+
38
+ def needs_refresh?
39
+ synced_directory.should_sync?
40
+ end
41
+
42
+ def refresh
43
+ synced_directory.sync
44
+ end
45
+
46
+ def refresh!
47
+ synced_directory.sync!
48
+ end
49
+
50
+ def files_to_sync
51
+ synced_directory.files_to_sync
52
+ end
53
+
54
+ def download_file(file)
55
+ synced_directory.download_file(file)
56
+ end
57
+
58
+ def exists?
59
+ synced_directory.files_to_sync.any?
60
+ end
61
+
62
+ def error_not_exists
63
+ "No files found at s3://#{File.join(s3_bucket, s3_prefix)}"
64
+ end
65
+
66
+ def s3_bucket
67
+ datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket
68
+ end
69
+
70
+ def s3_prefix
71
+ datasource_config.dig("s3_prefix")
72
+ end
73
+
74
+ def cache_for
75
+ datasource_config.dig("cache_for") || 0
76
+ end
77
+
78
+ private
79
+
80
+ def datasource_config
81
+ @datasource_config ||= datasource.configuration || {}
82
+ end
83
+
84
+ def synced_directory
85
+ @synced_directory ||= EasyML::Data::SyncedDirectory.new(
86
+ root_dir: datasource.root_dir,
87
+ s3_bucket: s3_bucket,
88
+ s3_prefix: s3_prefix,
89
+ s3_access_key_id: EasyML::Configuration.s3_access_key_id,
90
+ s3_secret_access_key: EasyML::Configuration.s3_secret_access_key,
91
+ polars_args: datasource_config.dig("polars_args") || {},
92
+ cache_for: cache_for,
93
+ )
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,114 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_deploys
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model_id :bigint
7
+ # model_history_id :bigint
8
+ # retraining_run_id :bigint
9
+ # model_file_id :bigint
10
+ # status :string not null
11
+ # trigger :string default("manual")
12
+ # stacktrace :text
13
+ # snapshot_id :string
14
+ # created_at :datetime not null
15
+ # updated_at :datetime not null
16
+ #
17
+ module EasyML
18
+ class Deploy < ActiveRecord::Base
19
+ self.table_name = "easy_ml_deploys"
20
+
21
+ belongs_to :model, class_name: "EasyML::Model"
22
+ belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
23
+ belongs_to :retraining_run, class_name: "EasyML::RetrainingRun"
24
+ belongs_to :model_version, class_name: "EasyML::ModelHistory", optional: true, foreign_key: :model_history_id
25
+
26
+ validates :status, presence: true
27
+ after_initialize :set_defaults, if: :new_record?
28
+ before_save :set_model_file, if: :new_record?
29
+ validates :status, presence: true, inclusion: { in: %w[pending running success failed] }
30
+
31
+ scope :latest, -> { select("DISTINCT ON (model_id) *").order("model_id, id DESC") }
32
+
33
+ def unlocked?
34
+ EasyML::Deploy.where(model_id: model_id).where.not(locked_at: nil).where(status: ["pending", "running"]).empty?
35
+ end
36
+
37
+ def locked?
38
+ !unlocked?
39
+ end
40
+
41
+ def deploy(async: true)
42
+ if async
43
+ EasyML::DeployJob.perform_later(id)
44
+ else
45
+ actually_deploy
46
+ end
47
+ end
48
+
49
+ def actually_deploy
50
+ lock_deploy do
51
+ update(status: "running")
52
+ EasyML::Event.create_event(self, "started")
53
+
54
+ if identical_deploy.present?
55
+ self.model_file = identical_deploy.model_file
56
+ self.model_version = identical_deploy.model_version
57
+ else
58
+ if model_file.present?
59
+ model.model_file = model_file
60
+ end
61
+ model.load_model
62
+ self.model_version = model.actually_deploy
63
+ end
64
+
65
+ EasyML::Deploy.transaction do
66
+ update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
67
+ model.retraining_runs.where(status: :deployed).update_all(status: :success)
68
+ retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed, is_deploying: false)
69
+ end
70
+
71
+ model_version.tap do
72
+ EasyML::Event.create_event(self, "success")
73
+ end
74
+ end
75
+ end
76
+
77
+ alias_method :rollback, :deploy
78
+
79
+ def unlock!
80
+ Support::Lockable.unlock!(lock_key)
81
+ end
82
+
83
+ def lock_deploy
84
+ with_lock do |client|
85
+ yield
86
+ end
87
+ end
88
+
89
+ def identical_deploy
90
+ EasyML::Deploy.where(retraining_run_id: retraining_run_id).
91
+ where.not(id: id).where(status: :success).limit(1).first
92
+ end
93
+
94
+ private
95
+
96
+ def with_lock
97
+ EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
98
+ yield client
99
+ end
100
+ end
101
+
102
+ def lock_key
103
+ "deploy:#{self.model.name}:#{self.model.id}"
104
+ end
105
+
106
+ def set_defaults
107
+ self.status ||= :pending
108
+ end
109
+
110
+ def set_model_file
111
+ self.model_file ||= retraining_run.model_file
112
+ end
113
+ end
114
+ end