easy_ml 0.1.4 → 0.2.0.pre.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -5
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,93 @@
1
+ require_relative "date_converter"
2
+ require_relative "polars_column"
3
+
4
+ module EasyML::Data
5
+ class StatisticsLearner
6
+ attr_accessor :verbose
7
+
8
+ def initialize(options = {})
9
+ @verbose = options[:verbose]
10
+ end
11
+
12
+ def self.learn(raw, processed)
13
+ output = { raw: learn_split(raw) }
14
+ output[:processed] = learn_split(processed) if processed.data.present?
15
+ output
16
+ end
17
+
18
+ def self.learn_split(split)
19
+ df = split.read(:all)
20
+ train_df = split.read(:train)
21
+ all_stats = learn_df(df)
22
+ train_stats = learn_df(train_df)
23
+
24
+ all_stats.reduce({}) do |output, (k, _)|
25
+ output.tap do
26
+ output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
27
+ train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value)
28
+ )
29
+ end
30
+ end
31
+ end
32
+
33
+ def self.learn_df(df)
34
+ return if df.nil?
35
+
36
+ base_stats = describe_to_h(df).deep_symbolize_keys
37
+
38
+ # Add basic column statistics first
39
+ df.columns.each_with_object({}) do |col, stats|
40
+ series = df[col]
41
+ return {} if series.dtype == Polars::Null
42
+ field_type = PolarsColumn.determine_type(series)
43
+
44
+ stats[col] = {
45
+ num_rows: series.shape,
46
+ null_count: base_stats[col.to_sym][:null_count].to_i,
47
+ }
48
+
49
+ # Add type-specific statistics
50
+ case field_type
51
+ when :integer, :float
52
+ allowed_attrs = if id_column?(col)
53
+ %i[field_type null_count min max]
54
+ else
55
+ base_stats[col.to_sym].keys
56
+ end
57
+ stats[col].merge!(base_stats[col.to_sym].slice(*allowed_attrs))
58
+ when :categorical, :string, :text, :boolean
59
+ stats[col].merge!(most_frequent_value: series.mode.sort.to_a&.first)
60
+ if field_type == :categorical
61
+ stats[col].merge!(
62
+ unique_count: series.n_unique,
63
+ counts: Hash[series.value_counts.to_hashes.map(&:values)],
64
+ )
65
+ end
66
+ when :datetime
67
+ stats[col].merge!(
68
+ unique_count: series.n_unique,
69
+ last_value: series.sort[-1],
70
+ )
71
+ end
72
+ end
73
+ end
74
+
75
+ def self.id_column?(column)
76
+ col = column.to_s.downcase
77
+ col.match?(/^id$/) || col.match?(/.*_id/)
78
+ end
79
+
80
+ def self.describe_to_h(df)
81
+ init_h = df.describe.to_h
82
+ rows = init_h.values.map(&:to_a)
83
+ keys = rows.first
84
+ column_names = init_h.keys[1..-1]
85
+ column_values = rows[1..-1]
86
+ column_names.zip(column_values).inject({}) do |hash, (col_name, col_values)|
87
+ hash.tap do
88
+ hash[col_name] = Hash[keys.zip(col_values)]
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,341 @@
1
+ require_relative "polars_reader"
2
+
3
+ module EasyML
4
+ module Data
5
+ class SyncedDirectory
6
+ attr_accessor :root_dir, :s3_bucket, :s3_prefix, :s3_access_key_id, :s3_secret_access_key, :cache_for, :polars_args
7
+
8
+ def initialize(options = {})
9
+ @root_dir = options.dig(:root_dir)
10
+ @s3_bucket = options.dig(:s3_bucket)
11
+ @s3_prefix = options.dig(:s3_prefix)
12
+ @s3_access_key_id = options.dig(:s3_access_key_id)
13
+ @s3_secret_access_key = options.dig(:s3_secret_access_key)
14
+ @cache_for = options.dig(:cache_for)
15
+ @polars_args = options.dig(:polars_args)
16
+ end
17
+
18
+ delegate :query, :data, :all_files, :files, to: :reader
19
+
20
+ def before_sync
21
+ return unless should_sync?
22
+
23
+ clean
24
+ end
25
+
26
+ def after_sync
27
+ reader.normalize
28
+ end
29
+
30
+ def clean
31
+ mk_dir
32
+ clean_dir!
33
+ reader.clean
34
+ end
35
+
36
+ def remote_files
37
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix)
38
+ end
39
+
40
+ def should_sync?(force = false)
41
+ force || !synced?
42
+ end
43
+
44
+ def sync!(parallel: true)
45
+ sync(force: true, parallel: parallel)
46
+ end
47
+
48
+ def sync(force: false, parallel: false)
49
+ return false unless should_sync?(force)
50
+
51
+ files = files_to_sync
52
+
53
+ if parallel
54
+ Parallel.each(files, in_processes: 4, timeout: 10) { |object| download_file(object) }
55
+ else
56
+ files.each { |object| download_file(object) }
57
+ end
58
+ true
59
+ end
60
+
61
+ def files_to_sync
62
+ objects = s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents
63
+ objects.reject { |object| object.key.end_with?("/") }
64
+ end
65
+
66
+ def in_batches(&block)
67
+ reader.in_batches(&block)
68
+ end
69
+
70
+ def files
71
+ reader.files
72
+ end
73
+
74
+ def age(format: "human")
75
+ EasyML::Support::Age.age(last_updated_at, EasyML::Support::EST.now, format: format)
76
+ end
77
+
78
+ def stale?
79
+ !synced?
80
+ end
81
+
82
+ def synced?
83
+ return @synced unless @synced.nil?
84
+
85
+ return true if use_cached?
86
+
87
+ @synced = calculate_synced
88
+ end
89
+
90
+ def use_cached?
91
+ return false unless cache_for.present?
92
+ return false if last_updated_at.nil?
93
+
94
+ age_in_seconds = EasyML::Support::Age.age(last_updated_at, EasyML::Support::EST.now, format: "integer")
95
+ age_in_seconds < cache_for.to_i
96
+ end
97
+
98
+ def last_updated_at
99
+ return nil if files.empty?
100
+
101
+ files.map { |file| File.mtime(file) }.max.in_time_zone(EasyML::Support::EST)
102
+ end
103
+
104
+ def schema
105
+ reader.schema
106
+ end
107
+
108
+ def num_rows
109
+ reader.num_rows
110
+ end
111
+
112
+ def download_file(object)
113
+ # When s3_prefix is present, strip it from the key and just use the filename
114
+ key_without_prefix = s3_prefix.present? ? object.key.sub(/^#{Regexp.escape(s3_prefix)}\//, "") : object.key
115
+ local_file_path = File.join(root_dir, File.basename(key_without_prefix))
116
+ FileUtils.mkdir_p(File.dirname(local_file_path))
117
+
118
+ Rails.logger.info("Downloading object #{object.key} to #{local_file_path}")
119
+
120
+ s3.get_object(
121
+ response_target: local_file_path,
122
+ bucket: s3_bucket,
123
+ key: object.key,
124
+ )
125
+
126
+ Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
127
+ ungzipped_file_path = ungzip_file(local_file_path)
128
+ Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
129
+ rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
130
+ Rails.logger.error("Failed to process #{object.key}: #{e.message}")
131
+ raise e
132
+ end
133
+
134
+ def upload!(parallel: true)
135
+ upload(force: true, parallel: parallel)
136
+ end
137
+
138
+ def upload(force: false, parallel: true)
139
+ files = force ? files_to_upload : files_to_upload.select { |f| should_upload?(f) }
140
+ return true if files.empty?
141
+
142
+ if parallel
143
+ Parallel.each(files, in_processes: 4, timeout: 10) { |file| upload_file(file) }
144
+ else
145
+ files.each { |file| upload_file(file) }
146
+ end
147
+ true
148
+ end
149
+
150
+ def files_to_upload
151
+ return [] unless Dir.exist?(root_dir)
152
+
153
+ local_files = Dir.glob(File.join(root_dir, "**", "*")).select { |f| File.file?(f) }
154
+
155
+ # Get remote files and their last modified times
156
+ remote_files = {}
157
+ self.remote_files.contents.each do |object|
158
+ next if object.key.end_with?("/")
159
+
160
+ # Remove .gz extension and s3_prefix to match local paths
161
+ local_key = object.key.sub(/\.gz$/, "")
162
+ local_key = local_key.sub(%r{^#{Regexp.escape(s3_prefix)}/}, "") if s3_prefix.present?
163
+ remote_files[local_key] = object.last_modified.in_time_zone(EasyML::Support::EST)
164
+ end
165
+
166
+ # Filter files that are newer locally
167
+ local_files.select do |file_path|
168
+ relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
169
+ local_mtime = File.mtime(file_path).in_time_zone(EasyML::Support::EST)
170
+
171
+ # Upload if file doesn't exist remotely or is newer locally
172
+ !remote_files.key?(relative_path) || local_mtime > remote_files[relative_path]
173
+ end
174
+ end
175
+
176
+ # Add aliases for sync methods
177
+ alias download! sync!
178
+ alias download sync
179
+
180
+ private
181
+
182
+ def dir
183
+ root_dir
184
+ end
185
+
186
+ def relative_path(path)
187
+ if s3_prefix.present?
188
+ path.sub(Regexp.escape(s3_prefix), "").gsub(%r{/$}, "")
189
+ else
190
+ path
191
+ end
192
+ end
193
+
194
+ def reader
195
+ return @reader if @reader
196
+
197
+ @reader = EasyML::Data::PolarsReader.new(
198
+ root_dir: dir,
199
+ polars_args: polars_args,
200
+ refresh: false,
201
+ )
202
+ end
203
+
204
+ def mk_dir
205
+ FileUtils.mkdir_p(root_dir)
206
+ end
207
+
208
+ def clean_dir!
209
+ unless root_dir.start_with?(Rails.root.to_s)
210
+ raise "Refusing to wipe directory #{root_dir}, as it is not in the scope of #{Rails.root}"
211
+ end
212
+
213
+ FileUtils.rm_rf(root_dir)
214
+ end
215
+
216
+ def s3
217
+ credentials = Aws::Credentials.new(
218
+ s3_access_key_id,
219
+ s3_secret_access_key
220
+ )
221
+ Aws::S3::Client.new(
222
+ credentials: credentials,
223
+ http_open_timeout: 5, # Timeout for establishing connection (in seconds)
224
+ http_read_timeout: 30, # Timeout for reading response (in seconds))
225
+ http_wire_trace: false, # Enable verbose HTTP logging
226
+ http_idle_timeout: 0,
227
+ logger: Logger.new(STDOUT), # Logs to STDOUT; you can also set a file
228
+ )
229
+ end
230
+
231
+ def ungzip_file(gzipped_file_path)
232
+ ungzipped_file_path = gzipped_file_path.sub(/\.gz$/, "")
233
+
234
+ Zlib::GzipReader.open(gzipped_file_path) do |gz|
235
+ File.open(ungzipped_file_path, "wb") do |file|
236
+ file.write(gz.read)
237
+ end
238
+ end
239
+
240
+ File.delete(gzipped_file_path) # Optionally delete the gzipped file after extraction
241
+ ungzipped_file_path
242
+ end
243
+
244
+ def expand_dir(dir)
245
+ return dir if dir.to_s[0] == "/"
246
+
247
+ Rails.root.join(dir)
248
+ end
249
+
250
+ def new_data_available?
251
+ return false if files_to_sync.empty?
252
+ return true if files.empty?
253
+
254
+ local_latest = last_updated_at
255
+ s3_latest = s3_last_updated_at
256
+
257
+ return false if s3_latest.nil?
258
+
259
+ s3_latest > local_latest
260
+ end
261
+
262
+ def calculate_synced
263
+ !new_data_available?
264
+ end
265
+
266
+ def s3_last_updated_at
267
+ s3_latest = nil
268
+
269
+ s3.list_objects_v2(bucket: s3_bucket, prefix: s3_prefix).contents.each do |object|
270
+ next if object.key.end_with?("/")
271
+
272
+ s3_latest = [s3_latest, object.last_modified].compact.max
273
+ end
274
+
275
+ s3_latest.in_time_zone(EasyML::Support::EST)
276
+ end
277
+
278
+ def upload_file(file_path)
279
+ relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
280
+ s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
281
+
282
+ # Create a temporary gzipped version of the file
283
+ gzipped_file_path = "#{file_path}.gz"
284
+
285
+ begin
286
+ Rails.logger.info("Compressing and uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
287
+
288
+ # Compress the file
289
+ Zlib::GzipWriter.open(gzipped_file_path) do |gz|
290
+ File.open(file_path, "rb") do |file|
291
+ gz.write(file.read)
292
+ end
293
+ end
294
+
295
+ # Upload the gzipped file
296
+ File.open(gzipped_file_path, "rb") do |file|
297
+ s3.put_object(
298
+ bucket: s3_bucket,
299
+ key: "#{s3_key}.gz",
300
+ body: file,
301
+ content_encoding: "gzip",
302
+ )
303
+ end
304
+
305
+ Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
306
+ rescue Aws::S3::Errors::ServiceError, StandardError => e
307
+ Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
308
+ raise e
309
+ ensure
310
+ # Clean up temporary gzipped file
311
+ File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
312
+ end
313
+ end
314
+
315
+ def should_upload?(file_path)
316
+ relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
317
+ s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
318
+
319
+ begin
320
+ # Check if file exists in S3
321
+ response = s3.head_object(
322
+ bucket: s3_bucket,
323
+ key: "#{s3_key}.gz",
324
+ )
325
+
326
+ # Compare modification times
327
+ local_mtime = File.mtime(file_path).in_time_zone(EasyML::Support::EST)
328
+ remote_mtime = response.last_modified.in_time_zone(EasyML::Support::EST)
329
+
330
+ local_mtime > remote_mtime
331
+ rescue Aws::S3::Errors::NotFound
332
+ # File doesn't exist in S3, should upload
333
+ true
334
+ rescue Aws::S3::Errors::ServiceError => e
335
+ Rails.logger.error("Error checking S3 object: #{e.message}")
336
+ raise e
337
+ end
338
+ end
339
+ end
340
+ end
341
+ end
data/lib/easy_ml/data.rb CHANGED
@@ -1,8 +1,12 @@
1
1
  module EasyML
2
2
  module Data
3
3
  require_relative "data/utils"
4
+ require_relative "data/polars_reader"
5
+ require_relative "data/synced_directory"
4
6
  require_relative "data/preprocessor"
5
- require_relative "data/dataset"
6
- require_relative "data/datasource"
7
+ require_relative "data/splits"
8
+ require_relative "data/polars_column"
9
+ require_relative "data/statistics_learner"
10
+ require_relative "data/date_converter"
7
11
  end
8
12
  end
@@ -1,26 +1,125 @@
1
+ require "aws-sdk"
2
+ require "awesome_print"
3
+ require "action_controller"
4
+ require "inertia_rails"
5
+ require "jsonapi/serializer"
6
+ require "numo/narray"
7
+ require "numpy"
8
+ require "parallel"
9
+ require "polars-df"
10
+ require "pycall"
11
+ require "optuna"
12
+ require "tailwindcss-rails"
13
+ require "wandb"
14
+ require "xgb"
15
+ require "sidekiq"
16
+ require "vite_ruby"
1
17
  require "rails/engine"
18
+ require "activerecord-import"
19
+ require "historiographer"
2
20
 
3
21
  module EasyML
4
22
  class Engine < Rails::Engine
5
23
  isolate_namespace EasyML
6
24
 
25
+ def root_dir
26
+ Rails.root.join("easy_ml")
27
+ end
28
+
29
+ config.autoload_paths += [
30
+ root.join("app/models"),
31
+ root.join("app/models/datasources"),
32
+ root.join("app/models/models"),
33
+ root.join("lib/easy_ml"),
34
+ ]
35
+
36
+ config.eager_load_paths += [
37
+ root.join("app/models"),
38
+ root.join("app/models/datasources"),
39
+ root.join("app/models/models"),
40
+ root.join("lib/easy_ml"),
41
+ ]
42
+
7
43
  initializer "easy_ml.inflections" do
8
44
  require_relative "initializers/inflections"
45
+ EasyML::Initializers::Inflections.inflect
46
+ end
47
+
48
+ initializer "easy_ml.enable_string_cache" do
49
+ Polars.enable_string_cache
50
+ end
51
+
52
+ unless %w[rake rails].include?(File.basename($0)) && %w[generate db:migrate db:drop easy_ml:migration].include?(ARGV.first)
53
+ config.after_initialize do
54
+ Dir.glob(File.expand_path("app/models/easy_ml/datasources/*.rb", EasyML::Engine.root)).each do |file|
55
+ require file
56
+ end
57
+ Dir.glob(File.expand_path("app/models/easy_ml/models/*.rb", EasyML::Engine.root)).each do |file|
58
+ require file
59
+ end
60
+ Dir.glob(File.expand_path("app/models/easy_ml/splitters/*.rb", EasyML::Engine.root)).each do |file|
61
+ require file
62
+ end
63
+ Dir.glob(File.expand_path("app/models/easy_ml/**/*.rb", EasyML::Engine.root)).each do |file|
64
+ require file
65
+ end
66
+ end
67
+ end
68
+
69
+ initializer "easy_ml.active_job_config" do
70
+ ActiveSupport.on_load(:active_job) do
71
+ self.queue_adapter = :resque
72
+ end
73
+ end
74
+
75
+ # This tells our demo app where to look for assets like css, js
76
+ initializer "easy_ml.assets" do |app|
77
+ if app.config.respond_to?(:assets)
78
+ app.config.assets.precompile += %w[
79
+ easy_ml/application.js
80
+ easy_ml/application.css
81
+ ]
82
+ app.config.assets.paths << root.join("app", "frontend")
83
+ end
9
84
  end
10
85
 
11
86
  initializer "easy_ml.setup_generators" do |app|
87
+ generators_path = EasyML::Engine.root.join("lib/easy_ml/railtie/generators")
88
+ generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
89
+ generators_dirs.each { |file| require file }
90
+
12
91
  app.config.generators do |g|
13
92
  g.templates.unshift File.expand_path("../templates", __dir__)
14
93
  end
15
94
  end
16
95
 
17
- generators_path = File.expand_path("railtie/generators", __dir__)
18
- generators_dirs = Dir[File.join(generators_path, "**", "*.rb")]
19
- generators_dirs.each { |file| require file }
96
+ delegate :vite_ruby, to: :class
97
+
98
+ def self.vite_ruby
99
+ @vite_ruby ||= ViteRuby.new(root: root)
100
+ end
101
+
102
+ unless Rails.env.development?
103
+ config.app_middleware.use(Rack::Static,
104
+ urls: ["/#{vite_ruby.config.public_output_dir}"],
105
+ root: root.join(vite_ruby.config.public_dir))
106
+ end
107
+
108
+ initializer "vite_rails_engine.proxy" do |app|
109
+ if vite_ruby.run_proxy?
110
+ # Use Vite proxy in development for live assets
111
+ app.middleware.insert_before 0, ViteRuby::DevServerProxy, ssl_verify_none: true, vite_ruby: vite_ruby
112
+ end
113
+ end
114
+
115
+ initializer "vite_rails_engine.logger" do
116
+ config.after_initialize do
117
+ vite_ruby.logger = Rails.logger
118
+ end
119
+ end
20
120
 
21
- config.after_initialize do
22
- require_relative "../../app/models/easy_ml/model"
23
- require_relative "../../app/models/easy_ml/models"
121
+ def list_routes
122
+ EasyML::Engine.routes.routes.map { |r| "#{r.name} #{r.path.spec}" }
24
123
  end
25
124
  end
26
125
  end