easy_ml 0.2.0.pre.rc45 → 0.2.0.pre.rc46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +2 -1
  3. data/app/frontend/components/dataset/ColumnConfigModal.tsx +91 -28
  4. data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -2
  5. data/app/frontend/types/dataset.ts +1 -0
  6. data/app/helpers/easy_ml/application_helper.rb +3 -3
  7. data/app/models/easy_ml/column.rb +22 -0
  8. data/app/models/easy_ml/column_history.rb +1 -0
  9. data/app/models/easy_ml/column_list.rb +3 -0
  10. data/app/models/easy_ml/dataset.rb +29 -7
  11. data/app/models/easy_ml/event_context.rb +1 -0
  12. data/app/models/easy_ml/feature_history.rb +1 -0
  13. data/app/models/easy_ml/model_file.rb +0 -1
  14. data/app/models/easy_ml/model_file_history.rb +0 -1
  15. data/app/models/easy_ml/splitter.rb +12 -0
  16. data/app/models/easy_ml/splitters/date_splitter.rb +4 -0
  17. data/app/serializers/easy_ml/dataset_serializer.rb +4 -0
  18. data/config/initializers/dataframe.rb +9 -0
  19. data/lib/easy_ml/data/preprocessor.rb +9 -2
  20. data/lib/easy_ml/data/statistics_learner.rb +44 -9
  21. data/lib/easy_ml/engine.rb +9 -1
  22. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
  23. data/lib/easy_ml/railtie/templates/migration/add_is_date_column_to_easy_ml_columns.rb.tt +13 -0
  24. data/lib/easy_ml/version.rb +1 -1
  25. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  26. data/public/easy_ml/assets/assets/{Application-zpGA_Q9c.css → Application-D6L0eW4P.css} +1 -1
  27. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KJwHDm3F.js +474 -0
  28. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BQL_uYxE.js.map → Application.tsx-KJwHDm3F.js.map} +1 -1
  29. metadata +21 -5
  30. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-BQL_uYxE.js +0 -474
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c6df30d23d49530894614483d7bebe0599aa1aa1fc62108024b7c417cd1bb550
4
- data.tar.gz: ce317fa72227d618313d9b41b1d632d18628029253d33a69834c7ca878f18c4c
3
+ metadata.gz: 6b4c7d737a4d6775aa7bc7be7408994b4a68e76f55288563ed56990fe1aa2bb2
4
+ data.tar.gz: d1c147bbc6087489e908cb2df4f61de0021f553d4673f82955929177b4ddd754
5
5
  SHA512:
6
- metadata.gz: 1508d885e06d23f8e255d56499e2ab6757e9a5725b1f367a66b2b0a6e904c15aff437f0dc3b660a067e2d45730659248af4ff31f0dc02198b840735c3845c7a2
7
- data.tar.gz: 3631f30b7825a0cb5a6a5b6a0044fbfc5ce2bc4f627d395c66045f34c9c2fde43be249e996ee37ab9eb36ef5bd74cbfdfcc4ad6d5f93325ea07a48914f29efd4
6
+ metadata.gz: 9f182c7d0d9cd5f01124d4a3af4cb9c2f99f1450e3f98e77151761b77fd387cd8ee810d38c9797a8af182f44bc6896ab7789d71f2d475d962733fbd8cd8f0e5a
7
+ data.tar.gz: 1d4d930f6be09ab56838e79ae9b52a464ebbc25905644557aa2911bd1d2dfdffe96bc7563753b02a9d09c1016da050ef7c6e2ddf2124f9b11d1cddc4775f4792
@@ -23,7 +23,7 @@
23
23
  module EasyML
24
24
  class DatasetsController < ApplicationController
25
25
  def index
26
- datasets = Dataset.all
26
+ datasets = Dataset.all.order(id: :desc)
27
27
 
28
28
  render inertia: "pages/DatasetsPage", props: {
29
29
  datasets: datasets.map { |dataset| dataset_to_json(dataset) },
@@ -131,6 +131,7 @@ module EasyML
131
131
  :datatype,
132
132
  :polars_datatype,
133
133
  :is_target,
134
+ :is_date_column,
134
135
  :hidden,
135
136
  :drop_if_null,
136
137
  :sample_values,
@@ -10,6 +10,7 @@ import {
10
10
  Play,
11
11
  Loader2,
12
12
  Sparkles,
13
+ Calendar,
13
14
  } from "lucide-react";
14
15
  import { PreprocessingConfig } from "./PreprocessingConfig";
15
16
  import { ColumnList } from "./ColumnList";
@@ -24,6 +25,7 @@ import { router } from "@inertiajs/react";
24
25
 
25
26
  interface ColumnConfig {
26
27
  targetColumn?: string;
28
+ dateColumn?: string;
27
29
  }
28
30
 
29
31
  interface ColumnConfigModalProps {
@@ -45,9 +47,13 @@ export function ColumnConfigModal({
45
47
  const [activeTab, setActiveTab] = useState<"columns" | "features">(
46
48
  "columns"
47
49
  );
50
+ const [activeColumnSubTab, setActiveColumnSubTab] = useState<
51
+ "target" | "date"
52
+ >("target");
48
53
  const [isApplying, setIsApplying] = useState(false);
49
54
  const [config, setConfig] = useState<ColumnConfig>({
50
55
  targetColumn: dataset.target,
56
+ dateColumn: dataset.date_column,
51
57
  });
52
58
  const [selectedColumn, setSelectedColumn] = useState<string | null>(null);
53
59
  const [searchQuery, setSearchQuery] = useState("");
@@ -127,6 +133,15 @@ export function ColumnConfigModal({
127
133
  [dataset.columns]
128
134
  );
129
135
 
136
+ const dateColumnOptions = useMemo(() => {
137
+ return dataset.columns
138
+ .filter((column) => column.datatype === "datetime")
139
+ .map((column) => ({
140
+ value: column.name,
141
+ label: column.name,
142
+ }));
143
+ }, [dataset.columns]);
144
+
130
145
  const handleColumnSelect = (columnName: string) => {
131
146
  setSelectedColumn(columnName);
132
147
  };
@@ -146,7 +161,7 @@ export function ColumnConfigModal({
146
161
 
147
162
  const setTargetColumn = (columnName: string) => {
148
163
  const name = String(columnName);
149
- setConfig({ targetColumn: columnName });
164
+ setConfig({ ...config, targetColumn: columnName });
150
165
  const updatedColumns = dataset.columns.map((c) => ({
151
166
  ...c,
152
167
  is_target: c.name === name,
@@ -159,6 +174,21 @@ export function ColumnConfigModal({
159
174
  setNeedsRefresh(true);
160
175
  };
161
176
 
177
+ const setDateColumn = (columnName: string) => {
178
+ const name = String(columnName);
179
+ setConfig((prev) => ({ ...prev, dateColumn: columnName }));
180
+ const updatedColumns = dataset.columns.map((c) => ({
181
+ ...c,
182
+ is_date_column: c.name === name,
183
+ }));
184
+
185
+ setDataset({
186
+ ...dataset,
187
+ columns: updatedColumns,
188
+ });
189
+ setNeedsRefresh(true);
190
+ };
191
+
162
192
  const setColumnType = (columnName: string, datatype: string) => {
163
193
  const updatedColumns = dataset.columns.map((c) => ({
164
194
  ...c,
@@ -285,32 +315,25 @@ export function ColumnConfigModal({
285
315
  <div className="flex border-b shrink-0">
286
316
  <button
287
317
  onClick={() => setActiveTab("columns")}
288
- className={`px-4 py-2 text-sm font-medium border-b-2 ${
318
+ className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
289
319
  activeTab === "columns"
290
320
  ? "border-blue-500 text-blue-600"
291
321
  : "border-transparent text-gray-500 hover:text-gray-700"
292
322
  }`}
293
323
  >
294
- <div className="flex items-center gap-2">
295
- <Settings2 className="w-4 h-4" />
296
- Column Configuration
297
- </div>
324
+ <Settings2 className="w-4 h-4" />
325
+ Preprocessing
298
326
  </button>
299
327
  <button
300
328
  onClick={() => setActiveTab("features")}
301
- className={`px-4 py-2 text-sm font-medium border-b-2 ${
329
+ className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
302
330
  activeTab === "features"
303
331
  ? "border-blue-500 text-blue-600"
304
332
  : "border-transparent text-gray-500 hover:text-gray-700"
305
333
  }`}
306
334
  >
307
- <div className="flex items-center gap-2">
308
- <Wand2 className="w-4 h-4" />
309
- Features
310
- <span className="px-1.5 py-0.5 text-xs font-medium bg-blue-100 text-blue-600 rounded-full">
311
- {constants.feature_options.length}
312
- </span>
313
- </div>
335
+ <Wand2 className="w-4 h-4" />
336
+ Feature Engineering
314
337
  </button>
315
338
 
316
339
  {needsRefresh && (
@@ -341,20 +364,60 @@ export function ColumnConfigModal({
341
364
  <React.Fragment>
342
365
  <div className="grid grid-cols-7 flex-1 min-h-0">
343
366
  <div className="col-span-3 border-r overflow-hidden flex flex-col">
344
- <div className="p-4 border-b shrink-0">
345
- <label className="block text-sm font-medium text-gray-700">
346
- Target Column
347
- </label>
348
- <SearchableSelect
349
- options={dataset.columns.map((column) => ({
350
- value: column.name,
351
- label: column.name,
352
- }))}
353
- value={config.targetColumn || ""}
354
- onChange={(value) =>
355
- value && setTargetColumn(String(value))
356
- }
357
- />
367
+ <div className="p-4 border-b">
368
+ <div className="flex border-b">
369
+ <button
370
+ onClick={() => setActiveColumnSubTab("target")}
371
+ className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
372
+ activeColumnSubTab === "target"
373
+ ? "border-blue-500 text-blue-600"
374
+ : "border-transparent text-gray-500 hover:text-gray-700"
375
+ }`}
376
+ >
377
+ <Target className="w-4 h-4" />
378
+ Target Column
379
+ </button>
380
+ <button
381
+ onClick={() => setActiveColumnSubTab("date")}
382
+ className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
383
+ activeColumnSubTab === "date"
384
+ ? "border-blue-500 text-blue-600"
385
+ : "border-transparent text-gray-500 hover:text-gray-700"
386
+ }`}
387
+ >
388
+ <Calendar className="w-4 h-4" />
389
+ Date Column
390
+ </button>
391
+ </div>
392
+
393
+ {activeColumnSubTab === "target" ? (
394
+ <div className="mt-4">
395
+ <SearchableSelect
396
+ value={config.targetColumn || ""}
397
+ onChange={(value) => setTargetColumn(value)}
398
+ options={dataset.columns.map((column) => ({
399
+ value: column.name,
400
+ label: column.name,
401
+ }))}
402
+ placeholder="Select target column..."
403
+ />
404
+ </div>
405
+ ) : (
406
+ <div className="mt-4">
407
+ {dateColumnOptions.length > 0 ? (
408
+ <SearchableSelect
409
+ options={dateColumnOptions}
410
+ value={config.dateColumn}
411
+ onChange={setDateColumn}
412
+ placeholder="Select a date column..."
413
+ />
414
+ ) : (
415
+ <div className="text-center py-4 text-gray-500 bg-gray-50 rounded-md">
416
+ No date columns available
417
+ </div>
418
+ )}
419
+ </div>
420
+ )}
358
421
  </div>
359
422
  <div className="shrink-0">
360
423
  <ColumnFilters
@@ -266,8 +266,13 @@ export function PreprocessingConfig({
266
266
  let content;
267
267
  if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value) {
268
268
  content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
269
- } else if (strategy.method === 'ffill' && column.statistics?.raw.last_value) {
270
- content = `Last Value: ${column.statistics.raw.last_value}`
269
+ } else if (strategy.method === 'ffill') {
270
+ const lastValue = column.statistics?.raw.last_value;
271
+ if (lastValue !== undefined && lastValue !== null) {
272
+ content = `Forward Fill using Last Value: ${lastValue}`;
273
+ } else {
274
+ content = 'Set date column & apply preprocessing to see last value';
275
+ }
271
276
  } else if (strategy.method === 'median' && column.statistics?.raw?.median) {
272
277
  content = `Median: ${column.statistics.raw.median}`
273
278
  } else if (strategy.method === 'mean' && column.statistics?.raw?.mean) {
@@ -94,6 +94,7 @@ export interface Dataset {
94
94
  needs_refresh: boolean;
95
95
  workflow_status: DatasetWorkflowStatus;
96
96
  target?: string;
97
+ date_column?: string;
97
98
  num_rows?: number;
98
99
  drop_cols?: string[];
99
100
  datasource_id: number;
@@ -2,9 +2,9 @@
2
2
 
3
3
  module EasyML
4
4
  module ApplicationHelper
5
- # Override: Returns the engine assets manifest.
6
- def easy_ml_manifest
7
- ViteRuby.new(root: EasyML::Engine.root).manifest
5
+ def vite_manifest
6
+ # ViteRuby.new(root: EasyML::Engine.root).manifest
7
+ EasyML::Engine.vite_ruby.manifest
8
8
  end
9
9
 
10
10
  def prod_script_tags
@@ -16,6 +16,7 @@
16
16
  # statistics :json
17
17
  # created_at :datetime not null
18
18
  # updated_at :datetime not null
19
+ # is_date_column :boolean default(FALSE)
19
20
  #
20
21
  module EasyML
21
22
  class Column < ActiveRecord::Base
@@ -29,12 +30,15 @@ module EasyML
29
30
  validates :name, uniqueness: { scope: :dataset_id }
30
31
 
31
32
  before_save :ensure_valid_datatype
33
+ after_create :set_date_column_if_date_splitter
34
+ after_save :handle_date_column_change
32
35
 
33
36
  # Scopes
34
37
  scope :visible, -> { where(hidden: false) }
35
38
  scope :numeric, -> { where(datatype: %w[float integer]) }
36
39
  scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
37
40
  scope :datetime, -> { where(datatype: "datetime") }
41
+ scope :date_column, -> { where(is_date_column: true) }
38
42
 
39
43
  def datatype=(dtype)
40
44
  write_attribute(:datatype, dtype)
@@ -88,8 +92,26 @@ module EasyML
88
92
  dataset.preprocessor.statistics.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
89
93
  end
90
94
 
95
+ def date_column?
96
+ is_date_column
97
+ end
98
+
91
99
  private
92
100
 
101
+ def set_date_column_if_date_splitter
102
+ binding.pry
103
+ end
104
+
105
+ def handle_date_column_change
106
+ return unless saved_change_to_is_date_column? && is_date_column?
107
+
108
+ Column.transaction do
109
+ dataset.columns.where.not(id: id).update_all(is_date_column: false)
110
+ dataset.learn_statistics
111
+ dataset.columns.sync
112
+ end
113
+ end
114
+
93
115
  def ensure_valid_datatype
94
116
  return if datatype.blank?
95
117
 
@@ -21,6 +21,7 @@
21
21
  # history_ended_at :datetime
22
22
  # history_user_id :integer
23
23
  # snapshot_id :string
24
+ # is_date_column :boolean default(FALSE)
24
25
  #
25
26
  module EasyML
26
27
  class ColumnHistory < ActiveRecord::Base
@@ -9,6 +9,9 @@ module EasyML
9
9
  import_new(col_names, existing_columns)
10
10
  update_existing(existing_columns)
11
11
  delete_missing(existing_columns)
12
+ if existing_columns.none? # Totally new dataset
13
+ dataset.after_create_columns
14
+ end
12
15
  end
13
16
  end
14
17
 
@@ -336,9 +336,12 @@ module EasyML
336
336
  end
337
337
 
338
338
  def learn_statistics
339
- update(
340
- statistics: EasyML::Data::StatisticsLearner.learn(raw, processed),
341
- )
339
+ stats = {
340
+ raw: EasyML::Data::StatisticsLearner.learn(raw, self),
341
+ }
342
+ stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self)) if processed.data.present?
343
+
344
+ update(statistics: stats)
342
345
  end
343
346
 
344
347
  def process_data
@@ -508,6 +511,10 @@ module EasyML
508
511
  @target ||= preloaded_columns.find(&:is_target)&.name
509
512
  end
510
513
 
514
+ def date_column
515
+ @date_column ||= columns.date_column.first
516
+ end
517
+
511
518
  def drop_cols
512
519
  @drop_cols ||= preloaded_columns.select(&:hidden).map(&:name)
513
520
  end
@@ -588,8 +595,18 @@ module EasyML
588
595
  self
589
596
  end
590
597
 
598
+ def after_create_columns
599
+ apply_date_splitter_config
600
+ end
601
+
591
602
  private
592
603
 
604
+ def apply_date_splitter_config
605
+ return unless splitter.date_splitter?
606
+
607
+ set_date_column(splitter.date_col)
608
+ end
609
+
593
610
  def preloaded_features
594
611
  @preloaded_features ||= features.includes(:dataset).load
595
612
  end
@@ -670,10 +687,8 @@ module EasyML
670
687
  end
671
688
  end
672
689
 
673
- def fit(xs = nil)
674
- xs = raw.train(all_columns: true) if xs.nil?
675
-
676
- preprocessor.fit(xs)
690
+ def fit
691
+ preprocessor.fit(raw.train(all_columns: true))
677
692
  self.preprocessor_statistics = preprocessor.statistics
678
693
  end
679
694
 
@@ -712,6 +727,12 @@ module EasyML
712
727
  end
713
728
  end
714
729
 
730
+ def set_date_column(column_name)
731
+ return unless column_name.present?
732
+
733
+ columns.find_by(name: column_name).update(is_date_column: true)
734
+ end
735
+
715
736
  def apply_features(df, features = self.features)
716
737
  if features.nil? || features.empty?
717
738
  df
@@ -753,6 +774,7 @@ module EasyML
753
774
  EasyML::Data::Preprocessor.new(
754
775
  directory: Pathname.new(root_dir).append("preprocessor"),
755
776
  preprocessing_steps: preprocessing_steps,
777
+ dataset: self,
756
778
  ).tap do |preprocessor|
757
779
  preprocessor.statistics = preprocessor_statistics
758
780
  end
@@ -5,6 +5,7 @@
5
5
  # id :bigint not null, primary key
6
6
  # event_id :bigint not null
7
7
  # context :jsonb not null
8
+ # format :string
8
9
  # created_at :datetime not null
9
10
  # updated_at :datetime not null
10
11
  #
@@ -22,6 +22,7 @@
22
22
  # history_ended_at :datetime
23
23
  # history_user_id :integer
24
24
  # snapshot_id :string
25
+ # workflow_status :string
25
26
  #
26
27
  module EasyML
27
28
  class FeatureHistory < ActiveRecord::Base
@@ -4,7 +4,6 @@
4
4
  #
5
5
  # id :bigint not null, primary key
6
6
  # filename :string not null
7
- # path :string not null
8
7
  # configuration :json
9
8
  # model_type :string
10
9
  # model_id :bigint
@@ -5,7 +5,6 @@
5
5
  # id :bigint not null, primary key
6
6
  # model_file_id :integer not null
7
7
  # filename :string not null
8
- # path :string not null
9
8
  # configuration :json
10
9
  # model_type :string
11
10
  # model_id :integer
@@ -52,6 +52,18 @@ module EasyML
52
52
  add_configuration_attributes attribute
53
53
  end
54
54
 
55
+ def date_splitter?
56
+ splitter_type == "date"
57
+ end
58
+
59
+ def random_splitter?
60
+ splitter_type == "random"
61
+ end
62
+
63
+ def predefined_splitter?
64
+ splitter_type == "predefined"
65
+ end
66
+
55
67
  def self.constants
56
68
  {
57
69
  SPLITTER_TYPES: SPLITTER_TYPES,
@@ -86,6 +86,10 @@ module EasyML
86
86
  def today
87
87
  to_datetime(@today, default: UTC.today)
88
88
  end
89
+
90
+ def update_dataset_date_col
91
+ dataset.apply_date_splitter_config
92
+ end
89
93
  end
90
94
  end
91
95
  end
@@ -29,6 +29,10 @@ module EasyML
29
29
  attributes :id, :name, :description, :target, :num_rows, :status,
30
30
  :datasource_id, :preprocessing_steps, :workflow_status, :statistics
31
31
 
32
+ attribute :date_column do |dataset|
33
+ dataset.date_column&.name
34
+ end
35
+
32
36
  attribute :splitter do |dataset|
33
37
  dataset.splitter
34
38
  end
@@ -0,0 +1,9 @@
1
+ class Polars::DataFrame
2
+ def to_hash
3
+ if columns.count == 2
4
+ to_a.map(&:values).to_h
5
+ else
6
+ self
7
+ end
8
+ end
9
+ end
@@ -18,16 +18,19 @@ module EasyML::Data
18
18
 
19
19
  PREPROCESSING_STRATEGIES = {
20
20
  float: [
21
+ { value: "ffill", label: "Forward Fill" },
21
22
  { value: "mean", label: "Mean" },
22
23
  { value: "median", label: "Median" },
23
24
  { value: "constant", label: "Constant Value" },
24
25
  ],
25
26
  integer: [
27
+ { value: "ffill", label: "Forward Fill" },
26
28
  { value: "mean", label: "Mean" },
27
29
  { value: "median", label: "Median" },
28
30
  { value: "constant", label: "Constant Value" },
29
31
  ],
30
32
  boolean: [
33
+ { value: "ffill", label: "Forward Fill" },
31
34
  { value: "most_frequent", label: "Most Frequent" },
32
35
  { value: "constant", label: "Constant Value" },
33
36
  ],
@@ -37,21 +40,24 @@ module EasyML::Data
37
40
  { value: "today", label: "Current Date" },
38
41
  ],
39
42
  string: [
43
+ { value: "ffill", label: "Forward Fill" },
40
44
  { value: "most_frequent", label: "Most Frequent" },
41
45
  { value: "constant", label: "Constant Value" },
42
46
  ],
43
47
  text: [
48
+ { value: "ffill", label: "Forward Fill" },
44
49
  { value: "most_frequent", label: "Most Frequent" },
45
50
  { value: "constant", label: "Constant Value" },
46
51
  ],
47
52
  categorical: [
53
+ { value: "ffill", label: "Forward Fill" },
48
54
  { value: "categorical", label: "Categorical" },
49
55
  { value: "most_frequent", label: "Most Frequent" },
50
56
  { value: "constant", label: "Constant Value" },
51
57
  ],
52
58
  }.freeze
53
59
 
54
- attr_accessor :directory, :verbose, :imputers, :preprocessing_steps
60
+ attr_accessor :directory, :verbose, :imputers, :preprocessing_steps, :dataset
55
61
  attr_reader :statistics
56
62
 
57
63
  def initialize(options = {})
@@ -59,6 +65,7 @@ module EasyML::Data
59
65
  @verbose = options[:verbose]
60
66
  @imputers = options[:imputers]
61
67
  @preprocessing_steps = options[:preprocessing_steps]
68
+ @dataset = options[:dataset]
62
69
  @statistics = {}
63
70
  end
64
71
 
@@ -110,7 +117,7 @@ module EasyML::Data
110
117
  df = apply_clip(df, preprocessing_steps)
111
118
  allowed_categories = learn_categorical_min(df, preprocessing_steps)
112
119
 
113
- self.statistics = StatisticsLearner.learn_df(df).deep_symbolize_keys
120
+ self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
114
121
 
115
122
  # Merge allowed categories into statistics
116
123
  allowed_categories.each do |col, categories|
@@ -9,28 +9,55 @@ module EasyML::Data
9
9
  @verbose = options[:verbose]
10
10
  end
11
11
 
12
- def self.learn(raw, processed)
13
- output = { raw: learn_split(raw) }
14
- output[:processed] = learn_split(processed) if processed.data.present?
15
- output
12
+ def self.learn(df, dataset = nil)
13
+ new(df, dataset).learn
16
14
  end
17
15
 
18
- def self.learn_split(split)
16
+ attr_reader :df, :dataset
17
+
18
+ def initialize(df, dataset)
19
+ @df = df
20
+ @dataset = dataset
21
+ end
22
+
23
+ def learn
24
+ learn_split(df)
25
+ end
26
+
27
+ def learn_split(split)
19
28
  df = split.read(:all)
20
29
  train_df = split.read(:train)
21
- all_stats = learn_df(df)
22
- train_stats = learn_df(train_df)
30
+ all_stats = learn_df(df, dataset: dataset)
31
+ train_stats = learn_df(train_df, dataset: dataset)
23
32
 
24
33
  all_stats.reduce({}) do |output, (k, _)|
25
34
  output.tap do
26
35
  output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
27
- train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value)
36
+ train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value, :last_known_value)
28
37
  )
29
38
  end
30
39
  end
31
40
  end
32
41
 
33
- def self.learn_df(df)
42
+ def last_known_value(df, col, date_col)
43
+ return nil if df.empty? || !df.columns.include?(date_col)
44
+
45
+ # Sort by date and get the last non-null value
46
+ sorted_df = df.sort(date_col, reverse: true)
47
+ last_value = sorted_df
48
+ .filter(Polars.col(col).is_not_null)
49
+ .select(col)
50
+ .head(1)
51
+ .item
52
+
53
+ last_value
54
+ end
55
+
56
+ def learn_df(df, dataset: nil)
57
+ self.class.learn_df(df, dataset: dataset)
58
+ end
59
+
60
+ def self.learn_df(df, dataset: nil)
34
61
  return if df.nil?
35
62
 
36
63
  base_stats = describe_to_h(df).deep_symbolize_keys
@@ -46,6 +73,10 @@ module EasyML::Data
46
73
  null_count: base_stats[col.to_sym][:null_count].to_i,
47
74
  }
48
75
 
76
+ if dataset&.date_column.present?
77
+ stats[col][:last_value] = last_value(df, col, dataset.date_column.name)
78
+ end
79
+
49
80
  # Add type-specific statistics
50
81
  case field_type
51
82
  when :integer, :float
@@ -77,6 +108,10 @@ module EasyML::Data
77
108
  col.match?(/^id$/) || col.match?(/.*_id/)
78
109
  end
79
110
 
111
+ def self.last_value(df, col, date_col)
112
+ df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
113
+ end
114
+
80
115
  def self.describe_to_h(df)
81
116
  init_h = df.describe.to_h
82
117
  rows = init_h.values.map(&:to_a)
@@ -18,11 +18,16 @@ require "resque-batched-job"
18
18
  require "rake"
19
19
  require "resque/tasks"
20
20
  require "zhong"
21
+ require "vite_ruby"
22
+ require "vite_rails"
23
+ require "dotenv"
21
24
 
22
25
  module EasyML
23
26
  class Engine < Rails::Engine
24
27
  isolate_namespace EasyML
25
28
 
29
+ Dotenv.load if File.exist?(".env")
30
+
26
31
  def root_dir
27
32
  Rails.root.join("easy_ml")
28
33
  end
@@ -118,7 +123,10 @@ module EasyML
118
123
  end
119
124
 
120
125
  puts "Running dev proxy"
121
- config.app_middleware.insert_before 0, ViteRuby::DevServerProxy, ssl_verify_none: true, vite_ruby: vite_ruby
126
+ config.app_middleware.insert_before 0,
127
+ ViteRuby::DevServerProxy,
128
+ vite_ruby: vite_ruby,
129
+ ssl_verify_none: true
122
130
  else
123
131
  config.app_middleware.use(
124
132
  Rack::Static,