polars-df 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  module Polars
2
2
  # Two-dimensional data structure representing data as a table with rows and columns.
3
3
  class DataFrame
4
+ include Plot
5
+
4
6
  # @private
5
7
  attr_accessor :_df
6
8
 
@@ -95,7 +97,7 @@ module Polars
95
97
  eol_char: "\n"
96
98
  )
97
99
  if Utils.pathlike?(file)
98
- path = Utils.format_path(file)
100
+ path = Utils.normalise_filepath(file)
99
101
  else
100
102
  path = nil
101
103
  # if defined?(StringIO) && file.is_a?(StringIO)
@@ -194,32 +196,56 @@ module Polars
194
196
 
195
197
  # @private
196
198
  def self._read_parquet(
197
- file,
199
+ source,
198
200
  columns: nil,
199
201
  n_rows: nil,
200
202
  parallel: "auto",
201
203
  row_count_name: nil,
202
204
  row_count_offset: 0,
203
- low_memory: false
205
+ low_memory: false,
206
+ use_statistics: true,
207
+ rechunk: true
204
208
  )
205
- if Utils.pathlike?(file)
206
- file = Utils.format_path(file)
209
+ if Utils.pathlike?(source)
210
+ source = Utils.normalise_filepath(source)
211
+ end
212
+ if columns.is_a?(String)
213
+ columns = [columns]
207
214
  end
208
215
 
209
- if file.is_a?(String) && file.include?("*")
210
- raise Todo
216
+ if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
217
+ scan =
218
+ Polars.scan_parquet(
219
+ source,
220
+ n_rows: n_rows,
221
+ rechunk: true,
222
+ parallel: parallel,
223
+ row_count_name: row_count_name,
224
+ row_count_offset: row_count_offset,
225
+ low_memory: low_memory
226
+ )
227
+
228
+ if columns.nil?
229
+ return self._from_rbdf(scan.collect._df)
230
+ elsif Utils.is_str_sequence(columns, allow_str: false)
231
+ return self._from_rbdf(scan.select(columns).collect._df)
232
+ else
233
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
+ end
211
235
  end
212
236
 
213
237
  projection, columns = Utils.handle_projection_columns(columns)
214
238
  _from_rbdf(
215
239
  RbDataFrame.read_parquet(
216
- file,
240
+ source,
217
241
  columns,
218
242
  projection,
219
243
  n_rows,
220
244
  parallel,
221
245
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
222
- low_memory
246
+ low_memory,
247
+ use_statistics,
248
+ rechunk
223
249
  )
224
250
  )
225
251
  end
@@ -227,7 +253,7 @@ module Polars
227
253
  # @private
228
254
  def self._read_avro(file, columns: nil, n_rows: nil)
229
255
  if Utils.pathlike?(file)
230
- file = Utils.format_path(file)
256
+ file = Utils.normalise_filepath(file)
231
257
  end
232
258
  projection, columns = Utils.handle_projection_columns(columns)
233
259
  _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
@@ -244,7 +270,7 @@ module Polars
244
270
  memory_map: true
245
271
  )
246
272
  if Utils.pathlike?(file)
247
- file = Utils.format_path(file)
273
+ file = Utils.normalise_filepath(file)
248
274
  end
249
275
  if columns.is_a?(String)
250
276
  columns = [columns]
@@ -270,7 +296,7 @@ module Polars
270
296
  # @private
271
297
  def self._read_json(file)
272
298
  if Utils.pathlike?(file)
273
- file = Utils.format_path(file)
299
+ file = Utils.normalise_filepath(file)
274
300
  end
275
301
 
276
302
  _from_rbdf(RbDataFrame.read_json(file))
@@ -279,7 +305,7 @@ module Polars
279
305
  # @private
280
306
  def self._read_ndjson(file)
281
307
  if Utils.pathlike?(file)
282
- file = Utils.format_path(file)
308
+ file = Utils.normalise_filepath(file)
283
309
  end
284
310
 
285
311
  _from_rbdf(RbDataFrame.read_ndjson(file))
@@ -604,10 +630,10 @@ module Polars
604
630
  return Slice.new(self).apply(item)
605
631
  end
606
632
 
607
- if Utils.is_str_sequence(item, allow_str: false)
633
+ if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
608
634
  # select multiple columns
609
635
  # df[["foo", "bar"]]
610
- return _from_rbdf(_df.select(item))
636
+ return _from_rbdf(_df.select(item.map(&:to_s)))
611
637
  end
612
638
 
613
639
  if Utils.is_int_sequence(item)
@@ -689,7 +715,8 @@ module Polars
689
715
  # @example
690
716
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
691
717
  # df.to_hashes
692
- # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
718
+ # # =>
719
+ # # [{"foo"=>1, "bar"=>4}, {"foo"=>2, "bar"=>5}, {"foo"=>3, "bar"=>6}]
693
720
  def to_hashes
694
721
  rbdf = _df
695
722
  names = columns
@@ -699,8 +726,26 @@ module Polars
699
726
  end
700
727
  end
701
728
 
702
- # def to_numo
703
- # end
729
+ # Convert DataFrame to a 2D Numo array.
730
+ #
731
+ # This operation clones data.
732
+ #
733
+ # @return [Numo::NArray]
734
+ #
735
+ # @example
736
+ # df = Polars::DataFrame.new(
737
+ # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
738
+ # )
739
+ # df.to_numo.class
740
+ # # => Numo::RObject
741
+ def to_numo
742
+ out = _df.to_numo
743
+ if out.nil?
744
+ Numo::NArray.vstack(width.times.map { |i| to_series(i).to_numo }).transpose
745
+ else
746
+ out
747
+ end
748
+ end
704
749
 
705
750
  # no to_pandas
706
751
 
@@ -753,7 +798,7 @@ module Polars
753
798
  row_oriented: false
754
799
  )
755
800
  if Utils.pathlike?(file)
756
- file = Utils.format_path(file)
801
+ file = Utils.normalise_filepath(file)
757
802
  end
758
803
 
759
804
  _df.write_json(file, pretty, row_oriented)
@@ -768,7 +813,7 @@ module Polars
768
813
  # @return [nil]
769
814
  def write_ndjson(file)
770
815
  if Utils.pathlike?(file)
771
- file = Utils.format_path(file)
816
+ file = Utils.normalise_filepath(file)
772
817
  end
773
818
 
774
819
  _df.write_ndjson(file)
@@ -858,7 +903,7 @@ module Polars
858
903
  end
859
904
 
860
905
  if Utils.pathlike?(file)
861
- file = Utils.format_path(file)
906
+ file = Utils.normalise_filepath(file)
862
907
  end
863
908
 
864
909
  _df.write_csv(
@@ -896,7 +941,7 @@ module Polars
896
941
  compression = "uncompressed"
897
942
  end
898
943
  if Utils.pathlike?(file)
899
- file = Utils.format_path(file)
944
+ file = Utils.normalise_filepath(file)
900
945
  end
901
946
 
902
947
  _df.write_avro(file, compression)
@@ -915,7 +960,7 @@ module Polars
915
960
  compression = "uncompressed"
916
961
  end
917
962
  if Utils.pathlike?(file)
918
- file = Utils.format_path(file)
963
+ file = Utils.normalise_filepath(file)
919
964
  end
920
965
 
921
966
  _df.write_ipc(file, compression)
@@ -957,7 +1002,7 @@ module Polars
957
1002
  compression = "uncompressed"
958
1003
  end
959
1004
  if Utils.pathlike?(file)
960
- file = Utils.format_path(file)
1005
+ file = Utils.normalise_filepath(file)
961
1006
  end
962
1007
 
963
1008
  _df.write_parquet(
@@ -3021,24 +3066,28 @@ module Polars
3021
3066
  if aggregate_fn.is_a?(String)
3022
3067
  case aggregate_fn
3023
3068
  when "first"
3024
- aggregate_fn = Polars.element.first
3069
+ aggregate_expr = Polars.element.first._rbexpr
3025
3070
  when "sum"
3026
- aggregate_fn = Polars.element.sum
3071
+ aggregate_expr = Polars.element.sum._rbexpr
3027
3072
  when "max"
3028
- aggregate_fn = Polars.element.max
3073
+ aggregate_expr = Polars.element.max._rbexpr
3029
3074
  when "min"
3030
- aggregate_fn = Polars.element.min
3075
+ aggregate_expr = Polars.element.min._rbexpr
3031
3076
  when "mean"
3032
- aggregate_fn = Polars.element.mean
3077
+ aggregate_expr = Polars.element.mean._rbexpr
3033
3078
  when "median"
3034
- aggregate_fn = Polars.element.median
3079
+ aggregate_expr = Polars.element.median._rbexpr
3035
3080
  when "last"
3036
- aggregate_fn = Polars.element.last
3081
+ aggregate_expr = Polars.element.last._rbexpr
3037
3082
  when "count"
3038
- aggregate_fn = Polars.count
3083
+ aggregate_expr = Polars.count._rbexpr
3039
3084
  else
3040
3085
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3041
3086
  end
3087
+ elsif aggregate_fn.nil?
3088
+ aggregate_expr = nil
3089
+ else
3090
+ aggregate_expr = aggregate_function._rbexpr
3042
3091
  end
3043
3092
 
3044
3093
  _from_rbdf(
@@ -3046,9 +3095,9 @@ module Polars
3046
3095
  values,
3047
3096
  index,
3048
3097
  columns,
3049
- aggregate_fn._rbexpr,
3050
3098
  maintain_order,
3051
3099
  sort_columns,
3100
+ aggregate_expr,
3052
3101
  separator
3053
3102
  )
3054
3103
  )
@@ -3153,7 +3202,7 @@ module Polars
3153
3202
  # # │ B ┆ 1 │
3154
3203
  # # │ C ┆ 2 │
3155
3204
  # # │ D ┆ 3 │
3156
- # # │ ... ...
3205
+ # # │
3157
3206
  # # │ F ┆ 5 │
3158
3207
  # # │ G ┆ 6 │
3159
3208
  # # │ H ┆ 7 │
@@ -4032,15 +4081,12 @@ module Polars
4032
4081
  # # │ 5 ┆ 3.0 ┆ true │
4033
4082
  # # └─────┴─────┴───────┘
4034
4083
  def unique(maintain_order: true, subset: nil, keep: "first")
4035
- if !subset.nil?
4036
- if subset.is_a?(String)
4037
- subset = [subset]
4038
- elsif !subset.is_a?(Array)
4039
- subset = subset.to_a
4040
- end
4041
- end
4042
-
4043
- _from_rbdf(_df.unique(maintain_order, subset, keep))
4084
+ self._from_rbdf(
4085
+ lazy
4086
+ .unique(maintain_order: maintain_order, subset: subset, keep: keep)
4087
+ .collect(no_optimization: true)
4088
+ ._df
4089
+ )
4044
4090
  end
4045
4091
 
4046
4092
  # Return the number of unique rows, or the number of unique row-subsets.
@@ -84,6 +84,8 @@ module Polars
84
84
 
85
85
  # Calendar date and time type.
86
86
  class Datetime < TemporalType
87
+ attr_reader :tu
88
+
87
89
  def initialize(time_unit = "us", time_zone = nil)
88
90
  @tu = time_unit || "us"
89
91
  @time_zone = time_zone
@@ -92,6 +94,8 @@ module Polars
92
94
 
93
95
  # Time duration/delta type.
94
96
  class Duration < TemporalType
97
+ attr_reader :tu
98
+
95
99
  def initialize(time_unit = "us")
96
100
  @tu = time_unit
97
101
  end
@@ -1130,7 +1130,7 @@ module Polars
1130
1130
  # ]
1131
1131
  # )
1132
1132
  # # =>
1133
- # # shape: (1001, 2)
1133
+ # # shape: (1_001, 2)
1134
1134
  # # ┌─────────────────────────┬───────────────────┐
1135
1135
  # # │ date ┆ milliseconds_diff │
1136
1136
  # # │ --- ┆ --- │
@@ -1140,7 +1140,7 @@ module Polars
1140
1140
  # # │ 2020-01-01 00:00:00.001 ┆ 1 │
1141
1141
  # # │ 2020-01-01 00:00:00.002 ┆ 1 │
1142
1142
  # # │ 2020-01-01 00:00:00.003 ┆ 1 │
1143
- # # │ ... ...
1143
+ # # │
1144
1144
  # # │ 2020-01-01 00:00:00.997 ┆ 1 │
1145
1145
  # # │ 2020-01-01 00:00:00.998 ┆ 1 │
1146
1146
  # # │ 2020-01-01 00:00:00.999 ┆ 1 │
@@ -1169,7 +1169,7 @@ module Polars
1169
1169
  # ]
1170
1170
  # )
1171
1171
  # # =>
1172
- # # shape: (1001, 2)
1172
+ # # shape: (1_001, 2)
1173
1173
  # # ┌─────────────────────────┬───────────────────┐
1174
1174
  # # │ date ┆ microseconds_diff │
1175
1175
  # # │ --- ┆ --- │
@@ -1179,7 +1179,7 @@ module Polars
1179
1179
  # # │ 2020-01-01 00:00:00.001 ┆ 1000 │
1180
1180
  # # │ 2020-01-01 00:00:00.002 ┆ 1000 │
1181
1181
  # # │ 2020-01-01 00:00:00.003 ┆ 1000 │
1182
- # # │ ... ...
1182
+ # # │
1183
1183
  # # │ 2020-01-01 00:00:00.997 ┆ 1000 │
1184
1184
  # # │ 2020-01-01 00:00:00.998 ┆ 1000 │
1185
1185
  # # │ 2020-01-01 00:00:00.999 ┆ 1000 │
@@ -1208,7 +1208,7 @@ module Polars
1208
1208
  # ]
1209
1209
  # )
1210
1210
  # # =>
1211
- # # shape: (1001, 2)
1211
+ # # shape: (1_001, 2)
1212
1212
  # # ┌─────────────────────────┬──────────────────┐
1213
1213
  # # │ date ┆ nanoseconds_diff │
1214
1214
  # # │ --- ┆ --- │
@@ -1218,7 +1218,7 @@ module Polars
1218
1218
  # # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
1219
1219
  # # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
1220
1220
  # # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
1221
- # # │ ... ...
1221
+ # # │
1222
1222
  # # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
1223
1223
  # # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
1224
1224
  # # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
data/lib/polars/expr.rb CHANGED
@@ -2194,7 +2194,7 @@ module Polars
2194
2194
  # # │ 4 │
2195
2195
  # # │ 6 │
2196
2196
  # # │ 6 │
2197
- # # │ ...
2197
+ # # │
2198
2198
  # # │ 6 │
2199
2199
  # # │ 6 │
2200
2200
  # # │ 6 │
@@ -2571,7 +2571,7 @@ module Polars
2571
2571
  # # │ e │
2572
2572
  # # │ l │
2573
2573
  # # │ l │
2574
- # # │ ...
2574
+ # # │
2575
2575
  # # │ o │
2576
2576
  # # │ r │
2577
2577
  # # │ l │
@@ -4962,6 +4962,13 @@ module Polars
4962
4962
  ListExpr.new(self)
4963
4963
  end
4964
4964
 
4965
+ # Create an object namespace of all binary related methods.
4966
+ #
4967
+ # @return [BinaryExpr]
4968
+ def bin
4969
+ BinaryExpr.new(self)
4970
+ end
4971
+
4965
4972
  # Create an object namespace of all categorical related methods.
4966
4973
  #
4967
4974
  # @return [CatExpr]
@@ -571,5 +571,16 @@ module Polars
571
571
  def agg_list
572
572
  agg(Polars.all.list)
573
573
  end
574
+
575
+ # Plot data.
576
+ #
577
+ # @return [Vega::LiteChart]
578
+ def plot(*args, **options)
579
+ raise ArgumentError, "Multiple groups not supported" if by.is_a?(Array) && by.size > 1
580
+ # same message as Ruby
581
+ raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
582
+
583
+ Utils.wrap_df(_df).plot(*args, **options, group: by)
584
+ end
574
585
  end
575
586
  end