polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,33 +26,135 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
35
91
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
- file = Utils.format_path(file)
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
37
113
  end
38
114
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
122
+ end
123
+
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
40
155
  end
41
156
 
157
+ # @private
42
158
  def self._read_parquet(file)
43
159
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
160
  file = Utils.format_path(file)
@@ -47,6 +163,44 @@ module Polars
47
163
  _from_rbdf(RbDataFrame.read_parquet(file))
48
164
  end
49
165
 
166
+ # def self._read_avro
167
+ # end
168
+
169
+ # @private
170
+ def self._read_ipc(
171
+ file,
172
+ columns: nil,
173
+ n_rows: nil,
174
+ row_count_name: nil,
175
+ row_count_offset: 0,
176
+ rechunk: true,
177
+ memory_map: true
178
+ )
179
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
180
+ file = Utils.format_path(file)
181
+ end
182
+ if columns.is_a?(String)
183
+ columns = [columns]
184
+ end
185
+
186
+ if file.is_a?(String) && file.include?("*")
187
+ raise Todo
188
+ end
189
+
190
+ projection, columns = Utils.handle_projection_columns(columns)
191
+ _from_rbdf(
192
+ RbDataFrame.read_ipc(
193
+ file,
194
+ columns,
195
+ projection,
196
+ n_rows,
197
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
198
+ memory_map
199
+ )
200
+ )
201
+ end
202
+
203
+ # @private
50
204
  def self._read_json(file)
51
205
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
206
  file = Utils.format_path(file)
@@ -55,6 +209,7 @@ module Polars
55
209
  _from_rbdf(RbDataFrame.read_json(file))
56
210
  end
57
211
 
212
+ # @private
58
213
  def self._read_ndjson(file)
59
214
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
215
  file = Utils.format_path(file)
@@ -63,26 +218,157 @@ module Polars
63
218
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
219
  end
65
220
 
221
+ # Get the shape of the DataFrame.
222
+ #
223
+ # @return [Array]
224
+ #
225
+ # @example
226
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
227
+ # df.shape
228
+ # # => [5, 1]
66
229
  def shape
67
230
  _df.shape
68
231
  end
69
232
 
233
+ # Get the height of the DataFrame.
234
+ #
235
+ # @return [Integer]
236
+ #
237
+ # @example
238
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
239
+ # df.height
240
+ # # => 5
70
241
  def height
71
242
  _df.height
72
243
  end
73
244
 
245
+ # Get the width of the DataFrame.
246
+ #
247
+ # @return [Integer]
248
+ #
249
+ # @example
250
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
251
+ # df.width
252
+ # # => 1
74
253
  def width
75
254
  _df.width
76
255
  end
77
256
 
257
+ # Get column names.
258
+ #
259
+ # @return [Array]
260
+ #
261
+ # @example
262
+ # df = Polars::DataFrame.new({
263
+ # "foo" => [1, 2, 3],
264
+ # "bar" => [6, 7, 8],
265
+ # "ham" => ["a", "b", "c"]
266
+ # })
267
+ # df.columns
268
+ # # => ["foo", "bar", "ham"]
78
269
  def columns
79
270
  _df.columns
80
271
  end
81
272
 
273
+ # Change the column names of the DataFrame.
274
+ #
275
+ # @param columns [Array]
276
+ # A list with new names for the DataFrame.
277
+ # The length of the list should be equal to the width of the DataFrame.
278
+ #
279
+ # @return [Object]
280
+ #
281
+ # @example
282
+ # df = Polars::DataFrame.new({
283
+ # "foo" => [1, 2, 3],
284
+ # "bar" => [6, 7, 8],
285
+ # "ham" => ["a", "b", "c"]
286
+ # })
287
+ # df.columns = ["apple", "banana", "orange"]
288
+ # df
289
+ # # =>
290
+ # # shape: (3, 3)
291
+ # # ┌───────┬────────┬────────┐
292
+ # # │ apple ┆ banana ┆ orange │
293
+ # # │ --- ┆ --- ┆ --- │
294
+ # # │ i64 ┆ i64 ┆ str │
295
+ # # ╞═══════╪════════╪════════╡
296
+ # # │ 1 ┆ 6 ┆ a │
297
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
298
+ # # │ 2 ┆ 7 ┆ b │
299
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
300
+ # # │ 3 ┆ 8 ┆ c │
301
+ # # └───────┴────────┴────────┘
302
+ def columns=(columns)
303
+ _df.set_column_names(columns)
304
+ end
305
+
306
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
307
+ #
308
+ # @return [Array]
309
+ #
310
+ # @example
311
+ # df = Polars::DataFrame.new({
312
+ # "foo" => [1, 2, 3],
313
+ # "bar" => [6.0, 7.0, 8.0],
314
+ # "ham" => ["a", "b", "c"]
315
+ # })
316
+ # df.dtypes
317
+ # # => [:i64, :f64, :str]
82
318
  def dtypes
83
- _df.dtypes.map(&:to_sym)
319
+ _df.dtypes
320
+ end
321
+
322
+ # Get the schema.
323
+ #
324
+ # @return [Hash]
325
+ #
326
+ # @example
327
+ # df = Polars::DataFrame.new({
328
+ # "foo" => [1, 2, 3],
329
+ # "bar" => [6.0, 7.0, 8.0],
330
+ # "ham" => ["a", "b", "c"]
331
+ # })
332
+ # df.schema
333
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
334
+ def schema
335
+ columns.zip(dtypes).to_h
84
336
  end
85
337
 
338
+ # def ==(other)
339
+ # end
340
+
341
+ # def !=(other)
342
+ # end
343
+
344
+ # def >(other)
345
+ # end
346
+
347
+ # def <(other)
348
+ # end
349
+
350
+ # def >=(other)
351
+ # end
352
+
353
+ # def <=(other)
354
+ # end
355
+
356
+ # def *(other)
357
+ # end
358
+
359
+ # def /(other)
360
+ # end
361
+
362
+ # def +(other)
363
+ # end
364
+
365
+ # def -(other)
366
+ # end
367
+
368
+ # def %(other)
369
+ # end
370
+
371
+ #
86
372
  def to_s
87
373
  _df.to_s
88
374
  end
@@ -92,10 +378,64 @@ module Polars
92
378
  columns.include?(name)
93
379
  end
94
380
 
381
+ # def each
382
+ # end
383
+
384
+ # def _pos_idx
385
+ # end
386
+
387
+ # def _pos_idxs
388
+ # end
389
+
390
+ #
95
391
  def [](name)
96
392
  Utils.wrap_s(_df.column(name))
97
393
  end
98
394
 
395
+ # def []=(key, value)
396
+ # end
397
+
398
+ # no to_arrow
399
+
400
+ #
401
+ def to_h(as_series: true)
402
+ if as_series
403
+ get_columns.to_h { |s| [s.name, s] }
404
+ else
405
+ get_columns.to_h { |s| [s.name, s.to_a] }
406
+ end
407
+ end
408
+
409
+ # def to_hashes / to_a
410
+ # end
411
+
412
+ # def to_numo
413
+ # end
414
+
415
+ # no to_pandas
416
+
417
+ # Select column as Series at index location.
418
+ #
419
+ # @param index [Integer]
420
+ # Location of selection.
421
+ #
422
+ # @return [Series]
423
+ #
424
+ # @example
425
+ # df = Polars::DataFrame.new({
426
+ # "foo" => [1, 2, 3],
427
+ # "bar" => [6, 7, 8],
428
+ # "ham" => ["a", "b", "c"]
429
+ # })
430
+ # df.to_series(1)
431
+ # # =>
432
+ # # shape: (3,)
433
+ # # Series: 'bar' [i64]
434
+ # # [
435
+ # # 6
436
+ # # 7
437
+ # # 8
438
+ # # ]
99
439
  def to_series(index = 0)
100
440
  if index < 0
101
441
  index = columns.length + index
@@ -103,6 +443,18 @@ module Polars
103
443
  Utils.wrap_s(_df.select_at_idx(index))
104
444
  end
105
445
 
446
+ # Serialize to JSON representation.
447
+ #
448
+ # @return [nil]
449
+ #
450
+ # @param file [String]
451
+ # File path to which the result should be written.
452
+ # @param pretty [Boolean]
453
+ # Pretty serialize json.
454
+ # @param row_oriented [Boolean]
455
+ # Write to row oriented json. This is slower, but more common.
456
+ #
457
+ # @see #write_ndjson
106
458
  def write_json(
107
459
  file,
108
460
  pretty: false,
@@ -116,6 +468,12 @@ module Polars
116
468
  nil
117
469
  end
118
470
 
471
+ # Serialize to newline delimited JSON representation.
472
+ #
473
+ # @param file [String]
474
+ # File path to which the result should be written.
475
+ #
476
+ # @return [nil]
119
477
  def write_ndjson(file)
120
478
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
121
479
  file = Utils.format_path(file)
@@ -125,6 +483,48 @@ module Polars
125
483
  nil
126
484
  end
127
485
 
486
+ # Write to comma-separated values (CSV) file.
487
+ #
488
+ # @param file [String, nil]
489
+ # File path to which the result should be written. If set to `nil`
490
+ # (default), the output is returned as a string instead.
491
+ # @param has_header [Boolean]
492
+ # Whether to include header in the CSV output.
493
+ # @param sep [String]
494
+ # Separate CSV fields with this symbol.
495
+ # @param quote [String]
496
+ # Byte to use as quoting character.
497
+ # @param batch_size [Integer]
498
+ # Number of rows that will be processed per thread.
499
+ # @param datetime_format [String, nil]
500
+ # A format string, with the specifiers defined by the
501
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
502
+ # Rust crate. If no format specified, the default fractional-second
503
+ # precision is inferred from the maximum timeunit found in the frame's
504
+ # Datetime cols (if any).
505
+ # @param date_format [String, nil]
506
+ # A format string, with the specifiers defined by the
507
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
508
+ # Rust crate.
509
+ # @param time_format [String, nil]
510
+ # A format string, with the specifiers defined by the
511
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
512
+ # Rust crate.
513
+ # @param float_precision [Integer, nil]
514
+ # Number of decimal places to write, applied to both `:f32` and
515
+ # `:f64` datatypes.
516
+ # @param null_value [String, nil]
517
+ # A string representing null values (defaulting to the empty string).
518
+ #
519
+ # @return [String, nil]
520
+ #
521
+ # @example
522
+ # df = Polars::DataFrame.new({
523
+ # "foo" => [1, 2, 3, 4, 5],
524
+ # "bar" => [6, 7, 8, 9, 10],
525
+ # "ham" => ["a", "b", "c", "d", "e"]
526
+ # })
527
+ # df.write_csv("file.csv")
128
528
  def write_csv(
129
529
  file = nil,
130
530
  has_header: true,
@@ -160,8 +560,7 @@ module Polars
160
560
  float_precision,
161
561
  null_value
162
562
  )
163
- buffer.rewind
164
- return buffer.read.force_encoding(Encoding::UTF_8)
563
+ return buffer.string.force_encoding(Encoding::UTF_8)
165
564
  end
166
565
 
167
566
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -183,6 +582,53 @@ module Polars
183
582
  nil
184
583
  end
185
584
 
585
+ # def write_avro
586
+ # end
587
+
588
+ # Write to Arrow IPC binary stream or Feather file.
589
+ #
590
+ # @param file [String]
591
+ # File path to which the file should be written.
592
+ # @param compression ["uncompressed", "lz4", "zstd"]
593
+ # Compression method. Defaults to "uncompressed".
594
+ #
595
+ # @return [nil]
596
+ def write_ipc(file, compression: "uncompressed")
597
+ if compression.nil?
598
+ compression = "uncompressed"
599
+ end
600
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
601
+ file = Utils.format_path(file)
602
+ end
603
+
604
+ _df.write_ipc(file, compression)
605
+ end
606
+
607
+ # Write to Apache Parquet file.
608
+ #
609
+ # @param file [String]
610
+ # File path to which the file should be written.
611
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
612
+ # Choose "zstd" for good compression performance.
613
+ # Choose "lz4" for fast compression/decompression.
614
+ # Choose "snappy" for more backwards compatibility guarantees
615
+ # when you deal with older parquet readers.
616
+ # @param compression_level [Integer, nil]
617
+ # The level of compression to use. Higher compression means smaller files on
618
+ # disk.
619
+ #
620
+ # - "gzip" : min-level: 0, max-level: 10.
621
+ # - "brotli" : min-level: 0, max-level: 11.
622
+ # - "zstd" : min-level: 1, max-level: 22.
623
+ # @param statistics [Boolean]
624
+ # Write statistics to the parquet headers. This requires extra compute.
625
+ # @param row_group_size [Integer, nil]
626
+ # Size of the row groups in number of rows.
627
+ # If `nil` (default), the chunks of the DataFrame are
628
+ # used. Writing in smaller chunks may reduce memory pressure and improve
629
+ # writing speeds.
630
+ #
631
+ # @return [nil]
186
632
  def write_parquet(
187
633
  file,
188
634
  compression: "zstd",
@@ -202,10 +648,177 @@ module Polars
202
648
  )
203
649
  end
204
650
 
651
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
652
+ #
653
+ # Estimated size is given in the specified unit (bytes by default).
654
+ #
655
+ # This estimation is the sum of the size of its buffers, validity, including
656
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
657
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
658
+ # particular, StructArray's size is an upper bound.
659
+ #
660
+ # When an array is sliced, its allocated size remains constant because the buffer
661
+ # unchanged. However, this function will yield a smaller number. This is because
662
+ # this function returns the visible size of the buffer, not its total capacity.
663
+ #
664
+ # FFI buffers are included in this estimation.
665
+ #
666
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
667
+ # Scale the returned size to the given unit.
668
+ #
669
+ # @return [Numeric]
670
+ #
671
+ # @example
672
+ # df = Polars::DataFrame.new(
673
+ # {
674
+ # "x" => 1_000_000.times.to_a.reverse,
675
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
676
+ # "z" => 1_000_000.times.map(&:to_s)
677
+ # },
678
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
679
+ # )
680
+ # df.estimated_size
681
+ # # => 25888898
682
+ # df.estimated_size("mb")
683
+ # # => 24.689577102661133
684
+ def estimated_size(unit = "b")
685
+ sz = _df.estimated_size
686
+ Utils.scale_bytes(sz, to: unit)
687
+ end
688
+
689
+ # def transpose
690
+ # end
691
+
692
+ # Reverse the DataFrame.
693
+ #
694
+ # @return [DataFrame]
695
+ #
696
+ # @example
697
+ # df = Polars::DataFrame.new({
698
+ # "key" => ["a", "b", "c"],
699
+ # "val" => [1, 2, 3]
700
+ # })
701
+ # df.reverse()
702
+ # # =>
703
+ # # shape: (3, 2)
704
+ # # ┌─────┬─────┐
705
+ # # │ key ┆ val │
706
+ # # │ --- ┆ --- │
707
+ # # │ str ┆ i64 │
708
+ # # ╞═════╪═════╡
709
+ # # │ c ┆ 3 │
710
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
711
+ # # │ b ┆ 2 │
712
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
713
+ # # │ a ┆ 1 │
714
+ # # └─────┴─────┘
715
+ def reverse
716
+ select(Polars.col("*").reverse)
717
+ end
718
+
719
+ # Rename column names.
720
+ #
721
+ # @param mapping [Hash]
722
+ # Key value pairs that map from old name to new name.
723
+ #
724
+ # @return [DataFrame]
725
+ #
726
+ # @example
727
+ # df = Polars::DataFrame.new({
728
+ # "foo" => [1, 2, 3],
729
+ # "bar" => [6, 7, 8],
730
+ # "ham" => ["a", "b", "c"]
731
+ # })
732
+ # df.rename({"foo" => "apple"})
733
+ # # =>
734
+ # # shape: (3, 3)
735
+ # # ┌───────┬─────┬─────┐
736
+ # # │ apple ┆ bar ┆ ham │
737
+ # # │ --- ┆ --- ┆ --- │
738
+ # # │ i64 ┆ i64 ┆ str │
739
+ # # ╞═══════╪═════╪═════╡
740
+ # # │ 1 ┆ 6 ┆ a │
741
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 2 ┆ 7 ┆ b │
743
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 3 ┆ 8 ┆ c │
745
+ # # └───────┴─────┴─────┘
746
+ def rename(mapping)
747
+ lazy.rename(mapping).collect(no_optimization: true)
748
+ end
749
+
750
+ # Insert a Series at a certain column index. This operation is in place.
751
+ #
752
+ # @param index [Integer]
753
+ # Column to insert the new `Series` column.
754
+ # @param series [Series]
755
+ # `Series` to insert.
756
+ #
757
+ # @return [DataFrame]
758
+ #
759
+ # @example
760
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
761
+ # s = Polars::Series.new("baz", [97, 98, 99])
762
+ # df.insert_at_idx(1, s)
763
+ # # =>
764
+ # # shape: (3, 3)
765
+ # # ┌─────┬─────┬─────┐
766
+ # # │ foo ┆ baz ┆ bar │
767
+ # # │ --- ┆ --- ┆ --- │
768
+ # # │ i64 ┆ i64 ┆ i64 │
769
+ # # ╞═════╪═════╪═════╡
770
+ # # │ 1 ┆ 97 ┆ 4 │
771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
772
+ # # │ 2 ┆ 98 ┆ 5 │
773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
774
+ # # │ 3 ┆ 99 ┆ 6 │
775
+ # # └─────┴─────┴─────┘
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new({
779
+ # "a" => [1, 2, 3, 4],
780
+ # "b" => [0.5, 4, 10, 13],
781
+ # "c" => [true, true, false, true]
782
+ # })
783
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
+ # df.insert_at_idx(3, s)
785
+ # # =>
786
+ # # shape: (4, 4)
787
+ # # ┌─────┬──────┬───────┬──────┐
788
+ # # │ a ┆ b ┆ c ┆ d │
789
+ # # │ --- ┆ --- ┆ --- ┆ --- │
790
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
791
+ # # ╞═════╪══════╪═══════╪══════╡
792
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
793
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
794
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
795
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
796
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
797
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
798
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
799
+ # # └─────┴──────┴───────┴──────┘
800
+ def insert_at_idx(index, series)
801
+ if index < 0
802
+ index = columns.length + index
803
+ end
804
+ _df.insert_at_idx(index, series._s)
805
+ self
806
+ end
807
+
205
808
  def filter(predicate)
206
809
  lazy.filter(predicate).collect
207
810
  end
208
811
 
812
+ # def describe
813
+ # end
814
+
815
+ # def find_idx_by_name
816
+ # end
817
+
818
+ # def replace_at_idx
819
+ # end
820
+
821
+ #
209
822
  def sort(by, reverse: false, nulls_last: false)
210
823
  _from_rbdf(_df.sort(by, reverse, nulls_last))
211
824
  end
@@ -214,6 +827,17 @@ module Polars
214
827
  _df.frame_equal(other._df, null_equal)
215
828
  end
216
829
 
830
+ # def replace
831
+ # end
832
+
833
+ #
834
+ def slice(offset, length = nil)
835
+ if !length.nil? && length < 0
836
+ length = height - offset + length
837
+ end
838
+ _from_rbdf(_df.slice(offset, length))
839
+ end
840
+
217
841
  def limit(n = 5)
218
842
  head(n)
219
843
  end
@@ -226,10 +850,33 @@ module Polars
226
850
  _from_rbdf(_df.tail(n))
227
851
  end
228
852
 
853
+ # def drop_nulls
854
+ # end
855
+
856
+ # def pipe
857
+ # end
858
+
859
+ # def with_row_count
860
+ # end
861
+
862
+ #
229
863
  def groupby(by, maintain_order: false)
230
864
  lazy.groupby(by, maintain_order: maintain_order)
231
865
  end
232
866
 
867
+ # def groupby_rolling
868
+ # end
869
+
870
+ # def groupby_dynamic
871
+ # end
872
+
873
+ # def upsample
874
+ # end
875
+
876
+ # def join_asof
877
+ # end
878
+
879
+ #
233
880
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
881
  lazy
235
882
  .join(
@@ -243,12 +890,86 @@ module Polars
243
890
  .collect(no_optimization: true)
244
891
  end
245
892
 
893
+ # def apply
894
+ # end
895
+
896
+ #
246
897
  def with_column(column)
247
898
  lazy
248
899
  .with_column(column)
249
900
  .collect(no_optimization: true, string_cache: false)
250
901
  end
251
902
 
903
+ # def hstack
904
+ # end
905
+
906
+ # def vstack
907
+ # end
908
+
909
+ #
910
+ def extend(other)
911
+ _df.extend(other._df)
912
+ self
913
+ end
914
+
915
+ # def drop
916
+ # end
917
+
918
+ # def drop_in_place
919
+ # end
920
+
921
+ # def cleared
922
+ # end
923
+
924
+ # clone handled by initialize_copy
925
+
926
+ #
927
+ def get_columns
928
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
929
+ end
930
+
931
+ def get_column(name)
932
+ self[name]
933
+ end
934
+
935
+ # def fill_null
936
+ # end
937
+
938
+ #
939
+ def fill_nan(fill_value)
940
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
941
+ end
942
+
943
+ # def explode
944
+ # end
945
+
946
+ # def pivot
947
+ # end
948
+
949
+ # def melt
950
+ # end
951
+
952
+ # def unstack
953
+ # end
954
+
955
+ # def partition_by
956
+ # end
957
+
958
+ # def shift
959
+ # end
960
+
961
+ # def shift_and_fill
962
+ # end
963
+
964
+ #
965
+ def is_duplicated
966
+ Utils.wrap_s(_df.is_duplicated)
967
+ end
968
+
969
+ def is_unique
970
+ Utils.wrap_s(_df.is_unique)
971
+ end
972
+
252
973
  def lazy
253
974
  wrap_ldf(_df.lazy)
254
975
  end
@@ -262,6 +983,56 @@ module Polars
262
983
  )
263
984
  end
264
985
 
986
+ def with_columns(exprs)
987
+ if !exprs.nil? && !exprs.is_a?(Array)
988
+ exprs = [exprs]
989
+ end
990
+ lazy
991
+ .with_columns(exprs)
992
+ .collect(no_optimization: true, string_cache: false)
993
+ end
994
+
995
+ def n_chunks(strategy: "first")
996
+ if strategy == "first"
997
+ _df.n_chunks
998
+ elsif strategy == "all"
999
+ get_columns.map(&:n_chunks)
1000
+ else
1001
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
1002
+ end
1003
+ end
1004
+
1005
+ def max(axis: 0)
1006
+ if axis == 0
1007
+ _from_rbdf(_df.max)
1008
+ elsif axis == 1
1009
+ Utils.wrap_s(_df.hmax)
1010
+ else
1011
+ raise ArgumentError, "Axis should be 0 or 1."
1012
+ end
1013
+ end
1014
+
1015
+ def min(axis: 0)
1016
+ if axis == 0
1017
+ _from_rbdf(_df.min)
1018
+ elsif axis == 1
1019
+ Utils.wrap_s(_df.hmin)
1020
+ else
1021
+ raise ArgumentError, "Axis should be 0 or 1."
1022
+ end
1023
+ end
1024
+
1025
+ def sum(axis: 0, null_strategy: "ignore")
1026
+ case axis
1027
+ when 0
1028
+ _from_rbdf(_df.sum)
1029
+ when 1
1030
+ Utils.wrap_s(_df.hsum(null_strategy))
1031
+ else
1032
+ raise ArgumentError, "Axis should be 0 or 1."
1033
+ end
1034
+ end
1035
+
265
1036
  def mean(axis: 0, null_strategy: "ignore")
266
1037
  case axis
267
1038
  when 0
@@ -273,15 +1044,34 @@ module Polars
273
1044
  end
274
1045
  end
275
1046
 
276
- def with_columns(exprs)
277
- if !exprs.nil? && !exprs.is_a?(Array)
278
- exprs = [exprs]
279
- end
280
- lazy
281
- .with_columns(exprs)
282
- .collect(no_optimization: true, string_cache: false)
1047
+ def std(ddof: 1)
1048
+ _from_rbdf(_df.std(ddof))
1049
+ end
1050
+
1051
+ def var(ddof: 1)
1052
+ _from_rbdf(_df.var(ddof))
283
1053
  end
284
1054
 
1055
+ def median
1056
+ _from_rbdf(_df.median)
1057
+ end
1058
+
1059
+ # def product
1060
+ # end
1061
+
1062
+ # def quantile(quantile, interpolation: "nearest")
1063
+ # end
1064
+
1065
+ # def to_dummies
1066
+ # end
1067
+
1068
+ # def unique
1069
+ # end
1070
+
1071
+ # def n_unique
1072
+ # end
1073
+
1074
+ #
285
1075
  def rechunk
286
1076
  _from_rbdf(_df.rechunk)
287
1077
  end
@@ -290,17 +1080,98 @@ module Polars
290
1080
  _from_rbdf(_df.null_count)
291
1081
  end
292
1082
 
1083
+ # def sample
1084
+ # end
1085
+
1086
+ # def fold
1087
+ # end
1088
+
1089
+ # def row
1090
+ # end
1091
+
1092
+ # def rows
1093
+ # end
1094
+
1095
+ # def shrink_to_fit
1096
+ # end
1097
+
1098
+ # def take_every
1099
+ # end
1100
+
1101
+ # def hash_rows
1102
+ # end
1103
+
1104
+ # def interpolate
1105
+ # end
1106
+
1107
+ #
1108
+ def is_empty
1109
+ height == 0
1110
+ end
1111
+ alias_method :empty?, :is_empty
1112
+
1113
+ # def to_struct(name)
1114
+ # end
1115
+
1116
+ # def unnest
1117
+ # end
1118
+
293
1119
  private
294
1120
 
295
- def hash_to_rbdf(data)
1121
+ def initialize_copy(other)
1122
+ super
1123
+ self._df = _df._clone
1124
+ end
1125
+
1126
+ def hash_to_rbdf(data, columns: nil)
1127
+ if !columns.nil?
1128
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
+
1130
+ if !data && dtypes
1131
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
+ else
1133
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
1134
+ end
1135
+ data_series = _handle_columns_arg(data_series, columns: columns)
1136
+ return RbDataFrame.new(data_series)
1137
+ end
1138
+
296
1139
  RbDataFrame.read_hash(data)
297
1140
  end
298
1141
 
299
- def sequence_to_rbdf(data)
1142
+ def _unpack_columns(columns, lookup_names: nil)
1143
+ [columns.keys, columns]
1144
+ end
1145
+
1146
+ def _handle_columns_arg(data, columns: nil)
1147
+ if columns.nil?
1148
+ data
1149
+ else
1150
+ if !data
1151
+ columns.map { |c| Series.new(c, nil)._s }
1152
+ elsif data.length == columns.length
1153
+ columns.each_with_index do |c, i|
1154
+ # not in-place?
1155
+ data[i].rename(c)
1156
+ end
1157
+ data
1158
+ else
1159
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
1160
+ end
1161
+ end
1162
+ end
1163
+
1164
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
1165
+ if columns || orient
1166
+ raise Todo
1167
+ end
300
1168
  RbDataFrame.new(data.map(&:_s))
301
1169
  end
302
1170
 
303
- def series_to_rbdf(data)
1171
+ def series_to_rbdf(data, columns: nil)
1172
+ if columns
1173
+ raise Todo
1174
+ end
304
1175
  RbDataFrame.new([data._s])
305
1176
  end
306
1177