polars-df 0.8.0-x86_64-linux → 0.10.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/3.3/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -91,7 +91,8 @@ module Polars
91
91
  row_count_name: nil,
92
92
  row_count_offset: 0,
93
93
  sample_size: 1024,
94
- eol_char: "\n"
94
+ eol_char: "\n",
95
+ truncate_ragged_lines: false
95
96
  )
96
97
  if Utils.pathlike?(file)
97
98
  path = Utils.normalise_filepath(file)
@@ -147,7 +148,8 @@ module Polars
147
148
  skip_rows_after_header: skip_rows_after_header,
148
149
  row_count_name: row_count_name,
149
150
  row_count_offset: row_count_offset,
150
- eol_char: eol_char
151
+ eol_char: eol_char,
152
+ truncate_ragged_lines: truncate_ragged_lines
151
153
  )
152
154
  if columns.nil?
153
155
  return _from_rbdf(scan.collect._df)
@@ -186,7 +188,8 @@ module Polars
186
188
  skip_rows_after_header,
187
189
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
188
190
  sample_size,
189
- eol_char
191
+ eol_char,
192
+ truncate_ragged_lines
190
193
  )
191
194
  )
192
195
  end
@@ -622,7 +625,7 @@ module Polars
622
625
  # select single column
623
626
  # df["foo"]
624
627
  if item.is_a?(::String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
628
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
629
  end
627
630
 
628
631
  # df[idx]
@@ -814,8 +817,6 @@ module Polars
814
817
 
815
818
  # Serialize to JSON representation.
816
819
  #
817
- # @return [nil]
818
- #
819
820
  # @param file [String]
820
821
  # File path to which the result should be written.
821
822
  # @param pretty [Boolean]
@@ -823,17 +824,45 @@ module Polars
823
824
  # @param row_oriented [Boolean]
824
825
  # Write to row oriented json. This is slower, but more common.
825
826
  #
826
- # @see #write_ndjson
827
+ # @return [nil]
828
+ #
829
+ # @example
830
+ # df = Polars::DataFrame.new(
831
+ # {
832
+ # "foo" => [1, 2, 3],
833
+ # "bar" => [6, 7, 8]
834
+ # }
835
+ # )
836
+ # df.write_json
837
+ # # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
838
+ #
839
+ # @example
840
+ # df.write_json(row_oriented: true)
841
+ # # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
827
842
  def write_json(
828
- file,
843
+ file = nil,
829
844
  pretty: false,
830
845
  row_oriented: false
831
846
  )
832
847
  if Utils.pathlike?(file)
833
848
  file = Utils.normalise_filepath(file)
834
849
  end
835
-
836
- _df.write_json(file, pretty, row_oriented)
850
+ to_string_io = !file.nil? && file.is_a?(StringIO)
851
+ if file.nil? || to_string_io
852
+ buf = StringIO.new
853
+ buf.set_encoding(Encoding::BINARY)
854
+ _df.write_json(buf, pretty, row_oriented)
855
+ json_bytes = buf.string
856
+
857
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
858
+ if to_string_io
859
+ file.write(json_str)
860
+ else
861
+ return json_str
862
+ end
863
+ else
864
+ _df.write_json(file, pretty, row_oriented)
865
+ end
837
866
  nil
838
867
  end
839
868
 
@@ -843,12 +872,36 @@ module Polars
843
872
  # File path to which the result should be written.
844
873
  #
845
874
  # @return [nil]
846
- def write_ndjson(file)
875
+ #
876
+ # @example
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "foo" => [1, 2, 3],
880
+ # "bar" => [6, 7, 8]
881
+ # }
882
+ # )
883
+ # df.write_ndjson()
884
+ # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
+ def write_ndjson(file = nil)
847
886
  if Utils.pathlike?(file)
848
887
  file = Utils.normalise_filepath(file)
849
888
  end
850
-
851
- _df.write_ndjson(file)
889
+ to_string_io = !file.nil? && file.is_a?(StringIO)
890
+ if file.nil? || to_string_io
891
+ buf = StringIO.new
892
+ buf.set_encoding(Encoding::BINARY)
893
+ _df.write_ndjson(buf)
894
+ json_bytes = buf.string
895
+
896
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
897
+ if to_string_io
898
+ file.write(json_str)
899
+ else
900
+ return json_str
901
+ end
902
+ else
903
+ _df.write_ndjson(file)
904
+ end
852
905
  nil
853
906
  end
854
907
 
@@ -1010,7 +1063,7 @@ module Polars
1010
1063
 
1011
1064
  # Write to Apache Parquet file.
1012
1065
  #
1013
- # @param file [String]
1066
+ # @param file [String, Pathname, StringIO]
1014
1067
  # File path to which the file should be written.
1015
1068
  # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
1016
1069
  # Choose "zstd" for good compression performance.
@@ -1027,10 +1080,9 @@ module Polars
1027
1080
  # @param statistics [Boolean]
1028
1081
  # Write statistics to the parquet headers. This requires extra compute.
1029
1082
  # @param row_group_size [Integer, nil]
1030
- # Size of the row groups in number of rows.
1031
- # If `nil` (default), the chunks of the DataFrame are
1032
- # used. Writing in smaller chunks may reduce memory pressure and improve
1033
- # writing speeds.
1083
+ # Size of the row groups in number of rows. Defaults to 512^2 rows.
1084
+ # @param data_page_size [Integer, nil]
1085
+ # Size of the data page in bytes. Defaults to 1024^2 bytes.
1034
1086
  #
1035
1087
  # @return [nil]
1036
1088
  def write_parquet(
@@ -1038,7 +1090,8 @@ module Polars
1038
1090
  compression: "zstd",
1039
1091
  compression_level: nil,
1040
1092
  statistics: false,
1041
- row_group_size: nil
1093
+ row_group_size: nil,
1094
+ data_page_size: nil
1042
1095
  )
1043
1096
  if compression.nil?
1044
1097
  compression = "uncompressed"
@@ -1048,7 +1101,7 @@ module Polars
1048
1101
  end
1049
1102
 
1050
1103
  _df.write_parquet(
1051
- file, compression, compression_level, statistics, row_group_size
1104
+ file, compression, compression_level, statistics, row_group_size, data_page_size
1052
1105
  )
1053
1106
  end
1054
1107
 
@@ -1084,7 +1137,7 @@ module Polars
1084
1137
  # df.estimated_size
1085
1138
  # # => 25888898
1086
1139
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1140
+ # # => 17.0601749420166
1088
1141
  def estimated_size(unit = "b")
1089
1142
  sz = _df.estimated_size
1090
1143
  Utils.scale_bytes(sz, to: unit)
@@ -1782,7 +1835,7 @@ module Polars
1782
1835
  # "b" => [2, 4, 6]
1783
1836
  # }
1784
1837
  # )
1785
- # df.with_row_count
1838
+ # df.with_row_index
1786
1839
  # # =>
1787
1840
  # # shape: (3, 3)
1788
1841
  # # ┌────────┬─────┬─────┐
@@ -1794,9 +1847,10 @@ module Polars
1794
1847
  # # │ 1 ┆ 3 ┆ 4 │
1795
1848
  # # │ 2 ┆ 5 ┆ 6 │
1796
1849
  # # └────────┴─────┴─────┘
1797
- def with_row_count(name: "row_nr", offset: 0)
1798
- _from_rbdf(_df.with_row_count(name, offset))
1850
+ def with_row_index(name: "row_nr", offset: 0)
1851
+ _from_rbdf(_df.with_row_index(name, offset))
1799
1852
  end
1853
+ alias_method :with_row_count, :with_row_index
1800
1854
 
1801
1855
  # Start a group by operation.
1802
1856
  #
@@ -2160,12 +2214,13 @@ module Polars
2160
2214
  # closed: "right"
2161
2215
  # ).agg(Polars.col("A").alias("A_agg_list"))
2162
2216
  # # =>
2163
- # # shape: (3, 4)
2217
+ # # shape: (4, 4)
2164
2218
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2165
2219
  # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2166
2220
  # # │ --- ┆ --- ┆ --- ┆ --- │
2167
2221
  # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2168
2222
  # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2223
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
2169
2224
  # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2170
2225
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2171
2226
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
@@ -2433,6 +2488,8 @@ module Polars
2433
2488
  # Join strategy.
2434
2489
  # @param suffix [String]
2435
2490
  # Suffix to append to columns with a duplicate name.
2491
+ # @param join_nulls [Boolean]
2492
+ # Join on null values. By default null values will never produce matches.
2436
2493
  #
2437
2494
  # @return [DataFrame]
2438
2495
  #
@@ -2515,7 +2572,7 @@ module Polars
2515
2572
  # # ╞═════╪═════╪═════╡
2516
2573
  # # │ 3 ┆ 8.0 ┆ c │
2517
2574
  # # └─────┴─────┴─────┘
2518
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2575
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2519
2576
  lazy
2520
2577
  .join(
2521
2578
  other.lazy,
@@ -2524,6 +2581,7 @@ module Polars
2524
2581
  on: on,
2525
2582
  how: how,
2526
2583
  suffix: suffix,
2584
+ join_nulls: join_nulls
2527
2585
  )
2528
2586
  .collect(no_optimization: true)
2529
2587
  end
@@ -2617,26 +2675,26 @@ module Polars
2617
2675
  # # ┌─────┬─────┬───────────┐
2618
2676
  # # │ a ┆ b ┆ b_squared │
2619
2677
  # # │ --- ┆ --- ┆ --- │
2620
- # # │ i64 ┆ i64 ┆ f64
2678
+ # # │ i64 ┆ i64 ┆ i64
2621
2679
  # # ╞═════╪═════╪═══════════╡
2622
- # # │ 1 ┆ 2 ┆ 4.0
2623
- # # │ 3 ┆ 4 ┆ 16.0
2624
- # # │ 5 ┆ 6 ┆ 36.0
2680
+ # # │ 1 ┆ 2 ┆ 4
2681
+ # # │ 3 ┆ 4 ┆ 16
2682
+ # # │ 5 ┆ 6 ┆ 36
2625
2683
  # # └─────┴─────┴───────────┘
2626
2684
  #
2627
2685
  # @example Replaced
2628
2686
  # df.with_column(Polars.col("a") ** 2)
2629
2687
  # # =>
2630
2688
  # # shape: (3, 2)
2631
- # # ┌──────┬─────┐
2632
- # # │ a ┆ b │
2633
- # # │ --- ┆ --- │
2634
- # # │ f64 ┆ i64 │
2635
- # # ╞══════╪═════╡
2636
- # # │ 1.0 ┆ 2 │
2637
- # # │ 9.0 ┆ 4 │
2638
- # # │ 25.0 ┆ 6 │
2639
- # # └──────┴─────┘
2689
+ # # ┌─────┬─────┐
2690
+ # # │ a ┆ b │
2691
+ # # │ --- ┆ --- │
2692
+ # # │ i64 ┆ i64 │
2693
+ # # ╞═════╪═════╡
2694
+ # # │ 1 ┆ 2 │
2695
+ # # │ 9 ┆ 4 │
2696
+ # # │ 25 ┆ 6 │
2697
+ # # └─────┴─────┘
2640
2698
  def with_column(column)
2641
2699
  lazy
2642
2700
  .with_column(column)
@@ -2803,16 +2861,36 @@ module Polars
2803
2861
  # # │ 2 ┆ 7.0 │
2804
2862
  # # │ 3 ┆ 8.0 │
2805
2863
  # # └─────┴─────┘
2806
- def drop(columns)
2807
- if columns.is_a?(::Array)
2808
- df = clone
2809
- columns.each do |n|
2810
- df._df.drop_in_place(n)
2811
- end
2812
- df
2813
- else
2814
- _from_rbdf(_df.drop(columns))
2815
- end
2864
+ #
2865
+ # @example Drop multiple columns by passing a list of column names.
2866
+ # df.drop(["bar", "ham"])
2867
+ # # =>
2868
+ # # shape: (3, 1)
2869
+ # # ┌─────┐
2870
+ # # │ foo │
2871
+ # # │ --- │
2872
+ # # │ i64 │
2873
+ # # ╞═════╡
2874
+ # # │ 1 │
2875
+ # # │ 2 │
2876
+ # # │ 3 │
2877
+ # # └─────┘
2878
+ #
2879
+ # @example Use positional arguments to drop multiple columns.
2880
+ # df.drop("foo", "ham")
2881
+ # # =>
2882
+ # # shape: (3, 1)
2883
+ # # ┌─────┐
2884
+ # # │ bar │
2885
+ # # │ --- │
2886
+ # # │ f64 │
2887
+ # # ╞═════╡
2888
+ # # │ 6.0 │
2889
+ # # │ 7.0 │
2890
+ # # │ 8.0 │
2891
+ # # └─────┘
2892
+ def drop(*columns)
2893
+ lazy.drop(*columns).collect(_eager: true)
2816
2894
  end
2817
2895
 
2818
2896
  # Drop in place.
@@ -2867,7 +2945,7 @@ module Polars
2867
2945
  # "c" => [true, true, false, nil]
2868
2946
  # }
2869
2947
  # )
2870
- # df.cleared
2948
+ # df.clear
2871
2949
  # # =>
2872
2950
  # # shape: (0, 3)
2873
2951
  # # ┌─────┬─────┬──────┐
@@ -2876,9 +2954,31 @@ module Polars
2876
2954
  # # │ i64 ┆ f64 ┆ bool │
2877
2955
  # # ╞═════╪═════╪══════╡
2878
2956
  # # └─────┴─────┴──────┘
2879
- def cleared
2880
- height > 0 ? head(0) : clone
2957
+ #
2958
+ # @example
2959
+ # df.clear(2)
2960
+ # # =>
2961
+ # # shape: (2, 3)
2962
+ # # ┌──────┬──────┬──────┐
2963
+ # # │ a ┆ b ┆ c │
2964
+ # # │ --- ┆ --- ┆ --- │
2965
+ # # │ i64 ┆ f64 ┆ bool │
2966
+ # # ╞══════╪══════╪══════╡
2967
+ # # │ null ┆ null ┆ null │
2968
+ # # │ null ┆ null ┆ null │
2969
+ # # └──────┴──────┴──────┘
2970
+ def clear(n = 0)
2971
+ if n == 0
2972
+ _from_rbdf(_df.clear)
2973
+ elsif n > 0 || len > 0
2974
+ self.class.new(
2975
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2976
+ )
2977
+ else
2978
+ clone
2979
+ end
2881
2980
  end
2981
+ alias_method :cleared, :clear
2882
2982
 
2883
2983
  # clone handled by initialize_copy
2884
2984
 
@@ -3141,8 +3241,11 @@ module Polars
3141
3241
  aggregate_expr = Polars.element.median._rbexpr
3142
3242
  when "last"
3143
3243
  aggregate_expr = Polars.element.last._rbexpr
3244
+ when "len"
3245
+ aggregate_expr = Polars.len._rbexpr
3144
3246
  when "count"
3145
- aggregate_expr = Polars.count._rbexpr
3247
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3248
+ aggregate_expr = Polars.len._rbexpr
3146
3249
  else
3147
3250
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3148
3251
  end
@@ -3154,9 +3257,9 @@ module Polars
3154
3257
 
3155
3258
  _from_rbdf(
3156
3259
  _df.pivot_expr(
3157
- values,
3158
3260
  index,
3159
3261
  columns,
3262
+ values,
3160
3263
  maintain_order,
3161
3264
  sort_columns,
3162
3265
  aggregate_expr,
@@ -3591,8 +3694,13 @@ module Polars
3591
3694
 
3592
3695
  # Select columns from this DataFrame.
3593
3696
  #
3594
- # @param exprs [Object]
3595
- # Column or columns to select.
3697
+ # @param exprs [Array]
3698
+ # Column(s) to select, specified as positional arguments.
3699
+ # Accepts expression input. Strings are parsed as column names,
3700
+ # other non-expression inputs are parsed as literals.
3701
+ # @param named_exprs [Hash]
3702
+ # Additional columns to select, specified as keyword arguments.
3703
+ # The columns will be renamed to the keyword used.
3596
3704
  #
3597
3705
  # @return [DataFrame]
3598
3706
  #
@@ -3672,23 +3780,25 @@ module Polars
3672
3780
  # # │ 0 │
3673
3781
  # # │ 10 │
3674
3782
  # # └─────────┘
3675
- def select(exprs)
3676
- _from_rbdf(
3677
- lazy
3678
- .select(exprs)
3679
- .collect(no_optimization: true, string_cache: false)
3680
- ._df
3681
- )
3783
+ def select(*exprs, **named_exprs)
3784
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3682
3785
  end
3683
3786
 
3684
- # Add or overwrite multiple columns in a DataFrame.
3787
+ # Add columns to this DataFrame.
3788
+ #
3789
+ # Added columns will replace existing columns with the same name.
3685
3790
  #
3686
3791
  # @param exprs [Array]
3687
- # Array of Expressions that evaluate to columns.
3792
+ # Column(s) to add, specified as positional arguments.
3793
+ # Accepts expression input. Strings are parsed as column names, other
3794
+ # non-expression inputs are parsed as literals.
3795
+ # @param named_exprs [Hash]
3796
+ # Additional columns to add, specified as keyword arguments.
3797
+ # The columns will be renamed to the keyword used.
3688
3798
  #
3689
3799
  # @return [DataFrame]
3690
3800
  #
3691
- # @example
3801
+ # @example Pass an expression to add it as a new column.
3692
3802
  # df = Polars::DataFrame.new(
3693
3803
  # {
3694
3804
  # "a" => [1, 2, 3, 4],
@@ -3696,32 +3806,94 @@ module Polars
3696
3806
  # "c" => [true, true, false, true]
3697
3807
  # }
3698
3808
  # )
3809
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3810
+ # # =>
3811
+ # # shape: (4, 4)
3812
+ # # ┌─────┬──────┬───────┬─────┐
3813
+ # # │ a ┆ b ┆ c ┆ a^2 │
3814
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3815
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 │
3816
+ # # ╞═════╪══════╪═══════╪═════╡
3817
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 │
3818
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 │
3819
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 │
3820
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 │
3821
+ # # └─────┴──────┴───────┴─────┘
3822
+ #
3823
+ # @example Added columns will replace existing columns with the same name.
3824
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3825
+ # # =>
3826
+ # # shape: (4, 3)
3827
+ # # ┌─────┬──────┬───────┐
3828
+ # # │ a ┆ b ┆ c │
3829
+ # # │ --- ┆ --- ┆ --- │
3830
+ # # │ f64 ┆ f64 ┆ bool │
3831
+ # # ╞═════╪══════╪═══════╡
3832
+ # # │ 1.0 ┆ 0.5 ┆ true │
3833
+ # # │ 2.0 ┆ 4.0 ┆ true │
3834
+ # # │ 3.0 ┆ 10.0 ┆ false │
3835
+ # # │ 4.0 ┆ 13.0 ┆ true │
3836
+ # # └─────┴──────┴───────┘
3837
+ #
3838
+ # @example Multiple columns can be added by passing a list of expressions.
3699
3839
  # df.with_columns(
3700
3840
  # [
3701
3841
  # (Polars.col("a") ** 2).alias("a^2"),
3702
3842
  # (Polars.col("b") / 2).alias("b/2"),
3703
- # (Polars.col("c").is_not).alias("not c")
3843
+ # (Polars.col("c").not_).alias("not c"),
3704
3844
  # ]
3705
3845
  # )
3706
3846
  # # =>
3707
3847
  # # shape: (4, 6)
3708
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3709
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3710
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3711
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3712
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3713
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3714
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3715
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3716
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
- def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(::Array)
3720
- exprs = [exprs]
3721
- end
3722
- lazy
3723
- .with_columns(exprs)
3724
- .collect(no_optimization: true, string_cache: false)
3848
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3849
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3850
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3851
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3852
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3853
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3854
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3855
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3856
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3857
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3858
+ #
3859
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3860
+ # df.with_columns(
3861
+ # (Polars.col("a") ** 2).alias("a^2"),
3862
+ # (Polars.col("b") / 2).alias("b/2"),
3863
+ # (Polars.col("c").not_).alias("not c"),
3864
+ # )
3865
+ # # =>
3866
+ # # shape: (4, 6)
3867
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3868
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3869
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3870
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3871
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3872
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3873
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3874
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3875
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3876
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3877
+ #
3878
+ # @example Use keyword arguments to easily name your expression inputs.
3879
+ # df.with_columns(
3880
+ # ab: Polars.col("a") * Polars.col("b"),
3881
+ # not_c: Polars.col("c").not_
3882
+ # )
3883
+ # # =>
3884
+ # # shape: (4, 5)
3885
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3886
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3887
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3888
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3889
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3890
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3891
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3892
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3893
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3894
+ # # └─────┴──────┴───────┴──────┴───────┘
3895
+ def with_columns(*exprs, **named_exprs)
3896
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3725
3897
  end
3726
3898
 
3727
3899
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4363,7 +4535,7 @@ module Polars
4363
4535
  # # null
4364
4536
  # # ]
4365
4537
  #
4366
- # @example A horizontal boolean or, similar to a row-wise .any():
4538
+ # @example A horizontal boolean or, similar to a row-wise .any:
4367
4539
  # df = Polars::DataFrame.new(
4368
4540
  # {
4369
4541
  # "a" => [false, false, true],
@@ -4486,7 +4658,7 @@ module Polars
4486
4658
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4487
4659
  def rows(named: false)
4488
4660
  if named
4489
- columns = columns()
4661
+ columns = self.columns
4490
4662
  _df.row_tuples.map do |v|
4491
4663
  columns.zip(v).to_h
4492
4664
  end
@@ -4527,7 +4699,7 @@ module Polars
4527
4699
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4528
4700
 
4529
4701
  # load into the local namespace for a modest performance boost in the hot loops
4530
- columns = columns()
4702
+ columns = self.columns
4531
4703
 
4532
4704
  # note: buffering rows results in a 2-4x speedup over individual calls
4533
4705
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4764,13 +4936,51 @@ module Polars
4764
4936
  _from_rbdf(_df.unnest(names))
4765
4937
  end
4766
4938
 
4767
- # TODO
4939
+ # Requires NumPy
4768
4940
  # def corr
4769
4941
  # end
4770
4942
 
4771
- # TODO
4772
- # def merge_sorted
4773
- # end
4943
+ # Take two sorted DataFrames and merge them by the sorted key.
4944
+ #
4945
+ # The output of this operation will also be sorted.
4946
+ # It is the callers responsibility that the frames are sorted
4947
+ # by that key otherwise the output will not make sense.
4948
+ #
4949
+ # The schemas of both DataFrames must be equal.
4950
+ #
4951
+ # @param other [DataFrame]
4952
+ # Other DataFrame that must be merged
4953
+ # @param key [String]
4954
+ # Key that is sorted.
4955
+ #
4956
+ # @return [DataFrame]
4957
+ #
4958
+ # @example
4959
+ # df0 = Polars::DataFrame.new(
4960
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4961
+ # ).sort("age")
4962
+ # df1 = Polars::DataFrame.new(
4963
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4964
+ # ).sort("age")
4965
+ # df0.merge_sorted(df1, "age")
4966
+ # # =>
4967
+ # # shape: (7, 2)
4968
+ # # ┌────────┬─────┐
4969
+ # # │ name ┆ age │
4970
+ # # │ --- ┆ --- │
4971
+ # # │ str ┆ i64 │
4972
+ # # ╞════════╪═════╡
4973
+ # # │ bob ┆ 18 │
4974
+ # # │ thomas ┆ 20 │
4975
+ # # │ anna ┆ 21 │
4976
+ # # │ megan ┆ 33 │
4977
+ # # │ steve ┆ 42 │
4978
+ # # │ steve ┆ 42 │
4979
+ # # │ elise ┆ 44 │
4980
+ # # └────────┴─────┘
4981
+ def merge_sorted(other, key)
4982
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4983
+ end
4774
4984
 
4775
4985
  # Indicate that one or multiple columns are sorted.
4776
4986
  #
@@ -4812,7 +5022,7 @@ module Polars
4812
5022
  end
4813
5023
 
4814
5024
  def _pos_idxs(idxs, dim)
4815
- idx_type = Polars._get_idx_type
5025
+ idx_type = Plr.get_index_type
4816
5026
 
4817
5027
  if idxs.is_a?(Series)
4818
5028
  if idxs.dtype == idx_type
@@ -5045,14 +5255,14 @@ module Polars
5045
5255
  elsif data[0].is_a?(Hash)
5046
5256
  column_names, dtypes = _unpack_schema(columns)
5047
5257
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5048
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5258
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5049
5259
  if column_names
5050
5260
  rbdf = _post_apply_columns(rbdf, column_names)
5051
5261
  end
5052
5262
  return rbdf
5053
5263
  elsif data[0].is_a?(::Array)
5264
+ first_element = data[0]
5054
5265
  if orient.nil? && !columns.nil?
5055
- first_element = data[0]
5056
5266
  row_types = first_element.filter_map { |value| value.class }.uniq
5057
5267
  if row_types.include?(Integer) && row_types.include?(Float)
5058
5268
  row_types.delete(Integer)