polars-df 0.8.0-aarch64-linux → 0.10.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/3.3/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -91,7 +91,8 @@ module Polars
91
91
  row_count_name: nil,
92
92
  row_count_offset: 0,
93
93
  sample_size: 1024,
94
- eol_char: "\n"
94
+ eol_char: "\n",
95
+ truncate_ragged_lines: false
95
96
  )
96
97
  if Utils.pathlike?(file)
97
98
  path = Utils.normalise_filepath(file)
@@ -147,7 +148,8 @@ module Polars
147
148
  skip_rows_after_header: skip_rows_after_header,
148
149
  row_count_name: row_count_name,
149
150
  row_count_offset: row_count_offset,
150
- eol_char: eol_char
151
+ eol_char: eol_char,
152
+ truncate_ragged_lines: truncate_ragged_lines
151
153
  )
152
154
  if columns.nil?
153
155
  return _from_rbdf(scan.collect._df)
@@ -186,7 +188,8 @@ module Polars
186
188
  skip_rows_after_header,
187
189
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
188
190
  sample_size,
189
- eol_char
191
+ eol_char,
192
+ truncate_ragged_lines
190
193
  )
191
194
  )
192
195
  end
@@ -622,7 +625,7 @@ module Polars
622
625
  # select single column
623
626
  # df["foo"]
624
627
  if item.is_a?(::String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
628
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
629
  end
627
630
 
628
631
  # df[idx]
@@ -814,8 +817,6 @@ module Polars
814
817
 
815
818
  # Serialize to JSON representation.
816
819
  #
817
- # @return [nil]
818
- #
819
820
  # @param file [String]
820
821
  # File path to which the result should be written.
821
822
  # @param pretty [Boolean]
@@ -823,17 +824,45 @@ module Polars
823
824
  # @param row_oriented [Boolean]
824
825
  # Write to row oriented json. This is slower, but more common.
825
826
  #
826
- # @see #write_ndjson
827
+ # @return [nil]
828
+ #
829
+ # @example
830
+ # df = Polars::DataFrame.new(
831
+ # {
832
+ # "foo" => [1, 2, 3],
833
+ # "bar" => [6, 7, 8]
834
+ # }
835
+ # )
836
+ # df.write_json
837
+ # # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
838
+ #
839
+ # @example
840
+ # df.write_json(row_oriented: true)
841
+ # # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
827
842
  def write_json(
828
- file,
843
+ file = nil,
829
844
  pretty: false,
830
845
  row_oriented: false
831
846
  )
832
847
  if Utils.pathlike?(file)
833
848
  file = Utils.normalise_filepath(file)
834
849
  end
835
-
836
- _df.write_json(file, pretty, row_oriented)
850
+ to_string_io = !file.nil? && file.is_a?(StringIO)
851
+ if file.nil? || to_string_io
852
+ buf = StringIO.new
853
+ buf.set_encoding(Encoding::BINARY)
854
+ _df.write_json(buf, pretty, row_oriented)
855
+ json_bytes = buf.string
856
+
857
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
858
+ if to_string_io
859
+ file.write(json_str)
860
+ else
861
+ return json_str
862
+ end
863
+ else
864
+ _df.write_json(file, pretty, row_oriented)
865
+ end
837
866
  nil
838
867
  end
839
868
 
@@ -843,12 +872,36 @@ module Polars
843
872
  # File path to which the result should be written.
844
873
  #
845
874
  # @return [nil]
846
- def write_ndjson(file)
875
+ #
876
+ # @example
877
+ # df = Polars::DataFrame.new(
878
+ # {
879
+ # "foo" => [1, 2, 3],
880
+ # "bar" => [6, 7, 8]
881
+ # }
882
+ # )
883
+ # df.write_ndjson()
884
+ # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
+ def write_ndjson(file = nil)
847
886
  if Utils.pathlike?(file)
848
887
  file = Utils.normalise_filepath(file)
849
888
  end
850
-
851
- _df.write_ndjson(file)
889
+ to_string_io = !file.nil? && file.is_a?(StringIO)
890
+ if file.nil? || to_string_io
891
+ buf = StringIO.new
892
+ buf.set_encoding(Encoding::BINARY)
893
+ _df.write_ndjson(buf)
894
+ json_bytes = buf.string
895
+
896
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
897
+ if to_string_io
898
+ file.write(json_str)
899
+ else
900
+ return json_str
901
+ end
902
+ else
903
+ _df.write_ndjson(file)
904
+ end
852
905
  nil
853
906
  end
854
907
 
@@ -1010,7 +1063,7 @@ module Polars
1010
1063
 
1011
1064
  # Write to Apache Parquet file.
1012
1065
  #
1013
- # @param file [String]
1066
+ # @param file [String, Pathname, StringIO]
1014
1067
  # File path to which the file should be written.
1015
1068
  # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
1016
1069
  # Choose "zstd" for good compression performance.
@@ -1027,10 +1080,9 @@ module Polars
1027
1080
  # @param statistics [Boolean]
1028
1081
  # Write statistics to the parquet headers. This requires extra compute.
1029
1082
  # @param row_group_size [Integer, nil]
1030
- # Size of the row groups in number of rows.
1031
- # If `nil` (default), the chunks of the DataFrame are
1032
- # used. Writing in smaller chunks may reduce memory pressure and improve
1033
- # writing speeds.
1083
+ # Size of the row groups in number of rows. Defaults to 512^2 rows.
1084
+ # @param data_page_size [Integer, nil]
1085
+ # Size of the data page in bytes. Defaults to 1024^2 bytes.
1034
1086
  #
1035
1087
  # @return [nil]
1036
1088
  def write_parquet(
@@ -1038,7 +1090,8 @@ module Polars
1038
1090
  compression: "zstd",
1039
1091
  compression_level: nil,
1040
1092
  statistics: false,
1041
- row_group_size: nil
1093
+ row_group_size: nil,
1094
+ data_page_size: nil
1042
1095
  )
1043
1096
  if compression.nil?
1044
1097
  compression = "uncompressed"
@@ -1048,7 +1101,7 @@ module Polars
1048
1101
  end
1049
1102
 
1050
1103
  _df.write_parquet(
1051
- file, compression, compression_level, statistics, row_group_size
1104
+ file, compression, compression_level, statistics, row_group_size, data_page_size
1052
1105
  )
1053
1106
  end
1054
1107
 
@@ -1084,7 +1137,7 @@ module Polars
1084
1137
  # df.estimated_size
1085
1138
  # # => 25888898
1086
1139
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1140
+ # # => 17.0601749420166
1088
1141
  def estimated_size(unit = "b")
1089
1142
  sz = _df.estimated_size
1090
1143
  Utils.scale_bytes(sz, to: unit)
@@ -1782,7 +1835,7 @@ module Polars
1782
1835
  # "b" => [2, 4, 6]
1783
1836
  # }
1784
1837
  # )
1785
- # df.with_row_count
1838
+ # df.with_row_index
1786
1839
  # # =>
1787
1840
  # # shape: (3, 3)
1788
1841
  # # ┌────────┬─────┬─────┐
@@ -1794,9 +1847,10 @@ module Polars
1794
1847
  # # │ 1 ┆ 3 ┆ 4 │
1795
1848
  # # │ 2 ┆ 5 ┆ 6 │
1796
1849
  # # └────────┴─────┴─────┘
1797
- def with_row_count(name: "row_nr", offset: 0)
1798
- _from_rbdf(_df.with_row_count(name, offset))
1850
+ def with_row_index(name: "row_nr", offset: 0)
1851
+ _from_rbdf(_df.with_row_index(name, offset))
1799
1852
  end
1853
+ alias_method :with_row_count, :with_row_index
1800
1854
 
1801
1855
  # Start a group by operation.
1802
1856
  #
@@ -2160,12 +2214,13 @@ module Polars
2160
2214
  # closed: "right"
2161
2215
  # ).agg(Polars.col("A").alias("A_agg_list"))
2162
2216
  # # =>
2163
- # # shape: (3, 4)
2217
+ # # shape: (4, 4)
2164
2218
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2165
2219
  # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2166
2220
  # # │ --- ┆ --- ┆ --- ┆ --- │
2167
2221
  # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2168
2222
  # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2223
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
2169
2224
  # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2170
2225
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2171
2226
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
@@ -2433,6 +2488,8 @@ module Polars
2433
2488
  # Join strategy.
2434
2489
  # @param suffix [String]
2435
2490
  # Suffix to append to columns with a duplicate name.
2491
+ # @param join_nulls [Boolean]
2492
+ # Join on null values. By default null values will never produce matches.
2436
2493
  #
2437
2494
  # @return [DataFrame]
2438
2495
  #
@@ -2515,7 +2572,7 @@ module Polars
2515
2572
  # # ╞═════╪═════╪═════╡
2516
2573
  # # │ 3 ┆ 8.0 ┆ c │
2517
2574
  # # └─────┴─────┴─────┘
2518
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2575
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2519
2576
  lazy
2520
2577
  .join(
2521
2578
  other.lazy,
@@ -2524,6 +2581,7 @@ module Polars
2524
2581
  on: on,
2525
2582
  how: how,
2526
2583
  suffix: suffix,
2584
+ join_nulls: join_nulls
2527
2585
  )
2528
2586
  .collect(no_optimization: true)
2529
2587
  end
@@ -2617,26 +2675,26 @@ module Polars
2617
2675
  # # ┌─────┬─────┬───────────┐
2618
2676
  # # │ a ┆ b ┆ b_squared │
2619
2677
  # # │ --- ┆ --- ┆ --- │
2620
- # # │ i64 ┆ i64 ┆ f64
2678
+ # # │ i64 ┆ i64 ┆ i64
2621
2679
  # # ╞═════╪═════╪═══════════╡
2622
- # # │ 1 ┆ 2 ┆ 4.0
2623
- # # │ 3 ┆ 4 ┆ 16.0
2624
- # # │ 5 ┆ 6 ┆ 36.0
2680
+ # # │ 1 ┆ 2 ┆ 4
2681
+ # # │ 3 ┆ 4 ┆ 16
2682
+ # # │ 5 ┆ 6 ┆ 36
2625
2683
  # # └─────┴─────┴───────────┘
2626
2684
  #
2627
2685
  # @example Replaced
2628
2686
  # df.with_column(Polars.col("a") ** 2)
2629
2687
  # # =>
2630
2688
  # # shape: (3, 2)
2631
- # # ┌──────┬─────┐
2632
- # # │ a ┆ b │
2633
- # # │ --- ┆ --- │
2634
- # # │ f64 ┆ i64 │
2635
- # # ╞══════╪═════╡
2636
- # # │ 1.0 ┆ 2 │
2637
- # # │ 9.0 ┆ 4 │
2638
- # # │ 25.0 ┆ 6 │
2639
- # # └──────┴─────┘
2689
+ # # ┌─────┬─────┐
2690
+ # # │ a ┆ b │
2691
+ # # │ --- ┆ --- │
2692
+ # # │ i64 ┆ i64 │
2693
+ # # ╞═════╪═════╡
2694
+ # # │ 1 ┆ 2 │
2695
+ # # │ 9 ┆ 4 │
2696
+ # # │ 25 ┆ 6 │
2697
+ # # └─────┴─────┘
2640
2698
  def with_column(column)
2641
2699
  lazy
2642
2700
  .with_column(column)
@@ -2803,16 +2861,36 @@ module Polars
2803
2861
  # # │ 2 ┆ 7.0 │
2804
2862
  # # │ 3 ┆ 8.0 │
2805
2863
  # # └─────┴─────┘
2806
- def drop(columns)
2807
- if columns.is_a?(::Array)
2808
- df = clone
2809
- columns.each do |n|
2810
- df._df.drop_in_place(n)
2811
- end
2812
- df
2813
- else
2814
- _from_rbdf(_df.drop(columns))
2815
- end
2864
+ #
2865
+ # @example Drop multiple columns by passing a list of column names.
2866
+ # df.drop(["bar", "ham"])
2867
+ # # =>
2868
+ # # shape: (3, 1)
2869
+ # # ┌─────┐
2870
+ # # │ foo │
2871
+ # # │ --- │
2872
+ # # │ i64 │
2873
+ # # ╞═════╡
2874
+ # # │ 1 │
2875
+ # # │ 2 │
2876
+ # # │ 3 │
2877
+ # # └─────┘
2878
+ #
2879
+ # @example Use positional arguments to drop multiple columns.
2880
+ # df.drop("foo", "ham")
2881
+ # # =>
2882
+ # # shape: (3, 1)
2883
+ # # ┌─────┐
2884
+ # # │ bar │
2885
+ # # │ --- │
2886
+ # # │ f64 │
2887
+ # # ╞═════╡
2888
+ # # │ 6.0 │
2889
+ # # │ 7.0 │
2890
+ # # │ 8.0 │
2891
+ # # └─────┘
2892
+ def drop(*columns)
2893
+ lazy.drop(*columns).collect(_eager: true)
2816
2894
  end
2817
2895
 
2818
2896
  # Drop in place.
@@ -2867,7 +2945,7 @@ module Polars
2867
2945
  # "c" => [true, true, false, nil]
2868
2946
  # }
2869
2947
  # )
2870
- # df.cleared
2948
+ # df.clear
2871
2949
  # # =>
2872
2950
  # # shape: (0, 3)
2873
2951
  # # ┌─────┬─────┬──────┐
@@ -2876,9 +2954,31 @@ module Polars
2876
2954
  # # │ i64 ┆ f64 ┆ bool │
2877
2955
  # # ╞═════╪═════╪══════╡
2878
2956
  # # └─────┴─────┴──────┘
2879
- def cleared
2880
- height > 0 ? head(0) : clone
2957
+ #
2958
+ # @example
2959
+ # df.clear(2)
2960
+ # # =>
2961
+ # # shape: (2, 3)
2962
+ # # ┌──────┬──────┬──────┐
2963
+ # # │ a ┆ b ┆ c │
2964
+ # # │ --- ┆ --- ┆ --- │
2965
+ # # │ i64 ┆ f64 ┆ bool │
2966
+ # # ╞══════╪══════╪══════╡
2967
+ # # │ null ┆ null ┆ null │
2968
+ # # │ null ┆ null ┆ null │
2969
+ # # └──────┴──────┴──────┘
2970
+ def clear(n = 0)
2971
+ if n == 0
2972
+ _from_rbdf(_df.clear)
2973
+ elsif n > 0 || len > 0
2974
+ self.class.new(
2975
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2976
+ )
2977
+ else
2978
+ clone
2979
+ end
2881
2980
  end
2981
+ alias_method :cleared, :clear
2882
2982
 
2883
2983
  # clone handled by initialize_copy
2884
2984
 
@@ -3141,8 +3241,11 @@ module Polars
3141
3241
  aggregate_expr = Polars.element.median._rbexpr
3142
3242
  when "last"
3143
3243
  aggregate_expr = Polars.element.last._rbexpr
3244
+ when "len"
3245
+ aggregate_expr = Polars.len._rbexpr
3144
3246
  when "count"
3145
- aggregate_expr = Polars.count._rbexpr
3247
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3248
+ aggregate_expr = Polars.len._rbexpr
3146
3249
  else
3147
3250
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3148
3251
  end
@@ -3154,9 +3257,9 @@ module Polars
3154
3257
 
3155
3258
  _from_rbdf(
3156
3259
  _df.pivot_expr(
3157
- values,
3158
3260
  index,
3159
3261
  columns,
3262
+ values,
3160
3263
  maintain_order,
3161
3264
  sort_columns,
3162
3265
  aggregate_expr,
@@ -3591,8 +3694,13 @@ module Polars
3591
3694
 
3592
3695
  # Select columns from this DataFrame.
3593
3696
  #
3594
- # @param exprs [Object]
3595
- # Column or columns to select.
3697
+ # @param exprs [Array]
3698
+ # Column(s) to select, specified as positional arguments.
3699
+ # Accepts expression input. Strings are parsed as column names,
3700
+ # other non-expression inputs are parsed as literals.
3701
+ # @param named_exprs [Hash]
3702
+ # Additional columns to select, specified as keyword arguments.
3703
+ # The columns will be renamed to the keyword used.
3596
3704
  #
3597
3705
  # @return [DataFrame]
3598
3706
  #
@@ -3672,23 +3780,25 @@ module Polars
3672
3780
  # # │ 0 │
3673
3781
  # # │ 10 │
3674
3782
  # # └─────────┘
3675
- def select(exprs)
3676
- _from_rbdf(
3677
- lazy
3678
- .select(exprs)
3679
- .collect(no_optimization: true, string_cache: false)
3680
- ._df
3681
- )
3783
+ def select(*exprs, **named_exprs)
3784
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3682
3785
  end
3683
3786
 
3684
- # Add or overwrite multiple columns in a DataFrame.
3787
+ # Add columns to this DataFrame.
3788
+ #
3789
+ # Added columns will replace existing columns with the same name.
3685
3790
  #
3686
3791
  # @param exprs [Array]
3687
- # Array of Expressions that evaluate to columns.
3792
+ # Column(s) to add, specified as positional arguments.
3793
+ # Accepts expression input. Strings are parsed as column names, other
3794
+ # non-expression inputs are parsed as literals.
3795
+ # @param named_exprs [Hash]
3796
+ # Additional columns to add, specified as keyword arguments.
3797
+ # The columns will be renamed to the keyword used.
3688
3798
  #
3689
3799
  # @return [DataFrame]
3690
3800
  #
3691
- # @example
3801
+ # @example Pass an expression to add it as a new column.
3692
3802
  # df = Polars::DataFrame.new(
3693
3803
  # {
3694
3804
  # "a" => [1, 2, 3, 4],
@@ -3696,32 +3806,94 @@ module Polars
3696
3806
  # "c" => [true, true, false, true]
3697
3807
  # }
3698
3808
  # )
3809
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3810
+ # # =>
3811
+ # # shape: (4, 4)
3812
+ # # ┌─────┬──────┬───────┬─────┐
3813
+ # # │ a ┆ b ┆ c ┆ a^2 │
3814
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3815
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 │
3816
+ # # ╞═════╪══════╪═══════╪═════╡
3817
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 │
3818
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 │
3819
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 │
3820
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 │
3821
+ # # └─────┴──────┴───────┴─────┘
3822
+ #
3823
+ # @example Added columns will replace existing columns with the same name.
3824
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3825
+ # # =>
3826
+ # # shape: (4, 3)
3827
+ # # ┌─────┬──────┬───────┐
3828
+ # # │ a ┆ b ┆ c │
3829
+ # # │ --- ┆ --- ┆ --- │
3830
+ # # │ f64 ┆ f64 ┆ bool │
3831
+ # # ╞═════╪══════╪═══════╡
3832
+ # # │ 1.0 ┆ 0.5 ┆ true │
3833
+ # # │ 2.0 ┆ 4.0 ┆ true │
3834
+ # # │ 3.0 ┆ 10.0 ┆ false │
3835
+ # # │ 4.0 ┆ 13.0 ┆ true │
3836
+ # # └─────┴──────┴───────┘
3837
+ #
3838
+ # @example Multiple columns can be added by passing a list of expressions.
3699
3839
  # df.with_columns(
3700
3840
  # [
3701
3841
  # (Polars.col("a") ** 2).alias("a^2"),
3702
3842
  # (Polars.col("b") / 2).alias("b/2"),
3703
- # (Polars.col("c").is_not).alias("not c")
3843
+ # (Polars.col("c").not_).alias("not c"),
3704
3844
  # ]
3705
3845
  # )
3706
3846
  # # =>
3707
3847
  # # shape: (4, 6)
3708
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3709
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3710
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3711
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3712
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3713
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3714
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3715
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3716
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
- def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(::Array)
3720
- exprs = [exprs]
3721
- end
3722
- lazy
3723
- .with_columns(exprs)
3724
- .collect(no_optimization: true, string_cache: false)
3848
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3849
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3850
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3851
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3852
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3853
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3854
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3855
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3856
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3857
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3858
+ #
3859
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3860
+ # df.with_columns(
3861
+ # (Polars.col("a") ** 2).alias("a^2"),
3862
+ # (Polars.col("b") / 2).alias("b/2"),
3863
+ # (Polars.col("c").not_).alias("not c"),
3864
+ # )
3865
+ # # =>
3866
+ # # shape: (4, 6)
3867
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3868
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3869
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3870
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3871
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3872
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3873
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3874
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3875
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3876
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3877
+ #
3878
+ # @example Use keyword arguments to easily name your expression inputs.
3879
+ # df.with_columns(
3880
+ # ab: Polars.col("a") * Polars.col("b"),
3881
+ # not_c: Polars.col("c").not_
3882
+ # )
3883
+ # # =>
3884
+ # # shape: (4, 5)
3885
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3886
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3887
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3888
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3889
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3890
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3891
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3892
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3893
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3894
+ # # └─────┴──────┴───────┴──────┴───────┘
3895
+ def with_columns(*exprs, **named_exprs)
3896
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3725
3897
  end
3726
3898
 
3727
3899
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4363,7 +4535,7 @@ module Polars
4363
4535
  # # null
4364
4536
  # # ]
4365
4537
  #
4366
- # @example A horizontal boolean or, similar to a row-wise .any():
4538
+ # @example A horizontal boolean or, similar to a row-wise .any:
4367
4539
  # df = Polars::DataFrame.new(
4368
4540
  # {
4369
4541
  # "a" => [false, false, true],
@@ -4486,7 +4658,7 @@ module Polars
4486
4658
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4487
4659
  def rows(named: false)
4488
4660
  if named
4489
- columns = columns()
4661
+ columns = self.columns
4490
4662
  _df.row_tuples.map do |v|
4491
4663
  columns.zip(v).to_h
4492
4664
  end
@@ -4527,7 +4699,7 @@ module Polars
4527
4699
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4528
4700
 
4529
4701
  # load into the local namespace for a modest performance boost in the hot loops
4530
- columns = columns()
4702
+ columns = self.columns
4531
4703
 
4532
4704
  # note: buffering rows results in a 2-4x speedup over individual calls
4533
4705
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4764,13 +4936,51 @@ module Polars
4764
4936
  _from_rbdf(_df.unnest(names))
4765
4937
  end
4766
4938
 
4767
- # TODO
4939
+ # Requires NumPy
4768
4940
  # def corr
4769
4941
  # end
4770
4942
 
4771
- # TODO
4772
- # def merge_sorted
4773
- # end
4943
+ # Take two sorted DataFrames and merge them by the sorted key.
4944
+ #
4945
+ # The output of this operation will also be sorted.
4946
+ # It is the callers responsibility that the frames are sorted
4947
+ # by that key otherwise the output will not make sense.
4948
+ #
4949
+ # The schemas of both DataFrames must be equal.
4950
+ #
4951
+ # @param other [DataFrame]
4952
+ # Other DataFrame that must be merged
4953
+ # @param key [String]
4954
+ # Key that is sorted.
4955
+ #
4956
+ # @return [DataFrame]
4957
+ #
4958
+ # @example
4959
+ # df0 = Polars::DataFrame.new(
4960
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4961
+ # ).sort("age")
4962
+ # df1 = Polars::DataFrame.new(
4963
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4964
+ # ).sort("age")
4965
+ # df0.merge_sorted(df1, "age")
4966
+ # # =>
4967
+ # # shape: (7, 2)
4968
+ # # ┌────────┬─────┐
4969
+ # # │ name ┆ age │
4970
+ # # │ --- ┆ --- │
4971
+ # # │ str ┆ i64 │
4972
+ # # ╞════════╪═════╡
4973
+ # # │ bob ┆ 18 │
4974
+ # # │ thomas ┆ 20 │
4975
+ # # │ anna ┆ 21 │
4976
+ # # │ megan ┆ 33 │
4977
+ # # │ steve ┆ 42 │
4978
+ # # │ steve ┆ 42 │
4979
+ # # │ elise ┆ 44 │
4980
+ # # └────────┴─────┘
4981
+ def merge_sorted(other, key)
4982
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4983
+ end
4774
4984
 
4775
4985
  # Indicate that one or multiple columns are sorted.
4776
4986
  #
@@ -4812,7 +5022,7 @@ module Polars
4812
5022
  end
4813
5023
 
4814
5024
  def _pos_idxs(idxs, dim)
4815
- idx_type = Polars._get_idx_type
5025
+ idx_type = Plr.get_index_type
4816
5026
 
4817
5027
  if idxs.is_a?(Series)
4818
5028
  if idxs.dtype == idx_type
@@ -5045,14 +5255,14 @@ module Polars
5045
5255
  elsif data[0].is_a?(Hash)
5046
5256
  column_names, dtypes = _unpack_schema(columns)
5047
5257
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5048
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5258
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5049
5259
  if column_names
5050
5260
  rbdf = _post_apply_columns(rbdf, column_names)
5051
5261
  end
5052
5262
  return rbdf
5053
5263
  elsif data[0].is_a?(::Array)
5264
+ first_element = data[0]
5054
5265
  if orient.nil? && !columns.nil?
5055
- first_element = data[0]
5056
5266
  row_types = first_element.filter_map { |value| value.class }.uniq
5057
5267
  if row_types.include?(Integer) && row_types.include?(Float)
5058
5268
  row_types.delete(Integer)