polars-df 0.8.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1726 -754
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/3.3/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +179 -43
  17. data/lib/polars/data_types.rb +191 -28
  18. data/lib/polars/date_time_expr.rb +31 -14
  19. data/lib/polars/exceptions.rb +12 -1
  20. data/lib/polars/expr.rb +866 -186
  21. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  22. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  23. data/lib/polars/functions/as_datatype.rb +248 -0
  24. data/lib/polars/functions/col.rb +47 -0
  25. data/lib/polars/functions/eager.rb +182 -0
  26. data/lib/polars/functions/lazy.rb +1280 -0
  27. data/lib/polars/functions/len.rb +49 -0
  28. data/lib/polars/functions/lit.rb +35 -0
  29. data/lib/polars/functions/random.rb +16 -0
  30. data/lib/polars/functions/range/date_range.rb +103 -0
  31. data/lib/polars/functions/range/int_range.rb +51 -0
  32. data/lib/polars/functions/repeat.rb +144 -0
  33. data/lib/polars/functions/whenthen.rb +27 -0
  34. data/lib/polars/functions.rb +29 -416
  35. data/lib/polars/group_by.rb +2 -2
  36. data/lib/polars/io.rb +18 -25
  37. data/lib/polars/lazy_frame.rb +367 -53
  38. data/lib/polars/list_expr.rb +152 -6
  39. data/lib/polars/list_name_space.rb +102 -0
  40. data/lib/polars/meta_expr.rb +175 -7
  41. data/lib/polars/series.rb +273 -34
  42. data/lib/polars/string_cache.rb +75 -0
  43. data/lib/polars/string_expr.rb +412 -96
  44. data/lib/polars/string_name_space.rb +4 -4
  45. data/lib/polars/testing.rb +507 -0
  46. data/lib/polars/utils.rb +52 -8
  47. data/lib/polars/version.rb +1 -1
  48. data/lib/polars.rb +15 -2
  49. metadata +33 -4
  50. data/lib/polars/lazy_functions.rb +0 -1181
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -622,7 +622,7 @@ module Polars
622
622
  # select single column
623
623
  # df["foo"]
624
624
  if item.is_a?(::String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
625
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
626
  end
627
627
 
628
628
  # df[idx]
@@ -1084,7 +1084,7 @@ module Polars
1084
1084
  # df.estimated_size
1085
1085
  # # => 25888898
1086
1086
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1087
+ # # => 26.702880859375
1088
1088
  def estimated_size(unit = "b")
1089
1089
  sz = _df.estimated_size
1090
1090
  Utils.scale_bytes(sz, to: unit)
@@ -1782,7 +1782,7 @@ module Polars
1782
1782
  # "b" => [2, 4, 6]
1783
1783
  # }
1784
1784
  # )
1785
- # df.with_row_count
1785
+ # df.with_row_index
1786
1786
  # # =>
1787
1787
  # # shape: (3, 3)
1788
1788
  # # ┌────────┬─────┬─────┐
@@ -1794,9 +1794,10 @@ module Polars
1794
1794
  # # │ 1 ┆ 3 ┆ 4 │
1795
1795
  # # │ 2 ┆ 5 ┆ 6 │
1796
1796
  # # └────────┴─────┴─────┘
1797
- def with_row_count(name: "row_nr", offset: 0)
1798
- _from_rbdf(_df.with_row_count(name, offset))
1797
+ def with_row_index(name: "row_nr", offset: 0)
1798
+ _from_rbdf(_df.with_row_index(name, offset))
1799
1799
  end
1800
+ alias_method :with_row_count, :with_row_index
1800
1801
 
1801
1802
  # Start a group by operation.
1802
1803
  #
@@ -2433,6 +2434,8 @@ module Polars
2433
2434
  # Join strategy.
2434
2435
  # @param suffix [String]
2435
2436
  # Suffix to append to columns with a duplicate name.
2437
+ # @param join_nulls [Boolean]
2438
+ # Join on null values. By default null values will never produce matches.
2436
2439
  #
2437
2440
  # @return [DataFrame]
2438
2441
  #
@@ -2515,7 +2518,7 @@ module Polars
2515
2518
  # # ╞═════╪═════╪═════╡
2516
2519
  # # │ 3 ┆ 8.0 ┆ c │
2517
2520
  # # └─────┴─────┴─────┘
2518
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2521
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2519
2522
  lazy
2520
2523
  .join(
2521
2524
  other.lazy,
@@ -2524,6 +2527,7 @@ module Polars
2524
2527
  on: on,
2525
2528
  how: how,
2526
2529
  suffix: suffix,
2530
+ join_nulls: join_nulls
2527
2531
  )
2528
2532
  .collect(no_optimization: true)
2529
2533
  end
@@ -2867,7 +2871,7 @@ module Polars
2867
2871
  # "c" => [true, true, false, nil]
2868
2872
  # }
2869
2873
  # )
2870
- # df.cleared
2874
+ # df.clear
2871
2875
  # # =>
2872
2876
  # # shape: (0, 3)
2873
2877
  # # ┌─────┬─────┬──────┐
@@ -2876,9 +2880,31 @@ module Polars
2876
2880
  # # │ i64 ┆ f64 ┆ bool │
2877
2881
  # # ╞═════╪═════╪══════╡
2878
2882
  # # └─────┴─────┴──────┘
2879
- def cleared
2880
- height > 0 ? head(0) : clone
2883
+ #
2884
+ # @example
2885
+ # df.clear(2)
2886
+ # # =>
2887
+ # # shape: (2, 3)
2888
+ # # ┌──────┬──────┬──────┐
2889
+ # # │ a ┆ b ┆ c │
2890
+ # # │ --- ┆ --- ┆ --- │
2891
+ # # │ i64 ┆ f64 ┆ bool │
2892
+ # # ╞══════╪══════╪══════╡
2893
+ # # │ null ┆ null ┆ null │
2894
+ # # │ null ┆ null ┆ null │
2895
+ # # └──────┴──────┴──────┘
2896
+ def clear(n = 0)
2897
+ if n == 0
2898
+ _from_rbdf(_df.clear)
2899
+ elsif n > 0 || len > 0
2900
+ self.class.new(
2901
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2902
+ )
2903
+ else
2904
+ clone
2905
+ end
2881
2906
  end
2907
+ alias_method :cleared, :clear
2882
2908
 
2883
2909
  # clone handled by initialize_copy
2884
2910
 
@@ -3141,8 +3167,11 @@ module Polars
3141
3167
  aggregate_expr = Polars.element.median._rbexpr
3142
3168
  when "last"
3143
3169
  aggregate_expr = Polars.element.last._rbexpr
3170
+ when "len"
3171
+ aggregate_expr = Polars.len._rbexpr
3144
3172
  when "count"
3145
- aggregate_expr = Polars.count._rbexpr
3173
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3174
+ aggregate_expr = Polars.len._rbexpr
3146
3175
  else
3147
3176
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3148
3177
  end
@@ -3154,9 +3183,9 @@ module Polars
3154
3183
 
3155
3184
  _from_rbdf(
3156
3185
  _df.pivot_expr(
3157
- values,
3158
3186
  index,
3159
3187
  columns,
3188
+ values,
3160
3189
  maintain_order,
3161
3190
  sort_columns,
3162
3191
  aggregate_expr,
@@ -3591,8 +3620,13 @@ module Polars
3591
3620
 
3592
3621
  # Select columns from this DataFrame.
3593
3622
  #
3594
- # @param exprs [Object]
3595
- # Column or columns to select.
3623
+ # @param exprs [Array]
3624
+ # Column(s) to select, specified as positional arguments.
3625
+ # Accepts expression input. Strings are parsed as column names,
3626
+ # other non-expression inputs are parsed as literals.
3627
+ # @param named_exprs [Hash]
3628
+ # Additional columns to select, specified as keyword arguments.
3629
+ # The columns will be renamed to the keyword used.
3596
3630
  #
3597
3631
  # @return [DataFrame]
3598
3632
  #
@@ -3672,23 +3706,25 @@ module Polars
3672
3706
  # # │ 0 │
3673
3707
  # # │ 10 │
3674
3708
  # # └─────────┘
3675
- def select(exprs)
3676
- _from_rbdf(
3677
- lazy
3678
- .select(exprs)
3679
- .collect(no_optimization: true, string_cache: false)
3680
- ._df
3681
- )
3709
+ def select(*exprs, **named_exprs)
3710
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3682
3711
  end
3683
3712
 
3684
- # Add or overwrite multiple columns in a DataFrame.
3713
+ # Add columns to this DataFrame.
3714
+ #
3715
+ # Added columns will replace existing columns with the same name.
3685
3716
  #
3686
3717
  # @param exprs [Array]
3687
- # Array of Expressions that evaluate to columns.
3718
+ # Column(s) to add, specified as positional arguments.
3719
+ # Accepts expression input. Strings are parsed as column names, other
3720
+ # non-expression inputs are parsed as literals.
3721
+ # @param named_exprs [Hash]
3722
+ # Additional columns to add, specified as keyword arguments.
3723
+ # The columns will be renamed to the keyword used.
3688
3724
  #
3689
3725
  # @return [DataFrame]
3690
3726
  #
3691
- # @example
3727
+ # @example Pass an expression to add it as a new column.
3692
3728
  # df = Polars::DataFrame.new(
3693
3729
  # {
3694
3730
  # "a" => [1, 2, 3, 4],
@@ -3696,11 +3732,41 @@ module Polars
3696
3732
  # "c" => [true, true, false, true]
3697
3733
  # }
3698
3734
  # )
3735
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3736
+ # # =>
3737
+ # # shape: (4, 4)
3738
+ # # ┌─────┬──────┬───────┬──────┐
3739
+ # # │ a ┆ b ┆ c ┆ a^2 │
3740
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3741
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
3742
+ # # ╞═════╪══════╪═══════╪══════╡
3743
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 │
3744
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 │
3745
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 │
3746
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 │
3747
+ # # └─────┴──────┴───────┴──────┘
3748
+ #
3749
+ # @example Added columns will replace existing columns with the same name.
3750
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3751
+ # # =>
3752
+ # # shape: (4, 3)
3753
+ # # ┌─────┬──────┬───────┐
3754
+ # # │ a ┆ b ┆ c │
3755
+ # # │ --- ┆ --- ┆ --- │
3756
+ # # │ f64 ┆ f64 ┆ bool │
3757
+ # # ╞═════╪══════╪═══════╡
3758
+ # # │ 1.0 ┆ 0.5 ┆ true │
3759
+ # # │ 2.0 ┆ 4.0 ┆ true │
3760
+ # # │ 3.0 ┆ 10.0 ┆ false │
3761
+ # # │ 4.0 ┆ 13.0 ┆ true │
3762
+ # # └─────┴──────┴───────┘
3763
+ #
3764
+ # @example Multiple columns can be added by passing a list of expressions.
3699
3765
  # df.with_columns(
3700
3766
  # [
3701
3767
  # (Polars.col("a") ** 2).alias("a^2"),
3702
3768
  # (Polars.col("b") / 2).alias("b/2"),
3703
- # (Polars.col("c").is_not).alias("not c")
3769
+ # (Polars.col("c").not_).alias("not c"),
3704
3770
  # ]
3705
3771
  # )
3706
3772
  # # =>
@@ -3715,13 +3781,45 @@ module Polars
3715
3781
  # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3716
3782
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
3783
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
- def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(::Array)
3720
- exprs = [exprs]
3721
- end
3722
- lazy
3723
- .with_columns(exprs)
3724
- .collect(no_optimization: true, string_cache: false)
3784
+ #
3785
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3786
+ # df.with_columns(
3787
+ # (Polars.col("a") ** 2).alias("a^2"),
3788
+ # (Polars.col("b") / 2).alias("b/2"),
3789
+ # (Polars.col("c").not_).alias("not c"),
3790
+ # )
3791
+ # # =>
3792
+ # # shape: (4, 6)
3793
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3794
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3795
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3796
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3797
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3798
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3799
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3800
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3801
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3802
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
3803
+ #
3804
+ # @example Use keyword arguments to easily name your expression inputs.
3805
+ # df.with_columns(
3806
+ # ab: Polars.col("a") * Polars.col("b"),
3807
+ # not_c: Polars.col("c").not_
3808
+ # )
3809
+ # # =>
3810
+ # # shape: (4, 5)
3811
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3812
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3813
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3814
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3815
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3816
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3817
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3818
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3819
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3820
+ # # └─────┴──────┴───────┴──────┴───────┘
3821
+ def with_columns(*exprs, **named_exprs)
3822
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3725
3823
  end
3726
3824
 
3727
3825
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4363,7 +4461,7 @@ module Polars
4363
4461
  # # null
4364
4462
  # # ]
4365
4463
  #
4366
- # @example A horizontal boolean or, similar to a row-wise .any():
4464
+ # @example A horizontal boolean or, similar to a row-wise .any:
4367
4465
  # df = Polars::DataFrame.new(
4368
4466
  # {
4369
4467
  # "a" => [false, false, true],
@@ -4486,7 +4584,7 @@ module Polars
4486
4584
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4487
4585
  def rows(named: false)
4488
4586
  if named
4489
- columns = columns()
4587
+ columns = self.columns
4490
4588
  _df.row_tuples.map do |v|
4491
4589
  columns.zip(v).to_h
4492
4590
  end
@@ -4527,7 +4625,7 @@ module Polars
4527
4625
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4528
4626
 
4529
4627
  # load into the local namespace for a modest performance boost in the hot loops
4530
- columns = columns()
4628
+ columns = self.columns
4531
4629
 
4532
4630
  # note: buffering rows results in a 2-4x speedup over individual calls
4533
4631
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4764,13 +4862,51 @@ module Polars
4764
4862
  _from_rbdf(_df.unnest(names))
4765
4863
  end
4766
4864
 
4767
- # TODO
4865
+ # Requires NumPy
4768
4866
  # def corr
4769
4867
  # end
4770
4868
 
4771
- # TODO
4772
- # def merge_sorted
4773
- # end
4869
+ # Take two sorted DataFrames and merge them by the sorted key.
4870
+ #
4871
+ # The output of this operation will also be sorted.
4872
+ # It is the callers responsibility that the frames are sorted
4873
+ # by that key otherwise the output will not make sense.
4874
+ #
4875
+ # The schemas of both DataFrames must be equal.
4876
+ #
4877
+ # @param other [DataFrame]
4878
+ # Other DataFrame that must be merged
4879
+ # @param key [String]
4880
+ # Key that is sorted.
4881
+ #
4882
+ # @return [DataFrame]
4883
+ #
4884
+ # @example
4885
+ # df0 = Polars::DataFrame.new(
4886
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4887
+ # ).sort("age")
4888
+ # df1 = Polars::DataFrame.new(
4889
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4890
+ # ).sort("age")
4891
+ # df0.merge_sorted(df1, "age")
4892
+ # # =>
4893
+ # # shape: (7, 2)
4894
+ # # ┌────────┬─────┐
4895
+ # # │ name ┆ age │
4896
+ # # │ --- ┆ --- │
4897
+ # # │ str ┆ i64 │
4898
+ # # ╞════════╪═════╡
4899
+ # # │ bob ┆ 18 │
4900
+ # # │ thomas ┆ 20 │
4901
+ # # │ anna ┆ 21 │
4902
+ # # │ megan ┆ 33 │
4903
+ # # │ steve ┆ 42 │
4904
+ # # │ steve ┆ 42 │
4905
+ # # │ elise ┆ 44 │
4906
+ # # └────────┴─────┘
4907
+ def merge_sorted(other, key)
4908
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4909
+ end
4774
4910
 
4775
4911
  # Indicate that one or multiple columns are sorted.
4776
4912
  #
@@ -4812,7 +4948,7 @@ module Polars
4812
4948
  end
4813
4949
 
4814
4950
  def _pos_idxs(idxs, dim)
4815
- idx_type = Polars._get_idx_type
4951
+ idx_type = Plr.get_index_type
4816
4952
 
4817
4953
  if idxs.is_a?(Series)
4818
4954
  if idxs.dtype == idx_type
@@ -5045,14 +5181,14 @@ module Polars
5045
5181
  elsif data[0].is_a?(Hash)
5046
5182
  column_names, dtypes = _unpack_schema(columns)
5047
5183
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5048
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5184
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5049
5185
  if column_names
5050
5186
  rbdf = _post_apply_columns(rbdf, column_names)
5051
5187
  end
5052
5188
  return rbdf
5053
5189
  elsif data[0].is_a?(::Array)
5190
+ first_element = data[0]
5054
5191
  if orient.nil? && !columns.nil?
5055
- first_element = data[0]
5056
5192
  row_types = first_element.filter_map { |value| value.class }.uniq
5057
5193
  if row_types.include?(Integer) && row_types.include?(Float)
5058
5194
  row_types.delete(Integer)