polars-df 0.8.0-x86_64-darwin → 0.9.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1726 -754
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/3.3/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +179 -43
  17. data/lib/polars/data_types.rb +191 -28
  18. data/lib/polars/date_time_expr.rb +31 -14
  19. data/lib/polars/exceptions.rb +12 -1
  20. data/lib/polars/expr.rb +866 -186
  21. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  22. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  23. data/lib/polars/functions/as_datatype.rb +248 -0
  24. data/lib/polars/functions/col.rb +47 -0
  25. data/lib/polars/functions/eager.rb +182 -0
  26. data/lib/polars/functions/lazy.rb +1280 -0
  27. data/lib/polars/functions/len.rb +49 -0
  28. data/lib/polars/functions/lit.rb +35 -0
  29. data/lib/polars/functions/random.rb +16 -0
  30. data/lib/polars/functions/range/date_range.rb +103 -0
  31. data/lib/polars/functions/range/int_range.rb +51 -0
  32. data/lib/polars/functions/repeat.rb +144 -0
  33. data/lib/polars/functions/whenthen.rb +27 -0
  34. data/lib/polars/functions.rb +29 -416
  35. data/lib/polars/group_by.rb +2 -2
  36. data/lib/polars/io.rb +18 -25
  37. data/lib/polars/lazy_frame.rb +367 -53
  38. data/lib/polars/list_expr.rb +152 -6
  39. data/lib/polars/list_name_space.rb +102 -0
  40. data/lib/polars/meta_expr.rb +175 -7
  41. data/lib/polars/series.rb +273 -34
  42. data/lib/polars/string_cache.rb +75 -0
  43. data/lib/polars/string_expr.rb +412 -96
  44. data/lib/polars/string_name_space.rb +4 -4
  45. data/lib/polars/testing.rb +507 -0
  46. data/lib/polars/utils.rb +52 -8
  47. data/lib/polars/version.rb +1 -1
  48. data/lib/polars.rb +15 -2
  49. metadata +33 -4
  50. data/lib/polars/lazy_functions.rb +0 -1181
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -622,7 +622,7 @@ module Polars
622
622
  # select single column
623
623
  # df["foo"]
624
624
  if item.is_a?(::String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
625
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
626
  end
627
627
 
628
628
  # df[idx]
@@ -1084,7 +1084,7 @@ module Polars
1084
1084
  # df.estimated_size
1085
1085
  # # => 25888898
1086
1086
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1087
+ # # => 26.702880859375
1088
1088
  def estimated_size(unit = "b")
1089
1089
  sz = _df.estimated_size
1090
1090
  Utils.scale_bytes(sz, to: unit)
@@ -1782,7 +1782,7 @@ module Polars
1782
1782
  # "b" => [2, 4, 6]
1783
1783
  # }
1784
1784
  # )
1785
- # df.with_row_count
1785
+ # df.with_row_index
1786
1786
  # # =>
1787
1787
  # # shape: (3, 3)
1788
1788
  # # ┌────────┬─────┬─────┐
@@ -1794,9 +1794,10 @@ module Polars
1794
1794
  # # │ 1 ┆ 3 ┆ 4 │
1795
1795
  # # │ 2 ┆ 5 ┆ 6 │
1796
1796
  # # └────────┴─────┴─────┘
1797
- def with_row_count(name: "row_nr", offset: 0)
1798
- _from_rbdf(_df.with_row_count(name, offset))
1797
+ def with_row_index(name: "row_nr", offset: 0)
1798
+ _from_rbdf(_df.with_row_index(name, offset))
1799
1799
  end
1800
+ alias_method :with_row_count, :with_row_index
1800
1801
 
1801
1802
  # Start a group by operation.
1802
1803
  #
@@ -2433,6 +2434,8 @@ module Polars
2433
2434
  # Join strategy.
2434
2435
  # @param suffix [String]
2435
2436
  # Suffix to append to columns with a duplicate name.
2437
+ # @param join_nulls [Boolean]
2438
+ # Join on null values. By default null values will never produce matches.
2436
2439
  #
2437
2440
  # @return [DataFrame]
2438
2441
  #
@@ -2515,7 +2518,7 @@ module Polars
2515
2518
  # # ╞═════╪═════╪═════╡
2516
2519
  # # │ 3 ┆ 8.0 ┆ c │
2517
2520
  # # └─────┴─────┴─────┘
2518
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2521
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2519
2522
  lazy
2520
2523
  .join(
2521
2524
  other.lazy,
@@ -2524,6 +2527,7 @@ module Polars
2524
2527
  on: on,
2525
2528
  how: how,
2526
2529
  suffix: suffix,
2530
+ join_nulls: join_nulls
2527
2531
  )
2528
2532
  .collect(no_optimization: true)
2529
2533
  end
@@ -2867,7 +2871,7 @@ module Polars
2867
2871
  # "c" => [true, true, false, nil]
2868
2872
  # }
2869
2873
  # )
2870
- # df.cleared
2874
+ # df.clear
2871
2875
  # # =>
2872
2876
  # # shape: (0, 3)
2873
2877
  # # ┌─────┬─────┬──────┐
@@ -2876,9 +2880,31 @@ module Polars
2876
2880
  # # │ i64 ┆ f64 ┆ bool │
2877
2881
  # # ╞═════╪═════╪══════╡
2878
2882
  # # └─────┴─────┴──────┘
2879
- def cleared
2880
- height > 0 ? head(0) : clone
2883
+ #
2884
+ # @example
2885
+ # df.clear(2)
2886
+ # # =>
2887
+ # # shape: (2, 3)
2888
+ # # ┌──────┬──────┬──────┐
2889
+ # # │ a ┆ b ┆ c │
2890
+ # # │ --- ┆ --- ┆ --- │
2891
+ # # │ i64 ┆ f64 ┆ bool │
2892
+ # # ╞══════╪══════╪══════╡
2893
+ # # │ null ┆ null ┆ null │
2894
+ # # │ null ┆ null ┆ null │
2895
+ # # └──────┴──────┴──────┘
2896
+ def clear(n = 0)
2897
+ if n == 0
2898
+ _from_rbdf(_df.clear)
2899
+ elsif n > 0 || len > 0
2900
+ self.class.new(
2901
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2902
+ )
2903
+ else
2904
+ clone
2905
+ end
2881
2906
  end
2907
+ alias_method :cleared, :clear
2882
2908
 
2883
2909
  # clone handled by initialize_copy
2884
2910
 
@@ -3141,8 +3167,11 @@ module Polars
3141
3167
  aggregate_expr = Polars.element.median._rbexpr
3142
3168
  when "last"
3143
3169
  aggregate_expr = Polars.element.last._rbexpr
3170
+ when "len"
3171
+ aggregate_expr = Polars.len._rbexpr
3144
3172
  when "count"
3145
- aggregate_expr = Polars.count._rbexpr
3173
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3174
+ aggregate_expr = Polars.len._rbexpr
3146
3175
  else
3147
3176
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3148
3177
  end
@@ -3154,9 +3183,9 @@ module Polars
3154
3183
 
3155
3184
  _from_rbdf(
3156
3185
  _df.pivot_expr(
3157
- values,
3158
3186
  index,
3159
3187
  columns,
3188
+ values,
3160
3189
  maintain_order,
3161
3190
  sort_columns,
3162
3191
  aggregate_expr,
@@ -3591,8 +3620,13 @@ module Polars
3591
3620
 
3592
3621
  # Select columns from this DataFrame.
3593
3622
  #
3594
- # @param exprs [Object]
3595
- # Column or columns to select.
3623
+ # @param exprs [Array]
3624
+ # Column(s) to select, specified as positional arguments.
3625
+ # Accepts expression input. Strings are parsed as column names,
3626
+ # other non-expression inputs are parsed as literals.
3627
+ # @param named_exprs [Hash]
3628
+ # Additional columns to select, specified as keyword arguments.
3629
+ # The columns will be renamed to the keyword used.
3596
3630
  #
3597
3631
  # @return [DataFrame]
3598
3632
  #
@@ -3672,23 +3706,25 @@ module Polars
3672
3706
  # # │ 0 │
3673
3707
  # # │ 10 │
3674
3708
  # # └─────────┘
3675
- def select(exprs)
3676
- _from_rbdf(
3677
- lazy
3678
- .select(exprs)
3679
- .collect(no_optimization: true, string_cache: false)
3680
- ._df
3681
- )
3709
+ def select(*exprs, **named_exprs)
3710
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3682
3711
  end
3683
3712
 
3684
- # Add or overwrite multiple columns in a DataFrame.
3713
+ # Add columns to this DataFrame.
3714
+ #
3715
+ # Added columns will replace existing columns with the same name.
3685
3716
  #
3686
3717
  # @param exprs [Array]
3687
- # Array of Expressions that evaluate to columns.
3718
+ # Column(s) to add, specified as positional arguments.
3719
+ # Accepts expression input. Strings are parsed as column names, other
3720
+ # non-expression inputs are parsed as literals.
3721
+ # @param named_exprs [Hash]
3722
+ # Additional columns to add, specified as keyword arguments.
3723
+ # The columns will be renamed to the keyword used.
3688
3724
  #
3689
3725
  # @return [DataFrame]
3690
3726
  #
3691
- # @example
3727
+ # @example Pass an expression to add it as a new column.
3692
3728
  # df = Polars::DataFrame.new(
3693
3729
  # {
3694
3730
  # "a" => [1, 2, 3, 4],
@@ -3696,11 +3732,41 @@ module Polars
3696
3732
  # "c" => [true, true, false, true]
3697
3733
  # }
3698
3734
  # )
3735
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3736
+ # # =>
3737
+ # # shape: (4, 4)
3738
+ # # ┌─────┬──────┬───────┬──────┐
3739
+ # # │ a ┆ b ┆ c ┆ a^2 │
3740
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3741
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
3742
+ # # ╞═════╪══════╪═══════╪══════╡
3743
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 │
3744
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 │
3745
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 │
3746
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 │
3747
+ # # └─────┴──────┴───────┴──────┘
3748
+ #
3749
+ # @example Added columns will replace existing columns with the same name.
3750
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3751
+ # # =>
3752
+ # # shape: (4, 3)
3753
+ # # ┌─────┬──────┬───────┐
3754
+ # # │ a ┆ b ┆ c │
3755
+ # # │ --- ┆ --- ┆ --- │
3756
+ # # │ f64 ┆ f64 ┆ bool │
3757
+ # # ╞═════╪══════╪═══════╡
3758
+ # # │ 1.0 ┆ 0.5 ┆ true │
3759
+ # # │ 2.0 ┆ 4.0 ┆ true │
3760
+ # # │ 3.0 ┆ 10.0 ┆ false │
3761
+ # # │ 4.0 ┆ 13.0 ┆ true │
3762
+ # # └─────┴──────┴───────┘
3763
+ #
3764
+ # @example Multiple columns can be added by passing a list of expressions.
3699
3765
  # df.with_columns(
3700
3766
  # [
3701
3767
  # (Polars.col("a") ** 2).alias("a^2"),
3702
3768
  # (Polars.col("b") / 2).alias("b/2"),
3703
- # (Polars.col("c").is_not).alias("not c")
3769
+ # (Polars.col("c").not_).alias("not c"),
3704
3770
  # ]
3705
3771
  # )
3706
3772
  # # =>
@@ -3715,13 +3781,45 @@ module Polars
3715
3781
  # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3716
3782
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
3783
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
- def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(::Array)
3720
- exprs = [exprs]
3721
- end
3722
- lazy
3723
- .with_columns(exprs)
3724
- .collect(no_optimization: true, string_cache: false)
3784
+ #
3785
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3786
+ # df.with_columns(
3787
+ # (Polars.col("a") ** 2).alias("a^2"),
3788
+ # (Polars.col("b") / 2).alias("b/2"),
3789
+ # (Polars.col("c").not_).alias("not c"),
3790
+ # )
3791
+ # # =>
3792
+ # # shape: (4, 6)
3793
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3794
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3795
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3796
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3797
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3798
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3799
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3800
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3801
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3802
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
3803
+ #
3804
+ # @example Use keyword arguments to easily name your expression inputs.
3805
+ # df.with_columns(
3806
+ # ab: Polars.col("a") * Polars.col("b"),
3807
+ # not_c: Polars.col("c").not_
3808
+ # )
3809
+ # # =>
3810
+ # # shape: (4, 5)
3811
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3812
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3813
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3814
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3815
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3816
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3817
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3818
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3819
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3820
+ # # └─────┴──────┴───────┴──────┴───────┘
3821
+ def with_columns(*exprs, **named_exprs)
3822
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3725
3823
  end
3726
3824
 
3727
3825
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4363,7 +4461,7 @@ module Polars
4363
4461
  # # null
4364
4462
  # # ]
4365
4463
  #
4366
- # @example A horizontal boolean or, similar to a row-wise .any():
4464
+ # @example A horizontal boolean or, similar to a row-wise .any:
4367
4465
  # df = Polars::DataFrame.new(
4368
4466
  # {
4369
4467
  # "a" => [false, false, true],
@@ -4486,7 +4584,7 @@ module Polars
4486
4584
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4487
4585
  def rows(named: false)
4488
4586
  if named
4489
- columns = columns()
4587
+ columns = self.columns
4490
4588
  _df.row_tuples.map do |v|
4491
4589
  columns.zip(v).to_h
4492
4590
  end
@@ -4527,7 +4625,7 @@ module Polars
4527
4625
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4528
4626
 
4529
4627
  # load into the local namespace for a modest performance boost in the hot loops
4530
- columns = columns()
4628
+ columns = self.columns
4531
4629
 
4532
4630
  # note: buffering rows results in a 2-4x speedup over individual calls
4533
4631
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4764,13 +4862,51 @@ module Polars
4764
4862
  _from_rbdf(_df.unnest(names))
4765
4863
  end
4766
4864
 
4767
- # TODO
4865
+ # Requires NumPy
4768
4866
  # def corr
4769
4867
  # end
4770
4868
 
4771
- # TODO
4772
- # def merge_sorted
4773
- # end
4869
+ # Take two sorted DataFrames and merge them by the sorted key.
4870
+ #
4871
+ # The output of this operation will also be sorted.
4872
+ # It is the callers responsibility that the frames are sorted
4873
+ # by that key otherwise the output will not make sense.
4874
+ #
4875
+ # The schemas of both DataFrames must be equal.
4876
+ #
4877
+ # @param other [DataFrame]
4878
+ # Other DataFrame that must be merged
4879
+ # @param key [String]
4880
+ # Key that is sorted.
4881
+ #
4882
+ # @return [DataFrame]
4883
+ #
4884
+ # @example
4885
+ # df0 = Polars::DataFrame.new(
4886
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4887
+ # ).sort("age")
4888
+ # df1 = Polars::DataFrame.new(
4889
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4890
+ # ).sort("age")
4891
+ # df0.merge_sorted(df1, "age")
4892
+ # # =>
4893
+ # # shape: (7, 2)
4894
+ # # ┌────────┬─────┐
4895
+ # # │ name ┆ age │
4896
+ # # │ --- ┆ --- │
4897
+ # # │ str ┆ i64 │
4898
+ # # ╞════════╪═════╡
4899
+ # # │ bob ┆ 18 │
4900
+ # # │ thomas ┆ 20 │
4901
+ # # │ anna ┆ 21 │
4902
+ # # │ megan ┆ 33 │
4903
+ # # │ steve ┆ 42 │
4904
+ # # │ steve ┆ 42 │
4905
+ # # │ elise ┆ 44 │
4906
+ # # └────────┴─────┘
4907
+ def merge_sorted(other, key)
4908
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4909
+ end
4774
4910
 
4775
4911
  # Indicate that one or multiple columns are sorted.
4776
4912
  #
@@ -4812,7 +4948,7 @@ module Polars
4812
4948
  end
4813
4949
 
4814
4950
  def _pos_idxs(idxs, dim)
4815
- idx_type = Polars._get_idx_type
4951
+ idx_type = Plr.get_index_type
4816
4952
 
4817
4953
  if idxs.is_a?(Series)
4818
4954
  if idxs.dtype == idx_type
@@ -5045,14 +5181,14 @@ module Polars
5045
5181
  elsif data[0].is_a?(Hash)
5046
5182
  column_names, dtypes = _unpack_schema(columns)
5047
5183
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5048
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5184
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5049
5185
  if column_names
5050
5186
  rbdf = _post_apply_columns(rbdf, column_names)
5051
5187
  end
5052
5188
  return rbdf
5053
5189
  elsif data[0].is_a?(::Array)
5190
+ first_element = data[0]
5054
5191
  if orient.nil? && !columns.nil?
5055
- first_element = data[0]
5056
5192
  row_types = first_element.filter_map { |value| value.class }.uniq
5057
5193
  if row_types.include?(Integer) && row_types.include?(Float)
5058
5194
  row_types.delete(Integer)