polars-df 0.6.0-x86_64-darwin → 0.8.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
data/lib/polars/series.rb CHANGED
@@ -34,7 +34,7 @@ module Polars
34
34
  # s3 = Polars::Series.new([1, 2, 3])
35
35
  def initialize(name = nil, values = nil, dtype: nil, strict: true, nan_to_null: false, dtype_if_empty: nil)
36
36
  # Handle case where values are passed as the first argument
37
- if !name.nil? && !name.is_a?(String)
37
+ if !name.nil? && !name.is_a?(::String)
38
38
  if values.nil?
39
39
  values = name
40
40
  name = nil
@@ -46,7 +46,7 @@ module Polars
46
46
  name = "" if name.nil?
47
47
 
48
48
  # TODO improve
49
- if values.is_a?(Range) && values.begin.is_a?(String)
49
+ if values.is_a?(Range) && values.begin.is_a?(::String)
50
50
  values = values.to_a
51
51
  end
52
52
 
@@ -341,7 +341,7 @@ module Polars
341
341
  def []=(key, value)
342
342
  if value.is_a?(::Array)
343
343
  if is_numeric || is_datelike
344
- set_at_idx(key, value)
344
+ scatter(key, value)
345
345
  return
346
346
  end
347
347
  raise ArgumentError, "cannot set Series of dtype: #{dtype} with list/tuple as value; use a scalar value"
@@ -351,9 +351,9 @@ module Polars
351
351
  if key.dtype == Boolean
352
352
  self._s = set(key, value)._s
353
353
  elsif key.dtype == UInt64
354
- self._s = set_at_idx(key.cast(UInt32), value)._s
354
+ self._s = scatter(key.cast(UInt32), value)._s
355
355
  elsif key.dtype == UInt32
356
- self._s = set_at_idx(key, value)._s
356
+ self._s = scatter(key, value)._s
357
357
  else
358
358
  raise Todo
359
359
  end
@@ -432,6 +432,18 @@ module Polars
432
432
  end
433
433
  alias_method :all, :all?
434
434
 
435
+ # Check if all boolean values in the column are `false`.
436
+ #
437
+ # @return [Boolean]
438
+ def none?(&block)
439
+ if block_given?
440
+ apply(&block).none?
441
+ else
442
+ to_frame.select(Polars.col(name).is_not.all).to_series[0]
443
+ end
444
+ end
445
+ alias_method :none, :none?
446
+
435
447
  # Compute the logarithm to a given base.
436
448
  #
437
449
  # @param base [Float]
@@ -723,6 +735,212 @@ module Polars
723
735
  Utils.wrap_df(_s.to_dummies(separator, drop_first))
724
736
  end
725
737
 
738
+ # Bin continuous values into discrete categories.
739
+ #
740
+ # @param breaks [Array]
741
+ # List of unique cut points.
742
+ # @param labels [Array]
743
+ # Names of the categories. The number of labels must be equal to the number
744
+ # of cut points plus one.
745
+ # @param left_closed [Boolean]
746
+ # Set the intervals to be left-closed instead of right-closed.
747
+ # @param include_breaks [Boolean]
748
+ # Include a column with the right endpoint of the bin each observation falls
749
+ # in. This will change the data type of the output from a
750
+ # `Categorical` to a `Struct`.
751
+ #
752
+ # @return [Series]
753
+ #
754
+ # @example Divide the column into three categories.
755
+ # s = Polars::Series.new("foo", [-2, -1, 0, 1, 2])
756
+ # s.cut([-1, 1], labels: ["a", "b", "c"])
757
+ # # =>
758
+ # # shape: (5,)
759
+ # # Series: 'foo' [cat]
760
+ # # [
761
+ # # "a"
762
+ # # "a"
763
+ # # "b"
764
+ # # "b"
765
+ # # "c"
766
+ # # ]
767
+ #
768
+ # @example Create a DataFrame with the breakpoint and category for each value.
769
+ # cut = s.cut([-1, 1], include_breaks: true).alias("cut")
770
+ # s.to_frame.with_columns(cut).unnest("cut")
771
+ # # =>
772
+ # # shape: (5, 3)
773
+ # # ┌─────┬─────────────┬────────────┐
774
+ # # │ foo ┆ break_point ┆ category │
775
+ # # │ --- ┆ --- ┆ --- │
776
+ # # │ i64 ┆ f64 ┆ cat │
777
+ # # ╞═════╪═════════════╪════════════╡
778
+ # # │ -2 ┆ -1.0 ┆ (-inf, -1] │
779
+ # # │ -1 ┆ -1.0 ┆ (-inf, -1] │
780
+ # # │ 0 ┆ 1.0 ┆ (-1, 1] │
781
+ # # │ 1 ┆ 1.0 ┆ (-1, 1] │
782
+ # # │ 2 ┆ inf ┆ (1, inf] │
783
+ # # └─────┴─────────────┴────────────┘
784
+ def cut(breaks, labels: nil, left_closed: false, include_breaks: false)
785
+ result = (
786
+ to_frame
787
+ .select(
788
+ Polars.col(name).cut(
789
+ breaks,
790
+ labels: labels,
791
+ left_closed: left_closed,
792
+ include_breaks: include_breaks
793
+ )
794
+ )
795
+ .to_series
796
+ )
797
+
798
+ if include_breaks
799
+ result = result.struct.rename_fields(["break_point", "category"])
800
+ end
801
+
802
+ result
803
+ end
804
+
805
+ # Bin continuous values into discrete categories based on their quantiles.
806
+ #
807
+ # @param quantiles [Array]
808
+ # Either a list of quantile probabilities between 0 and 1 or a positive
809
+ # integer determining the number of bins with uniform probability.
810
+ # @param labels [Array]
811
+ # Names of the categories. The number of labels must be equal to the number
812
+ # of cut points plus one.
813
+ # @param left_closed [Boolean]
814
+ # Set the intervals to be left-closed instead of right-closed.
815
+ # @param allow_duplicates [Boolean]
816
+ # If set to `true`, duplicates in the resulting quantiles are dropped,
817
+ # rather than raising a `DuplicateError`. This can happen even with unique
818
+ # probabilities, depending on the data.
819
+ # @param include_breaks [Boolean]
820
+ # Include a column with the right endpoint of the bin each observation falls
821
+ # in. This will change the data type of the output from a
822
+ # `Categorical` to a `Struct`.
823
+ #
824
+ # @return [Series]
825
+ #
826
+ # @example Divide a column into three categories according to pre-defined quantile probabilities.
827
+ # s = Polars::Series.new("foo", [-2, -1, 0, 1, 2])
828
+ # s.qcut([0.25, 0.75], labels: ["a", "b", "c"])
829
+ # # =>
830
+ # # shape: (5,)
831
+ # # Series: 'foo' [cat]
832
+ # # [
833
+ # # "a"
834
+ # # "a"
835
+ # # "b"
836
+ # # "b"
837
+ # # "c"
838
+ # # ]
839
+ #
840
+ # @example Divide a column into two categories using uniform quantile probabilities.
841
+ # s.qcut(2, labels: ["low", "high"], left_closed: true)
842
+ # # =>
843
+ # # shape: (5,)
844
+ # # Series: 'foo' [cat]
845
+ # # [
846
+ # # "low"
847
+ # # "low"
848
+ # # "high"
849
+ # # "high"
850
+ # # "high"
851
+ # # ]
852
+ #
853
+ # @example Create a DataFrame with the breakpoint and category for each value.
854
+ # cut = s.qcut([0.25, 0.75], include_breaks: true).alias("cut")
855
+ # s.to_frame.with_columns(cut).unnest("cut")
856
+ # # =>
857
+ # # shape: (5, 3)
858
+ # # ┌─────┬─────────────┬────────────┐
859
+ # # │ foo ┆ break_point ┆ category │
860
+ # # │ --- ┆ --- ┆ --- │
861
+ # # │ i64 ┆ f64 ┆ cat │
862
+ # # ╞═════╪═════════════╪════════════╡
863
+ # # │ -2 ┆ -1.0 ┆ (-inf, -1] │
864
+ # # │ -1 ┆ -1.0 ┆ (-inf, -1] │
865
+ # # │ 0 ┆ 1.0 ┆ (-1, 1] │
866
+ # # │ 1 ┆ 1.0 ┆ (-1, 1] │
867
+ # # │ 2 ┆ inf ┆ (1, inf] │
868
+ # # └─────┴─────────────┴────────────┘
869
+ def qcut(quantiles, labels: nil, left_closed: false, allow_duplicates: false, include_breaks: false)
870
+ result = (
871
+ to_frame
872
+ .select(
873
+ Polars.col(name).qcut(
874
+ quantiles,
875
+ labels: labels,
876
+ left_closed: left_closed,
877
+ allow_duplicates: allow_duplicates,
878
+ include_breaks: include_breaks
879
+ )
880
+ )
881
+ .to_series
882
+ )
883
+
884
+ if include_breaks
885
+ result = result.struct.rename_fields(["break_point", "category"])
886
+ end
887
+
888
+ result
889
+ end
890
+
891
+ # Get the lengths of runs of identical values.
892
+ #
893
+ # @return [Series]
894
+ #
895
+ # @example
896
+ # s = Polars::Series.new("s", [1, 1, 2, 1, nil, 1, 3, 3])
897
+ # s.rle.struct.unnest
898
+ # # =>
899
+ # # shape: (6, 2)
900
+ # # ┌─────────┬────────┐
901
+ # # │ lengths ┆ values │
902
+ # # │ --- ┆ --- │
903
+ # # │ i32 ┆ i64 │
904
+ # # ╞═════════╪════════╡
905
+ # # │ 2 ┆ 1 │
906
+ # # │ 1 ┆ 2 │
907
+ # # │ 1 ┆ 1 │
908
+ # # │ 1 ┆ null │
909
+ # # │ 1 ┆ 1 │
910
+ # # │ 2 ┆ 3 │
911
+ # # └─────────┴────────┘
912
+ def rle
913
+ super
914
+ end
915
+
916
+ # Map values to run IDs.
917
+ #
918
+ # Similar to RLE, but it maps each value to an ID corresponding to the run into
919
+ # which it falls. This is especially useful when you want to define groups by
920
+ # runs of identical values rather than the values themselves.
921
+ #
922
+ # @return [Series]
923
+ #
924
+ # @example
925
+ # s = Polars::Series.new("s", [1, 1, 2, 1, nil, 1, 3, 3])
926
+ # s.rle_id()
927
+ # # =>
928
+ # # shape: (8,)
929
+ # # Series: 's' [u32]
930
+ # # [
931
+ # # 0
932
+ # # 0
933
+ # # 1
934
+ # # 2
935
+ # # 3
936
+ # # 4
937
+ # # 5
938
+ # # 5
939
+ # # ]
940
+ def rle_id
941
+ super
942
+ end
943
+
726
944
  # Count the unique values in a Series.
727
945
  #
728
946
  # @param sort [Boolean]
@@ -799,7 +1017,7 @@ module Polars
799
1017
  # Number of valid values there should be in the window before the expression
800
1018
  # is evaluated. valid values = `length - null_count`
801
1019
  # @param parallel [Boolean]
802
- # Run in parallel. Don't do this in a groupby or another operation that
1020
+ # Run in parallel. Don't do this in a group by or another operation that
803
1021
  # already has much parallelization.
804
1022
  #
805
1023
  # @return [Series]
@@ -1236,15 +1454,45 @@ module Polars
1236
1454
 
1237
1455
  # Return the `k` largest elements.
1238
1456
  #
1239
- # If `reverse: true`, the smallest elements will be given.
1457
+ # @param k [Integer]
1458
+ # Number of elements to return.
1459
+ #
1460
+ # @return [Boolean]
1461
+ #
1462
+ # @example
1463
+ # s = Polars::Series.new("a", [2, 5, 1, 4, 3])
1464
+ # s.top_k(k: 3)
1465
+ # # =>
1466
+ # # shape: (3,)
1467
+ # # Series: 'a' [i64]
1468
+ # # [
1469
+ # # 5
1470
+ # # 4
1471
+ # # 3
1472
+ # # ]
1473
+ def top_k(k: 5)
1474
+ super
1475
+ end
1476
+
1477
+ # Return the `k` smallest elements.
1240
1478
  #
1241
1479
  # @param k [Integer]
1242
1480
  # Number of elements to return.
1243
- # @param reverse [Boolean]
1244
- # Return the smallest elements.
1245
1481
  #
1246
1482
  # @return [Boolean]
1247
- def top_k(k: 5, reverse: false)
1483
+ #
1484
+ # @example
1485
+ # s = Polars::Series.new("a", [2, 5, 1, 4, 3])
1486
+ # s.bottom_k(k: 3)
1487
+ # # =>
1488
+ # # shape: (3,)
1489
+ # # Series: 'a' [i64]
1490
+ # # [
1491
+ # # 1
1492
+ # # 2
1493
+ # # 3
1494
+ # # ]
1495
+ def bottom_k(k: 5)
1248
1496
  super
1249
1497
  end
1250
1498
 
@@ -1693,26 +1941,40 @@ module Polars
1693
1941
  # @example
1694
1942
  # s = Polars::Series.new("a", [1, 2, 3])
1695
1943
  # s2 = Polars::Series.new("b", [4, 5, 6])
1696
- # s.series_equal(s)
1944
+ # s.equals(s)
1697
1945
  # # => true
1698
- # s.series_equal(s2)
1946
+ # s.equals(s2)
1699
1947
  # # => false
1700
- def series_equal(other, null_equal: false, strict: false)
1701
- _s.series_equal(other._s, null_equal, strict)
1948
+ def equals(other, null_equal: false, strict: false)
1949
+ _s.equals(other._s, null_equal, strict)
1702
1950
  end
1951
+ alias_method :series_equal, :equals
1703
1952
 
1704
- # Length of this Series.
1953
+ # Return the number of elements in the Series.
1705
1954
  #
1706
1955
  # @return [Integer]
1707
1956
  #
1708
1957
  # @example
1709
- # s = Polars::Series.new("a", [1, 2, 3])
1958
+ # s = Polars::Series.new("a", [1, 2, nil])
1959
+ # s.count
1960
+ # # => 3
1961
+ def count
1962
+ warn "`Series#count` will exclude null values in 0.9.0. Use `Series#length` instead."
1963
+ # len - null_count
1964
+ len
1965
+ end
1966
+
1967
+ # Return the number of elements in the Series.
1968
+ #
1969
+ # @return [Integer]
1970
+ #
1971
+ # @example
1972
+ # s = Polars::Series.new("a", [1, 2, nil])
1710
1973
  # s.len
1711
1974
  # # => 3
1712
1975
  def len
1713
1976
  _s.len
1714
1977
  end
1715
- alias_method :count, :len
1716
1978
  alias_method :length, :len
1717
1979
  alias_method :size, :len
1718
1980
 
@@ -1874,7 +2136,7 @@ module Polars
1874
2136
  # s.is_utf8
1875
2137
  # # => true
1876
2138
  def is_utf8
1877
- dtype == Utf8
2139
+ dtype == String
1878
2140
  end
1879
2141
  alias_method :utf8?, :is_utf8
1880
2142
 
@@ -1970,7 +2232,7 @@ module Polars
1970
2232
  # # 10
1971
2233
  # # 3
1972
2234
  # # ]
1973
- def set_at_idx(idx, value)
2235
+ def scatter(idx, value)
1974
2236
  if idx.is_a?(Integer)
1975
2237
  idx = [idx]
1976
2238
  end
@@ -1979,7 +2241,7 @@ module Polars
1979
2241
  end
1980
2242
 
1981
2243
  idx = Series.new("", idx)
1982
- if value.is_a?(Integer) || value.is_a?(Float) || Utils.bool?(value) || value.is_a?(String) || value.nil?
2244
+ if value.is_a?(Integer) || value.is_a?(Float) || Utils.bool?(value) || value.is_a?(::String) || value.nil?
1983
2245
  value = Series.new("", [value])
1984
2246
 
1985
2247
  # if we need to set more than a single value, we extend it
@@ -1989,9 +2251,10 @@ module Polars
1989
2251
  elsif !value.is_a?(Series)
1990
2252
  value = Series.new("", value)
1991
2253
  end
1992
- _s.set_at_idx(idx._s, value._s)
2254
+ _s.scatter(idx._s, value._s)
1993
2255
  self
1994
2256
  end
2257
+ alias_method :set_at_idx, :scatter
1995
2258
 
1996
2259
  # Create an empty copy of the current Series.
1997
2260
  #
@@ -2818,7 +3081,8 @@ module Polars
2818
3081
  weights: nil,
2819
3082
  min_periods: nil,
2820
3083
  center: false,
2821
- ddof: 1
3084
+ ddof: 1,
3085
+ warn_if_unsorted: true
2822
3086
  )
2823
3087
  to_frame
2824
3088
  .select(
@@ -2827,7 +3091,8 @@ module Polars
2827
3091
  weights: weights,
2828
3092
  min_periods: min_periods,
2829
3093
  center: center,
2830
- ddof: ddof
3094
+ ddof: ddof,
3095
+ warn_if_unsorted: warn_if_unsorted
2831
3096
  )
2832
3097
  )
2833
3098
  .to_series
@@ -2871,7 +3136,8 @@ module Polars
2871
3136
  weights: nil,
2872
3137
  min_periods: nil,
2873
3138
  center: false,
2874
- ddof: 1
3139
+ ddof: 1,
3140
+ warn_if_unsorted: true
2875
3141
  )
2876
3142
  to_frame
2877
3143
  .select(
@@ -2880,7 +3146,8 @@ module Polars
2880
3146
  weights: weights,
2881
3147
  min_periods: min_periods,
2882
3148
  center: center,
2883
- ddof: ddof
3149
+ ddof: ddof,
3150
+ warn_if_unsorted: warn_if_unsorted
2884
3151
  )
2885
3152
  )
2886
3153
  .to_series
@@ -2922,7 +3189,8 @@ module Polars
2922
3189
  window_size,
2923
3190
  weights: nil,
2924
3191
  min_periods: nil,
2925
- center: false
3192
+ center: false,
3193
+ warn_if_unsorted: true
2926
3194
  )
2927
3195
  if min_periods.nil?
2928
3196
  min_periods = window_size
@@ -2934,7 +3202,8 @@ module Polars
2934
3202
  window_size,
2935
3203
  weights: weights,
2936
3204
  min_periods: min_periods,
2937
- center: center
3205
+ center: center,
3206
+ warn_if_unsorted: warn_if_unsorted
2938
3207
  )
2939
3208
  )
2940
3209
  .to_series
@@ -2993,7 +3262,8 @@ module Polars
2993
3262
  window_size: 2,
2994
3263
  weights: nil,
2995
3264
  min_periods: nil,
2996
- center: false
3265
+ center: false,
3266
+ warn_if_unsorted: true
2997
3267
  )
2998
3268
  if min_periods.nil?
2999
3269
  min_periods = window_size
@@ -3007,7 +3277,8 @@ module Polars
3007
3277
  window_size: window_size,
3008
3278
  weights: weights,
3009
3279
  min_periods: min_periods,
3010
- center: center
3280
+ center: center,
3281
+ warn_if_unsorted: warn_if_unsorted
3011
3282
  )
3012
3283
  )
3013
3284
  .to_series
@@ -3097,7 +3368,7 @@ module Polars
3097
3368
  # s.peak_max
3098
3369
  # # =>
3099
3370
  # # shape: (5,)
3100
- # # Series: '' [bool]
3371
+ # # Series: 'a' [bool]
3101
3372
  # # [
3102
3373
  # # false
3103
3374
  # # false
@@ -3106,7 +3377,7 @@ module Polars
3106
3377
  # # true
3107
3378
  # # ]
3108
3379
  def peak_max
3109
- Utils.wrap_s(_s.peak_max)
3380
+ super
3110
3381
  end
3111
3382
 
3112
3383
  # Get a boolean mask of the local minimum peaks.
@@ -3118,7 +3389,7 @@ module Polars
3118
3389
  # s.peak_min
3119
3390
  # # =>
3120
3391
  # # shape: (5,)
3121
- # # Series: '' [bool]
3392
+ # # Series: 'a' [bool]
3122
3393
  # # [
3123
3394
  # # false
3124
3395
  # # true
@@ -3127,7 +3398,7 @@ module Polars
3127
3398
  # # false
3128
3399
  # # ]
3129
3400
  def peak_min
3130
- Utils.wrap_s(_s.peak_min)
3401
+ super
3131
3402
  end
3132
3403
 
3133
3404
  # Count the number of unique values in this Series.
@@ -3211,13 +3482,13 @@ module Polars
3211
3482
  # s.interpolate
3212
3483
  # # =>
3213
3484
  # # shape: (5,)
3214
- # # Series: 'a' [i64]
3485
+ # # Series: 'a' [f64]
3215
3486
  # # [
3216
- # # 1
3217
- # # 2
3218
- # # 3
3219
- # # 4
3220
- # # 5
3487
+ # # 1.0
3488
+ # # 2.0
3489
+ # # 3.0
3490
+ # # 4.0
3491
+ # # 5.0
3221
3492
  # # ]
3222
3493
  def interpolate(method: "linear")
3223
3494
  super
@@ -3260,7 +3531,7 @@ module Polars
3260
3531
  # s.rank
3261
3532
  # # =>
3262
3533
  # # shape: (5,)
3263
- # # Series: 'a' [f32]
3534
+ # # Series: 'a' [f64]
3264
3535
  # # [
3265
3536
  # # 3.0
3266
3537
  # # 4.5
@@ -3749,7 +4020,7 @@ module Polars
3749
4020
  return Utils.wrap_s(_s.send(op, other._s))
3750
4021
  end
3751
4022
 
3752
- if (other.is_a?(Float) || other.is_a?(::Date) || other.is_a?(::DateTime) || other.is_a?(::Time) || other.is_a?(String)) && !is_float
4023
+ if (other.is_a?(Float) || other.is_a?(::Date) || other.is_a?(::DateTime) || other.is_a?(::Time) || other.is_a?(::String)) && !is_float
3753
4024
  _s2 = sequence_to_rbseries(name, [other])
3754
4025
  return Utils.wrap_s(_s.send(op, _s2))
3755
4026
  end
@@ -3915,7 +4186,7 @@ module Polars
3915
4186
  return RbSeries.new_series_list(name, values, strict)
3916
4187
  else
3917
4188
  constructor =
3918
- if value.is_a?(String)
4189
+ if value.is_a?(::String)
3919
4190
  if value.encoding == Encoding::UTF_8
3920
4191
  RbSeries.method(:new_str)
3921
4192
  else
@@ -3998,7 +4269,8 @@ module Polars
3998
4269
  Integer => RbSeries.method(:new_opt_i64),
3999
4270
  TrueClass => RbSeries.method(:new_opt_bool),
4000
4271
  FalseClass => RbSeries.method(:new_opt_bool),
4001
- BigDecimal => RbSeries.method(:new_decimal)
4272
+ BigDecimal => RbSeries.method(:new_decimal),
4273
+ NilClass => RbSeries.method(:new_null)
4002
4274
  }
4003
4275
 
4004
4276
  def rb_type_to_constructor(dtype)