polars-df 0.4.0-arm64-darwin → 0.6.0-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +447 -410
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +2142 -972
- data/README.md +6 -5
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +289 -96
- data/lib/polars/data_types.rb +169 -33
- data/lib/polars/date_time_expr.rb +142 -2
- data/lib/polars/date_time_name_space.rb +17 -3
- data/lib/polars/expr.rb +145 -78
- data/lib/polars/functions.rb +0 -1
- data/lib/polars/group_by.rb +1 -22
- data/lib/polars/lazy_frame.rb +84 -31
- data/lib/polars/lazy_functions.rb +71 -32
- data/lib/polars/list_expr.rb +94 -45
- data/lib/polars/list_name_space.rb +13 -13
- data/lib/polars/rolling_group_by.rb +4 -2
- data/lib/polars/series.rb +249 -87
- data/lib/polars/string_expr.rb +277 -45
- data/lib/polars/string_name_space.rb +137 -22
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +138 -54
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +5 -2
- metadata +4 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -18,7 +18,10 @@ module Polars
|
|
18
18
|
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
19
19
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
def initialize(data = nil, columns: nil, orient: nil)
|
21
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
|
+
schema ||= columns
|
23
|
+
raise Todo if schema_overrides
|
24
|
+
|
22
25
|
# TODO deprecate in favor of read_sql
|
23
26
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
24
27
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
@@ -29,14 +32,14 @@ module Polars
|
|
29
32
|
end
|
30
33
|
|
31
34
|
if data.nil?
|
32
|
-
self._df = self.class.hash_to_rbdf({},
|
35
|
+
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
33
36
|
elsif data.is_a?(Hash)
|
34
37
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
35
|
-
self._df = self.class.hash_to_rbdf(data,
|
36
|
-
elsif data.is_a?(Array)
|
37
|
-
self._df = self.class.sequence_to_rbdf(data,
|
38
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
39
|
+
elsif data.is_a?(::Array)
|
40
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
38
41
|
elsif data.is_a?(Series)
|
39
|
-
self._df = self.class.series_to_rbdf(data,
|
42
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
40
43
|
else
|
41
44
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
42
45
|
end
|
@@ -56,8 +59,8 @@ module Polars
|
|
56
59
|
end
|
57
60
|
|
58
61
|
# @private
|
59
|
-
def self._from_hash(data,
|
60
|
-
_from_rbdf(hash_to_rbdf(data,
|
62
|
+
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
63
|
+
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
61
64
|
end
|
62
65
|
|
63
66
|
# def self._from_records
|
@@ -113,7 +116,7 @@ module Polars
|
|
113
116
|
dtypes.each do|k, v|
|
114
117
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
115
118
|
end
|
116
|
-
elsif dtypes.is_a?(Array)
|
119
|
+
elsif dtypes.is_a?(::Array)
|
117
120
|
dtype_slice = dtypes
|
118
121
|
else
|
119
122
|
raise ArgumentError, "dtype arg should be list or dict"
|
@@ -336,6 +339,7 @@ module Polars
|
|
336
339
|
end
|
337
340
|
alias_method :count, :height
|
338
341
|
alias_method :length, :height
|
342
|
+
alias_method :size, :height
|
339
343
|
|
340
344
|
# Get the width of the DataFrame.
|
341
345
|
#
|
@@ -546,6 +550,13 @@ module Polars
|
|
546
550
|
end
|
547
551
|
alias_method :inspect, :to_s
|
548
552
|
|
553
|
+
# Returns an array representing the DataFrame
|
554
|
+
#
|
555
|
+
# @return [Array]
|
556
|
+
def to_a
|
557
|
+
rows(named: true)
|
558
|
+
end
|
559
|
+
|
549
560
|
# Check if DataFrame includes column.
|
550
561
|
#
|
551
562
|
# @return [Boolean]
|
@@ -579,7 +590,7 @@ module Polars
|
|
579
590
|
|
580
591
|
# df[2, ..] (select row as df)
|
581
592
|
if row_selection.is_a?(Integer)
|
582
|
-
if col_selection.is_a?(Array)
|
593
|
+
if col_selection.is_a?(::Array)
|
583
594
|
df = self[0.., col_selection]
|
584
595
|
return df.slice(row_selection, 1)
|
585
596
|
end
|
@@ -600,7 +611,7 @@ module Polars
|
|
600
611
|
return series[row_selection]
|
601
612
|
end
|
602
613
|
|
603
|
-
if col_selection.is_a?(Array)
|
614
|
+
if col_selection.is_a?(::Array)
|
604
615
|
# df[.., [1, 2]]
|
605
616
|
if Utils.is_int_sequence(col_selection)
|
606
617
|
series_list = col_selection.map { |i| to_series(i) }
|
@@ -630,7 +641,7 @@ module Polars
|
|
630
641
|
return Slice.new(self).apply(item)
|
631
642
|
end
|
632
643
|
|
633
|
-
if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
|
644
|
+
if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
|
634
645
|
# select multiple columns
|
635
646
|
# df[["foo", "bar"]]
|
636
647
|
return _from_rbdf(_df.select(item.map(&:to_s)))
|
@@ -655,7 +666,7 @@ module Polars
|
|
655
666
|
end
|
656
667
|
|
657
668
|
# Ruby-specific
|
658
|
-
if item.is_a?(Expr)
|
669
|
+
if item.is_a?(Expr) || item.is_a?(Series)
|
659
670
|
return filter(item)
|
660
671
|
end
|
661
672
|
|
@@ -665,15 +676,42 @@ module Polars
|
|
665
676
|
# Set item.
|
666
677
|
#
|
667
678
|
# @return [Object]
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
679
|
+
def []=(*key, value)
|
680
|
+
if key.length == 1
|
681
|
+
key = key.first
|
682
|
+
elsif key.length != 2
|
683
|
+
raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
|
684
|
+
end
|
673
685
|
|
674
|
-
|
675
|
-
|
686
|
+
if Utils.strlike?(key)
|
687
|
+
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
|
+
value = Series.new(value)
|
689
|
+
elsif !value.is_a?(Series)
|
690
|
+
value = Polars.lit(value)
|
691
|
+
end
|
692
|
+
self._df = with_column(value.alias(key.to_s))._df
|
693
|
+
elsif key.is_a?(::Array)
|
694
|
+
row_selection, col_selection = key
|
695
|
+
|
696
|
+
if Utils.strlike?(col_selection)
|
697
|
+
s = self[col_selection]
|
698
|
+
elsif col_selection.is_a?(Integer)
|
699
|
+
raise Todo
|
700
|
+
else
|
701
|
+
raise ArgumentError, "column selection not understood: #{col_selection}"
|
702
|
+
end
|
676
703
|
|
704
|
+
s[row_selection] = value
|
705
|
+
|
706
|
+
if col_selection.is_a?(Integer)
|
707
|
+
replace_at_idx(col_selection, s)
|
708
|
+
elsif Utils.strlike?(col_selection)
|
709
|
+
replace(col_selection, s)
|
710
|
+
end
|
711
|
+
else
|
712
|
+
raise Todo
|
713
|
+
end
|
714
|
+
end
|
677
715
|
|
678
716
|
# Return the dataframe as a scalar.
|
679
717
|
#
|
@@ -956,14 +994,21 @@ module Polars
|
|
956
994
|
#
|
957
995
|
# @return [nil]
|
958
996
|
def write_ipc(file, compression: "uncompressed")
|
959
|
-
|
960
|
-
|
997
|
+
return_bytes = file.nil?
|
998
|
+
if return_bytes
|
999
|
+
file = StringIO.new
|
1000
|
+
file.set_encoding(Encoding::BINARY)
|
961
1001
|
end
|
962
1002
|
if Utils.pathlike?(file)
|
963
1003
|
file = Utils.normalise_filepath(file)
|
964
1004
|
end
|
965
1005
|
|
1006
|
+
if compression.nil?
|
1007
|
+
compression = "uncompressed"
|
1008
|
+
end
|
1009
|
+
|
966
1010
|
_df.write_ipc(file, compression)
|
1011
|
+
return_bytes ? file.string : nil
|
967
1012
|
end
|
968
1013
|
|
969
1014
|
# Write to Apache Parquet file.
|
@@ -1453,13 +1498,23 @@ module Polars
|
|
1453
1498
|
# # │ 1 ┆ 6.0 ┆ a │
|
1454
1499
|
# # └─────┴─────┴─────┘
|
1455
1500
|
def sort(by, reverse: false, nulls_last: false)
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1501
|
+
lazy
|
1502
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1503
|
+
.collect(no_optimization: true)
|
1504
|
+
end
|
1505
|
+
|
1506
|
+
# Sort the DataFrame by column in-place.
|
1507
|
+
#
|
1508
|
+
# @param by [String]
|
1509
|
+
# By which column to sort.
|
1510
|
+
# @param reverse [Boolean]
|
1511
|
+
# Reverse/descending sort.
|
1512
|
+
# @param nulls_last [Boolean]
|
1513
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1514
|
+
#
|
1515
|
+
# @return [DataFrame]
|
1516
|
+
def sort!(by, reverse: false, nulls_last: false)
|
1517
|
+
self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
|
1463
1518
|
end
|
1464
1519
|
|
1465
1520
|
# Check if DataFrame is equal to other.
|
@@ -1519,7 +1574,7 @@ module Polars
|
|
1519
1574
|
# # │ 30 ┆ 6 │
|
1520
1575
|
# # └─────┴─────┘
|
1521
1576
|
def replace(column, new_col)
|
1522
|
-
_df.replace(column, new_col._s)
|
1577
|
+
_df.replace(column.to_s, new_col._s)
|
1523
1578
|
self
|
1524
1579
|
end
|
1525
1580
|
|
@@ -1847,6 +1902,12 @@ module Polars
|
|
1847
1902
|
# Define whether the temporal window interval is closed or not.
|
1848
1903
|
# @param by [Object]
|
1849
1904
|
# Also group by this column/these columns.
|
1905
|
+
# @param check_sorted [Boolean]
|
1906
|
+
# When the `by` argument is given, polars can not check sortedness
|
1907
|
+
# by the metadata and has to do a full scan on the index column to
|
1908
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1909
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1910
|
+
# Doing so incorrectly will lead to incorrect output
|
1850
1911
|
#
|
1851
1912
|
# @return [RollingGroupBy]
|
1852
1913
|
#
|
@@ -1860,7 +1921,7 @@ module Polars
|
|
1860
1921
|
# "2020-01-08 23:16:43"
|
1861
1922
|
# ]
|
1862
1923
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1863
|
-
# Polars.col("dt").str.strptime(
|
1924
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1864
1925
|
# )
|
1865
1926
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1866
1927
|
# [
|
@@ -1888,9 +1949,10 @@ module Polars
|
|
1888
1949
|
period:,
|
1889
1950
|
offset: nil,
|
1890
1951
|
closed: "right",
|
1891
|
-
by: nil
|
1952
|
+
by: nil,
|
1953
|
+
check_sorted: true
|
1892
1954
|
)
|
1893
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1955
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
|
1894
1956
|
end
|
1895
1957
|
|
1896
1958
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
@@ -2026,21 +2088,21 @@ module Polars
|
|
2026
2088
|
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
2027
2089
|
# [
|
2028
2090
|
# Polars.col("time").count.alias("time_count"),
|
2029
|
-
# Polars.col("time").
|
2091
|
+
# Polars.col("time").alias("time_agg_list")
|
2030
2092
|
# ]
|
2031
2093
|
# )
|
2032
2094
|
# # =>
|
2033
2095
|
# # shape: (4, 3)
|
2034
|
-
# #
|
2035
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2036
|
-
# # │ --- ┆ --- ┆ ---
|
2037
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2038
|
-
# #
|
2039
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
2040
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
2041
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
2042
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2043
|
-
# #
|
2096
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
2097
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
2098
|
+
# # │ --- ┆ --- ┆ --- │
|
2099
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
2100
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
2101
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
2102
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
2103
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
2104
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
2105
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
2044
2106
|
#
|
2045
2107
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2046
2108
|
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2107,7 +2169,7 @@ module Polars
|
|
2107
2169
|
# period: "3i",
|
2108
2170
|
# include_boundaries: true,
|
2109
2171
|
# closed: "right"
|
2110
|
-
# ).agg(Polars.col("A").
|
2172
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2111
2173
|
# # =>
|
2112
2174
|
# # shape: (3, 4)
|
2113
2175
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -2190,7 +2252,7 @@ module Polars
|
|
2190
2252
|
# "groups" => ["A", "B", "A", "B"],
|
2191
2253
|
# "values" => [0, 1, 2, 3]
|
2192
2254
|
# }
|
2193
|
-
# )
|
2255
|
+
# ).set_sorted("time")
|
2194
2256
|
# df.upsample(
|
2195
2257
|
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2196
2258
|
# ).select(Polars.all.forward_fill)
|
@@ -2308,7 +2370,7 @@ module Polars
|
|
2308
2370
|
# ], # note record date: Jan 1st (sorted!)
|
2309
2371
|
# "gdp" => [4164, 4411, 4566, 4696]
|
2310
2372
|
# }
|
2311
|
-
# )
|
2373
|
+
# ).set_sorted("date")
|
2312
2374
|
# population = Polars::DataFrame.new(
|
2313
2375
|
# {
|
2314
2376
|
# "date" => [
|
@@ -2319,7 +2381,7 @@ module Polars
|
|
2319
2381
|
# ], # note record date: May 12th (sorted!)
|
2320
2382
|
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2321
2383
|
# }
|
2322
|
-
# )
|
2384
|
+
# ).set_sorted("date")
|
2323
2385
|
# population.join_asof(
|
2324
2386
|
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2325
2387
|
# )
|
@@ -2622,7 +2684,7 @@ module Polars
|
|
2622
2684
|
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2623
2685
|
# # └─────┴─────┴─────┴───────┘
|
2624
2686
|
def hstack(columns, in_place: false)
|
2625
|
-
if !columns.is_a?(Array)
|
2687
|
+
if !columns.is_a?(::Array)
|
2626
2688
|
columns = columns.get_columns
|
2627
2689
|
end
|
2628
2690
|
if in_place
|
@@ -2752,7 +2814,7 @@ module Polars
|
|
2752
2814
|
# # │ 3 ┆ 8.0 │
|
2753
2815
|
# # └─────┴─────┘
|
2754
2816
|
def drop(columns)
|
2755
|
-
if columns.is_a?(Array)
|
2817
|
+
if columns.is_a?(::Array)
|
2756
2818
|
df = clone
|
2757
2819
|
columns.each do |n|
|
2758
2820
|
df._df.drop_in_place(n)
|
@@ -2791,6 +2853,16 @@ module Polars
|
|
2791
2853
|
Utils.wrap_s(_df.drop_in_place(name))
|
2792
2854
|
end
|
2793
2855
|
|
2856
|
+
# Drop in place if exists.
|
2857
|
+
#
|
2858
|
+
# @param name [Object]
|
2859
|
+
# Column to drop.
|
2860
|
+
#
|
2861
|
+
# @return [Series]
|
2862
|
+
def delete(name)
|
2863
|
+
drop_in_place(name) if include?(name)
|
2864
|
+
end
|
2865
|
+
|
2794
2866
|
# Create an empty copy of the current DataFrame.
|
2795
2867
|
#
|
2796
2868
|
# Returns a DataFrame with identical schema but no data.
|
@@ -3202,7 +3274,7 @@ module Polars
|
|
3202
3274
|
# # │ B ┆ 1 │
|
3203
3275
|
# # │ C ┆ 2 │
|
3204
3276
|
# # │ D ┆ 3 │
|
3205
|
-
# # │
|
3277
|
+
# # │ E ┆ 4 │
|
3206
3278
|
# # │ F ┆ 5 │
|
3207
3279
|
# # │ G ┆ 6 │
|
3208
3280
|
# # │ H ┆ 7 │
|
@@ -3255,7 +3327,7 @@ module Polars
|
|
3255
3327
|
n_fill = n_cols * n_rows - height
|
3256
3328
|
|
3257
3329
|
if n_fill > 0
|
3258
|
-
if !fill_values.is_a?(Array)
|
3330
|
+
if !fill_values.is_a?(::Array)
|
3259
3331
|
fill_values = [fill_values] * df.width
|
3260
3332
|
end
|
3261
3333
|
|
@@ -3364,29 +3436,29 @@ module Polars
|
|
3364
3436
|
# # ╞═════╪═════╪═════╡
|
3365
3437
|
# # │ C ┆ 2 ┆ l │
|
3366
3438
|
# # └─────┴─────┴─────┘}
|
3367
|
-
def partition_by(groups, maintain_order: true, as_dict: false)
|
3439
|
+
def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
|
3368
3440
|
if groups.is_a?(String)
|
3369
3441
|
groups = [groups]
|
3370
|
-
elsif !groups.is_a?(Array)
|
3442
|
+
elsif !groups.is_a?(::Array)
|
3371
3443
|
groups = Array(groups)
|
3372
3444
|
end
|
3373
3445
|
|
3374
3446
|
if as_dict
|
3375
3447
|
out = {}
|
3376
3448
|
if groups.length == 1
|
3377
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3449
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3378
3450
|
df = _from_rbdf(df)
|
3379
3451
|
out[df[groups][0, 0]] = df
|
3380
3452
|
end
|
3381
3453
|
else
|
3382
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3454
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3383
3455
|
df = _from_rbdf(df)
|
3384
3456
|
out[df[groups].row(0)] = df
|
3385
3457
|
end
|
3386
3458
|
end
|
3387
3459
|
out
|
3388
3460
|
else
|
3389
|
-
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3461
|
+
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
3390
3462
|
end
|
3391
3463
|
end
|
3392
3464
|
|
@@ -3654,7 +3726,7 @@ module Polars
|
|
3654
3726
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3655
3727
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3656
3728
|
def with_columns(exprs)
|
3657
|
-
if !exprs.nil? && !exprs.is_a?(Array)
|
3729
|
+
if !exprs.nil? && !exprs.is_a?(::Array)
|
3658
3730
|
exprs = [exprs]
|
3659
3731
|
end
|
3660
3732
|
lazy
|
@@ -4035,11 +4107,11 @@ module Polars
|
|
4035
4107
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4036
4108
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4037
4109
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4038
|
-
def to_dummies(columns: nil, separator: "_")
|
4110
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
4039
4111
|
if columns.is_a?(String)
|
4040
4112
|
columns = [columns]
|
4041
4113
|
end
|
4042
|
-
_from_rbdf(_df.to_dummies(columns, separator))
|
4114
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
4043
4115
|
end
|
4044
4116
|
|
4045
4117
|
# Drop duplicate rows from this DataFrame.
|
@@ -4127,7 +4199,7 @@ module Polars
|
|
4127
4199
|
subset = [subset]
|
4128
4200
|
end
|
4129
4201
|
|
4130
|
-
if subset.is_a?(Array) && subset.length == 1
|
4202
|
+
if subset.is_a?(::Array) && subset.length == 1
|
4131
4203
|
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4132
4204
|
else
|
4133
4205
|
struct_fields = subset.nil? ? Polars.all : subset
|
@@ -4428,7 +4500,7 @@ module Polars
|
|
4428
4500
|
end
|
4429
4501
|
end
|
4430
4502
|
|
4431
|
-
# Returns an iterator over the DataFrame of rows of
|
4503
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4432
4504
|
#
|
4433
4505
|
# @param named [Boolean]
|
4434
4506
|
# Return hashes instead of arrays. The hashes are a mapping of
|
@@ -4489,6 +4561,24 @@ module Polars
|
|
4489
4561
|
end
|
4490
4562
|
end
|
4491
4563
|
|
4564
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4565
|
+
#
|
4566
|
+
# @param named [Boolean]
|
4567
|
+
# Return hashes instead of arrays. The hashes are a mapping of
|
4568
|
+
# column name to row value. This is more expensive than returning an
|
4569
|
+
# array, but allows for accessing values by column name.
|
4570
|
+
# @param buffer_size [Integer]
|
4571
|
+
# Determines the number of rows that are buffered internally while iterating
|
4572
|
+
# over the data; you should only modify this in very specific cases where the
|
4573
|
+
# default value is determined not to be a good fit to your access pattern, as
|
4574
|
+
# the speedup from using the buffer is significant (~2-4x). Setting this
|
4575
|
+
# value to zero disables row buffering.
|
4576
|
+
#
|
4577
|
+
# @return [Object]
|
4578
|
+
def each_row(named: true, buffer_size: 500, &block)
|
4579
|
+
iter_rows(named: named, buffer_size: buffer_size, &block)
|
4580
|
+
end
|
4581
|
+
|
4492
4582
|
# Shrink DataFrame memory usage.
|
4493
4583
|
#
|
4494
4584
|
# Shrinks to fit the exact capacity needed to hold the data.
|
@@ -4678,6 +4768,38 @@ module Polars
|
|
4678
4768
|
_from_rbdf(_df.unnest(names))
|
4679
4769
|
end
|
4680
4770
|
|
4771
|
+
# TODO
|
4772
|
+
# def corr
|
4773
|
+
# end
|
4774
|
+
|
4775
|
+
# TODO
|
4776
|
+
# def merge_sorted
|
4777
|
+
# end
|
4778
|
+
|
4779
|
+
# Indicate that one or multiple columns are sorted.
|
4780
|
+
#
|
4781
|
+
# @param column [Object]
|
4782
|
+
# Columns that are sorted
|
4783
|
+
# @param more_columns [Object]
|
4784
|
+
# Additional columns that are sorted, specified as positional arguments.
|
4785
|
+
# @param descending [Boolean]
|
4786
|
+
# Whether the columns are sorted in descending order.
|
4787
|
+
#
|
4788
|
+
# @return [DataFrame]
|
4789
|
+
def set_sorted(
|
4790
|
+
column,
|
4791
|
+
*more_columns,
|
4792
|
+
descending: false
|
4793
|
+
)
|
4794
|
+
lazy
|
4795
|
+
.set_sorted(column, *more_columns, descending: descending)
|
4796
|
+
.collect(no_optimization: true)
|
4797
|
+
end
|
4798
|
+
|
4799
|
+
# TODO
|
4800
|
+
# def update
|
4801
|
+
# end
|
4802
|
+
|
4681
4803
|
private
|
4682
4804
|
|
4683
4805
|
def initialize_copy(other)
|
@@ -4742,20 +4864,63 @@ module Polars
|
|
4742
4864
|
end
|
4743
4865
|
|
4744
4866
|
# @private
|
4745
|
-
def self.
|
4746
|
-
|
4747
|
-
|
4867
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
4868
|
+
updated_data = {}
|
4869
|
+
unless data.empty?
|
4870
|
+
dtypes = schema_overrides || {}
|
4871
|
+
array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
|
4872
|
+
if array_len > 0
|
4873
|
+
data.each do |name, val|
|
4874
|
+
dtype = dtypes[name]
|
4875
|
+
if val.is_a?(Hash) && dtype != Struct
|
4876
|
+
updated_data[name] = DataFrame.new(val).to_struct(name)
|
4877
|
+
elsif !Utils.arrlen(val).nil?
|
4878
|
+
updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
|
4879
|
+
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4880
|
+
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4881
|
+
updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
4882
|
+
else
|
4883
|
+
raise Todo
|
4884
|
+
end
|
4885
|
+
end
|
4886
|
+
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4887
|
+
data.each do |name, val|
|
4888
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
4889
|
+
end
|
4890
|
+
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4891
|
+
data.each do |name, val|
|
4892
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
4893
|
+
end
|
4894
|
+
end
|
4895
|
+
end
|
4896
|
+
updated_data
|
4897
|
+
end
|
4748
4898
|
|
4749
|
-
|
4750
|
-
|
4751
|
-
|
4752
|
-
|
4899
|
+
# @private
|
4900
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
4901
|
+
if schema.is_a?(Hash) && !data.empty?
|
4902
|
+
if !data.all? { |col, _| schema[col] }
|
4903
|
+
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
4753
4904
|
end
|
4754
|
-
|
4755
|
-
|
4905
|
+
|
4906
|
+
data = schema.to_h { |col| [col, data[col]] }
|
4756
4907
|
end
|
4757
4908
|
|
4758
|
-
|
4909
|
+
column_names, schema_overrides = _unpack_schema(
|
4910
|
+
schema, lookup_names: data.keys, schema_overrides: schema_overrides
|
4911
|
+
)
|
4912
|
+
if column_names.empty?
|
4913
|
+
column_names = data.keys
|
4914
|
+
end
|
4915
|
+
|
4916
|
+
if data.empty? && !schema_overrides.empty?
|
4917
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
4918
|
+
else
|
4919
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
4920
|
+
end
|
4921
|
+
|
4922
|
+
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
4923
|
+
RbDataFrame.new(data_series)
|
4759
4924
|
end
|
4760
4925
|
|
4761
4926
|
# @private
|
@@ -4764,14 +4929,12 @@ module Polars
|
|
4764
4929
|
end
|
4765
4930
|
|
4766
4931
|
# @private
|
4767
|
-
def self.
|
4768
|
-
|
4769
|
-
|
4770
|
-
if columns.is_a?(Hash)
|
4771
|
-
columns = columns.to_a
|
4932
|
+
def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
|
4933
|
+
if schema.is_a?(Hash)
|
4934
|
+
schema = schema.to_a
|
4772
4935
|
end
|
4773
4936
|
column_names =
|
4774
|
-
(
|
4937
|
+
(schema || []).map.with_index do |col, i|
|
4775
4938
|
if col.is_a?(String)
|
4776
4939
|
col || "column_#{i}"
|
4777
4940
|
else
|
@@ -4784,21 +4947,38 @@ module Polars
|
|
4784
4947
|
# TODO zip_longest
|
4785
4948
|
lookup = column_names.zip(lookup_names || []).to_h
|
4786
4949
|
|
4787
|
-
|
4788
|
-
|
4789
|
-
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4950
|
+
column_dtypes =
|
4951
|
+
(schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4790
4952
|
[lookup[col[0]] || col[0], col[1]]
|
4791
4953
|
end
|
4792
|
-
|
4954
|
+
|
4955
|
+
if schema_overrides
|
4956
|
+
raise Todo
|
4957
|
+
end
|
4958
|
+
|
4959
|
+
column_dtypes.each do |col, dtype|
|
4960
|
+
if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
|
4961
|
+
column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
|
4962
|
+
end
|
4963
|
+
end
|
4964
|
+
|
4965
|
+
[column_names, column_dtypes]
|
4793
4966
|
end
|
4794
4967
|
|
4795
|
-
def self._handle_columns_arg(data, columns: nil)
|
4796
|
-
if columns.nil?
|
4968
|
+
def self._handle_columns_arg(data, columns: nil, from_hash: false)
|
4969
|
+
if columns.nil? || columns.empty?
|
4797
4970
|
data
|
4798
4971
|
else
|
4799
4972
|
if data.empty?
|
4800
4973
|
columns.map { |c| Series.new(c, nil)._s }
|
4801
4974
|
elsif data.length == columns.length
|
4975
|
+
if from_hash
|
4976
|
+
series_map = data.to_h { |s| [s.name, s] }
|
4977
|
+
if columns.all? { |col| series_map.key?(col) }
|
4978
|
+
return columns.map { |col| series_map[col] }
|
4979
|
+
end
|
4980
|
+
end
|
4981
|
+
|
4802
4982
|
columns.each_with_index do |c, i|
|
4803
4983
|
# not in-place?
|
4804
4984
|
data[i].rename(c)
|
@@ -4813,7 +4993,7 @@ module Polars
|
|
4813
4993
|
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4814
4994
|
rbdf_columns = rbdf.columns
|
4815
4995
|
rbdf_dtypes = rbdf.dtypes
|
4816
|
-
columns, dtypes =
|
4996
|
+
columns, dtypes = _unpack_schema(
|
4817
4997
|
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4818
4998
|
)
|
4819
4999
|
column_subset = []
|
@@ -4829,7 +5009,7 @@ module Polars
|
|
4829
5009
|
columns.each do |col, i|
|
4830
5010
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4831
5011
|
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4832
|
-
elsif structs
|
5012
|
+
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4833
5013
|
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4834
5014
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4835
5015
|
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
@@ -4851,27 +5031,30 @@ module Polars
|
|
4851
5031
|
end
|
4852
5032
|
|
4853
5033
|
# @private
|
4854
|
-
def self.sequence_to_rbdf(data,
|
5034
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5035
|
+
raise Todo if schema_overrides
|
5036
|
+
columns = schema
|
5037
|
+
|
4855
5038
|
if data.length == 0
|
4856
|
-
return hash_to_rbdf({},
|
5039
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
4857
5040
|
end
|
4858
5041
|
|
4859
5042
|
if data[0].is_a?(Series)
|
4860
5043
|
# series_names = data.map(&:name)
|
4861
|
-
# columns, dtypes =
|
5044
|
+
# columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
|
4862
5045
|
data_series = []
|
4863
5046
|
data.each do |s|
|
4864
5047
|
data_series << s._s
|
4865
5048
|
end
|
4866
5049
|
elsif data[0].is_a?(Hash)
|
4867
|
-
column_names, dtypes =
|
5050
|
+
column_names, dtypes = _unpack_schema(columns)
|
4868
5051
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4869
5052
|
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4870
5053
|
if column_names
|
4871
5054
|
rbdf = _post_apply_columns(rbdf, column_names)
|
4872
5055
|
end
|
4873
5056
|
return rbdf
|
4874
|
-
elsif data[0].is_a?(Array)
|
5057
|
+
elsif data[0].is_a?(::Array)
|
4875
5058
|
if orient.nil? && !columns.nil?
|
4876
5059
|
orient = columns.length == data.length ? "col" : "row"
|
4877
5060
|
end
|
@@ -4890,11 +5073,21 @@ module Polars
|
|
4890
5073
|
end
|
4891
5074
|
|
4892
5075
|
# @private
|
4893
|
-
def self.series_to_rbdf(data,
|
4894
|
-
|
4895
|
-
|
5076
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
|
5077
|
+
data_series = [data._s]
|
5078
|
+
series_name = data_series.map(&:name)
|
5079
|
+
column_names, schema_overrides = _unpack_schema(
|
5080
|
+
schema || series_name, schema_overrides: schema_overrides, n_expected: 1
|
5081
|
+
)
|
5082
|
+
if schema_overrides.any?
|
5083
|
+
new_dtype = schema_overrides.values[0]
|
5084
|
+
if new_dtype != data.dtype
|
5085
|
+
data_series[0] = data_series[0].cast(new_dtype, true)
|
5086
|
+
end
|
4896
5087
|
end
|
4897
|
-
|
5088
|
+
|
5089
|
+
data_series = _handle_columns_arg(data_series, columns: column_names)
|
5090
|
+
RbDataFrame.new(data_series)
|
4898
5091
|
end
|
4899
5092
|
|
4900
5093
|
def wrap_ldf(ldf)
|
@@ -4966,7 +5159,7 @@ module Polars
|
|
4966
5159
|
|
4967
5160
|
def _prepare_other_arg(other)
|
4968
5161
|
if !other.is_a?(Series)
|
4969
|
-
if other.is_a?(Array)
|
5162
|
+
if other.is_a?(::Array)
|
4970
5163
|
raise ArgumentError, "Operation not supported."
|
4971
5164
|
end
|
4972
5165
|
|