polars-df 0.4.0-x86_64-darwin → 0.6.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +447 -410
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +2142 -972
- data/README.md +6 -5
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +289 -96
- data/lib/polars/data_types.rb +169 -33
- data/lib/polars/date_time_expr.rb +142 -2
- data/lib/polars/date_time_name_space.rb +17 -3
- data/lib/polars/expr.rb +145 -78
- data/lib/polars/functions.rb +0 -1
- data/lib/polars/group_by.rb +1 -22
- data/lib/polars/lazy_frame.rb +84 -31
- data/lib/polars/lazy_functions.rb +71 -32
- data/lib/polars/list_expr.rb +94 -45
- data/lib/polars/list_name_space.rb +13 -13
- data/lib/polars/rolling_group_by.rb +4 -2
- data/lib/polars/series.rb +249 -87
- data/lib/polars/string_expr.rb +277 -45
- data/lib/polars/string_name_space.rb +137 -22
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +138 -54
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +5 -2
- metadata +4 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -18,7 +18,10 @@ module Polars
|
|
18
18
|
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
19
19
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
def initialize(data = nil, columns: nil, orient: nil)
|
21
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
|
+
schema ||= columns
|
23
|
+
raise Todo if schema_overrides
|
24
|
+
|
22
25
|
# TODO deprecate in favor of read_sql
|
23
26
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
24
27
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
@@ -29,14 +32,14 @@ module Polars
|
|
29
32
|
end
|
30
33
|
|
31
34
|
if data.nil?
|
32
|
-
self._df = self.class.hash_to_rbdf({},
|
35
|
+
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
33
36
|
elsif data.is_a?(Hash)
|
34
37
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
35
|
-
self._df = self.class.hash_to_rbdf(data,
|
36
|
-
elsif data.is_a?(Array)
|
37
|
-
self._df = self.class.sequence_to_rbdf(data,
|
38
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
39
|
+
elsif data.is_a?(::Array)
|
40
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
38
41
|
elsif data.is_a?(Series)
|
39
|
-
self._df = self.class.series_to_rbdf(data,
|
42
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
40
43
|
else
|
41
44
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
42
45
|
end
|
@@ -56,8 +59,8 @@ module Polars
|
|
56
59
|
end
|
57
60
|
|
58
61
|
# @private
|
59
|
-
def self._from_hash(data,
|
60
|
-
_from_rbdf(hash_to_rbdf(data,
|
62
|
+
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
63
|
+
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
61
64
|
end
|
62
65
|
|
63
66
|
# def self._from_records
|
@@ -113,7 +116,7 @@ module Polars
|
|
113
116
|
dtypes.each do|k, v|
|
114
117
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
115
118
|
end
|
116
|
-
elsif dtypes.is_a?(Array)
|
119
|
+
elsif dtypes.is_a?(::Array)
|
117
120
|
dtype_slice = dtypes
|
118
121
|
else
|
119
122
|
raise ArgumentError, "dtype arg should be list or dict"
|
@@ -336,6 +339,7 @@ module Polars
|
|
336
339
|
end
|
337
340
|
alias_method :count, :height
|
338
341
|
alias_method :length, :height
|
342
|
+
alias_method :size, :height
|
339
343
|
|
340
344
|
# Get the width of the DataFrame.
|
341
345
|
#
|
@@ -546,6 +550,13 @@ module Polars
|
|
546
550
|
end
|
547
551
|
alias_method :inspect, :to_s
|
548
552
|
|
553
|
+
# Returns an array representing the DataFrame
|
554
|
+
#
|
555
|
+
# @return [Array]
|
556
|
+
def to_a
|
557
|
+
rows(named: true)
|
558
|
+
end
|
559
|
+
|
549
560
|
# Check if DataFrame includes column.
|
550
561
|
#
|
551
562
|
# @return [Boolean]
|
@@ -579,7 +590,7 @@ module Polars
|
|
579
590
|
|
580
591
|
# df[2, ..] (select row as df)
|
581
592
|
if row_selection.is_a?(Integer)
|
582
|
-
if col_selection.is_a?(Array)
|
593
|
+
if col_selection.is_a?(::Array)
|
583
594
|
df = self[0.., col_selection]
|
584
595
|
return df.slice(row_selection, 1)
|
585
596
|
end
|
@@ -600,7 +611,7 @@ module Polars
|
|
600
611
|
return series[row_selection]
|
601
612
|
end
|
602
613
|
|
603
|
-
if col_selection.is_a?(Array)
|
614
|
+
if col_selection.is_a?(::Array)
|
604
615
|
# df[.., [1, 2]]
|
605
616
|
if Utils.is_int_sequence(col_selection)
|
606
617
|
series_list = col_selection.map { |i| to_series(i) }
|
@@ -630,7 +641,7 @@ module Polars
|
|
630
641
|
return Slice.new(self).apply(item)
|
631
642
|
end
|
632
643
|
|
633
|
-
if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
|
644
|
+
if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
|
634
645
|
# select multiple columns
|
635
646
|
# df[["foo", "bar"]]
|
636
647
|
return _from_rbdf(_df.select(item.map(&:to_s)))
|
@@ -655,7 +666,7 @@ module Polars
|
|
655
666
|
end
|
656
667
|
|
657
668
|
# Ruby-specific
|
658
|
-
if item.is_a?(Expr)
|
669
|
+
if item.is_a?(Expr) || item.is_a?(Series)
|
659
670
|
return filter(item)
|
660
671
|
end
|
661
672
|
|
@@ -665,15 +676,42 @@ module Polars
|
|
665
676
|
# Set item.
|
666
677
|
#
|
667
678
|
# @return [Object]
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
679
|
+
def []=(*key, value)
|
680
|
+
if key.length == 1
|
681
|
+
key = key.first
|
682
|
+
elsif key.length != 2
|
683
|
+
raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
|
684
|
+
end
|
673
685
|
|
674
|
-
|
675
|
-
|
686
|
+
if Utils.strlike?(key)
|
687
|
+
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
|
+
value = Series.new(value)
|
689
|
+
elsif !value.is_a?(Series)
|
690
|
+
value = Polars.lit(value)
|
691
|
+
end
|
692
|
+
self._df = with_column(value.alias(key.to_s))._df
|
693
|
+
elsif key.is_a?(::Array)
|
694
|
+
row_selection, col_selection = key
|
695
|
+
|
696
|
+
if Utils.strlike?(col_selection)
|
697
|
+
s = self[col_selection]
|
698
|
+
elsif col_selection.is_a?(Integer)
|
699
|
+
raise Todo
|
700
|
+
else
|
701
|
+
raise ArgumentError, "column selection not understood: #{col_selection}"
|
702
|
+
end
|
676
703
|
|
704
|
+
s[row_selection] = value
|
705
|
+
|
706
|
+
if col_selection.is_a?(Integer)
|
707
|
+
replace_at_idx(col_selection, s)
|
708
|
+
elsif Utils.strlike?(col_selection)
|
709
|
+
replace(col_selection, s)
|
710
|
+
end
|
711
|
+
else
|
712
|
+
raise Todo
|
713
|
+
end
|
714
|
+
end
|
677
715
|
|
678
716
|
# Return the dataframe as a scalar.
|
679
717
|
#
|
@@ -956,14 +994,21 @@ module Polars
|
|
956
994
|
#
|
957
995
|
# @return [nil]
|
958
996
|
def write_ipc(file, compression: "uncompressed")
|
959
|
-
|
960
|
-
|
997
|
+
return_bytes = file.nil?
|
998
|
+
if return_bytes
|
999
|
+
file = StringIO.new
|
1000
|
+
file.set_encoding(Encoding::BINARY)
|
961
1001
|
end
|
962
1002
|
if Utils.pathlike?(file)
|
963
1003
|
file = Utils.normalise_filepath(file)
|
964
1004
|
end
|
965
1005
|
|
1006
|
+
if compression.nil?
|
1007
|
+
compression = "uncompressed"
|
1008
|
+
end
|
1009
|
+
|
966
1010
|
_df.write_ipc(file, compression)
|
1011
|
+
return_bytes ? file.string : nil
|
967
1012
|
end
|
968
1013
|
|
969
1014
|
# Write to Apache Parquet file.
|
@@ -1453,13 +1498,23 @@ module Polars
|
|
1453
1498
|
# # │ 1 ┆ 6.0 ┆ a │
|
1454
1499
|
# # └─────┴─────┴─────┘
|
1455
1500
|
def sort(by, reverse: false, nulls_last: false)
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1501
|
+
lazy
|
1502
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1503
|
+
.collect(no_optimization: true)
|
1504
|
+
end
|
1505
|
+
|
1506
|
+
# Sort the DataFrame by column in-place.
|
1507
|
+
#
|
1508
|
+
# @param by [String]
|
1509
|
+
# By which column to sort.
|
1510
|
+
# @param reverse [Boolean]
|
1511
|
+
# Reverse/descending sort.
|
1512
|
+
# @param nulls_last [Boolean]
|
1513
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1514
|
+
#
|
1515
|
+
# @return [DataFrame]
|
1516
|
+
def sort!(by, reverse: false, nulls_last: false)
|
1517
|
+
self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
|
1463
1518
|
end
|
1464
1519
|
|
1465
1520
|
# Check if DataFrame is equal to other.
|
@@ -1519,7 +1574,7 @@ module Polars
|
|
1519
1574
|
# # │ 30 ┆ 6 │
|
1520
1575
|
# # └─────┴─────┘
|
1521
1576
|
def replace(column, new_col)
|
1522
|
-
_df.replace(column, new_col._s)
|
1577
|
+
_df.replace(column.to_s, new_col._s)
|
1523
1578
|
self
|
1524
1579
|
end
|
1525
1580
|
|
@@ -1847,6 +1902,12 @@ module Polars
|
|
1847
1902
|
# Define whether the temporal window interval is closed or not.
|
1848
1903
|
# @param by [Object]
|
1849
1904
|
# Also group by this column/these columns.
|
1905
|
+
# @param check_sorted [Boolean]
|
1906
|
+
# When the `by` argument is given, polars can not check sortedness
|
1907
|
+
# by the metadata and has to do a full scan on the index column to
|
1908
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1909
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1910
|
+
# Doing so incorrectly will lead to incorrect output
|
1850
1911
|
#
|
1851
1912
|
# @return [RollingGroupBy]
|
1852
1913
|
#
|
@@ -1860,7 +1921,7 @@ module Polars
|
|
1860
1921
|
# "2020-01-08 23:16:43"
|
1861
1922
|
# ]
|
1862
1923
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1863
|
-
# Polars.col("dt").str.strptime(
|
1924
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1864
1925
|
# )
|
1865
1926
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1866
1927
|
# [
|
@@ -1888,9 +1949,10 @@ module Polars
|
|
1888
1949
|
period:,
|
1889
1950
|
offset: nil,
|
1890
1951
|
closed: "right",
|
1891
|
-
by: nil
|
1952
|
+
by: nil,
|
1953
|
+
check_sorted: true
|
1892
1954
|
)
|
1893
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1955
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
|
1894
1956
|
end
|
1895
1957
|
|
1896
1958
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
@@ -2026,21 +2088,21 @@ module Polars
|
|
2026
2088
|
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
2027
2089
|
# [
|
2028
2090
|
# Polars.col("time").count.alias("time_count"),
|
2029
|
-
# Polars.col("time").
|
2091
|
+
# Polars.col("time").alias("time_agg_list")
|
2030
2092
|
# ]
|
2031
2093
|
# )
|
2032
2094
|
# # =>
|
2033
2095
|
# # shape: (4, 3)
|
2034
|
-
# #
|
2035
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2036
|
-
# # │ --- ┆ --- ┆ ---
|
2037
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2038
|
-
# #
|
2039
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
2040
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
2041
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
2042
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2043
|
-
# #
|
2096
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
2097
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
2098
|
+
# # │ --- ┆ --- ┆ --- │
|
2099
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
2100
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
2101
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
2102
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
2103
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
2104
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
2105
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
2044
2106
|
#
|
2045
2107
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2046
2108
|
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2107,7 +2169,7 @@ module Polars
|
|
2107
2169
|
# period: "3i",
|
2108
2170
|
# include_boundaries: true,
|
2109
2171
|
# closed: "right"
|
2110
|
-
# ).agg(Polars.col("A").
|
2172
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2111
2173
|
# # =>
|
2112
2174
|
# # shape: (3, 4)
|
2113
2175
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -2190,7 +2252,7 @@ module Polars
|
|
2190
2252
|
# "groups" => ["A", "B", "A", "B"],
|
2191
2253
|
# "values" => [0, 1, 2, 3]
|
2192
2254
|
# }
|
2193
|
-
# )
|
2255
|
+
# ).set_sorted("time")
|
2194
2256
|
# df.upsample(
|
2195
2257
|
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2196
2258
|
# ).select(Polars.all.forward_fill)
|
@@ -2308,7 +2370,7 @@ module Polars
|
|
2308
2370
|
# ], # note record date: Jan 1st (sorted!)
|
2309
2371
|
# "gdp" => [4164, 4411, 4566, 4696]
|
2310
2372
|
# }
|
2311
|
-
# )
|
2373
|
+
# ).set_sorted("date")
|
2312
2374
|
# population = Polars::DataFrame.new(
|
2313
2375
|
# {
|
2314
2376
|
# "date" => [
|
@@ -2319,7 +2381,7 @@ module Polars
|
|
2319
2381
|
# ], # note record date: May 12th (sorted!)
|
2320
2382
|
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2321
2383
|
# }
|
2322
|
-
# )
|
2384
|
+
# ).set_sorted("date")
|
2323
2385
|
# population.join_asof(
|
2324
2386
|
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2325
2387
|
# )
|
@@ -2622,7 +2684,7 @@ module Polars
|
|
2622
2684
|
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2623
2685
|
# # └─────┴─────┴─────┴───────┘
|
2624
2686
|
def hstack(columns, in_place: false)
|
2625
|
-
if !columns.is_a?(Array)
|
2687
|
+
if !columns.is_a?(::Array)
|
2626
2688
|
columns = columns.get_columns
|
2627
2689
|
end
|
2628
2690
|
if in_place
|
@@ -2752,7 +2814,7 @@ module Polars
|
|
2752
2814
|
# # │ 3 ┆ 8.0 │
|
2753
2815
|
# # └─────┴─────┘
|
2754
2816
|
def drop(columns)
|
2755
|
-
if columns.is_a?(Array)
|
2817
|
+
if columns.is_a?(::Array)
|
2756
2818
|
df = clone
|
2757
2819
|
columns.each do |n|
|
2758
2820
|
df._df.drop_in_place(n)
|
@@ -2791,6 +2853,16 @@ module Polars
|
|
2791
2853
|
Utils.wrap_s(_df.drop_in_place(name))
|
2792
2854
|
end
|
2793
2855
|
|
2856
|
+
# Drop in place if exists.
|
2857
|
+
#
|
2858
|
+
# @param name [Object]
|
2859
|
+
# Column to drop.
|
2860
|
+
#
|
2861
|
+
# @return [Series]
|
2862
|
+
def delete(name)
|
2863
|
+
drop_in_place(name) if include?(name)
|
2864
|
+
end
|
2865
|
+
|
2794
2866
|
# Create an empty copy of the current DataFrame.
|
2795
2867
|
#
|
2796
2868
|
# Returns a DataFrame with identical schema but no data.
|
@@ -3202,7 +3274,7 @@ module Polars
|
|
3202
3274
|
# # │ B ┆ 1 │
|
3203
3275
|
# # │ C ┆ 2 │
|
3204
3276
|
# # │ D ┆ 3 │
|
3205
|
-
# # │
|
3277
|
+
# # │ E ┆ 4 │
|
3206
3278
|
# # │ F ┆ 5 │
|
3207
3279
|
# # │ G ┆ 6 │
|
3208
3280
|
# # │ H ┆ 7 │
|
@@ -3255,7 +3327,7 @@ module Polars
|
|
3255
3327
|
n_fill = n_cols * n_rows - height
|
3256
3328
|
|
3257
3329
|
if n_fill > 0
|
3258
|
-
if !fill_values.is_a?(Array)
|
3330
|
+
if !fill_values.is_a?(::Array)
|
3259
3331
|
fill_values = [fill_values] * df.width
|
3260
3332
|
end
|
3261
3333
|
|
@@ -3364,29 +3436,29 @@ module Polars
|
|
3364
3436
|
# # ╞═════╪═════╪═════╡
|
3365
3437
|
# # │ C ┆ 2 ┆ l │
|
3366
3438
|
# # └─────┴─────┴─────┘}
|
3367
|
-
def partition_by(groups, maintain_order: true, as_dict: false)
|
3439
|
+
def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
|
3368
3440
|
if groups.is_a?(String)
|
3369
3441
|
groups = [groups]
|
3370
|
-
elsif !groups.is_a?(Array)
|
3442
|
+
elsif !groups.is_a?(::Array)
|
3371
3443
|
groups = Array(groups)
|
3372
3444
|
end
|
3373
3445
|
|
3374
3446
|
if as_dict
|
3375
3447
|
out = {}
|
3376
3448
|
if groups.length == 1
|
3377
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3449
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3378
3450
|
df = _from_rbdf(df)
|
3379
3451
|
out[df[groups][0, 0]] = df
|
3380
3452
|
end
|
3381
3453
|
else
|
3382
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3454
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3383
3455
|
df = _from_rbdf(df)
|
3384
3456
|
out[df[groups].row(0)] = df
|
3385
3457
|
end
|
3386
3458
|
end
|
3387
3459
|
out
|
3388
3460
|
else
|
3389
|
-
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3461
|
+
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
3390
3462
|
end
|
3391
3463
|
end
|
3392
3464
|
|
@@ -3654,7 +3726,7 @@ module Polars
|
|
3654
3726
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3655
3727
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3656
3728
|
def with_columns(exprs)
|
3657
|
-
if !exprs.nil? && !exprs.is_a?(Array)
|
3729
|
+
if !exprs.nil? && !exprs.is_a?(::Array)
|
3658
3730
|
exprs = [exprs]
|
3659
3731
|
end
|
3660
3732
|
lazy
|
@@ -4035,11 +4107,11 @@ module Polars
|
|
4035
4107
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4036
4108
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4037
4109
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4038
|
-
def to_dummies(columns: nil, separator: "_")
|
4110
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
4039
4111
|
if columns.is_a?(String)
|
4040
4112
|
columns = [columns]
|
4041
4113
|
end
|
4042
|
-
_from_rbdf(_df.to_dummies(columns, separator))
|
4114
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
4043
4115
|
end
|
4044
4116
|
|
4045
4117
|
# Drop duplicate rows from this DataFrame.
|
@@ -4127,7 +4199,7 @@ module Polars
|
|
4127
4199
|
subset = [subset]
|
4128
4200
|
end
|
4129
4201
|
|
4130
|
-
if subset.is_a?(Array) && subset.length == 1
|
4202
|
+
if subset.is_a?(::Array) && subset.length == 1
|
4131
4203
|
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4132
4204
|
else
|
4133
4205
|
struct_fields = subset.nil? ? Polars.all : subset
|
@@ -4428,7 +4500,7 @@ module Polars
|
|
4428
4500
|
end
|
4429
4501
|
end
|
4430
4502
|
|
4431
|
-
# Returns an iterator over the DataFrame of rows of
|
4503
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4432
4504
|
#
|
4433
4505
|
# @param named [Boolean]
|
4434
4506
|
# Return hashes instead of arrays. The hashes are a mapping of
|
@@ -4489,6 +4561,24 @@ module Polars
|
|
4489
4561
|
end
|
4490
4562
|
end
|
4491
4563
|
|
4564
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4565
|
+
#
|
4566
|
+
# @param named [Boolean]
|
4567
|
+
# Return hashes instead of arrays. The hashes are a mapping of
|
4568
|
+
# column name to row value. This is more expensive than returning an
|
4569
|
+
# array, but allows for accessing values by column name.
|
4570
|
+
# @param buffer_size [Integer]
|
4571
|
+
# Determines the number of rows that are buffered internally while iterating
|
4572
|
+
# over the data; you should only modify this in very specific cases where the
|
4573
|
+
# default value is determined not to be a good fit to your access pattern, as
|
4574
|
+
# the speedup from using the buffer is significant (~2-4x). Setting this
|
4575
|
+
# value to zero disables row buffering.
|
4576
|
+
#
|
4577
|
+
# @return [Object]
|
4578
|
+
def each_row(named: true, buffer_size: 500, &block)
|
4579
|
+
iter_rows(named: named, buffer_size: buffer_size, &block)
|
4580
|
+
end
|
4581
|
+
|
4492
4582
|
# Shrink DataFrame memory usage.
|
4493
4583
|
#
|
4494
4584
|
# Shrinks to fit the exact capacity needed to hold the data.
|
@@ -4678,6 +4768,38 @@ module Polars
|
|
4678
4768
|
_from_rbdf(_df.unnest(names))
|
4679
4769
|
end
|
4680
4770
|
|
4771
|
+
# TODO
|
4772
|
+
# def corr
|
4773
|
+
# end
|
4774
|
+
|
4775
|
+
# TODO
|
4776
|
+
# def merge_sorted
|
4777
|
+
# end
|
4778
|
+
|
4779
|
+
# Indicate that one or multiple columns are sorted.
|
4780
|
+
#
|
4781
|
+
# @param column [Object]
|
4782
|
+
# Columns that are sorted
|
4783
|
+
# @param more_columns [Object]
|
4784
|
+
# Additional columns that are sorted, specified as positional arguments.
|
4785
|
+
# @param descending [Boolean]
|
4786
|
+
# Whether the columns are sorted in descending order.
|
4787
|
+
#
|
4788
|
+
# @return [DataFrame]
|
4789
|
+
def set_sorted(
|
4790
|
+
column,
|
4791
|
+
*more_columns,
|
4792
|
+
descending: false
|
4793
|
+
)
|
4794
|
+
lazy
|
4795
|
+
.set_sorted(column, *more_columns, descending: descending)
|
4796
|
+
.collect(no_optimization: true)
|
4797
|
+
end
|
4798
|
+
|
4799
|
+
# TODO
|
4800
|
+
# def update
|
4801
|
+
# end
|
4802
|
+
|
4681
4803
|
private
|
4682
4804
|
|
4683
4805
|
def initialize_copy(other)
|
@@ -4742,20 +4864,63 @@ module Polars
|
|
4742
4864
|
end
|
4743
4865
|
|
4744
4866
|
# @private
|
4745
|
-
def self.
|
4746
|
-
|
4747
|
-
|
4867
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
4868
|
+
updated_data = {}
|
4869
|
+
unless data.empty?
|
4870
|
+
dtypes = schema_overrides || {}
|
4871
|
+
array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
|
4872
|
+
if array_len > 0
|
4873
|
+
data.each do |name, val|
|
4874
|
+
dtype = dtypes[name]
|
4875
|
+
if val.is_a?(Hash) && dtype != Struct
|
4876
|
+
updated_data[name] = DataFrame.new(val).to_struct(name)
|
4877
|
+
elsif !Utils.arrlen(val).nil?
|
4878
|
+
updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
|
4879
|
+
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4880
|
+
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4881
|
+
updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
4882
|
+
else
|
4883
|
+
raise Todo
|
4884
|
+
end
|
4885
|
+
end
|
4886
|
+
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4887
|
+
data.each do |name, val|
|
4888
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
4889
|
+
end
|
4890
|
+
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4891
|
+
data.each do |name, val|
|
4892
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
4893
|
+
end
|
4894
|
+
end
|
4895
|
+
end
|
4896
|
+
updated_data
|
4897
|
+
end
|
4748
4898
|
|
4749
|
-
|
4750
|
-
|
4751
|
-
|
4752
|
-
|
4899
|
+
# @private
|
4900
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
4901
|
+
if schema.is_a?(Hash) && !data.empty?
|
4902
|
+
if !data.all? { |col, _| schema[col] }
|
4903
|
+
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
4753
4904
|
end
|
4754
|
-
|
4755
|
-
|
4905
|
+
|
4906
|
+
data = schema.to_h { |col| [col, data[col]] }
|
4756
4907
|
end
|
4757
4908
|
|
4758
|
-
|
4909
|
+
column_names, schema_overrides = _unpack_schema(
|
4910
|
+
schema, lookup_names: data.keys, schema_overrides: schema_overrides
|
4911
|
+
)
|
4912
|
+
if column_names.empty?
|
4913
|
+
column_names = data.keys
|
4914
|
+
end
|
4915
|
+
|
4916
|
+
if data.empty? && !schema_overrides.empty?
|
4917
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
4918
|
+
else
|
4919
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
4920
|
+
end
|
4921
|
+
|
4922
|
+
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
4923
|
+
RbDataFrame.new(data_series)
|
4759
4924
|
end
|
4760
4925
|
|
4761
4926
|
# @private
|
@@ -4764,14 +4929,12 @@ module Polars
|
|
4764
4929
|
end
|
4765
4930
|
|
4766
4931
|
# @private
|
4767
|
-
def self.
|
4768
|
-
|
4769
|
-
|
4770
|
-
if columns.is_a?(Hash)
|
4771
|
-
columns = columns.to_a
|
4932
|
+
def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
|
4933
|
+
if schema.is_a?(Hash)
|
4934
|
+
schema = schema.to_a
|
4772
4935
|
end
|
4773
4936
|
column_names =
|
4774
|
-
(
|
4937
|
+
(schema || []).map.with_index do |col, i|
|
4775
4938
|
if col.is_a?(String)
|
4776
4939
|
col || "column_#{i}"
|
4777
4940
|
else
|
@@ -4784,21 +4947,38 @@ module Polars
|
|
4784
4947
|
# TODO zip_longest
|
4785
4948
|
lookup = column_names.zip(lookup_names || []).to_h
|
4786
4949
|
|
4787
|
-
|
4788
|
-
|
4789
|
-
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4950
|
+
column_dtypes =
|
4951
|
+
(schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4790
4952
|
[lookup[col[0]] || col[0], col[1]]
|
4791
4953
|
end
|
4792
|
-
|
4954
|
+
|
4955
|
+
if schema_overrides
|
4956
|
+
raise Todo
|
4957
|
+
end
|
4958
|
+
|
4959
|
+
column_dtypes.each do |col, dtype|
|
4960
|
+
if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
|
4961
|
+
column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
|
4962
|
+
end
|
4963
|
+
end
|
4964
|
+
|
4965
|
+
[column_names, column_dtypes]
|
4793
4966
|
end
|
4794
4967
|
|
4795
|
-
def self._handle_columns_arg(data, columns: nil)
|
4796
|
-
if columns.nil?
|
4968
|
+
def self._handle_columns_arg(data, columns: nil, from_hash: false)
|
4969
|
+
if columns.nil? || columns.empty?
|
4797
4970
|
data
|
4798
4971
|
else
|
4799
4972
|
if data.empty?
|
4800
4973
|
columns.map { |c| Series.new(c, nil)._s }
|
4801
4974
|
elsif data.length == columns.length
|
4975
|
+
if from_hash
|
4976
|
+
series_map = data.to_h { |s| [s.name, s] }
|
4977
|
+
if columns.all? { |col| series_map.key?(col) }
|
4978
|
+
return columns.map { |col| series_map[col] }
|
4979
|
+
end
|
4980
|
+
end
|
4981
|
+
|
4802
4982
|
columns.each_with_index do |c, i|
|
4803
4983
|
# not in-place?
|
4804
4984
|
data[i].rename(c)
|
@@ -4813,7 +4993,7 @@ module Polars
|
|
4813
4993
|
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4814
4994
|
rbdf_columns = rbdf.columns
|
4815
4995
|
rbdf_dtypes = rbdf.dtypes
|
4816
|
-
columns, dtypes =
|
4996
|
+
columns, dtypes = _unpack_schema(
|
4817
4997
|
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4818
4998
|
)
|
4819
4999
|
column_subset = []
|
@@ -4829,7 +5009,7 @@ module Polars
|
|
4829
5009
|
columns.each do |col, i|
|
4830
5010
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4831
5011
|
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4832
|
-
elsif structs
|
5012
|
+
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4833
5013
|
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4834
5014
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4835
5015
|
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
@@ -4851,27 +5031,30 @@ module Polars
|
|
4851
5031
|
end
|
4852
5032
|
|
4853
5033
|
# @private
|
4854
|
-
def self.sequence_to_rbdf(data,
|
5034
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5035
|
+
raise Todo if schema_overrides
|
5036
|
+
columns = schema
|
5037
|
+
|
4855
5038
|
if data.length == 0
|
4856
|
-
return hash_to_rbdf({},
|
5039
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
4857
5040
|
end
|
4858
5041
|
|
4859
5042
|
if data[0].is_a?(Series)
|
4860
5043
|
# series_names = data.map(&:name)
|
4861
|
-
# columns, dtypes =
|
5044
|
+
# columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
|
4862
5045
|
data_series = []
|
4863
5046
|
data.each do |s|
|
4864
5047
|
data_series << s._s
|
4865
5048
|
end
|
4866
5049
|
elsif data[0].is_a?(Hash)
|
4867
|
-
column_names, dtypes =
|
5050
|
+
column_names, dtypes = _unpack_schema(columns)
|
4868
5051
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4869
5052
|
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4870
5053
|
if column_names
|
4871
5054
|
rbdf = _post_apply_columns(rbdf, column_names)
|
4872
5055
|
end
|
4873
5056
|
return rbdf
|
4874
|
-
elsif data[0].is_a?(Array)
|
5057
|
+
elsif data[0].is_a?(::Array)
|
4875
5058
|
if orient.nil? && !columns.nil?
|
4876
5059
|
orient = columns.length == data.length ? "col" : "row"
|
4877
5060
|
end
|
@@ -4890,11 +5073,21 @@ module Polars
|
|
4890
5073
|
end
|
4891
5074
|
|
4892
5075
|
# @private
|
4893
|
-
def self.series_to_rbdf(data,
|
4894
|
-
|
4895
|
-
|
5076
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
|
5077
|
+
data_series = [data._s]
|
5078
|
+
series_name = data_series.map(&:name)
|
5079
|
+
column_names, schema_overrides = _unpack_schema(
|
5080
|
+
schema || series_name, schema_overrides: schema_overrides, n_expected: 1
|
5081
|
+
)
|
5082
|
+
if schema_overrides.any?
|
5083
|
+
new_dtype = schema_overrides.values[0]
|
5084
|
+
if new_dtype != data.dtype
|
5085
|
+
data_series[0] = data_series[0].cast(new_dtype, true)
|
5086
|
+
end
|
4896
5087
|
end
|
4897
|
-
|
5088
|
+
|
5089
|
+
data_series = _handle_columns_arg(data_series, columns: column_names)
|
5090
|
+
RbDataFrame.new(data_series)
|
4898
5091
|
end
|
4899
5092
|
|
4900
5093
|
def wrap_ldf(ldf)
|
@@ -4966,7 +5159,7 @@ module Polars
|
|
4966
5159
|
|
4967
5160
|
def _prepare_other_arg(other)
|
4968
5161
|
if !other.is_a?(Series)
|
4969
|
-
if other.is_a?(Array)
|
5162
|
+
if other.is_a?(::Array)
|
4970
5163
|
raise ArgumentError, "Operation not supported."
|
4971
5164
|
end
|
4972
5165
|
|