polars-df 0.3.1-x86_64-linux → 0.5.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/Cargo.lock +486 -380
- data/Cargo.toml +0 -2
- data/LICENSE-THIRD-PARTY.txt +7353 -8473
- data/README.md +31 -2
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +263 -87
- data/lib/polars/data_types.rb +6 -4
- data/lib/polars/date_time_expr.rb +148 -8
- data/lib/polars/expr.rb +78 -11
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +107 -10
- data/lib/polars/lazy_functions.rb +7 -3
- data/lib/polars/list_expr.rb +70 -21
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +190 -74
- data/lib/polars/string_expr.rb +150 -44
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +51 -9
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +4 -2
- metadata +4 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -18,7 +18,10 @@ module Polars
|
|
18
18
|
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
19
19
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
def initialize(data = nil, columns: nil, orient: nil)
|
21
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
|
+
schema ||= columns
|
23
|
+
raise Todo if schema_overrides
|
24
|
+
|
22
25
|
# TODO deprecate in favor of read_sql
|
23
26
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
24
27
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
@@ -29,14 +32,14 @@ module Polars
|
|
29
32
|
end
|
30
33
|
|
31
34
|
if data.nil?
|
32
|
-
self._df = self.class.hash_to_rbdf({},
|
35
|
+
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
33
36
|
elsif data.is_a?(Hash)
|
34
37
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
35
|
-
self._df = self.class.hash_to_rbdf(data,
|
38
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
36
39
|
elsif data.is_a?(Array)
|
37
|
-
self._df = self.class.sequence_to_rbdf(data,
|
40
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
38
41
|
elsif data.is_a?(Series)
|
39
|
-
self._df = self.class.series_to_rbdf(data,
|
42
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
40
43
|
else
|
41
44
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
42
45
|
end
|
@@ -56,8 +59,8 @@ module Polars
|
|
56
59
|
end
|
57
60
|
|
58
61
|
# @private
|
59
|
-
def self._from_hash(data,
|
60
|
-
_from_rbdf(hash_to_rbdf(data,
|
62
|
+
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
63
|
+
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
61
64
|
end
|
62
65
|
|
63
66
|
# def self._from_records
|
@@ -97,7 +100,7 @@ module Polars
|
|
97
100
|
eol_char: "\n"
|
98
101
|
)
|
99
102
|
if Utils.pathlike?(file)
|
100
|
-
path = Utils.
|
103
|
+
path = Utils.normalise_filepath(file)
|
101
104
|
else
|
102
105
|
path = nil
|
103
106
|
# if defined?(StringIO) && file.is_a?(StringIO)
|
@@ -196,32 +199,56 @@ module Polars
|
|
196
199
|
|
197
200
|
# @private
|
198
201
|
def self._read_parquet(
|
199
|
-
|
202
|
+
source,
|
200
203
|
columns: nil,
|
201
204
|
n_rows: nil,
|
202
205
|
parallel: "auto",
|
203
206
|
row_count_name: nil,
|
204
207
|
row_count_offset: 0,
|
205
|
-
low_memory: false
|
208
|
+
low_memory: false,
|
209
|
+
use_statistics: true,
|
210
|
+
rechunk: true
|
206
211
|
)
|
207
|
-
if Utils.pathlike?(
|
208
|
-
|
212
|
+
if Utils.pathlike?(source)
|
213
|
+
source = Utils.normalise_filepath(source)
|
214
|
+
end
|
215
|
+
if columns.is_a?(String)
|
216
|
+
columns = [columns]
|
209
217
|
end
|
210
218
|
|
211
|
-
if
|
212
|
-
|
219
|
+
if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
|
220
|
+
scan =
|
221
|
+
Polars.scan_parquet(
|
222
|
+
source,
|
223
|
+
n_rows: n_rows,
|
224
|
+
rechunk: true,
|
225
|
+
parallel: parallel,
|
226
|
+
row_count_name: row_count_name,
|
227
|
+
row_count_offset: row_count_offset,
|
228
|
+
low_memory: low_memory
|
229
|
+
)
|
230
|
+
|
231
|
+
if columns.nil?
|
232
|
+
return self._from_rbdf(scan.collect._df)
|
233
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
234
|
+
return self._from_rbdf(scan.select(columns).collect._df)
|
235
|
+
else
|
236
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
237
|
+
end
|
213
238
|
end
|
214
239
|
|
215
240
|
projection, columns = Utils.handle_projection_columns(columns)
|
216
241
|
_from_rbdf(
|
217
242
|
RbDataFrame.read_parquet(
|
218
|
-
|
243
|
+
source,
|
219
244
|
columns,
|
220
245
|
projection,
|
221
246
|
n_rows,
|
222
247
|
parallel,
|
223
248
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
224
|
-
low_memory
|
249
|
+
low_memory,
|
250
|
+
use_statistics,
|
251
|
+
rechunk
|
225
252
|
)
|
226
253
|
)
|
227
254
|
end
|
@@ -229,7 +256,7 @@ module Polars
|
|
229
256
|
# @private
|
230
257
|
def self._read_avro(file, columns: nil, n_rows: nil)
|
231
258
|
if Utils.pathlike?(file)
|
232
|
-
file = Utils.
|
259
|
+
file = Utils.normalise_filepath(file)
|
233
260
|
end
|
234
261
|
projection, columns = Utils.handle_projection_columns(columns)
|
235
262
|
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
@@ -246,7 +273,7 @@ module Polars
|
|
246
273
|
memory_map: true
|
247
274
|
)
|
248
275
|
if Utils.pathlike?(file)
|
249
|
-
file = Utils.
|
276
|
+
file = Utils.normalise_filepath(file)
|
250
277
|
end
|
251
278
|
if columns.is_a?(String)
|
252
279
|
columns = [columns]
|
@@ -272,7 +299,7 @@ module Polars
|
|
272
299
|
# @private
|
273
300
|
def self._read_json(file)
|
274
301
|
if Utils.pathlike?(file)
|
275
|
-
file = Utils.
|
302
|
+
file = Utils.normalise_filepath(file)
|
276
303
|
end
|
277
304
|
|
278
305
|
_from_rbdf(RbDataFrame.read_json(file))
|
@@ -281,7 +308,7 @@ module Polars
|
|
281
308
|
# @private
|
282
309
|
def self._read_ndjson(file)
|
283
310
|
if Utils.pathlike?(file)
|
284
|
-
file = Utils.
|
311
|
+
file = Utils.normalise_filepath(file)
|
285
312
|
end
|
286
313
|
|
287
314
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
@@ -312,6 +339,7 @@ module Polars
|
|
312
339
|
end
|
313
340
|
alias_method :count, :height
|
314
341
|
alias_method :length, :height
|
342
|
+
alias_method :size, :height
|
315
343
|
|
316
344
|
# Get the width of the DataFrame.
|
317
345
|
#
|
@@ -522,6 +550,13 @@ module Polars
|
|
522
550
|
end
|
523
551
|
alias_method :inspect, :to_s
|
524
552
|
|
553
|
+
# Returns an array representing the DataFrame
|
554
|
+
#
|
555
|
+
# @return [Array]
|
556
|
+
def to_a
|
557
|
+
rows(named: true)
|
558
|
+
end
|
559
|
+
|
525
560
|
# Check if DataFrame includes column.
|
526
561
|
#
|
527
562
|
# @return [Boolean]
|
@@ -631,7 +666,7 @@ module Polars
|
|
631
666
|
end
|
632
667
|
|
633
668
|
# Ruby-specific
|
634
|
-
if item.is_a?(Expr)
|
669
|
+
if item.is_a?(Expr) || item.is_a?(Series)
|
635
670
|
return filter(item)
|
636
671
|
end
|
637
672
|
|
@@ -641,15 +676,42 @@ module Polars
|
|
641
676
|
# Set item.
|
642
677
|
#
|
643
678
|
# @return [Object]
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
679
|
+
def []=(*key, value)
|
680
|
+
if key.length == 1
|
681
|
+
key = key.first
|
682
|
+
elsif key.length != 2
|
683
|
+
raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
|
684
|
+
end
|
649
685
|
|
650
|
-
|
651
|
-
|
686
|
+
if Utils.strlike?(key)
|
687
|
+
if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
|
+
value = Series.new(value)
|
689
|
+
elsif !value.is_a?(Series)
|
690
|
+
value = Polars.lit(value)
|
691
|
+
end
|
692
|
+
self._df = with_column(value.alias(key.to_s))._df
|
693
|
+
elsif key.is_a?(Array)
|
694
|
+
row_selection, col_selection = key
|
695
|
+
|
696
|
+
if Utils.strlike?(col_selection)
|
697
|
+
s = self[col_selection]
|
698
|
+
elsif col_selection.is_a?(Integer)
|
699
|
+
raise Todo
|
700
|
+
else
|
701
|
+
raise ArgumentError, "column selection not understood: #{col_selection}"
|
702
|
+
end
|
703
|
+
|
704
|
+
s[row_selection] = value
|
652
705
|
|
706
|
+
if col_selection.is_a?(Integer)
|
707
|
+
replace_at_idx(col_selection, s)
|
708
|
+
elsif Utils.strlike?(col_selection)
|
709
|
+
replace(col_selection, s)
|
710
|
+
end
|
711
|
+
else
|
712
|
+
raise Todo
|
713
|
+
end
|
714
|
+
end
|
653
715
|
|
654
716
|
# Return the dataframe as a scalar.
|
655
717
|
#
|
@@ -774,7 +836,7 @@ module Polars
|
|
774
836
|
row_oriented: false
|
775
837
|
)
|
776
838
|
if Utils.pathlike?(file)
|
777
|
-
file = Utils.
|
839
|
+
file = Utils.normalise_filepath(file)
|
778
840
|
end
|
779
841
|
|
780
842
|
_df.write_json(file, pretty, row_oriented)
|
@@ -789,7 +851,7 @@ module Polars
|
|
789
851
|
# @return [nil]
|
790
852
|
def write_ndjson(file)
|
791
853
|
if Utils.pathlike?(file)
|
792
|
-
file = Utils.
|
854
|
+
file = Utils.normalise_filepath(file)
|
793
855
|
end
|
794
856
|
|
795
857
|
_df.write_ndjson(file)
|
@@ -879,7 +941,7 @@ module Polars
|
|
879
941
|
end
|
880
942
|
|
881
943
|
if Utils.pathlike?(file)
|
882
|
-
file = Utils.
|
944
|
+
file = Utils.normalise_filepath(file)
|
883
945
|
end
|
884
946
|
|
885
947
|
_df.write_csv(
|
@@ -917,7 +979,7 @@ module Polars
|
|
917
979
|
compression = "uncompressed"
|
918
980
|
end
|
919
981
|
if Utils.pathlike?(file)
|
920
|
-
file = Utils.
|
982
|
+
file = Utils.normalise_filepath(file)
|
921
983
|
end
|
922
984
|
|
923
985
|
_df.write_avro(file, compression)
|
@@ -936,7 +998,7 @@ module Polars
|
|
936
998
|
compression = "uncompressed"
|
937
999
|
end
|
938
1000
|
if Utils.pathlike?(file)
|
939
|
-
file = Utils.
|
1001
|
+
file = Utils.normalise_filepath(file)
|
940
1002
|
end
|
941
1003
|
|
942
1004
|
_df.write_ipc(file, compression)
|
@@ -978,7 +1040,7 @@ module Polars
|
|
978
1040
|
compression = "uncompressed"
|
979
1041
|
end
|
980
1042
|
if Utils.pathlike?(file)
|
981
|
-
file = Utils.
|
1043
|
+
file = Utils.normalise_filepath(file)
|
982
1044
|
end
|
983
1045
|
|
984
1046
|
_df.write_parquet(
|
@@ -1438,6 +1500,20 @@ module Polars
|
|
1438
1500
|
end
|
1439
1501
|
end
|
1440
1502
|
|
1503
|
+
# Sort the DataFrame by column in-place.
|
1504
|
+
#
|
1505
|
+
# @param by [String]
|
1506
|
+
# By which column to sort.
|
1507
|
+
# @param reverse [Boolean]
|
1508
|
+
# Reverse/descending sort.
|
1509
|
+
# @param nulls_last [Boolean]
|
1510
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1511
|
+
#
|
1512
|
+
# @return [DataFrame]
|
1513
|
+
def sort!(by, reverse: false, nulls_last: false)
|
1514
|
+
self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
|
1515
|
+
end
|
1516
|
+
|
1441
1517
|
# Check if DataFrame is equal to other.
|
1442
1518
|
#
|
1443
1519
|
# @param other [DataFrame]
|
@@ -1495,7 +1571,7 @@ module Polars
|
|
1495
1571
|
# # │ 30 ┆ 6 │
|
1496
1572
|
# # └─────┴─────┘
|
1497
1573
|
def replace(column, new_col)
|
1498
|
-
_df.replace(column, new_col._s)
|
1574
|
+
_df.replace(column.to_s, new_col._s)
|
1499
1575
|
self
|
1500
1576
|
end
|
1501
1577
|
|
@@ -1836,7 +1912,7 @@ module Polars
|
|
1836
1912
|
# "2020-01-08 23:16:43"
|
1837
1913
|
# ]
|
1838
1914
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1839
|
-
# Polars.col("dt").str.strptime(
|
1915
|
+
# Polars.col("dt").str.strptime(Polars::Datetime)
|
1840
1916
|
# )
|
1841
1917
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1842
1918
|
# [
|
@@ -2767,6 +2843,16 @@ module Polars
|
|
2767
2843
|
Utils.wrap_s(_df.drop_in_place(name))
|
2768
2844
|
end
|
2769
2845
|
|
2846
|
+
# Drop in place if exists.
|
2847
|
+
#
|
2848
|
+
# @param name [Object]
|
2849
|
+
# Column to drop.
|
2850
|
+
#
|
2851
|
+
# @return [Series]
|
2852
|
+
def delete(name)
|
2853
|
+
drop_in_place(name) if include?(name)
|
2854
|
+
end
|
2855
|
+
|
2770
2856
|
# Create an empty copy of the current DataFrame.
|
2771
2857
|
#
|
2772
2858
|
# Returns a DataFrame with identical schema but no data.
|
@@ -3042,24 +3128,28 @@ module Polars
|
|
3042
3128
|
if aggregate_fn.is_a?(String)
|
3043
3129
|
case aggregate_fn
|
3044
3130
|
when "first"
|
3045
|
-
|
3131
|
+
aggregate_expr = Polars.element.first._rbexpr
|
3046
3132
|
when "sum"
|
3047
|
-
|
3133
|
+
aggregate_expr = Polars.element.sum._rbexpr
|
3048
3134
|
when "max"
|
3049
|
-
|
3135
|
+
aggregate_expr = Polars.element.max._rbexpr
|
3050
3136
|
when "min"
|
3051
|
-
|
3137
|
+
aggregate_expr = Polars.element.min._rbexpr
|
3052
3138
|
when "mean"
|
3053
|
-
|
3139
|
+
aggregate_expr = Polars.element.mean._rbexpr
|
3054
3140
|
when "median"
|
3055
|
-
|
3141
|
+
aggregate_expr = Polars.element.median._rbexpr
|
3056
3142
|
when "last"
|
3057
|
-
|
3143
|
+
aggregate_expr = Polars.element.last._rbexpr
|
3058
3144
|
when "count"
|
3059
|
-
|
3145
|
+
aggregate_expr = Polars.count._rbexpr
|
3060
3146
|
else
|
3061
3147
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3062
3148
|
end
|
3149
|
+
elsif aggregate_fn.nil?
|
3150
|
+
aggregate_expr = nil
|
3151
|
+
else
|
3152
|
+
aggregate_expr = aggregate_function._rbexpr
|
3063
3153
|
end
|
3064
3154
|
|
3065
3155
|
_from_rbdf(
|
@@ -3067,9 +3157,9 @@ module Polars
|
|
3067
3157
|
values,
|
3068
3158
|
index,
|
3069
3159
|
columns,
|
3070
|
-
aggregate_fn._rbexpr,
|
3071
3160
|
maintain_order,
|
3072
3161
|
sort_columns,
|
3162
|
+
aggregate_expr,
|
3073
3163
|
separator
|
3074
3164
|
)
|
3075
3165
|
)
|
@@ -3174,7 +3264,7 @@ module Polars
|
|
3174
3264
|
# # │ B ┆ 1 │
|
3175
3265
|
# # │ C ┆ 2 │
|
3176
3266
|
# # │ D ┆ 3 │
|
3177
|
-
# # │
|
3267
|
+
# # │ E ┆ 4 │
|
3178
3268
|
# # │ F ┆ 5 │
|
3179
3269
|
# # │ G ┆ 6 │
|
3180
3270
|
# # │ H ┆ 7 │
|
@@ -4053,15 +4143,12 @@ module Polars
|
|
4053
4143
|
# # │ 5 ┆ 3.0 ┆ true │
|
4054
4144
|
# # └─────┴─────┴───────┘
|
4055
4145
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
4056
|
-
|
4057
|
-
|
4058
|
-
subset
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
end
|
4063
|
-
|
4064
|
-
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4146
|
+
self._from_rbdf(
|
4147
|
+
lazy
|
4148
|
+
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
4149
|
+
.collect(no_optimization: true)
|
4150
|
+
._df
|
4151
|
+
)
|
4065
4152
|
end
|
4066
4153
|
|
4067
4154
|
# Return the number of unique rows, or the number of unique row-subsets.
|
@@ -4403,7 +4490,7 @@ module Polars
|
|
4403
4490
|
end
|
4404
4491
|
end
|
4405
4492
|
|
4406
|
-
# Returns an iterator over the DataFrame of rows of
|
4493
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4407
4494
|
#
|
4408
4495
|
# @param named [Boolean]
|
4409
4496
|
# Return hashes instead of arrays. The hashes are a mapping of
|
@@ -4464,6 +4551,24 @@ module Polars
|
|
4464
4551
|
end
|
4465
4552
|
end
|
4466
4553
|
|
4554
|
+
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4555
|
+
#
|
4556
|
+
# @param named [Boolean]
|
4557
|
+
# Return hashes instead of arrays. The hashes are a mapping of
|
4558
|
+
# column name to row value. This is more expensive than returning an
|
4559
|
+
# array, but allows for accessing values by column name.
|
4560
|
+
# @param buffer_size [Integer]
|
4561
|
+
# Determines the number of rows that are buffered internally while iterating
|
4562
|
+
# over the data; you should only modify this in very specific cases where the
|
4563
|
+
# default value is determined not to be a good fit to your access pattern, as
|
4564
|
+
# the speedup from using the buffer is significant (~2-4x). Setting this
|
4565
|
+
# value to zero disables row buffering.
|
4566
|
+
#
|
4567
|
+
# @return [Object]
|
4568
|
+
def each_row(named: true, buffer_size: 500, &block)
|
4569
|
+
iter_rows(named: named, buffer_size: buffer_size, &block)
|
4570
|
+
end
|
4571
|
+
|
4467
4572
|
# Shrink DataFrame memory usage.
|
4468
4573
|
#
|
4469
4574
|
# Shrinks to fit the exact capacity needed to hold the data.
|
@@ -4717,20 +4822,63 @@ module Polars
|
|
4717
4822
|
end
|
4718
4823
|
|
4719
4824
|
# @private
|
4720
|
-
def self.
|
4721
|
-
|
4722
|
-
|
4825
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
4826
|
+
updated_data = {}
|
4827
|
+
unless data.empty?
|
4828
|
+
dtypes = schema_overrides || {}
|
4829
|
+
array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
|
4830
|
+
if array_len > 0
|
4831
|
+
data.each do |name, val|
|
4832
|
+
dtype = dtypes[name]
|
4833
|
+
if val.is_a?(Hash) && dtype != Struct
|
4834
|
+
updated_data[name] = DataFrame.new(val).to_struct(name)
|
4835
|
+
elsif !Utils.arrlen(val).nil?
|
4836
|
+
updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
|
4837
|
+
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4838
|
+
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4839
|
+
updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
4840
|
+
else
|
4841
|
+
raise Todo
|
4842
|
+
end
|
4843
|
+
end
|
4844
|
+
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4845
|
+
data.each do |name, val|
|
4846
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
4847
|
+
end
|
4848
|
+
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4849
|
+
data.each do |name, val|
|
4850
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
4851
|
+
end
|
4852
|
+
end
|
4853
|
+
end
|
4854
|
+
updated_data
|
4855
|
+
end
|
4723
4856
|
|
4724
|
-
|
4725
|
-
|
4726
|
-
|
4727
|
-
|
4857
|
+
# @private
|
4858
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
4859
|
+
if schema.is_a?(Hash) && !data.empty?
|
4860
|
+
if !data.all? { |col, _| schema[col] }
|
4861
|
+
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
4728
4862
|
end
|
4729
|
-
|
4730
|
-
|
4863
|
+
|
4864
|
+
data = schema.to_h { |col| [col, data[col]] }
|
4865
|
+
end
|
4866
|
+
|
4867
|
+
column_names, schema_overrides = _unpack_schema(
|
4868
|
+
schema, lookup_names: data.keys, schema_overrides: schema_overrides
|
4869
|
+
)
|
4870
|
+
if column_names.empty?
|
4871
|
+
column_names = data.keys
|
4872
|
+
end
|
4873
|
+
|
4874
|
+
if data.empty? && !schema_overrides.empty?
|
4875
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
4876
|
+
else
|
4877
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
4731
4878
|
end
|
4732
4879
|
|
4733
|
-
|
4880
|
+
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
4881
|
+
RbDataFrame.new(data_series)
|
4734
4882
|
end
|
4735
4883
|
|
4736
4884
|
# @private
|
@@ -4739,14 +4887,12 @@ module Polars
|
|
4739
4887
|
end
|
4740
4888
|
|
4741
4889
|
# @private
|
4742
|
-
def self.
|
4743
|
-
|
4744
|
-
|
4745
|
-
if columns.is_a?(Hash)
|
4746
|
-
columns = columns.to_a
|
4890
|
+
def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
|
4891
|
+
if schema.is_a?(Hash)
|
4892
|
+
schema = schema.to_a
|
4747
4893
|
end
|
4748
4894
|
column_names =
|
4749
|
-
(
|
4895
|
+
(schema || []).map.with_index do |col, i|
|
4750
4896
|
if col.is_a?(String)
|
4751
4897
|
col || "column_#{i}"
|
4752
4898
|
else
|
@@ -4759,21 +4905,38 @@ module Polars
|
|
4759
4905
|
# TODO zip_longest
|
4760
4906
|
lookup = column_names.zip(lookup_names || []).to_h
|
4761
4907
|
|
4762
|
-
|
4763
|
-
|
4764
|
-
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4908
|
+
column_dtypes =
|
4909
|
+
(schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4765
4910
|
[lookup[col[0]] || col[0], col[1]]
|
4766
4911
|
end
|
4767
|
-
|
4912
|
+
|
4913
|
+
if schema_overrides
|
4914
|
+
raise Todo
|
4915
|
+
end
|
4916
|
+
|
4917
|
+
column_dtypes.each do |col, dtype|
|
4918
|
+
if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
|
4919
|
+
column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
|
4920
|
+
end
|
4921
|
+
end
|
4922
|
+
|
4923
|
+
[column_names, column_dtypes]
|
4768
4924
|
end
|
4769
4925
|
|
4770
|
-
def self._handle_columns_arg(data, columns: nil)
|
4771
|
-
if columns.nil?
|
4926
|
+
def self._handle_columns_arg(data, columns: nil, from_hash: false)
|
4927
|
+
if columns.nil? || columns.empty?
|
4772
4928
|
data
|
4773
4929
|
else
|
4774
4930
|
if data.empty?
|
4775
4931
|
columns.map { |c| Series.new(c, nil)._s }
|
4776
4932
|
elsif data.length == columns.length
|
4933
|
+
if from_hash
|
4934
|
+
series_map = data.to_h { |s| [s.name, s] }
|
4935
|
+
if columns.all? { |col| series_map.key?(col) }
|
4936
|
+
return columns.map { |col| series_map[col] }
|
4937
|
+
end
|
4938
|
+
end
|
4939
|
+
|
4777
4940
|
columns.each_with_index do |c, i|
|
4778
4941
|
# not in-place?
|
4779
4942
|
data[i].rename(c)
|
@@ -4788,7 +4951,7 @@ module Polars
|
|
4788
4951
|
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4789
4952
|
rbdf_columns = rbdf.columns
|
4790
4953
|
rbdf_dtypes = rbdf.dtypes
|
4791
|
-
columns, dtypes =
|
4954
|
+
columns, dtypes = _unpack_schema(
|
4792
4955
|
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4793
4956
|
)
|
4794
4957
|
column_subset = []
|
@@ -4826,20 +4989,23 @@ module Polars
|
|
4826
4989
|
end
|
4827
4990
|
|
4828
4991
|
# @private
|
4829
|
-
def self.sequence_to_rbdf(data,
|
4992
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
4993
|
+
raise Todo if schema_overrides
|
4994
|
+
columns = schema
|
4995
|
+
|
4830
4996
|
if data.length == 0
|
4831
|
-
return hash_to_rbdf({},
|
4997
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
4832
4998
|
end
|
4833
4999
|
|
4834
5000
|
if data[0].is_a?(Series)
|
4835
5001
|
# series_names = data.map(&:name)
|
4836
|
-
# columns, dtypes =
|
5002
|
+
# columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
|
4837
5003
|
data_series = []
|
4838
5004
|
data.each do |s|
|
4839
5005
|
data_series << s._s
|
4840
5006
|
end
|
4841
5007
|
elsif data[0].is_a?(Hash)
|
4842
|
-
column_names, dtypes =
|
5008
|
+
column_names, dtypes = _unpack_schema(columns)
|
4843
5009
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4844
5010
|
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4845
5011
|
if column_names
|
@@ -4865,11 +5031,21 @@ module Polars
|
|
4865
5031
|
end
|
4866
5032
|
|
4867
5033
|
# @private
|
4868
|
-
def self.series_to_rbdf(data,
|
4869
|
-
|
4870
|
-
|
5034
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
|
5035
|
+
data_series = [data._s]
|
5036
|
+
series_name = data_series.map(&:name)
|
5037
|
+
column_names, schema_overrides = _unpack_schema(
|
5038
|
+
schema || series_name, schema_overrides: schema_overrides, n_expected: 1
|
5039
|
+
)
|
5040
|
+
if schema_overrides.any?
|
5041
|
+
new_dtype = schema_overrides.values[0]
|
5042
|
+
if new_dtype != data.dtype
|
5043
|
+
data_series[0] = data_series[0].cast(new_dtype, true)
|
5044
|
+
end
|
4871
5045
|
end
|
4872
|
-
|
5046
|
+
|
5047
|
+
data_series = _handle_columns_arg(data_series, columns: column_names)
|
5048
|
+
RbDataFrame.new(data_series)
|
4873
5049
|
end
|
4874
5050
|
|
4875
5051
|
def wrap_ldf(ldf)
|
data/lib/polars/data_types.rb
CHANGED
@@ -84,20 +84,22 @@ module Polars
|
|
84
84
|
|
85
85
|
# Calendar date and time type.
|
86
86
|
class Datetime < TemporalType
|
87
|
-
attr_reader :
|
87
|
+
attr_reader :time_unit, :time_zone
|
88
|
+
alias_method :tu, :time_unit
|
88
89
|
|
89
90
|
def initialize(time_unit = "us", time_zone = nil)
|
90
|
-
@
|
91
|
+
@time_unit = time_unit || "us"
|
91
92
|
@time_zone = time_zone
|
92
93
|
end
|
93
94
|
end
|
94
95
|
|
95
96
|
# Time duration/delta type.
|
96
97
|
class Duration < TemporalType
|
97
|
-
attr_reader :
|
98
|
+
attr_reader :time_unit
|
99
|
+
alias_method :tu, :time_unit
|
98
100
|
|
99
101
|
def initialize(time_unit = "us")
|
100
|
-
@
|
102
|
+
@time_unit = time_unit
|
101
103
|
end
|
102
104
|
end
|
103
105
|
|